From 4ec0b0d453c58fe6257ed0af34a86a83d5b058cd Mon Sep 17 00:00:00 2001 From: erikhuck Date: Wed, 22 May 2024 10:10:22 -0400 Subject: [PATCH 1/9] Continues work on the next release --- docs/conf.py | 4 ++-- docs/notebook/tutorial.ipynb | 6 +++--- docs/tutorial.rst | 20 ++++++++--------- src/gpu_tracker/__main__.py | 1 + src/gpu_tracker/tracker.py | 42 +++++++++++++++++++++++++++--------- tests/test_cli.py | 12 +++++++++++ 6 files changed, 60 insertions(+), 25 deletions(-) create mode 100644 tests/test_cli.py diff --git a/docs/conf.py b/docs/conf.py index b605b15..966388d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,8 +14,8 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'gpu_tracker' -copyright = '2024, Erik Huckvale, Hunter Moseley' -author = 'Erik Huckvale, Hunter Moseley' +copyright = '2024, Erik Huckvale, Hunter N. B. Moseley' +author = 'Erik Huckvale, Hunter N. B. Moseley' version = __version__ release = __version__ diff --git a/docs/notebook/tutorial.ipynb b/docs/notebook/tutorial.ipynb index 0b1b5e2..a07555c 100644 --- a/docs/notebook/tutorial.ipynb +++ b/docs/notebook/tutorial.ipynb @@ -21,7 +21,7 @@ "id": "2bb9e84a-8523-4e5f-bc01-1d6b234c19a6", "metadata": {}, "source": [ - "The `gpu_tracker` package provides the `Tracker` class which uses a subprocess to measure computational resource usage, namely the compute time, CPU utilization, maximum RAM used, and maximum GPU RAM used. The `start()` method starts this process which tracks usage in the background. After calling `start()`, one can write the code for which resource usage is measured, followed by calling the `stop()` method. The compute time will be the time from the call to `start()` to the call to `stop()` and the RAM, GPU RAM, and CPU utilization quantities will be the respective computational resources used by the code that's in between `start()` and `stop()`." + "The `gpu_tracker` package provides the `Tracker` class which uses a subprocess to measure computational resource usage, namely the compute time, maximum CPU utilization, mean CPU utilization, maximum RAM used, and maximum GPU RAM used. The `start()` method starts this process which tracks usage in the background. After calling `start()`, one can write the code for which resource usage is measured, followed by calling the `stop()` method. The compute time will be the time from the call to `start()` to the call to `stop()` and the RAM, GPU RAM, and CPU utilization quantities will be the respective computational resources used by the code that's in between `start()` and `stop()`." ] }, { @@ -131,9 +131,9 @@ "source": [ "The output is organized by computational resource followed by information specific to that resource. The system capacity is a constant for the total RAM capacity across the entire operating system. There is a system capacity field both for RAM and GPU RAM. This is not to be confused with the system field, which measures the maximum RAM / GPU RAM (operating system wide) that was actually used over the duration of the computational-resource tracking. Both the RAM and GPU RAM have 3 additional fields, namely the usage of the main process itself followed by the summed usage of any descendent processes it may have (i.e. child processes, grandchild processes, etc.), and combined usage which is the sum of the main and its descendent processes. RAM is divided further to include the private RSS (RAM usage unique to the process), shared RSS (RAM that's shared by a process and at least one other process), and total RSS (the sum of private and shared RSS). The private and shared RSS values are only available on Linux distributions. So for non-linux operating systems, the private and shared RSS will remain 0 and only the total RSS will be reported. Theoretically, the combined total RSS would never exceed the overall system RAM usage, but inaccuracies resulting from shared RSS can cause this to happen, especially for non-linux operating systems (see note below).\n", "\n", - "The `Tracker` assumes that GPU memory is not shared accross multiple processes and if it is, the reported GPU RAM of \"descendent\" and \"combined\" may be an overestimation.\n", + "The `Tracker` assumes that GPU memory is not shared across multiple processes and if it is, the reported GPU RAM of \"descendent\" and \"combined\" may be an overestimation.\n", "\n", - "The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents combined, and CPU utilization across the entire system. The core percent is the sum of the percentages of all the cores being used. The CPU percent is that divided by the system core count. The max percent is the highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads used accross its descendent processes and combined.\n", + "The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents combined, and CPU utilization across the entire system. The core percent is the sum of the percentages of all the cores being used. The CPU percent is that divided by the system core count. The max percent is the highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads used across its descendent processes and combined.\n", "\n", "The compute time is the real time that the computational-resource tracking lasted (as compared to CPU time)." ] diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 37aa49d..e736521 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -8,14 +8,14 @@ API The ``gpu_tracker`` package provides the ``Tracker`` class which uses a subprocess to measure computational resource usage, namely the compute -time, CPU utilization, maximum RAM used, and maximum GPU RAM used. The -``start()`` method starts this process which tracks usage in the -background. After calling ``start()``, one can write the code for which -resource usage is measured, followed by calling the ``stop()`` method. -The compute time will be the time from the call to ``start()`` to the -call to ``stop()`` and the RAM, GPU RAM, and CPU utilization quantities -will be the respective computational resources used by the code that’s -in between ``start()`` and ``stop()``. +time, maximum CPU utilization, mean CPU utilization, maximum RAM used, +and maximum GPU RAM used. The ``start()`` method starts this process +which tracks usage in the background. After calling ``start()``, one can +write the code for which resource usage is measured, followed by calling +the ``stop()`` method. The compute time will be the time from the call +to ``start()`` to the call to ``stop()`` and the RAM, GPU RAM, and CPU +utilization quantities will be the respective computational resources +used by the code that’s in between ``start()`` and ``stop()``. .. code:: python3 @@ -114,7 +114,7 @@ never exceed the overall system RAM usage, but inaccuracies resulting from shared RSS can cause this to happen, especially for non-linux operating systems (see note below). -The ``Tracker`` assumes that GPU memory is not shared accross multiple +The ``Tracker`` assumes that GPU memory is not shared across multiple processes and if it is, the reported GPU RAM of “descendent” and “combined” may be an overestimation. @@ -128,7 +128,7 @@ highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads -used accross its descendent processes and combined. +used across its descendent processes and combined. The compute time is the real time that the computational-resource tracking lasted (as compared to CPU time). diff --git a/src/gpu_tracker/__main__.py b/src/gpu_tracker/__main__.py index 1756345..9424003 100644 --- a/src/gpu_tracker/__main__.py +++ b/src/gpu_tracker/__main__.py @@ -29,6 +29,7 @@ def main(): args = doc.docopt(__doc__, version=__version__) + print(args['--execute']) command = args['--execute'].split(' ') output = args['--output'] output_format = args['--format'] if args['--format'] is not None else 'text' diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index 653a441..c7eb8e8 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -242,10 +242,10 @@ def _log_warning(self, warning: str): class Tracker: """ - Runs a sub-process that tracks computational resources of the calling process. Including the compute time, maximum RAM, and maximum GPU RAM usage within a context manager or explicit ``start()`` and ``stop()`` methods. + Runs a sub-process that tracks computational resources of the calling process. Including the compute time, maximum CPU utilization, mean CPU utilization, maximum RAM, and maximum GPU RAM used within a context manager or explicit calls to ``start()`` and ``stop()`` methods. Calculated quantities are scaled depending on the units chosen for them (e.g. megabytes vs. gigabytes, hours vs. days, etc.). - :ivar resource_usage: Data class containing the max_ram (Description of the maximum RAM usage of the process, any descendents it may have, and the operating system overall), max_gpu_ram (Description of the maximum GPU RAM usage of the process and any descendents it may have), and compute_time (Description of the real compute time i.e. the duration of tracking) attributes. + :ivar ResourceUsage resource_usage: Data class containing the computational resource usage data collected by the tracking process. """ _USAGE_FILE_TIME_DIFFERENCE = 10.0 @@ -382,6 +382,8 @@ def to_json(self) -> dict[str, dict]: @dclass.dataclass class RSSValues: """ + The resident set size (RSS) i.e. memory used by a process or processes. + :param total_rss: The sum of ``private_rss`` and ``shared_rss``. :param private_rss: The RAM usage exclusive to a process. :param shared_rss: The RAM usage of a process shared with at least one other process. @@ -394,6 +396,8 @@ class RSSValues: @dclass.dataclass class MaxRAM: """ + Information related to RAM including the maximum RAM used over a period of time. + :param unit: The unit of measurement for RAM e.g. gigabytes. :param system_capacity: A constant value for the RAM capacity of the entire operating system. :param system: The RAM usage across the entire operating system. @@ -412,7 +416,11 @@ class MaxRAM: @dclass.dataclass class MaxGPURAM: """ + Information related to GPU RAM including the maximum GPU RAM used over a period of time. + :param unit: The unit of measurement for GPU RAM e.g. gigabytes. + :param system_capacity: A constant value for the GPU RAM capacity of all the GPUs in the system. + :param system: The GPU RAM usage of all the GPUs in the system. :param main: The GPU RAM usage of the main process. :param descendents: The summed GPU RAM usage of the descendent processes (i.e. child processes, grandchild processes, etc.). :param combined: The summed GPU RAM usage of both the main process and any descendent processes it may have. @@ -425,18 +433,30 @@ class MaxGPURAM: combined: float = 0. +class GPUUtilization: + system_gpu_count: int + max_percent: float = 0 + mean_percent: float = 0 + + @dclass.dataclass class CPUPercentages: """ - :param max_core_percent: The maximum sum of utilization percentages of the cores used at any given time. - :param max_cpu_percent: The maximum percentage utilization of the entire CPU (core percentage divided by the number of cores in the system). - :param mean_core_percent: The mean sum of utilization percentages of the cores used over time. - :param mean_cpu_percent: The mean percentage utilization of the entire CPU (core percentage divided by the number of cores in the system). + Utilization percentages of a core or specified group of cores (e.g. all the cores in the system, cores allocated by a job manager, the cores used by workers in a task, etc.). + Max refers to the highest value measured over a duration of time. + Mean refers to the average of the measured values during this time. + Sum refers to the sum of the percentages of the core(s) involved. + Hardware refers to this sum divided by the number of cores involved. + + :param max_sum_percent: The maximum sum of utilization percentages of the core(s) used by the process(es) at any given time. + :param max_hardware_percent: The maximum utilization percentage of the core(s) as a whole (i.e. max_sum_percent divided by the number of cores involved). + :param mean_sum_percent: The mean sum of utilization percentages of the core(s) used by the process(es) over time. + :param mean_hardware_percent: The mean utilization percentage of the core(s) as a whole (i.e. mean_sum_percent divided by the number of cores involved). """ - max_core_percent: float = 0. - max_cpu_percent: float = 0. - mean_core_percent: float = 0. - mean_cpu_percent: float = 0. + max_sum_percent: float = 0. + max_hardware_percent: float = 0. + mean_sum_percent: float = 0. + mean_hardware_percent: float = 0. @dclass.dataclass @@ -474,6 +494,8 @@ class ComputeTime: @dclass.dataclass class ResourceUsage: """ + Contains data for computational resource usage. + :param max_ram: The maximum RAM used at any point while tracking. :param max_gpu_ram: The maximum GPU RAM used at any point while tracking. :param cpu_utilization: The core and CPU utilization and maximum number of threads used while tracking. diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..70cfe3c --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,12 @@ +import gpu_tracker.__main__ as cli +import pytest as pt + +test_data = [['-e', ]] + + +@pt.mark.parametrize('argv', test_data) +def test_main(mocker, argv: list[str]): + argv = ['gpu-tracker'] + argv + mocker.patch('sys.argv', argv) + mocker.patch('gpu_tracker.__main__.subp', ) + cli.main() From d36990915d60e6517ac37c6b7989d1bcfe79e318 Mon Sep 17 00:00:00 2001 From: Erik Huckvale <42946548+erikhuck@users.noreply.github.com> Date: Thu, 23 May 2024 13:24:03 -0400 Subject: [PATCH 2/9] Completes work of next release but needs testing --- requirements.txt | 1 + src/gpu_tracker/tracker.py | 185 +++++++++++++++++++++++++------------ 2 files changed, 128 insertions(+), 58 deletions(-) diff --git a/requirements.txt b/requirements.txt index d3c5812..003656a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ psutil docopt +pandas diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index c7eb8e8..46ad62f 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -13,6 +13,8 @@ import enum import pickle as pkl import uuid +import io +import pandas as pd class _TrackingProcess(mproc.Process): @@ -38,9 +40,15 @@ class _TrackingProcess(mproc.Process): 'days': 1 / (60 * 60 * 24) } + # TODO use the uuid field to determine the IDs of the GPUs. + # Accept a list of GPUs by UUID. + # If an invalid ID is provided, raise an exception informing the user that an ID was provided that is not available and then list the available UUIDs. + # If there is no failure, do an info log informing the user of the UUIDs that they provided and all the ones that are available. + def __init__( self, stop_event: mproc.Event, sleep_time: float, ram_unit: str, gpu_ram_unit: str, time_unit: str, - disable_logs: bool, main_process_id: int, resource_usage_file: str, extraneous_process_ids: set[int]): + n_expected_cores: int | None, gpu_uuids: set[str] | None, disable_logs: bool, main_process_id: int, + resource_usage_file: str, extraneous_process_ids: set[int]): super().__init__() self._stop_event = stop_event if sleep_time < _TrackingProcess._CPU_PERCENT_INTERVAL: @@ -55,8 +63,9 @@ def __init__( time_unit, _TrackingProcess._time_unit2coefficient, unit_type='time') self._disable_logs = disable_logs self._main_process_id = main_process_id - self._core_percent_sums = {key: 0. for key in ['system', 'main', 'descendents', 'combined']} - self._cpu_percent_sums = {key: 0. for key in ['system', 'main', 'descendents', 'combined']} + percent_keys = ['cpu_system', 'cpu_main', 'cpu_descendents', 'cpu_combined', 'gpu'] + self._sum_percent_sums = {key: 0. for key in percent_keys} + self._hardware_percent_sums = {key: 0. for key in percent_keys} self._tracking_iteration = 1 self._is_linux = platform.system().lower() == 'linux' self._nvidia_available = True @@ -68,12 +77,32 @@ def __init__( 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. ' 'Otherwise the Max GPU RAM values will remain 0.0') max_ram = MaxRAM(unit=ram_unit, system_capacity=psutil.virtual_memory().total * self._ram_coefficient) - max_gpu_ram = MaxGPURAM( - unit=gpu_ram_unit, system_capacity=self._system_gpu_ram(measurement='total') if self._nvidia_available else 0.0) - cpu_utilization = CPUUtilization(system_core_count=psutil.cpu_count()) + system_core_count = psutil.cpu_count() + cpu_utilization = CPUUtilization( + system_core_count=system_core_count, + n_expected_cores=n_expected_cores if n_expected_cores is not None else system_core_count) + if self._nvidia_available: + gpu_info = _TrackingProcess._system_gpu_info(fields='uuid,memory.total') + gpu_ram_system_capacity = self._system_gpu_ram(gpu_info=gpu_info, measurement='total') + max_gpu_ram = MaxGPURAM(unit=gpu_ram_unit, system_capacity=gpu_ram_system_capacity) + all_uuids = set(gpu_info['uuid']) + if gpu_uuids is None: + self._gpu_uuids = all_uuids + else: + if len(gpu_uuids) == 0: + raise ValueError('gpu_uuids is not None but the set is empty. Please provide a set of at least one GPU UUID.') + for gpu_uuid in gpu_uuids: + if gpu_uuid not in all_uuids: + raise ValueError(f'GPU UUID of {gpu_uuid} is not valid. Available UUIDs are: {", ".join(all_uuids)}') + self._gpu_uuids = gpu_uuids + gpu_utilization = GPUUtilization(system_gpu_count=len(all_uuids), n_expected_gpus=len(self._gpu_uuids)) + else: + max_gpu_ram = MaxGPURAM(unit=gpu_ram_unit, system_capacity=0.0) + gpu_utilization = GPUUtilization(system_gpu_count=0, n_expected_gpus=0) compute_time = ComputeTime(unit=time_unit) self._resource_usage = ResourceUsage( - max_ram=max_ram, max_gpu_ram=max_gpu_ram, cpu_utilization=cpu_utilization, compute_time=compute_time) + max_ram=max_ram, max_gpu_ram=max_gpu_ram, cpu_utilization=cpu_utilization, gpu_utilization=gpu_utilization, + compute_time=compute_time) self._resource_usage_file = resource_usage_file self._extraneous_process_ids = extraneous_process_ids @@ -125,21 +154,39 @@ def run(self): nvidia_smi_output=nvidia_smi_output) process_ids.add(self._main_process_id) self._update_gpu_ram(attr='combined', process_ids=process_ids, nvidia_smi_output=nvidia_smi_output) - self._resource_usage.max_gpu_ram.system = max( - self._resource_usage.max_gpu_ram.system, self._system_gpu_ram(measurement='used')) + + gpu_info = _TrackingProcess._system_gpu_info(fields='uuid,memory.used,utilization.gpu') + system_gpu_ram = self._system_gpu_ram(gpu_info, measurement='used') + self._resource_usage.max_gpu_ram.system = max(self._resource_usage.max_gpu_ram.system, system_gpu_ram) + gpu_info = gpu_info.loc[gpu_info['uuid'].apply(lambda gpu_uuid: gpu_uuid in self._gpu_uuids)] + gpu_percentages = [float(percentage) for percentage in gpu_info['utilization.gpu']] + self._update_processing_unit_utilization( + current_percentages=gpu_percentages, + processing_unit_percentages=self._resource_usage.gpu_utilization.gpu_percentages, percent_key='gpu', + n_hardware_units=self._resource_usage.gpu_utilization.n_expected_gpus) + # Get the mean and maximum CPU usages. self._update_n_threads(processes=[main_process], attr='main') self._update_n_threads(processes=descendent_processes, attr='descendents') self._update_n_threads(processes=combined_processes, attr='combined') # noinspection PyTypeChecker system_core_percentages: list[float] = psutil.cpu_percent(percpu=True) - self._update_cpu_utilization(percentages=system_core_percentages, attr='system') + cpu_utilization = self._resource_usage.cpu_utilization + self._update_processing_unit_utilization( + current_percentages=system_core_percentages, processing_unit_percentages=cpu_utilization.system, + percent_key='cpu_system', n_hardware_units=cpu_utilization.system_core_count) time.sleep(_TrackingProcess._CPU_PERCENT_INTERVAL) main_percentage = main_process.cpu_percent() descendent_percentages = self._map_processes(processes=descendent_processes, map_func=get_cpu_percent) - self._update_cpu_utilization(percentages=[main_percentage], attr='main') - self._update_cpu_utilization(percentages=descendent_percentages, attr='descendents') - self._update_cpu_utilization(percentages=[main_percentage] + descendent_percentages, attr='combined') + self._update_processing_unit_utilization( + current_percentages=[main_percentage], processing_unit_percentages=cpu_utilization.main, percent_key='cpu_main', + n_hardware_units=cpu_utilization.n_expected_cores) + self._update_processing_unit_utilization( + current_percentages=descendent_percentages, processing_unit_percentages=cpu_utilization.descendents, + percent_key='cpu_descendents', n_hardware_units=cpu_utilization.n_expected_cores) + self._update_processing_unit_utilization( + current_percentages=[main_percentage] + descendent_percentages, processing_unit_percentages=cpu_utilization.combined, + percent_key='cpu_combined', n_hardware_units=cpu_utilization.n_expected_cores) # Update compute time. self._resource_usage.compute_time.time = (time.time() - start_time) * self._time_coefficient self._tracking_iteration += 1 @@ -198,28 +245,29 @@ def _update_gpu_ram(self, attr: str, process_ids: set[int], nvidia_smi_output: s max_gpu_ram = getattr(self._resource_usage.max_gpu_ram, attr) setattr(self._resource_usage.max_gpu_ram, attr, max(max_gpu_ram, curr_gpu_ram)) - def _system_gpu_ram(self, measurement: str) -> float: - command = f'nvidia-smi --query-gpu=memory.{measurement} --format=csv,noheader' + @staticmethod + def _system_gpu_info(fields: str) -> pd.DataFrame: + command = f'nvidia-smi --query-gpu={fields}, --format=csv' output = subp.check_output(command.split(), stderr=subp.STDOUT).decode() - output = output.strip().split('\n') - usages = [line.replace('MiB', '').strip() for line in output] - ram_sum = sum([int(usage) for usage in usages if usage != '']) - return ram_sum * self._gpu_ram_coefficient - - def _update_cpu_utilization(self, percentages: list[float], attr: str): - cpu_percentages: CPUPercentages = getattr(self._resource_usage.cpu_utilization, attr) - - def update_percentages(percent: float, percent_type: str, percent_sums: dict[str, float]): - percent_sums[attr] += percent - mean_percent = percent_sums[attr] / self._tracking_iteration - setattr(cpu_percentages, f'mean_{percent_type}_percent', mean_percent) - max_percent: float = getattr(cpu_percentages, f'max_{percent_type}_percent') - setattr(cpu_percentages, f'max_{percent_type}_percent', max(max_percent, percent)) - - core_percent = sum(percentages) - cpu_percent = core_percent / self._resource_usage.cpu_utilization.system_core_count - update_percentages(percent=core_percent, percent_type='core', percent_sums=self._core_percent_sums) - update_percentages(percent=cpu_percent, percent_type='cpu', percent_sums=self._cpu_percent_sums) + return pd.DataFrame(io.StringIO(output)) + + def _system_gpu_ram(self, gpu_info: pd.DataFrame, measurement: str) -> float: + gpu_rams = gpu_info[f'memory.{measurement}'] + gpu_rams = gpu_rams.apply(lambda ram: int(ram.replace('MiB', '').strip())) + return sum(gpu_rams) * self._gpu_ram_coefficient + + def _update_processing_unit_utilization( + self, current_percentages: list[float], processing_unit_percentages: ProcessingUnitPercentages, + percent_key: str, n_hardware_units: int): + sum_percent = sum(current_percentages) + hardware_percent = sum_percent / n_hardware_units + for percent, percent_sums, percent_type in ( + (sum_percent, self._sum_percent_sums, 'sum'), (hardware_percent, self._hardware_percent_sums, 'hardware')): + percent_sums[percent_key] += percent + mean_percent = percent_sums[percent_key] / self._tracking_iteration + setattr(processing_unit_percentages, f'mean_{percent_type}_percent', mean_percent) + max_percent: float = getattr(processing_unit_percentages, f'max_{percent_type}_percent') + setattr(processing_unit_percentages, f'max_{percent_type}_percent', max(max_percent, percent)) def _update_n_threads(self, processes: list[psutil.Process], attr: str): n_threads_list = self._map_processes(processes, map_func=lambda process: process.num_threads()) @@ -257,12 +305,15 @@ class State(enum.Enum): def __init__( self, sleep_time: float = 1.0, ram_unit: str = 'gigabytes', gpu_ram_unit: str = 'gigabytes', time_unit: str = 'hours', - disable_logs: bool = False, process_id: int | None = None, n_join_attempts: int = 5, join_timeout: float = 10.0): + n_expected_cores: int = None, gpu_uuids: set[str] = None, disable_logs: bool = False, process_id: int = None, # TODO add example with n_expected_cores and gpu_uuids to tutorial! + n_join_attempts: int = 5, join_timeout: float = 10.0): """ :param sleep_time: The number of seconds to sleep in between usage-collection iterations. :param ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. :param gpu_ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. :param time_unit: One of 'seconds', 'minutes', 'hours', or 'days'. + :param n_expected_cores: The number of cores expected to be used during tracking (e.g. number of processes spawned, number of parallelized threads, etc.). Used as the denominator when calculating the hardware percentages of the CPU utilization (except for system-wide CPU utilization which always divides by all the cores in the system). Defaults to all the cores in the system. + :param gpu_uuids: The UUIDs of the GPUs to track utilization for. The length of this set is used as the denominator when calculating the hardware percentages of the GPU utilization (i.e. n_expected_gpus). Defaults to all the GPUs in the system. :param disable_logs: If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual. :param process_id: The ID of the process to track. Defaults to the current process. :param n_join_attempts: The number of times the tracker attempts to join its underlying sub-process. @@ -279,7 +330,7 @@ def __init__( extraneous_ids = {process.pid for process in current_process.children()} - legit_child_ids self._resource_usage_file = f'.{uuid.uuid1()}.pkl' self._tracking_process = _TrackingProcess( - self._stop_event, sleep_time, ram_unit, gpu_ram_unit, time_unit, disable_logs, + self._stop_event, sleep_time, ram_unit, gpu_ram_unit, time_unit, n_expected_cores, gpu_uuids, disable_logs, process_id if process_id is not None else current_process_id, self._resource_usage_file, extraneous_ids) self.resource_usage = None self.n_join_attempts = n_join_attempts @@ -433,23 +484,17 @@ class MaxGPURAM: combined: float = 0. -class GPUUtilization: - system_gpu_count: int - max_percent: float = 0 - mean_percent: float = 0 - - @dclass.dataclass -class CPUPercentages: +class ProcessingUnitPercentages: """ - Utilization percentages of a core or specified group of cores (e.g. all the cores in the system, cores allocated by a job manager, the cores used by workers in a task, etc.). + Utilization percentages of one or more processing units (i.e. GPUs or CPU cores). Max refers to the highest value measured over a duration of time. Mean refers to the average of the measured values during this time. - Sum refers to the sum of the percentages of the core(s) involved. - Hardware refers to this sum divided by the number of cores involved. + Sum refers to the sum of the percentages of the processing units involved. If there is only one unit in question, this is the percentage of just that unit. + Hardware refers to this sum divided by the number of units involved. If there is only one unit in question, this is the same as the sum. - :param max_sum_percent: The maximum sum of utilization percentages of the core(s) used by the process(es) at any given time. - :param max_hardware_percent: The maximum utilization percentage of the core(s) as a whole (i.e. max_sum_percent divided by the number of cores involved). + :param max_sum_percent: The maximum sum of utilization percentages of the processing units at any given time. + :param max_hardware_percent: The maximum utilization percentage of the group of units as a whole (i.e. max_sum_percent divided by the number of units involved). :param mean_sum_percent: The mean sum of utilization percentages of the core(s) used by the process(es) over time. :param mean_hardware_percent: The mean utilization percentage of the core(s) as a whole (i.e. mean_sum_percent divided by the number of cores involved). """ @@ -462,28 +507,50 @@ class CPUPercentages: @dclass.dataclass class CPUUtilization: """ - :param system_core_count: The number of cores available to the operating system. - :param system: The core and CPU utilization percentages of the entire system. - :param main: The core and CPU utilization percentages of the main process. - :param descendents: The core and CPU utilization percentages summed across descendent processes (i.e. child processes, grandchild processes, etc.). - :param combined: The core and CPU utilization percentages summed across both the descendent processes and the main process. + Information related to CPU usage, including core utilization percentages of the main process and any descendent processes it may have as well as system-wide utilization. + The system hardware utilization percentages are strictly divided by the total number of cores in the system while that of the main, descendent, and combined processes can be divided by the expected number of cores used in a task. + + :param system_core_count: The number of cores available to the entire operating system. + :param n_expected_cores: The number of cores expected to be used by the main process and/or any descendent processes it may have. + :param system: The utilization percentages of all the cores in the entire operating system. + :param main: The utilization percentages of the cores used by the main process. + :param descendents: The utilization percentages summed across descendent processes (i.e. child processes, grandchild processes, etc.). + :param combined: The utilization percentages summed across both the descendent processes and the main process. :param main_n_threads: The maximum detected number of threads used by the main process at any time. :param descendents_n_threads: The maximum sum of threads used across the descendent processes at any time. :param combined_n_threads: The maximum sum of threads used by both the main and descendent processes. """ system_core_count: int - system: CPUPercentages = dclass.field(default_factory=CPUPercentages) - main: CPUPercentages = dclass.field(default_factory=CPUPercentages) - descendents: CPUPercentages = dclass.field(default_factory=CPUPercentages) - combined: CPUPercentages = dclass.field(default_factory=CPUPercentages) + n_expected_cores: int + system: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + main: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + descendents: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + combined: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) main_n_threads: int = 0 descendents_n_threads: int = 0 combined_n_threads: int = 0 +@dclass.dataclass +class GPUUtilization: + """ + Utilization percentages of one or more GPUs being tracked. + Hardware percentages are the summed percentages divided by the number of GPUs being tracked. + + :param system_gpu_count: The number of GPUs in the system. + :param n_expected_gpus: The number of GPUs to be tracked (e.g. GPUs actually used while there may be other GPUs in the system). + :param gpu_percentages: The utilization percentages of the GPU(s) being tracked. + """ + system_gpu_count: int + n_expected_gpus: int + gpu_percentages: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + + @dclass.dataclass class ComputeTime: """ + The time it takes for a task to complete. + :param unit: The unit of measurement for compute time e.g. hours. :param time: The real compute time. """ @@ -498,10 +565,12 @@ class ResourceUsage: :param max_ram: The maximum RAM used at any point while tracking. :param max_gpu_ram: The maximum GPU RAM used at any point while tracking. - :param cpu_utilization: The core and CPU utilization and maximum number of threads used while tracking. + :param cpu_utilization: Core counts, utilization percentages of cores and maximum number of threads used while tracking. + :param gpu_utilization: GPU counts and utilization percentages of the GPU(s). :param compute_time: The real time spent tracking. """ max_ram: MaxRAM max_gpu_ram: MaxGPURAM cpu_utilization: CPUUtilization + gpu_utilization: GPUUtilization compute_time: ComputeTime From 402af80a522c00a6a206e97c172a5c29498bc03f Mon Sep 17 00:00:00 2001 From: Erik Huckvale <42946548+erikhuck@users.noreply.github.com> Date: Thu, 23 May 2024 14:29:43 -0400 Subject: [PATCH 3/9] Adds note to the README --- README.rst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.rst b/README.rst index f6b2fef..6178af6 100644 --- a/README.rst +++ b/README.rst @@ -11,6 +11,8 @@ The ``gpu-tracker`` command-line interface alternatively tracks the computationa **NOTE: The tracking occurs in a separate process. To maximize the accuracy of the reported resource usage, you may want to have a core available solely for the tracking process e.g. if your job uses 3 workers, you may want to allocate 4 cores.** +**NOTE: Since the tracking process is created using the Python multiprocessing library, if done so using the "spawn" start method (default on MacOS and Windows) or the "forkserver" method, you may get a runtime error after starting the tracking. To prevent this, you'll need to start the tracker after checking** ``if __name__ == '__main__'``. **See "Safe importing of main module" under** `The spawn and forkserver start methods `__ **for more information.** + Documentation ------------- The complete documentation for the ``gpu_tracker`` package, including tutorials, can be found `here `__. From 5ee6a6187d5209b6ca938a2ec5e614b417ee52ee Mon Sep 17 00:00:00 2001 From: Erik Huckvale <42946548+erikhuck@users.noreply.github.com> Date: Thu, 23 May 2024 14:50:00 -0400 Subject: [PATCH 4/9] Removes TODOs and corrects __str__ --- src/gpu_tracker/tracker.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index 46ad62f..0e8b9b6 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -40,11 +40,6 @@ class _TrackingProcess(mproc.Process): 'days': 1 / (60 * 60 * 24) } - # TODO use the uuid field to determine the IDs of the GPUs. - # Accept a list of GPUs by UUID. - # If an invalid ID is provided, raise an exception informing the user that an ID was provided that is not available and then list the available UUIDs. - # If there is no failure, do an info log informing the user of the UUIDs that they provided and all the ones that are available. - def __init__( self, stop_event: mproc.Event, sleep_time: float, ram_unit: str, gpu_ram_unit: str, time_unit: str, n_expected_cores: int | None, gpu_uuids: set[str] | None, disable_logs: bool, main_process_id: int, @@ -305,7 +300,7 @@ class State(enum.Enum): def __init__( self, sleep_time: float = 1.0, ram_unit: str = 'gigabytes', gpu_ram_unit: str = 'gigabytes', time_unit: str = 'hours', - n_expected_cores: int = None, gpu_uuids: set[str] = None, disable_logs: bool = False, process_id: int = None, # TODO add example with n_expected_cores and gpu_uuids to tutorial! + n_expected_cores: int = None, gpu_uuids: set[str] = None, disable_logs: bool = False, process_id: int = None, n_join_attempts: int = 5, join_timeout: float = 10.0): """ :param sleep_time: The number of seconds to sleep in between usage-collection iterations. @@ -409,7 +404,7 @@ def __str__(self) -> str: 'compute', 'Compute').replace('time: ', 'Time: ').replace('rss', 'RSS').replace('total', 'Total').replace( 'private', 'Private').replace('shared', 'Shared').replace('main', 'Main').replace('descendents', 'Descendents').replace( 'combined', 'Combined').replace('gpu', 'GPU').replace('mean', 'Mean').replace('cpu', 'CPU').replace( - 'n threads', 'number of threads') + 'n threads', 'number of threads').replace('n expected', 'Number of expected') @staticmethod def _format_float(dictionary: dict): From 20382c62df132ad37f45c331a01e245d67bb5993 Mon Sep 17 00:00:00 2001 From: erikhuck Date: Tue, 4 Jun 2024 13:51:27 -0400 Subject: [PATCH 5/9] Adds GPU utilization to the tracking --- src/gpu_tracker/tracker.py | 58 ++++++++----------- .../False-Linux-bytes-megabytes-seconds.json | 47 +++++++++------ .../False-Linux-bytes-megabytes-seconds.txt | 45 ++++++++------ .../False-Linux-kilobytes-bytes-days.json | 47 +++++++++------ .../data/False-Linux-kilobytes-bytes-days.txt | 45 ++++++++------ ...lse-Linux-kilobytes-gigabytes-minutes.json | 47 +++++++++------ ...alse-Linux-kilobytes-gigabytes-minutes.txt | 45 ++++++++------ ...False-Linux-megabytes-kilobytes-hours.json | 47 +++++++++------ .../False-Linux-megabytes-kilobytes-hours.txt | 45 ++++++++------ ...lse-not-linux-bytes-megabytes-seconds.json | 47 +++++++++------ ...alse-not-linux-bytes-megabytes-seconds.txt | 45 ++++++++------ .../False-not-linux-kilobytes-bytes-days.json | 47 +++++++++------ .../False-not-linux-kilobytes-bytes-days.txt | 45 ++++++++------ ...not-linux-kilobytes-gigabytes-minutes.json | 47 +++++++++------ ...-not-linux-kilobytes-gigabytes-minutes.txt | 45 ++++++++------ ...e-not-linux-megabytes-kilobytes-hours.json | 47 +++++++++------ ...se-not-linux-megabytes-kilobytes-hours.txt | 45 ++++++++------ .../True-Linux-bytes-megabytes-seconds.json | 47 +++++++++------ .../True-Linux-bytes-megabytes-seconds.txt | 45 ++++++++------ .../data/True-Linux-kilobytes-bytes-days.json | 47 +++++++++------ .../data/True-Linux-kilobytes-bytes-days.txt | 45 ++++++++------ ...rue-Linux-kilobytes-gigabytes-minutes.json | 47 +++++++++------ ...True-Linux-kilobytes-gigabytes-minutes.txt | 45 ++++++++------ .../True-Linux-megabytes-kilobytes-hours.json | 47 +++++++++------ .../True-Linux-megabytes-kilobytes-hours.txt | 45 ++++++++------ ...rue-not-linux-bytes-megabytes-seconds.json | 47 +++++++++------ ...True-not-linux-bytes-megabytes-seconds.txt | 45 ++++++++------ .../True-not-linux-kilobytes-bytes-days.json | 47 +++++++++------ .../True-not-linux-kilobytes-bytes-days.txt | 45 ++++++++------ ...not-linux-kilobytes-gigabytes-minutes.json | 47 +++++++++------ ...-not-linux-kilobytes-gigabytes-minutes.txt | 45 ++++++++------ ...e-not-linux-megabytes-kilobytes-hours.json | 47 +++++++++------ ...ue-not-linux-megabytes-kilobytes-hours.txt | 45 ++++++++------ tests/test_tracker.py | 46 ++++++++++----- 34 files changed, 950 insertions(+), 626 deletions(-) diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index 0e8b9b6..857f147 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -77,8 +77,8 @@ def __init__( system_core_count=system_core_count, n_expected_cores=n_expected_cores if n_expected_cores is not None else system_core_count) if self._nvidia_available: - gpu_info = _TrackingProcess._system_gpu_info(fields='uuid,memory.total') - gpu_ram_system_capacity = self._system_gpu_ram(gpu_info=gpu_info, measurement='total') + gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.total') + gpu_ram_system_capacity = self._get_gpu_ram(gpu_info=gpu_info, column='memory.total') max_gpu_ram = MaxGPURAM(unit=gpu_ram_unit, system_capacity=gpu_ram_system_capacity) all_uuids = set(gpu_info['uuid']) if gpu_uuids is None: @@ -88,7 +88,7 @@ def __init__( raise ValueError('gpu_uuids is not None but the set is empty. Please provide a set of at least one GPU UUID.') for gpu_uuid in gpu_uuids: if gpu_uuid not in all_uuids: - raise ValueError(f'GPU UUID of {gpu_uuid} is not valid. Available UUIDs are: {", ".join(all_uuids)}') + raise ValueError(f'GPU UUID of {gpu_uuid} is not valid. Available UUIDs are: {", ".join(sorted(all_uuids))}') self._gpu_uuids = gpu_uuids gpu_utilization = GPUUtilization(system_gpu_count=len(all_uuids), n_expected_gpus=len(self._gpu_uuids)) else: @@ -137,24 +137,19 @@ def run(self): self._resource_usage.max_ram.system, psutil.virtual_memory().used * self._ram_coefficient) # Get the maximum GPU RAM usage if available. if self._nvidia_available: - memory_used_command = 'nvidia-smi --query-compute-apps=pid,used_gpu_memory --format=csv,noheader' - nvidia_smi_output = subp.check_output(memory_used_command.split(), stderr=subp.STDOUT).decode() - if nvidia_smi_output: + gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-compute-apps=pid,used_gpu_memory') + if len(gpu_info): process_ids = {self._main_process_id} - self._update_gpu_ram(attr='main', process_ids=process_ids, nvidia_smi_output=nvidia_smi_output) - process_ids = { - process_id for process_id in self._map_processes( - processes=descendent_processes, map_func=lambda process: process.pid)} - self._update_gpu_ram(attr='descendents', process_ids=process_ids, - nvidia_smi_output=nvidia_smi_output) + self._update_gpu_ram(attr='main', process_ids=process_ids, gpu_info=gpu_info) + process_ids = set(self._map_processes(processes=descendent_processes, map_func=lambda process: process.pid)) + self._update_gpu_ram(attr='descendents', process_ids=process_ids, gpu_info=gpu_info) process_ids.add(self._main_process_id) - self._update_gpu_ram(attr='combined', process_ids=process_ids, nvidia_smi_output=nvidia_smi_output) - - gpu_info = _TrackingProcess._system_gpu_info(fields='uuid,memory.used,utilization.gpu') - system_gpu_ram = self._system_gpu_ram(gpu_info, measurement='used') + self._update_gpu_ram(attr='combined', process_ids=process_ids, gpu_info=gpu_info) + gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.used,utilization.gpu') + system_gpu_ram = self._get_gpu_ram(gpu_info, column='memory.used') self._resource_usage.max_gpu_ram.system = max(self._resource_usage.max_gpu_ram.system, system_gpu_ram) gpu_info = gpu_info.loc[gpu_info['uuid'].apply(lambda gpu_uuid: gpu_uuid in self._gpu_uuids)] - gpu_percentages = [float(percentage) for percentage in gpu_info['utilization.gpu']] + gpu_percentages = [float(percentage.replace('%', '').strip()) for percentage in gpu_info['utilization.gpu']] self._update_processing_unit_utilization( current_percentages=gpu_percentages, processing_unit_percentages=self._resource_usage.gpu_utilization.gpu_percentages, percent_key='gpu', @@ -227,27 +222,22 @@ def _update_ram(self, rss_values: RSSValues, processes: list[psutil.Process]): total_rss *= self._ram_coefficient rss_values.total_rss = max(rss_values.total_rss, total_rss) - def _update_gpu_ram(self, attr: str, process_ids: set[int], nvidia_smi_output: str): - nvidia_smi_output = nvidia_smi_output.strip().split('\n') - curr_gpu_ram = 0 - for process_info in nvidia_smi_output: - pid, megabytes_used = process_info.strip().split(',') - pid = int(pid.strip()) - if pid in process_ids: - megabytes_used = int(megabytes_used.replace('MiB', '').strip()) - curr_gpu_ram += megabytes_used - curr_gpu_ram *= self._gpu_ram_coefficient + def _update_gpu_ram(self, attr: str, process_ids: set[int], gpu_info: pd.DataFrame): + gpu_info = gpu_info.loc[[pid in process_ids for pid in gpu_info['pid']]] + gpu_ram = self._get_gpu_ram(gpu_info, column='used_gpu_memory') max_gpu_ram = getattr(self._resource_usage.max_gpu_ram, attr) - setattr(self._resource_usage.max_gpu_ram, attr, max(max_gpu_ram, curr_gpu_ram)) + setattr(self._resource_usage.max_gpu_ram, attr, max(max_gpu_ram, gpu_ram)) @staticmethod - def _system_gpu_info(fields: str) -> pd.DataFrame: - command = f'nvidia-smi --query-gpu={fields}, --format=csv' + def _query_gpu(nvidia_command: str) -> pd.DataFrame: + command = f'nvidia-smi {nvidia_command} --format=csv' output = subp.check_output(command.split(), stderr=subp.STDOUT).decode() - return pd.DataFrame(io.StringIO(output)) + gpu_info = pd.read_csv(io.StringIO(output)) + gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns] + return gpu_info.applymap(lambda value: value.strip() if type(value) is str else value) - def _system_gpu_ram(self, gpu_info: pd.DataFrame, measurement: str) -> float: - gpu_rams = gpu_info[f'memory.{measurement}'] + def _get_gpu_ram(self, gpu_info: pd.DataFrame, column: str) -> float: + gpu_rams = gpu_info[column] gpu_rams = gpu_rams.apply(lambda ram: int(ram.replace('MiB', '').strip())) return sum(gpu_rams) * self._gpu_ram_coefficient @@ -323,7 +313,7 @@ def __init__( legit_child_ids = {process.pid for process in current_process.children()} self._stop_event = mproc.Event() extraneous_ids = {process.pid for process in current_process.children()} - legit_child_ids - self._resource_usage_file = f'.{uuid.uuid1()}.pkl' + self._resource_usage_file = f'.gpu-tracker_{uuid.uuid1()}.pkl' self._tracking_process = _TrackingProcess( self._stop_event, sleep_time, ram_unit, gpu_ram_unit, time_unit, n_expected_cores, gpu_uuids, disable_logs, process_id if process_id is not None else current_process_id, self._resource_usage_file, extraneous_ids) diff --git a/tests/data/False-Linux-bytes-megabytes-seconds.json b/tests/data/False-Linux-bytes-megabytes-seconds.json index 6328498..3093f55 100644 --- a/tests/data/False-Linux-bytes-megabytes-seconds.json +++ b/tests/data/False-Linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/False-Linux-bytes-megabytes-seconds.txt b/tests/data/False-Linux-bytes-megabytes-seconds.txt index 16dbb28..d7c8650 100644 --- a/tests/data/False-Linux-bytes-megabytes-seconds.txt +++ b/tests/data/False-Linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22017.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/False-Linux-kilobytes-bytes-days.json b/tests/data/False-Linux-kilobytes-bytes-days.json index 5279afa..d43d05a 100644 --- a/tests/data/False-Linux-kilobytes-bytes-days.json +++ b/tests/data/False-Linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/False-Linux-kilobytes-bytes-days.txt b/tests/data/False-Linux-kilobytes-bytes-days.txt index 21a8209..a81ccd0 100644 --- a/tests/data/False-Linux-kilobytes-bytes-days.txt +++ b/tests/data/False-Linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/False-Linux-kilobytes-gigabytes-minutes.json b/tests/data/False-Linux-kilobytes-gigabytes-minutes.json index 3e93fab..b970b71 100644 --- a/tests/data/False-Linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/False-Linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt b/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt index 8ca2677..53c1719 100644 --- a/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/False-Linux-megabytes-kilobytes-hours.json b/tests/data/False-Linux-megabytes-kilobytes-hours.json index e5c30ec..e856451 100644 --- a/tests/data/False-Linux-megabytes-kilobytes-hours.json +++ b/tests/data/False-Linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/False-Linux-megabytes-kilobytes-hours.txt b/tests/data/False-Linux-megabytes-kilobytes-hours.txt index f6d8b0e..51f338b 100644 --- a/tests/data/False-Linux-megabytes-kilobytes-hours.txt +++ b/tests/data/False-Linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.022 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/data/False-not-linux-bytes-megabytes-seconds.json b/tests/data/False-not-linux-bytes-megabytes-seconds.json index 014365f..62e2a04 100644 --- a/tests/data/False-not-linux-bytes-megabytes-seconds.json +++ b/tests/data/False-not-linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/False-not-linux-bytes-megabytes-seconds.txt b/tests/data/False-not-linux-bytes-megabytes-seconds.txt index 3814148..fd05eb2 100644 --- a/tests/data/False-not-linux-bytes-megabytes-seconds.txt +++ b/tests/data/False-not-linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/False-not-linux-kilobytes-bytes-days.json b/tests/data/False-not-linux-kilobytes-bytes-days.json index 539c545..f8be40a 100644 --- a/tests/data/False-not-linux-kilobytes-bytes-days.json +++ b/tests/data/False-not-linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/False-not-linux-kilobytes-bytes-days.txt b/tests/data/False-not-linux-kilobytes-bytes-days.txt index 74d156b..a26e309 100644 --- a/tests/data/False-not-linux-kilobytes-bytes-days.txt +++ b/tests/data/False-not-linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json index ace2ffe..a01c201 100644 --- a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt index 00d0142..e3143e7 100644 --- a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/False-not-linux-megabytes-kilobytes-hours.json b/tests/data/False-not-linux-megabytes-kilobytes-hours.json index 15d697b..cb82585 100644 --- a/tests/data/False-not-linux-megabytes-kilobytes-hours.json +++ b/tests/data/False-not-linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/False-not-linux-megabytes-kilobytes-hours.txt b/tests/data/False-not-linux-megabytes-kilobytes-hours.txt index ee50767..afdff2c 100644 --- a/tests/data/False-not-linux-megabytes-kilobytes-hours.txt +++ b/tests/data/False-not-linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/data/True-Linux-bytes-megabytes-seconds.json b/tests/data/True-Linux-bytes-megabytes-seconds.json index 6328498..3093f55 100644 --- a/tests/data/True-Linux-bytes-megabytes-seconds.json +++ b/tests/data/True-Linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/True-Linux-bytes-megabytes-seconds.txt b/tests/data/True-Linux-bytes-megabytes-seconds.txt index 16dbb28..d7c8650 100644 --- a/tests/data/True-Linux-bytes-megabytes-seconds.txt +++ b/tests/data/True-Linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22017.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/True-Linux-kilobytes-bytes-days.json b/tests/data/True-Linux-kilobytes-bytes-days.json index 5279afa..d43d05a 100644 --- a/tests/data/True-Linux-kilobytes-bytes-days.json +++ b/tests/data/True-Linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/True-Linux-kilobytes-bytes-days.txt b/tests/data/True-Linux-kilobytes-bytes-days.txt index 21a8209..a81ccd0 100644 --- a/tests/data/True-Linux-kilobytes-bytes-days.txt +++ b/tests/data/True-Linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/True-Linux-kilobytes-gigabytes-minutes.json b/tests/data/True-Linux-kilobytes-gigabytes-minutes.json index 3e93fab..b970b71 100644 --- a/tests/data/True-Linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/True-Linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt b/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt index 8ca2677..53c1719 100644 --- a/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/True-Linux-megabytes-kilobytes-hours.json b/tests/data/True-Linux-megabytes-kilobytes-hours.json index e5c30ec..e856451 100644 --- a/tests/data/True-Linux-megabytes-kilobytes-hours.json +++ b/tests/data/True-Linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/True-Linux-megabytes-kilobytes-hours.txt b/tests/data/True-Linux-megabytes-kilobytes-hours.txt index f6d8b0e..51f338b 100644 --- a/tests/data/True-Linux-megabytes-kilobytes-hours.txt +++ b/tests/data/True-Linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.022 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/data/True-not-linux-bytes-megabytes-seconds.json b/tests/data/True-not-linux-bytes-megabytes-seconds.json index 014365f..62e2a04 100644 --- a/tests/data/True-not-linux-bytes-megabytes-seconds.json +++ b/tests/data/True-not-linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/True-not-linux-bytes-megabytes-seconds.txt b/tests/data/True-not-linux-bytes-megabytes-seconds.txt index 3814148..fd05eb2 100644 --- a/tests/data/True-not-linux-bytes-megabytes-seconds.txt +++ b/tests/data/True-not-linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/True-not-linux-kilobytes-bytes-days.json b/tests/data/True-not-linux-kilobytes-bytes-days.json index 539c545..f8be40a 100644 --- a/tests/data/True-not-linux-kilobytes-bytes-days.json +++ b/tests/data/True-not-linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/True-not-linux-kilobytes-bytes-days.txt b/tests/data/True-not-linux-kilobytes-bytes-days.txt index 74d156b..a26e309 100644 --- a/tests/data/True-not-linux-kilobytes-bytes-days.txt +++ b/tests/data/True-not-linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json index ace2ffe..a01c201 100644 --- a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt index 00d0142..e3143e7 100644 --- a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/True-not-linux-megabytes-kilobytes-hours.json b/tests/data/True-not-linux-megabytes-kilobytes-hours.json index 15d697b..cb82585 100644 --- a/tests/data/True-not-linux-megabytes-kilobytes-hours.json +++ b/tests/data/True-not-linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/True-not-linux-megabytes-kilobytes-hours.txt b/tests/data/True-not-linux-megabytes-kilobytes-hours.txt index ee50767..afdff2c 100644 --- a/tests/data/True-not-linux-megabytes-kilobytes-hours.txt +++ b/tests/data/True-not-linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/test_tracker.py b/tests/test_tracker.py index 8841aa7..2bb0105 100644 --- a/tests/test_tracker.py +++ b/tests/test_tracker.py @@ -19,15 +19,17 @@ def multiply_list(_list: list, multiple=2) -> list: test_tracker_data = [ - ('bytes', 'megabytes', 'seconds'), - ('kilobytes', 'gigabytes', 'minutes'), - ('megabytes', 'kilobytes', 'hours'), - ('kilobytes', 'bytes', 'days') + ('bytes', 'megabytes', 'seconds', None, 3), + ('kilobytes', 'gigabytes', 'minutes', {'gpu-id1'}, 2), + ('megabytes', 'kilobytes', 'hours', {'gpu-id1', 'gpu-id2'}, 1), + ('kilobytes', 'bytes', 'days', {'gpu-id1', 'gpu-id2', 'gpu-id3'}, None) ] -@pt.mark.parametrize('ram_unit,gpu_ram_unit,time_unit', test_tracker_data) -def test_tracker(mocker, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str): +@pt.mark.parametrize('ram_unit,gpu_ram_unit,time_unit,gpu_uuids,n_expected_cores', test_tracker_data) +def test_tracker( + mocker, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str, gpu_uuids: set[str], + n_expected_cores: int): class EventMock: def __init__(self): self.count = 0 @@ -126,13 +128,13 @@ def start_mock(self): mocker.MagicMock(used=29 * 1e9)]) nvidia_smi_outputs = [ b'', - b'12198 MiB\n12198 MiB', - b'', - b'', - b'12,1600 MiB\n21,700 MiB\n22,200 MiB', - b'1600 MiB\n900 MiB', - b'12,1500 MiB\n21,2100 MiB\n22,2200 MiB', - b'1500 MiB\n4300 MiB'] + b' uuid,memory.total [MiB]\ngpu-id1,12198 MiB\ngpu-id2,12198 MiB\ngpu-id3 , 12198MiB', + b'pid, used_gpu_memory [MiB]\n', + b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 0 MiB, 0 %\ngpu-id2 , 0 MiB, 0 %\ngpu-id3 , 0 MiB, 0 %', + b'pid, used_gpu_memory [MiB]\n12,1600 MiB\n21,700 MiB\n22,200 MiB', + b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1600 MiB,75 %\ngpu-id2,900 MiB , 50 %\n gpu-id3, 500 MiB, 25 %', + b'pid, used_gpu_memory [MiB]\n12,1500 MiB\n21,2100 MiB\n22,2200 MiB', + b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1500 MiB, 55 %\n gpu-id2, 4300 MiB, 45%\ngpu-id3,700MiB,35%'] check_output_mock = mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=nvidia_smi_outputs) cpu_count_mock = mocker.patch('gpu_tracker.tracker.psutil.cpu_count', return_value=4) cpu_percent_mock = mocker.patch( @@ -146,11 +148,12 @@ def start_mock(self): if use_context_manager: with gput.Tracker( sleep_time=sleep_time, join_timeout=join_timeout, ram_unit=ram_unit, gpu_ram_unit=gpu_ram_unit, - time_unit=time_unit) as tracker: + time_unit=time_unit, gpu_uuids=gpu_uuids, n_expected_cores=n_expected_cores) as tracker: pass else: tracker = gput.Tracker( - sleep_time=sleep_time, join_timeout=join_timeout, ram_unit=ram_unit, gpu_ram_unit=gpu_ram_unit, time_unit=time_unit) + sleep_time=sleep_time, join_timeout=join_timeout, ram_unit=ram_unit, gpu_ram_unit=gpu_ram_unit, time_unit=time_unit, + gpu_uuids=gpu_uuids, n_expected_cores=n_expected_cores) tracker.start() tracker.stop() assert start_mock.called @@ -233,13 +236,24 @@ def test_warnings(mocker, caplog): assert not os.path.isfile(tracker._resource_usage_file) -def test_validate_unit(): +def test_validate_arguments(mocker): with pt.raises(ValueError) as error: gput.Tracker(sleep_time=0.0) assert str(error.value) == 'Sleep time of 0.0 is invalid. Must be at least 0.1 seconds.' with pt.raises(ValueError) as error: gput.Tracker(ram_unit='milibytes') assert str(error.value) == '"milibytes" is not a valid RAM unit. Valid values are bytes, gigabytes, kilobytes, megabytes, terabytes' + subprocess_mock = mocker.patch( + 'gpu_tracker.tracker.subp', check_output=mocker.MagicMock( + side_effect=[b'', b'uuid ,memory.total [MiB] \ngpu-id1,2048 MiB\ngpu-id2,2048 MiB', b'', b'uuid ,memory.total [MiB] '])) + with pt.raises(ValueError) as error: + gput.Tracker(gpu_uuids={'invalid-id'}) + assert len(subprocess_mock.check_output.call_args_list) == 2 + assert str(error.value) == 'GPU UUID of invalid-id is not valid. Available UUIDs are: gpu-id1, gpu-id2' + with pt.raises(ValueError) as error: + gput.Tracker(gpu_uuids=set[str]()) + assert len(subprocess_mock.check_output.call_args_list) == 4 + assert str(error.value) == 'gpu_uuids is not None but the set is empty. Please provide a set of at least one GPU UUID.' def test_state(mocker): From b236d994cf7a086b3a94a44c4284206a0233978d Mon Sep 17 00:00:00 2001 From: erikhuck Date: Wed, 5 Jun 2024 11:48:59 -0400 Subject: [PATCH 6/9] Adds tests for the CLI --- src/gpu_tracker/__main__.py | 6 +++-- src/gpu_tracker/tracker.py | 2 +- tests/test_cli.py | 54 ++++++++++++++++++++++++++++++++++--- tests/test_tracker.py | 42 +++++++++++++---------------- tests/utils.py | 3 +++ 5 files changed, 77 insertions(+), 30 deletions(-) create mode 100644 tests/utils.py diff --git a/src/gpu_tracker/__main__.py b/src/gpu_tracker/__main__.py index 9424003..4054523 100644 --- a/src/gpu_tracker/__main__.py +++ b/src/gpu_tracker/__main__.py @@ -29,8 +29,7 @@ def main(): args = doc.docopt(__doc__, version=__version__) - print(args['--execute']) - command = args['--execute'].split(' ') + command = args['--execute'].split() output = args['--output'] output_format = args['--format'] if args['--format'] is not None else 'text' option_map = { @@ -45,6 +44,9 @@ def main(): '--execute', '--output', '--format', '--help', '--version'}} if 'sleep_time' in kwargs.keys(): kwargs['sleep_time'] = float(kwargs['sleep_time']) + if len(command) == 0: + log.error('Empty command provided.') + sys.exit(1) try: process = subp.Popen(command) except FileNotFoundError: diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index 857f147..2072d88 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -234,7 +234,7 @@ def _query_gpu(nvidia_command: str) -> pd.DataFrame: output = subp.check_output(command.split(), stderr=subp.STDOUT).decode() gpu_info = pd.read_csv(io.StringIO(output)) gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns] - return gpu_info.applymap(lambda value: value.strip() if type(value) is str else value) + return gpu_info.map(lambda value: value.strip() if type(value) is str else value) def _get_gpu_ram(self, gpu_info: pd.DataFrame, column: str) -> float: gpu_rams = gpu_info[column] diff --git a/tests/test_cli.py b/tests/test_cli.py index 70cfe3c..1187241 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -1,12 +1,58 @@ import gpu_tracker.__main__ as cli import pytest as pt +import os +import utils -test_data = [['-e', ]] +@pt.fixture(name='format_', params=['text', 'json', None]) +def get_format(request) -> str: + yield request.param -@pt.mark.parametrize('argv', test_data) -def test_main(mocker, argv: list[str]): + +@pt.fixture(name='output', params=['my-file', None]) +def get_output(request) -> str: + yield request.param + + +test_data = [ + (['-e', 'my-command', '--ru=kilobytes'], ['my-command'], {'disable_logs': False, 'ram_unit': 'kilobytes'}), + (['--execute', 'my-command arg1 ', '--disable-logs'], ['my-command', 'arg1'], {'disable_logs': True}), + (['--execute=my-command arg1 arg2', '--st=0.4'], ['my-command', 'arg1', 'arg2'], {'disable_logs': False, 'sleep_time': 0.4}), + ( + ['-e', 'my-command', '--gru=megabytes', '--tu=days'], ['my-command'], + {'disable_logs': False, 'gpu_ram_unit': 'megabytes', 'time_unit': 'days'} + )] + + +@pt.mark.parametrize('argv,command,kwargs', test_data) +def test_main(mocker, argv: list[str], command: list[str], kwargs: dict, format_: str | None, output: str | None): argv = ['gpu-tracker'] + argv + argv += ['-f', format_] if format_ else [] + argv += ['-o', output] if output else [] mocker.patch('sys.argv', argv) - mocker.patch('gpu_tracker.__main__.subp', ) + process_mock = mocker.MagicMock(returncode=0, pid=666) + subprocess_mock = mocker.patch('gpu_tracker.__main__.subp', Popen=mocker.MagicMock(return_value=process_mock)) + tracker_str = 'tracker-str' + tracker_json = {'tracker': 'json'} + tracker_mock = mocker.MagicMock( + __str__=mocker.MagicMock(return_value=tracker_str), to_json=mocker.MagicMock(return_value=tracker_json), __enter__=lambda self: self) + TrackerMock = mocker.patch('gpu_tracker.__main__.Tracker', return_value=tracker_mock) + print_mock = mocker.patch('builtins.print') cli.main() + TrackerMock.assert_called_with(process_id=process_mock.pid, **kwargs) + subprocess_mock.Popen.assert_called_once_with(command) + process_mock.wait.assert_called_once_with() + if format_ == 'text' or format_ is None: + tracker_mock.__str__.assert_called_once_with() + output_str = tracker_str + else: + tracker_mock.to_json.assert_called_once_with() + output_str = '{\n "tracker": "json"\n}' + print_args = [('Resource tracking complete. Process completed with status code: 0',)] + if output is None: + print_args.append((output_str,)) + else: + with open(output, 'r') as file: + assert output_str == file.read() + os.remove(output) + utils.assert_args_list(print_mock, print_args) diff --git a/tests/test_tracker.py b/tests/test_tracker.py index 2bb0105..0e1c7aa 100644 --- a/tests/test_tracker.py +++ b/tests/test_tracker.py @@ -2,6 +2,7 @@ import json import os import pytest as pt +import utils @pt.fixture(name='operating_system', params=['Linux', 'not-linux']) @@ -159,35 +160,35 @@ def start_mock(self): assert start_mock.called assert not os.path.isfile(tracker._resource_usage_file) assert not log_spy.called - _assert_args_list(virtual_memory_mock, [()] * 4) + utils.assert_args_list(virtual_memory_mock, [()] * 4) system_mock.assert_called_once_with() EventMock.assert_called_once_with() - _assert_args_list(mock=tracker._stop_event.is_set, expected_args_list=[()] * 4) - _assert_args_list(mock=PsProcessMock, expected_args_list=[(main_process_id,)] * 2) - _assert_args_list(current_process_mock.children, [()] * 2) - _assert_args_list(mock=main_process_mock.children, expected_args_list=[{'recursive': True}] * 3, use_kwargs=True) + utils.assert_args_list(mock=tracker._stop_event.is_set, expected_args_list=[()] * 4) + utils.assert_args_list(mock=PsProcessMock, expected_args_list=[(main_process_id,)] * 2) + utils.assert_args_list(current_process_mock.children, [()] * 2) + utils.assert_args_list(mock=main_process_mock.children, expected_args_list=[{'recursive': True}] * 3, use_kwargs=True) if operating_system == 'Linux': - _assert_args_list(mock=main_process_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) - _assert_args_list(mock=child1_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) - _assert_args_list(mock=child2_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) + utils.assert_args_list(mock=main_process_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) + utils.assert_args_list(mock=child1_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) + utils.assert_args_list(mock=child2_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) else: - _assert_args_list(mock=main_process_mock.memory_info, expected_args_list=[()] * 6) - _assert_args_list(mock=child1_mock.memory_info, expected_args_list=[()] * 6) - _assert_args_list(mock=child2_mock.memory_info, expected_args_list=[()] * 6) + utils.assert_args_list(mock=main_process_mock.memory_info, expected_args_list=[()] * 6) + utils.assert_args_list(mock=child1_mock.memory_info, expected_args_list=[()] * 6) + utils.assert_args_list(mock=child2_mock.memory_info, expected_args_list=[()] * 6) assert len(check_output_mock.call_args_list) == 8 os_mock.getpid.assert_called_once_with() - _assert_args_list(mock=time_mock.time, expected_args_list=[()] * 5) + utils.assert_args_list(mock=time_mock.time, expected_args_list=[()] * 5) cpu_percent_interval = gput.tracker._TrackingProcess._CPU_PERCENT_INTERVAL true_sleep_time = sleep_time - cpu_percent_interval - _assert_args_list( + utils.assert_args_list( mock=time_mock.sleep, expected_args_list=[(cpu_percent_interval,), (true_sleep_time,)] * 3) tracker._stop_event.set.assert_called_once_with() tracker._tracking_process.join.assert_called_once_with(timeout=join_timeout) - _assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * 2) + utils.assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * 2) assert not tracker._tracking_process.terminate.called tracker._tracking_process.close.assert_called_once_with() cpu_count_mock.assert_called_once_with() - _assert_args_list(cpu_percent_mock, [()] * 3) + utils.assert_args_list(cpu_percent_mock, [()] * 3) expected_measurements_file = f'tests/data/{use_context_manager}-{operating_system}-{ram_unit}-{gpu_ram_unit}-{time_unit}' with open(f'{expected_measurements_file}.txt', 'r') as file: expected_tracker_str = file.read() @@ -197,11 +198,6 @@ def start_mock(self): assert expected_measurements == tracker.to_json() -def _assert_args_list(mock, expected_args_list: list[tuple | dict], use_kwargs: bool = False): - actual_args_list = [call.kwargs if use_kwargs else call.args for call in mock.call_args_list] - assert actual_args_list == expected_args_list - - def test_warnings(mocker, caplog): n_join_attempts = 3 join_timeout = 5.2 @@ -215,10 +211,10 @@ def test_warnings(mocker, caplog): with gput.Tracker(n_join_attempts=n_join_attempts, join_timeout=join_timeout) as tracker: set_spy = mocker.spy(tracker._stop_event, 'set') subprocess_mock.check_output.assert_called_once() - _assert_args_list(mock=set_spy, expected_args_list=[()] * n_join_attempts) - _assert_args_list( + utils.assert_args_list(mock=set_spy, expected_args_list=[()] * n_join_attempts) + utils.assert_args_list( mock=join_spy, expected_args_list=[{'timeout': join_timeout}] * n_join_attempts, use_kwargs=True) - _assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * (n_join_attempts + 1)) + utils.assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * (n_join_attempts + 1)) terminate_spy.assert_called_once() close_spy.assert_called_once() expected_warnings = [ diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..cb9f7f9 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,3 @@ +def assert_args_list(mock, expected_args_list: list[tuple | dict], use_kwargs: bool = False): + actual_args_list = [call.kwargs if use_kwargs else call.args for call in mock.call_args_list] + assert actual_args_list == expected_args_list From e6e352e6f560215bedf3c40ff47fa81103f5a779 Mon Sep 17 00:00:00 2001 From: erikhuck Date: Wed, 5 Jun 2024 16:21:38 -0400 Subject: [PATCH 7/9] Completes test coverage --- src/gpu_tracker/__main__.py | 3 +- src/gpu_tracker/tracker.py | 8 +++--- tests/test_cli.py | 29 +++++++++++++++++-- tests/test_tracker.py | 57 +++++++++++++++++++++++++++++++++---- 4 files changed, 85 insertions(+), 12 deletions(-) diff --git a/src/gpu_tracker/__main__.py b/src/gpu_tracker/__main__.py index 4054523..d2837da 100644 --- a/src/gpu_tracker/__main__.py +++ b/src/gpu_tracker/__main__.py @@ -64,7 +64,8 @@ def main(): elif output_format == 'text': output_str = str(tracker) else: - raise ValueError(f'"{output_format} is not a valid format. Valid values are "json" or "text".') + log.error(f'"{output_format}" is not a valid format. Valid values are "json" or "text".') + sys.exit(1) if output is None: print(output_str) else: diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index 2072d88..027935c 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -136,7 +136,7 @@ def run(self): self._resource_usage.max_ram.system = max( self._resource_usage.max_ram.system, psutil.virtual_memory().used * self._ram_coefficient) # Get the maximum GPU RAM usage if available. - if self._nvidia_available: + if self._nvidia_available: # pragma: nocover gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-compute-apps=pid,used_gpu_memory') if len(gpu_info): process_ids = {self._main_process_id} @@ -193,8 +193,8 @@ def _map_processes(self, processes: list[psutil.Process], map_func: typ.Callable for process in processes: try: mapped_list.append(map_func(process)) - except psutil.NoSuchProcess: - self._log_warning('Attempted to obtain usage information of a process that no longer exists.') + except psutil.NoSuchProcess: # pragma: nocover + self._log_warning('Attempted to obtain usage information of a process that no longer exists.') # pragma: nocover return mapped_list def _update_ram(self, rss_values: RSSValues, processes: list[psutil.Process]): @@ -364,7 +364,7 @@ def __exit__(self, *_): f'last updated. Resource usage was not updated during that time.') os.remove(self._resource_usage_file) else: - raise RuntimeError('The temporary tracking results file does not exist. Tracking results cannot be obtained.') + raise RuntimeError('The temporary tracking results file does not exist. Tracking results cannot be obtained.') # pragma: nocover self.state = Tracker.State.STOPPED def start(self): diff --git a/tests/test_cli.py b/tests/test_cli.py index 1187241..73013f2 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -5,12 +5,12 @@ @pt.fixture(name='format_', params=['text', 'json', None]) -def get_format(request) -> str: +def get_format(request) -> str | None: yield request.param @pt.fixture(name='output', params=['my-file', None]) -def get_output(request) -> str: +def get_output(request) -> str | None: yield request.param @@ -56,3 +56,28 @@ def test_main(mocker, argv: list[str], command: list[str], kwargs: dict, format_ assert output_str == file.read() os.remove(output) utils.assert_args_list(print_mock, print_args) + + +error_data = [ + (['-e '], 'Empty command provided.'), (['-e', 'my-command'], 'Command not found: "my-command"'), + (['-e', 'my-command'], f'The following error occurred when starting the command "my-command":'), + (['-e', 'my-command', '-f', 'invalid-format'], '"invalid-format" is not a valid format. Valid values are "json" or "text".')] + + +@pt.mark.parametrize('argv,error_message', error_data) +def test_errors(mocker, argv: list[str], error_message: str): + argv = ['gpu-tracker'] + argv + mocker.patch('sys.argv', argv) + if 'Command not found' in error_message: + popen_side_effect = FileNotFoundError + elif 'The following error occurred' in error_message: + popen_side_effect = Exception + else: + popen_side_effect = mocker.MagicMock() + mocker.patch('gpu_tracker.__main__.subp.Popen', side_effect=popen_side_effect) + log_mock = mocker.patch('gpu_tracker.__main__.log', error=mocker.MagicMock()) + mocker.patch('gpu_tracker.__main__.Tracker') + with pt.raises(SystemExit) as error: + cli.main() + assert str(error.value) == '1' + log_mock.error.assert_called_once_with(error_message) diff --git a/tests/test_tracker.py b/tests/test_tracker.py index 0e1c7aa..10cbc7b 100644 --- a/tests/test_tracker.py +++ b/tests/test_tracker.py @@ -1,9 +1,13 @@ import gpu_tracker as gput +import psutil import json import os import pytest as pt import utils +nvidia_smi_unavailable_message = 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. ' \ + 'Otherwise the Max GPU RAM values will remain 0.0' + @pt.fixture(name='operating_system', params=['Linux', 'not-linux']) def get_operating_system(request) -> str: @@ -198,7 +202,7 @@ def start_mock(self): assert expected_measurements == tracker.to_json() -def test_warnings(mocker, caplog): +def test_main_process_warnings(mocker, caplog): n_join_attempts = 3 join_timeout = 5.2 subprocess_mock = mocker.patch('gpu_tracker.tracker.subp', check_output=mocker.MagicMock(side_effect=FileNotFoundError)) @@ -217,19 +221,62 @@ def test_warnings(mocker, caplog): utils.assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * (n_join_attempts + 1)) terminate_spy.assert_called_once() close_spy.assert_called_once() - expected_warnings = [ - 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. ' - 'Otherwise the Max GPU RAM values will remain 0.0'] + expected_warnings = [nvidia_smi_unavailable_message] expected_warnings += ['The tracking process is still alive after join timout. Attempting to join again...'] * n_join_attempts expected_warnings.append( 'The tracking process is still alive after 3 attempts to join. Terminating the process by force...') expected_warnings.append( 'Tracking is stopping and it has been 11.0 seconds since the temporary tracking results file was last updated. ' 'Resource usage was not updated during that time.') + assert not os.path.isfile(tracker._resource_usage_file) + _assert_warnings(caplog, expected_warnings) + + +def _assert_warnings(caplog, expected_warnings: list[str]): for expected_warning, record in zip(expected_warnings, caplog.records): assert record.levelname == 'WARNING' assert record.message == expected_warning - assert not os.path.isfile(tracker._resource_usage_file) + + +@pt.fixture(name='disable_logs', params=[True, False]) +def get_disable_logs(request) -> bool: + yield request.param + + +def test_tracking_process_warnings(mocker, disable_logs: bool, caplog): + main_process_id = 666 + child_process_id = 777 + error_message = 'Unexpected error' + ProcessMock = mocker.patch( + 'gpu_tracker.tracker.psutil.Process', + side_effect=[ + mocker.MagicMock(), psutil.NoSuchProcess(pid=666), mocker.MagicMock(), + mocker.MagicMock(children=mocker.MagicMock( + side_effect=[psutil.NoSuchProcess(child_process_id), RuntimeError(error_message)]))]) + subprocess_mock = mocker.patch('gpu_tracker.tracker.subp', check_output=mocker.MagicMock(side_effect=FileNotFoundError)) + log_spy = mocker.spy(gput.tracker.log, 'warning') + tracker = gput.Tracker(process_id=main_process_id, disable_logs=disable_logs) + tracker._tracking_process.run() + os.remove(tracker._resource_usage_file) + mocker.patch( + 'gpu_tracker.tracker.mproc.Event', return_value=mocker.MagicMock(is_set=mocker.MagicMock(side_effect=[False, False, True]))) + print_mock = mocker.patch('builtins.print') + tracker = gput.Tracker(process_id=main_process_id, disable_logs=disable_logs) + tracker._tracking_process.run() + os.remove(tracker._resource_usage_file) + utils.assert_args_list(ProcessMock, [(os.getpid(),), (main_process_id,), (os.getpid(),), (main_process_id,)]) + [printed] = print_mock.call_args_list + [printed] = printed.args + assert error_message == str(printed) + assert len(subprocess_mock.check_output.call_args_list) == 2 + if disable_logs: + assert not log_spy.called + else: + expected_warnings = [ + nvidia_smi_unavailable_message, 'The target process of ID 666 ended before tracking could begin.', nvidia_smi_unavailable_message, + 'Failed to track a process (PID: 777) that does not exist. This possibly resulted from the process completing before it could be tracked.', + 'The following uncaught exception occurred in the tracking process:'] + _assert_warnings(caplog, expected_warnings) def test_validate_arguments(mocker): From e0ae0e24729c47ec5e6e8929ea47022516cbf24d Mon Sep 17 00:00:00 2001 From: erikhuck Date: Thu, 6 Jun 2024 15:47:10 -0400 Subject: [PATCH 8/9] Updates the documentation --- README.rst | 2 +- docs/notebook/tutorial.ipynb | 723 ++++++++++++++++++--------------- docs/tutorial.rst | 745 ++++++++++++++++++++--------------- src/gpu_tracker/__main__.py | 12 +- src/gpu_tracker/tracker.py | 4 +- tests/test_cli.py | 7 +- 6 files changed, 851 insertions(+), 642 deletions(-) diff --git a/README.rst b/README.rst index 6178af6..9e32de0 100644 --- a/README.rst +++ b/README.rst @@ -3,7 +3,7 @@ gpu_tracker ########### Description ----------- -The ``gpu_tracker`` package provides a ``Tracker`` class and a commandline-interface that tracks (profiles) the usage of compute time, CPU utilization, maximum RAM, and maximum GPU RAM. +The ``gpu_tracker`` package provides a ``Tracker`` class and a commandline-interface that tracks (profiles) the usage of compute time, CPU utilization, maximum RAM, GPU utilization, and maximum GPU RAM. The compute time is a measurement of the real time taken by the task as opposed to the CPU-utilization time. The GPU tracking is for Nvidia GPUs and uses the ``nvidia-smi`` command. If the Nvidia drivers have not been installed, then the max GPU RAM is not tracked and measurements are reported as 0. Computational resources are tracked throughout the duration of a context manager or the duration of explicit calls to the ``start()`` and ``stop()`` methods of the ``Tracker`` class. diff --git a/docs/notebook/tutorial.ipynb b/docs/notebook/tutorial.ipynb index a07555c..540f9fd 100644 --- a/docs/notebook/tutorial.ipynb +++ b/docs/notebook/tutorial.ipynb @@ -21,7 +21,7 @@ "id": "2bb9e84a-8523-4e5f-bc01-1d6b234c19a6", "metadata": {}, "source": [ - "The `gpu_tracker` package provides the `Tracker` class which uses a subprocess to measure computational resource usage, namely the compute time, maximum CPU utilization, mean CPU utilization, maximum RAM used, and maximum GPU RAM used. The `start()` method starts this process which tracks usage in the background. After calling `start()`, one can write the code for which resource usage is measured, followed by calling the `stop()` method. The compute time will be the time from the call to `start()` to the call to `stop()` and the RAM, GPU RAM, and CPU utilization quantities will be the respective computational resources used by the code that's in between `start()` and `stop()`." + "The `gpu_tracker` package provides the `Tracker` class which uses a subprocess to measure computational resource usage, namely the compute time, maximum CPU utilization, mean CPU utilization, maximum RAM used, maximum GPU utilization, mean GPU utilization, and maximum GPU RAM used. The `start()` method starts this process which tracks usage in the background. After calling `start()`, one can write the code for which resource usage is measured, followed by calling the `stop()` method. The compute time will be the time from the call to `start()` to the call to `stop()` and the RAM, GPU RAM, CPU utilization, and GPU utilization quantities will be the respective computational resources used by the code that's in between `start()` and `stop()`." ] }, { @@ -42,7 +42,7 @@ "metadata": {}, "outputs": [], "source": [ - "tracker = gput.Tracker()\n", + "tracker = gput.Tracker(n_expected_cores=1, sleep_time=0.1)\n", "tracker.start()\n", "example_function()\n", "tracker.stop()" @@ -68,52 +68,61 @@ "text": [ "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.21\n", + " System capacity: 63.088\n", + " System: 1.899\n", " Main:\n", - " Total RSS: 0.827\n", - " Private RSS: 0.674\n", - " Shared RSS: 0.154\n", + " Total RSS: 0.914\n", + " Private RSS: 0.753\n", + " Shared RSS: 0.161\n", " Descendents:\n", " Total RSS: 0.0\n", " Private RSS: 0.0\n", " Shared RSS: 0.0\n", " Combined:\n", - " Total RSS: 0.834\n", - " Private RSS: 0.681\n", - " Shared RSS: 0.154\n", + " Total RSS: 0.883\n", + " Private RSS: 0.723\n", + " Shared RSS: 0.161\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 0.535\n", - " Main: 0.314\n", + " System capacity: 2.048\n", + " System: 0.353\n", + " Main: 0.277\n", " Descendents: 0.0\n", - " Combined: 0.314\n", + " Combined: 0.277\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 1\n", " System:\n", - " Max core percent: 150.6\n", - " Max CPU percent: 12.55\n", - " Mean core percent: 122.9\n", - " Mean CPU percent: 10.242\n", + " Max sum percent: 169.7\n", + " Max hardware percent: 14.142\n", + " Mean sum percent: 150.183\n", + " Mean hardware percent: 12.515\n", " Main:\n", - " Max core percent: 98.6\n", - " Max CPU percent: 8.217\n", - " Mean core percent: 96.8\n", - " Mean CPU percent: 8.067\n", + " Max sum percent: 101.2\n", + " Max hardware percent: 101.2\n", + " Mean sum percent: 93.158\n", + " Mean hardware percent: 93.158\n", " Descendents:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Combined:\n", - " Max core percent: 98.6\n", - " Max CPU percent: 8.217\n", - " Mean core percent: 96.8\n", - " Mean CPU percent: 8.067\n", - " Main number of threads: 15\n", + " Max sum percent: 101.2\n", + " Max hardware percent: 101.2\n", + " Mean sum percent: 93.158\n", + " Mean hardware percent: 93.158\n", + " Main number of threads: 24\n", " Descendents number of threads: 0\n", - " Combined number of threads: 15\n", + " Combined number of threads: 24\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 4.0\n", + " Max hardware percent: 4.0\n", + " Mean sum percent: 0.333\n", + " Mean hardware percent: 0.333\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001\n" @@ -133,7 +142,9 @@ "\n", "The `Tracker` assumes that GPU memory is not shared across multiple processes and if it is, the reported GPU RAM of \"descendent\" and \"combined\" may be an overestimation.\n", "\n", - "The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents combined, and CPU utilization across the entire system. The core percent is the sum of the percentages of all the cores being used. The CPU percent is that divided by the system core count. The max percent is the highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads used across its descendent processes and combined.\n", + "The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents combined, and CPU utilization across the entire system. The sum percent is the sum of the percentages of all the cores being used. The hardware percent is that divided by the expected number of cores being used i.e. the optional `n_expected_cores` parameter (defaults to the number of cores in the entire system) for the main, descendents, and combined measurements. For the system measurements, hardware percent is divided by the total number of cores in the system regardless of the value of `n_expected_cores`. The max percent is the highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads used across its descendent processes and combined.\n", + "\n", + "The GPU utilization is similar to the CPU utilization but rather than being based on utilization of processes, it can only measure the utilization percentages of the GPUs themselves, regardless of what processes are using them. To ameliorate this limiation, the optional `gpu_uuids` parameter can be set to specify which GPUs to measure utilization for (defaults to all the GPUs in the system). The system GPU count is the total number of GPUs in the system. The sum percent is the sum of all the percentages of these GPUs and the hardware percent is that divided by the expected number of GPUs being used (i.e. `len(gpu_uuids)`). Likewise with CPU utilization, the max and mean of both the sum and hardware percentages are provided.\n", "\n", "The compute time is the real time that the computational-resource tracking lasted (as compared to CPU time)." ] @@ -185,60 +196,69 @@ "text": [ "Max RAM:\n", " Unit: megabytes\n", - " System capacity: 67254.17\n", - " System: 5721.395\n", + " System capacity: 63088.23\n", + " System: 2399.92\n", " Main:\n", - " Total RSS: 850.399\n", - " Private RSS: 634.077\n", - " Shared RSS: 216.547\n", + " Total RSS: 890.704\n", + " Private RSS: 674.058\n", + " Shared RSS: 216.924\n", " Descendents:\n", " Total RSS: 0.0\n", " Private RSS: 0.0\n", " Shared RSS: 0.0\n", " Combined:\n", - " Total RSS: 858.763\n", - " Private RSS: 642.445\n", - " Shared RSS: 216.527\n", + " Total RSS: 901.263\n", + " Private RSS: 684.618\n", + " Shared RSS: 216.678\n", "Max GPU RAM:\n", " Unit: megabytes\n", - " System capacity: 16376.0\n", - " System: 727.0\n", - " Main: 506.0\n", + " System capacity: 2048.0\n", + " System: 353.0\n", + " Main: 277.0\n", " Descendents: 0.0\n", - " Combined: 506.0\n", + " Combined: 277.0\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 148.9\n", - " Max CPU percent: 12.408\n", - " Mean core percent: 124.7\n", - " Mean CPU percent: 10.392\n", + " Max sum percent: 164.3\n", + " Max hardware percent: 13.692\n", + " Mean sum percent: 152.325\n", + " Mean hardware percent: 12.694\n", " Main:\n", - " Max core percent: 99.9\n", - " Max CPU percent: 8.325\n", - " Mean core percent: 97.533\n", - " Mean CPU percent: 8.128\n", + " Max sum percent: 102.6\n", + " Max hardware percent: 8.55\n", + " Mean sum percent: 91.258\n", + " Mean hardware percent: 7.605\n", " Descendents:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Combined:\n", - " Max core percent: 99.9\n", - " Max CPU percent: 8.325\n", - " Mean core percent: 97.533\n", - " Mean CPU percent: 8.128\n", - " Main number of threads: 15\n", + " Max sum percent: 102.6\n", + " Max hardware percent: 8.55\n", + " Mean sum percent: 91.258\n", + " Mean hardware percent: 7.605\n", + " Main number of threads: 24\n", " Descendents number of threads: 0\n", - " Combined number of threads: 15\n", + " Combined number of threads: 24\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 6.0\n", + " Max hardware percent: 6.0\n", + " Mean sum percent: 0.5\n", + " Mean hardware percent: 0.5\n", "Compute time:\n", " Unit: seconds\n", - " Time: 2.52\n" + " Time: 3.346\n" ] } ], "source": [ - "with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds') as tracker:\n", + "with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds', sleep_time=0.1) as tracker:\n", " example_function()\n", "print(tracker)" ] @@ -264,12 +284,12 @@ "{\n", " \"max_ram\": {\n", " \"unit\": \"megabytes\",\n", - " \"system_capacity\": 67254.1696,\n", - " \"system\": 5721.3952,\n", + " \"system_capacity\": 63088.2304,\n", + " \"system\": 2399.9201279999997,\n", " \"main\": {\n", - " \"total_rss\": 850.399232,\n", - " \"private_rss\": 634.077184,\n", - " \"shared_rss\": 216.547328\n", + " \"total_rss\": 890.7038719999999,\n", + " \"private_rss\": 674.05824,\n", + " \"shared_rss\": 216.92416\n", " },\n", " \"descendents\": {\n", " \"total_rss\": 0.0,\n", @@ -277,52 +297,63 @@ " \"shared_rss\": 0.0\n", " },\n", " \"combined\": {\n", - " \"total_rss\": 858.7632639999999,\n", - " \"private_rss\": 642.445312,\n", - " \"shared_rss\": 216.526848\n", + " \"total_rss\": 901.2633599999999,\n", + " \"private_rss\": 684.6177279999999,\n", + " \"shared_rss\": 216.67839999999998\n", " }\n", " },\n", " \"max_gpu_ram\": {\n", " \"unit\": \"megabytes\",\n", - " \"system_capacity\": 16376.0,\n", - " \"system\": 727.0,\n", - " \"main\": 506.0,\n", + " \"system_capacity\": 2048.0,\n", + " \"system\": 353.0,\n", + " \"main\": 277.0,\n", " \"descendents\": 0.0,\n", - " \"combined\": 506.0\n", + " \"combined\": 277.0\n", " },\n", " \"cpu_utilization\": {\n", " \"system_core_count\": 12,\n", + " \"n_expected_cores\": 12,\n", " \"system\": {\n", - " \"max_core_percent\": 148.90000000000003,\n", - " \"max_cpu_percent\": 12.408333333333337,\n", - " \"mean_core_percent\": 124.70000000000003,\n", - " \"mean_cpu_percent\": 10.39166666666667\n", + " \"max_sum_percent\": 164.3,\n", + " \"max_hardware_percent\": 13.691666666666668,\n", + " \"mean_sum_percent\": 152.325,\n", + " \"mean_hardware_percent\": 12.693750000000001\n", " },\n", " \"main\": {\n", - " \"max_core_percent\": 99.9,\n", - " \"max_cpu_percent\": 8.325000000000001,\n", - " \"mean_core_percent\": 97.53333333333335,\n", - " \"mean_cpu_percent\": 8.127777777777778\n", + " \"max_sum_percent\": 102.6,\n", + " \"max_hardware_percent\": 8.549999999999999,\n", + " \"mean_sum_percent\": 91.25833333333334,\n", + " \"mean_hardware_percent\": 7.604861111111112\n", " },\n", " \"descendents\": {\n", - " \"max_core_percent\": 0.0,\n", - " \"max_cpu_percent\": 0.0,\n", - " \"mean_core_percent\": 0.0,\n", - " \"mean_cpu_percent\": 0.0\n", + " \"max_sum_percent\": 0.0,\n", + " \"max_hardware_percent\": 0.0,\n", + " \"mean_sum_percent\": 0.0,\n", + " \"mean_hardware_percent\": 0.0\n", " },\n", " \"combined\": {\n", - " \"max_core_percent\": 99.9,\n", - " \"max_cpu_percent\": 8.325000000000001,\n", - " \"mean_core_percent\": 97.53333333333335,\n", - " \"mean_cpu_percent\": 8.127777777777778\n", + " \"max_sum_percent\": 102.6,\n", + " \"max_hardware_percent\": 8.549999999999999,\n", + " \"mean_sum_percent\": 91.25833333333334,\n", + " \"mean_hardware_percent\": 7.604861111111112\n", " },\n", - " \"main_n_threads\": 15,\n", + " \"main_n_threads\": 24,\n", " \"descendents_n_threads\": 0,\n", - " \"combined_n_threads\": 15\n", + " \"combined_n_threads\": 24\n", + " },\n", + " \"gpu_utilization\": {\n", + " \"system_gpu_count\": 1,\n", + " \"n_expected_gpus\": 1,\n", + " \"gpu_percentages\": {\n", + " \"max_sum_percent\": 6.0,\n", + " \"max_hardware_percent\": 6.0,\n", + " \"mean_sum_percent\": 0.5,\n", + " \"mean_hardware_percent\": 0.5\n", + " }\n", " },\n", " \"compute_time\": {\n", " \"unit\": \"seconds\",\n", - " \"time\": 2.5198354721069336\n", + " \"time\": 3.345628023147583\n", " }\n", "}\n" ] @@ -350,7 +381,7 @@ { "data": { "text/plain": [ - "MaxRAM(unit='megabytes', system_capacity=67254.1696, system=5721.3952, main=RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=858.7632639999999, private_rss=642.445312, shared_rss=216.526848))" + "MaxRAM(unit='megabytes', system_capacity=63088.2304, system=2399.9201279999997, main=RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=901.2633599999999, private_rss=684.6177279999999, shared_rss=216.67839999999998))" ] }, "execution_count": 7, @@ -392,7 +423,7 @@ { "data": { "text/plain": [ - "RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328)" + "RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416)" ] }, "execution_count": 9, @@ -413,7 +444,7 @@ { "data": { "text/plain": [ - "850.399232" + "890.7038719999999" ] }, "execution_count": 10, @@ -434,7 +465,7 @@ { "data": { "text/plain": [ - "MaxGPURAM(unit='megabytes', system_capacity=16376.0, system=727.0, main=506.0, descendents=0.0, combined=506.0)" + "MaxGPURAM(unit='megabytes', system_capacity=2048.0, system=353.0, main=277.0, descendents=0.0, combined=277.0)" ] }, "execution_count": 11, @@ -455,7 +486,7 @@ { "data": { "text/plain": [ - "ComputeTime(unit='seconds', time=2.5198354721069336)" + "ComputeTime(unit='seconds', time=3.345628023147583)" ] }, "execution_count": 12, @@ -486,7 +517,7 @@ "output_type": "stream", "text": [ "The following error occured while tracking: AN ERROR\n", - "0.506\n" + "0.277\n" ] } ], @@ -511,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 23, "id": "f429ced6-573b-4f0f-ad64-658e9c05242d", "metadata": {}, "outputs": [ @@ -521,52 +552,61 @@ "text": [ "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.938\n", + " System capacity: 63.088\n", + " System: 2.877\n", " Main:\n", - " Total RSS: 0.798\n", - " Private RSS: 0.491\n", - " Shared RSS: 0.311\n", + " Total RSS: 0.844\n", + " Private RSS: 0.525\n", + " Shared RSS: 0.319\n", " Descendents:\n", - " Total RSS: 0.85\n", - " Private RSS: 0.728\n", - " Shared RSS: 0.122\n", + " Total RSS: 0.831\n", + " Private RSS: 0.704\n", + " Shared RSS: 0.127\n", " Combined:\n", - " Total RSS: 1.451\n", - " Private RSS: 1.144\n", - " Shared RSS: 0.311\n", + " Total RSS: 1.462\n", + " Private RSS: 1.148\n", + " Shared RSS: 0.32\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 1.043\n", - " Main: 0.506\n", - " Descendents: 0.314\n", - " Combined: 0.82\n", + " System capacity: 2.048\n", + " System: 0.631\n", + " Main: 0.277\n", + " Descendents: 0.277\n", + " Combined: 0.554\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 2\n", " System:\n", - " Max core percent: 225.5\n", - " Max CPU percent: 18.792\n", - " Mean core percent: 187.575\n", - " Mean CPU percent: 15.631\n", + " Max sum percent: 398.9\n", + " Max hardware percent: 33.242\n", + " Mean sum percent: 222.255\n", + " Mean hardware percent: 18.521\n", " Main:\n", - " Max core percent: 99.6\n", - " Max CPU percent: 8.3\n", - " Mean core percent: 74.15\n", - " Mean CPU percent: 6.179\n", + " Max sum percent: 103.8\n", + " Max hardware percent: 51.9\n", + " Mean sum percent: 66.009\n", + " Mean hardware percent: 33.005\n", " Descendents:\n", - " Max core percent: 101.2\n", - " Max CPU percent: 8.433\n", - " Mean core percent: 74.125\n", - " Mean CPU percent: 6.177\n", + " Max sum percent: 308.5\n", + " Max hardware percent: 154.25\n", + " Mean sum percent: 117.109\n", + " Mean hardware percent: 58.555\n", " Combined:\n", - " Max core percent: 198.7\n", - " Max CPU percent: 16.558\n", - " Mean core percent: 148.275\n", - " Mean CPU percent: 12.356\n", - " Main number of threads: 15\n", - " Descendents number of threads: 5\n", - " Combined number of threads: 20\n", + " Max sum percent: 409.2\n", + " Max hardware percent: 204.6\n", + " Mean sum percent: 183.118\n", + " Mean hardware percent: 91.559\n", + " Main number of threads: 24\n", + " Descendents number of threads: 16\n", + " Combined number of threads: 40\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 6.0\n", + " Max hardware percent: 6.0\n", + " Mean sum percent: 0.545\n", + " Mean hardware percent: 0.545\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001\n" @@ -577,11 +617,11 @@ "import multiprocessing as mp\n", "ctx = mp.get_context(method='spawn')\n", "child_process = ctx.Process(target=example_function)\n", - "with gput.Tracker() as tracker:\n", + "with gput.Tracker(n_expected_cores=2, sleep_time=0.2) as tracker:\n", " child_process.start()\n", " example_function()\n", " child_process.join()\n", - " child_process.close()\n", + "child_process.close()\n", "print(tracker)" ] }, @@ -611,12 +651,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tracks the computational resource usage (RAM, GPU RAM, and compute time) of a process corresponding to a given shell command.\n", + "Tracks the computational resource usage (RAM, GPU RAM, CPU utilization, GPU utilization, and compute time) of a process corresponding to a given shell command.\n", "\n", "Usage:\n", " gpu-tracker -h | --help\n", " gpu-tracker -v | --version\n", - " gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--disable-logs]\n", + " gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--nec=] [--guuids=] [--disable-logs]\n", "\n", "Options:\n", " -h --help Show this help message and exit.\n", @@ -628,6 +668,8 @@ " --ru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.\n", " --gru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.\n", " --tu= One of 'seconds', 'minutes', 'hours', or 'days'.\n", + " --nec= The number of cores expected to be used. Defaults to the number of cores in the entire operating system.\n", + " --guuids= Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system.\n", " --disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual.\n" ] } @@ -646,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "id": "ea7c710f-a238-460d-836c-a979e1c72f4f", "metadata": {}, "outputs": [ @@ -657,52 +699,61 @@ "Resource tracking complete. Process completed with status code: 0\n", "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.964\n", + " System capacity: 63.088\n", + " System: 2.3\n", " Main:\n", " Total RSS: 0.003\n", " Private RSS: 0.0\n", " Shared RSS: 0.003\n", " Descendents:\n", - " Total RSS: 0.847\n", - " Private RSS: 0.724\n", - " Shared RSS: 0.122\n", + " Total RSS: 0.917\n", + " Private RSS: 0.905\n", + " Shared RSS: 0.012\n", " Combined:\n", - " Total RSS: 0.856\n", - " Private RSS: 0.733\n", - " Shared RSS: 0.123\n", + " Total RSS: 0.925\n", + " Private RSS: 0.912\n", + " Shared RSS: 0.013\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 1.043\n", + " System capacity: 2.048\n", + " System: 0.193\n", " Main: 0.0\n", - " Descendents: 0.314\n", - " Combined: 0.314\n", + " Descendents: 0.117\n", + " Combined: 0.117\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 177.6\n", - " Max CPU percent: 14.8\n", - " Mean core percent: 134.375\n", - " Mean CPU percent: 11.198\n", + " Max sum percent: 309.5\n", + " Max hardware percent: 25.792\n", + " Mean sum percent: 159.073\n", + " Mean hardware percent: 13.256\n", " Main:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Descendents:\n", - " Max core percent: 100.4\n", - " Max CPU percent: 8.367\n", - " Mean core percent: 95.45\n", - " Mean CPU percent: 7.954\n", + " Max sum percent: 493.1\n", + " Max hardware percent: 41.092\n", + " Mean sum percent: 134.427\n", + " Mean hardware percent: 11.202\n", " Combined:\n", - " Max core percent: 100.4\n", - " Max CPU percent: 8.367\n", - " Mean core percent: 95.45\n", - " Mean CPU percent: 7.954\n", + " Max sum percent: 493.1\n", + " Max hardware percent: 41.092\n", + " Mean sum percent: 134.427\n", + " Mean hardware percent: 11.202\n", " Main number of threads: 1\n", - " Descendents number of threads: 4\n", - " Combined number of threads: 5\n", + " Descendents number of threads: 15\n", + " Combined number of threads: 16\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 4.0\n", + " Max hardware percent: 4.0\n", + " Mean sum percent: 0.364\n", + " Mean hardware percent: 0.364\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001\n" @@ -710,7 +761,7 @@ } ], "source": [ - "!gpu-tracker -e \"bash example-script.sh\"" + "!gpu-tracker -e \"bash example-script.sh\" --st=0.3" ] }, { @@ -731,7 +782,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "id": "cff099a7-1070-42ba-9f2a-008d58863fe6", "metadata": {}, "outputs": [ @@ -742,60 +793,69 @@ "Resource tracking complete. Process completed with status code: 0\n", "Max RAM:\n", " Unit: megabytes\n", - " System capacity: 67254.17\n", - " System: 5784.379\n", + " System capacity: 63088.23\n", + " System: 2242.593\n", " Main:\n", - " Total RSS: 3.076\n", - " Private RSS: 0.324\n", - " Shared RSS: 2.753\n", + " Total RSS: 3.039\n", + " Private RSS: 0.315\n", + " Shared RSS: 2.724\n", " Descendents:\n", - " Total RSS: 838.545\n", - " Private RSS: 716.681\n", - " Shared RSS: 121.864\n", + " Total RSS: 832.487\n", + " Private RSS: 705.831\n", + " Shared RSS: 126.657\n", " Combined:\n", - " Total RSS: 847.249\n", - " Private RSS: 724.492\n", - " Shared RSS: 122.757\n", + " Total RSS: 841.482\n", + " Private RSS: 713.867\n", + " Shared RSS: 127.992\n", "Max GPU RAM:\n", " Unit: megabytes\n", - " System capacity: 16376.0\n", - " System: 1043.0\n", + " System capacity: 2048.0\n", + " System: 631.0\n", " Main: 0.0\n", - " Descendents: 314.0\n", - " Combined: 314.0\n", + " Descendents: 277.0\n", + " Combined: 277.0\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 188.7\n", - " Max CPU percent: 15.725\n", - " Mean core percent: 136.45\n", - " Mean CPU percent: 11.371\n", + " Max sum percent: 362.6\n", + " Max hardware percent: 30.217\n", + " Mean sum percent: 156.853\n", + " Mean hardware percent: 13.071\n", " Main:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Descendents:\n", - " Max core percent: 96.2\n", - " Max CPU percent: 8.017\n", - " Mean core percent: 94.55\n", - " Mean CPU percent: 7.879\n", + " Max sum percent: 512.8\n", + " Max hardware percent: 42.733\n", + " Mean sum percent: 120.333\n", + " Mean hardware percent: 10.028\n", " Combined:\n", - " Max core percent: 96.2\n", - " Max CPU percent: 8.017\n", - " Mean core percent: 94.55\n", - " Mean CPU percent: 7.879\n", + " Max sum percent: 512.8\n", + " Max hardware percent: 42.733\n", + " Mean sum percent: 120.333\n", + " Mean hardware percent: 10.028\n", " Main number of threads: 1\n", - " Descendents number of threads: 4\n", - " Combined number of threads: 5\n", + " Descendents number of threads: 15\n", + " Combined number of threads: 16\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 4.0\n", + " Max hardware percent: 4.0\n", + " Mean sum percent: 0.267\n", + " Mean hardware percent: 0.267\n", "Compute time:\n", " Unit: seconds\n", - " Time: 3.566\n" + " Time: 4.931\n" ] } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes" + "!gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes --st=0.2" ] }, { @@ -808,7 +868,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "id": "a8520fd9-0907-4c0c-a68f-8fdaec040e1a", "metadata": {}, "outputs": [ @@ -821,12 +881,12 @@ } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' -o out.txt " + "!gpu-tracker -e 'bash example-script.sh' -o out.txt --st=0.2" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "id": "213550b7-d808-4e11-be37-f2f892e4834b", "metadata": {}, "outputs": [ @@ -836,52 +896,61 @@ "text": [ "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.584\n", + " System capacity: 63.088\n", + " System: 2.683\n", " Main:\n", " Total RSS: 0.003\n", " Private RSS: 0.0\n", " Shared RSS: 0.003\n", " Descendents:\n", - " Total RSS: 0.853\n", - " Private RSS: 0.731\n", - " Shared RSS: 0.122\n", + " Total RSS: 0.843\n", + " Private RSS: 0.717\n", + " Shared RSS: 0.127\n", " Combined:\n", - " Total RSS: 0.862\n", - " Private RSS: 0.739\n", - " Shared RSS: 0.123\n", + " Total RSS: 0.852\n", + " Private RSS: 0.725\n", + " Shared RSS: 0.128\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 1.043\n", + " System capacity: 2.048\n", + " System: 0.631\n", " Main: 0.0\n", - " Descendents: 0.314\n", - " Combined: 0.314\n", + " Descendents: 0.277\n", + " Combined: 0.277\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 187.6\n", - " Max CPU percent: 15.633\n", - " Mean core percent: 137.675\n", - " Mean CPU percent: 11.473\n", + " Max sum percent: 383.8\n", + " Max hardware percent: 31.983\n", + " Mean sum percent: 166.507\n", + " Mean hardware percent: 13.876\n", " Main:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Descendents:\n", - " Max core percent: 101.3\n", - " Max CPU percent: 8.442\n", - " Mean core percent: 97.675\n", - " Mean CPU percent: 8.14\n", + " Max sum percent: 528.4\n", + " Max hardware percent: 44.033\n", + " Mean sum percent: 128.014\n", + " Mean hardware percent: 10.668\n", " Combined:\n", - " Max core percent: 101.3\n", - " Max CPU percent: 8.442\n", - " Mean core percent: 97.675\n", - " Mean CPU percent: 8.14\n", + " Max sum percent: 528.4\n", + " Max hardware percent: 44.033\n", + " Mean sum percent: 128.014\n", + " Mean hardware percent: 10.668\n", " Main number of threads: 1\n", - " Descendents number of threads: 4\n", - " Combined number of threads: 5\n", + " Descendents number of threads: 15\n", + " Combined number of threads: 16\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 7.0\n", + " Max hardware percent: 7.0\n", + " Mean sum percent: 0.643\n", + " Mean hardware percent: 0.643\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001" @@ -902,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 26, "id": "f6fd29d2-cad6-4f9c-8af8-ccf4f0e721d3", "metadata": {}, "outputs": [ @@ -914,77 +983,88 @@ "{\n", " \"max_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 67.2541696,\n", - " \"system\": 5.720379392000001,\n", + " \"system_capacity\": 63.0882304,\n", + " \"system\": 3.111936,\n", " \"main\": {\n", - " \"total_rss\": 0.003084288,\n", - " \"private_rss\": 0.00031948800000000004,\n", - " \"shared_rss\": 0.0027648\n", + " \"total_rss\": 0.003059712,\n", + " \"private_rss\": 0.000339968,\n", + " \"shared_rss\": 0.002719744\n", " },\n", " \"descendents\": {\n", - " \"total_rss\": 0.854237184,\n", - " \"private_rss\": 0.73218048,\n", - " \"shared_rss\": 0.122056704\n", + " \"total_rss\": 0.846565376,\n", + " \"private_rss\": 0.7198023680000001,\n", + " \"shared_rss\": 0.12713984\n", " },\n", " \"combined\": {\n", - " \"total_rss\": 0.863256576,\n", - " \"private_rss\": 0.7403069440000001,\n", - " \"shared_rss\": 0.122949632\n", + " \"total_rss\": 0.8552325120000001,\n", + " \"private_rss\": 0.727576576,\n", + " \"shared_rss\": 0.12803276800000002\n", " }\n", " },\n", " \"max_gpu_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 16.376,\n", - " \"system\": 1.043,\n", + " \"system_capacity\": 2.048,\n", + " \"system\": 0.631,\n", " \"main\": 0.0,\n", - " \"descendents\": 0.314,\n", - " \"combined\": 0.314\n", + " \"descendents\": 0.277,\n", + " \"combined\": 0.277\n", " },\n", " \"cpu_utilization\": {\n", " \"system_core_count\": 12,\n", + " \"n_expected_cores\": 12,\n", " \"system\": {\n", - " \"max_core_percent\": 260.00000000000006,\n", - " \"max_cpu_percent\": 21.66666666666667,\n", - " \"mean_core_percent\": 159.35000000000002,\n", - " \"mean_cpu_percent\": 13.279166666666669\n", + " \"max_sum_percent\": 384.5999999999999,\n", + " \"max_hardware_percent\": 32.04999999999999,\n", + " \"mean_sum_percent\": 167.49285714285716,\n", + " \"mean_hardware_percent\": 13.957738095238097\n", " },\n", " \"main\": {\n", - " \"max_core_percent\": 0.0,\n", - " \"max_cpu_percent\": 0.0,\n", - " \"mean_core_percent\": 0.0,\n", - " \"mean_cpu_percent\": 0.0\n", + " \"max_sum_percent\": 0.0,\n", + " \"max_hardware_percent\": 0.0,\n", + " \"mean_sum_percent\": 0.0,\n", + " \"mean_hardware_percent\": 0.0\n", " },\n", " \"descendents\": {\n", - " \"max_core_percent\": 102.9,\n", - " \"max_cpu_percent\": 8.575000000000001,\n", - " \"mean_core_percent\": 97.475,\n", - " \"mean_cpu_percent\": 8.122916666666667\n", + " \"max_sum_percent\": 526.0,\n", + " \"max_hardware_percent\": 43.833333333333336,\n", + " \"mean_sum_percent\": 128.65,\n", + " \"mean_hardware_percent\": 10.720833333333333\n", " },\n", " \"combined\": {\n", - " \"max_core_percent\": 102.9,\n", - " \"max_cpu_percent\": 8.575000000000001,\n", - " \"mean_core_percent\": 97.475,\n", - " \"mean_cpu_percent\": 8.122916666666667\n", + " \"max_sum_percent\": 526.0,\n", + " \"max_hardware_percent\": 43.833333333333336,\n", + " \"mean_sum_percent\": 128.65,\n", + " \"mean_hardware_percent\": 10.720833333333333\n", " },\n", " \"main_n_threads\": 1,\n", - " \"descendents_n_threads\": 4,\n", - " \"combined_n_threads\": 5\n", + " \"descendents_n_threads\": 15,\n", + " \"combined_n_threads\": 16\n", + " },\n", + " \"gpu_utilization\": {\n", + " \"system_gpu_count\": 1,\n", + " \"n_expected_gpus\": 1,\n", + " \"gpu_percentages\": {\n", + " \"max_sum_percent\": 7.0,\n", + " \"max_hardware_percent\": 7.0,\n", + " \"mean_sum_percent\": 0.5,\n", + " \"mean_hardware_percent\": 0.5\n", + " }\n", " },\n", " \"compute_time\": {\n", " \"unit\": \"hours\",\n", - " \"time\": 0.001005272732840644\n", + " \"time\": 0.0012672905127207438\n", " }\n", "}\n" ] } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' -f json" + "!gpu-tracker -e 'bash example-script.sh' -f json --st=0.2" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 27, "id": "5c825e42-d100-4533-b218-c36f6380e6ed", "metadata": {}, "outputs": [ @@ -997,12 +1077,12 @@ } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' -f json -o out.json" + "!gpu-tracker -e 'bash example-script.sh' -f json -o out.json --st=0.3" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 28, "id": "c821972e-0bed-4245-8933-27b0b28589de", "metadata": {}, "outputs": [ @@ -1013,65 +1093,76 @@ "{\n", " \"max_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 67.2541696,\n", - " \"system\": 5.560373248,\n", + " \"system_capacity\": 63.0882304,\n", + " \"system\": 2.878910464,\n", " \"main\": {\n", - " \"total_rss\": 0.002957312,\n", - " \"private_rss\": 0.000323584,\n", - " \"shared_rss\": 0.002633728\n", + " \"total_rss\": 0.0029777920000000004,\n", + " \"private_rss\": 0.00031948800000000004,\n", + " \"shared_rss\": 0.0026583040000000002\n", " },\n", " \"descendents\": {\n", - " \"total_rss\": 0.848539648,\n", - " \"private_rss\": 0.726519808,\n", - " \"shared_rss\": 0.12201984\n", + " \"total_rss\": 0.8333844480000001,\n", + " \"private_rss\": 0.7066091520000001,\n", + " \"shared_rss\": 0.127152128\n", " },\n", " \"combined\": {\n", - " \"total_rss\": 0.857731072,\n", - " \"private_rss\": 0.734818304,\n", - " \"shared_rss\": 0.122912768\n", + " \"total_rss\": 0.841486336,\n", + " \"private_rss\": 0.713818112,\n", + " \"shared_rss\": 0.12804505600000002\n", " }\n", " },\n", " \"max_gpu_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 16.376,\n", - " \"system\": 1.043,\n", + " \"system_capacity\": 2.048,\n", + " \"system\": 0.631,\n", " \"main\": 0.0,\n", - " \"descendents\": 0.314,\n", - " \"combined\": 0.314\n", + " \"descendents\": 0.277,\n", + " \"combined\": 0.277\n", " },\n", " \"cpu_utilization\": {\n", " \"system_core_count\": 12,\n", + " \"n_expected_cores\": 12,\n", " \"system\": {\n", - " \"max_core_percent\": 192.5,\n", - " \"max_cpu_percent\": 16.041666666666668,\n", - " \"mean_core_percent\": 154.22500000000002,\n", - " \"mean_cpu_percent\": 12.852083333333335\n", + " \"max_sum_percent\": 306.09999999999997,\n", + " \"max_hardware_percent\": 25.50833333333333,\n", + " \"mean_sum_percent\": 161.4272727272727,\n", + " \"mean_hardware_percent\": 13.452272727272724\n", " },\n", " \"main\": {\n", - " \"max_core_percent\": 0.0,\n", - " \"max_cpu_percent\": 0.0,\n", - " \"mean_core_percent\": 0.0,\n", - " \"mean_cpu_percent\": 0.0\n", + " \"max_sum_percent\": 0.0,\n", + " \"max_hardware_percent\": 0.0,\n", + " \"mean_sum_percent\": 0.0,\n", + " \"mean_hardware_percent\": 0.0\n", " },\n", " \"descendents\": {\n", - " \"max_core_percent\": 104.1,\n", - " \"max_cpu_percent\": 8.674999999999999,\n", - " \"mean_core_percent\": 97.7,\n", - " \"mean_cpu_percent\": 8.141666666666667\n", + " \"max_sum_percent\": 440.2,\n", + " \"max_hardware_percent\": 36.68333333333333,\n", + " \"mean_sum_percent\": 128.27272727272728,\n", + " \"mean_hardware_percent\": 10.68939393939394\n", " },\n", " \"combined\": {\n", - " \"max_core_percent\": 104.1,\n", - " \"max_cpu_percent\": 8.674999999999999,\n", - " \"mean_core_percent\": 97.7,\n", - " \"mean_cpu_percent\": 8.141666666666667\n", + " \"max_sum_percent\": 440.2,\n", + " \"max_hardware_percent\": 36.68333333333333,\n", + " \"mean_sum_percent\": 128.27272727272728,\n", + " \"mean_hardware_percent\": 10.68939393939394\n", " },\n", " \"main_n_threads\": 1,\n", - " \"descendents_n_threads\": 4,\n", - " \"combined_n_threads\": 5\n", + " \"descendents_n_threads\": 15,\n", + " \"combined_n_threads\": 16\n", + " },\n", + " \"gpu_utilization\": {\n", + " \"system_gpu_count\": 1,\n", + " \"n_expected_gpus\": 1,\n", + " \"gpu_percentages\": {\n", + " \"max_sum_percent\": 7.0,\n", + " \"max_hardware_percent\": 7.0,\n", + " \"mean_sum_percent\": 0.6363636363636364,\n", + " \"mean_hardware_percent\": 0.6363636363636364\n", + " }\n", " },\n", " \"compute_time\": {\n", " \"unit\": \"hours\",\n", - " \"time\": 0.000995432734489441\n", + " \"time\": 0.0012816817230648465\n", " }\n", "}" ] diff --git a/docs/tutorial.rst b/docs/tutorial.rst index e736521..1e202c6 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -9,11 +9,12 @@ API The ``gpu_tracker`` package provides the ``Tracker`` class which uses a subprocess to measure computational resource usage, namely the compute time, maximum CPU utilization, mean CPU utilization, maximum RAM used, -and maximum GPU RAM used. The ``start()`` method starts this process -which tracks usage in the background. After calling ``start()``, one can -write the code for which resource usage is measured, followed by calling -the ``stop()`` method. The compute time will be the time from the call -to ``start()`` to the call to ``stop()`` and the RAM, GPU RAM, and CPU +maximum GPU utilization, mean GPU utilization, and maximum GPU RAM used. +The ``start()`` method starts this process which tracks usage in the +background. After calling ``start()``, one can write the code for which +resource usage is measured, followed by calling the ``stop()`` method. +The compute time will be the time from the call to ``start()`` to the +call to ``stop()`` and the RAM, GPU RAM, CPU utilization, and GPU utilization quantities will be the respective computational resources used by the code that’s in between ``start()`` and ``stop()``. @@ -24,7 +25,7 @@ used by the code that’s in between ``start()`` and ``stop()``. .. code:: python3 - tracker = gput.Tracker() + tracker = gput.Tracker(n_expected_cores=1, sleep_time=0.1) tracker.start() example_function() tracker.stop() @@ -42,52 +43,61 @@ resource formatted. Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.21 + System capacity: 63.088 + System: 1.899 Main: - Total RSS: 0.827 - Private RSS: 0.674 - Shared RSS: 0.154 + Total RSS: 0.914 + Private RSS: 0.753 + Shared RSS: 0.161 Descendents: Total RSS: 0.0 Private RSS: 0.0 Shared RSS: 0.0 Combined: - Total RSS: 0.834 - Private RSS: 0.681 - Shared RSS: 0.154 + Total RSS: 0.883 + Private RSS: 0.723 + Shared RSS: 0.161 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 0.535 - Main: 0.314 + System capacity: 2.048 + System: 0.353 + Main: 0.277 Descendents: 0.0 - Combined: 0.314 + Combined: 0.277 CPU utilization: System core count: 12 + Number of expected cores: 1 System: - Max core percent: 150.6 - Max CPU percent: 12.55 - Mean core percent: 122.9 - Mean CPU percent: 10.242 + Max sum percent: 169.7 + Max hardware percent: 14.142 + Mean sum percent: 150.183 + Mean hardware percent: 12.515 Main: - Max core percent: 98.6 - Max CPU percent: 8.217 - Mean core percent: 96.8 - Mean CPU percent: 8.067 + Max sum percent: 101.2 + Max hardware percent: 101.2 + Mean sum percent: 93.158 + Mean hardware percent: 93.158 Descendents: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Combined: - Max core percent: 98.6 - Max CPU percent: 8.217 - Mean core percent: 96.8 - Mean CPU percent: 8.067 - Main number of threads: 15 + Max sum percent: 101.2 + Max hardware percent: 101.2 + Mean sum percent: 93.158 + Mean hardware percent: 93.158 + Main number of threads: 24 Descendents number of threads: 0 - Combined number of threads: 15 + Combined number of threads: 24 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 4.0 + Max hardware percent: 4.0 + Mean sum percent: 0.333 + Mean hardware percent: 0.333 Compute time: Unit: hours Time: 0.001 @@ -121,14 +131,31 @@ processes and if it is, the reported GPU RAM of “descendent” and The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents -combined, and CPU utilization across the entire system. The core percent -is the sum of the percentages of all the cores being used. The CPU -percent is that divided by the system core count. The max percent is the -highest percentage detected through the duration of tracking while the -mean percent is the average of all the percentages detected over that -duration. The CPU utilization concludes with the maximum number of -threads used at any time for the main process and the sum of the threads -used across its descendent processes and combined. +combined, and CPU utilization across the entire system. The sum percent +is the sum of the percentages of all the cores being used. The hardware +percent is that divided by the expected number of cores being used +i.e. the optional ``n_expected_cores`` parameter (defaults to the number +of cores in the entire system) for the main, descendents, and combined +measurements. For the system measurements, hardware percent is divided +by the total number of cores in the system regardless of the value of +``n_expected_cores``. The max percent is the highest percentage detected +through the duration of tracking while the mean percent is the average +of all the percentages detected over that duration. The CPU utilization +concludes with the maximum number of threads used at any time for the +main process and the sum of the threads used across its descendent +processes and combined. + +The GPU utilization is similar to the CPU utilization but rather than +being based on utilization of processes, it can only measure the +utilization percentages of the GPUs themselves, regardless of what +processes are using them. To ameliorate this limiation, the optional +``gpu_uuids`` parameter can be set to specify which GPUs to measure +utilization for (defaults to all the GPUs in the system). The system GPU +count is the total number of GPUs in the system. The sum percent is the +sum of all the percentages of these GPUs and the hardware percent is +that divided by the expected number of GPUs being used +(i.e. ``len(gpu_uuids)``). Likewise with CPU utilization, the max and +mean of both the sum and hardware percentages are provided. The compute time is the real time that the computational-resource tracking lasted (as compared to CPU time). @@ -154,7 +181,7 @@ the compute time in seconds: .. code:: python3 - with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds') as tracker: + with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds', sleep_time=0.1) as tracker: example_function() print(tracker) @@ -163,55 +190,64 @@ the compute time in seconds: Max RAM: Unit: megabytes - System capacity: 67254.17 - System: 5721.395 + System capacity: 63088.23 + System: 2399.92 Main: - Total RSS: 850.399 - Private RSS: 634.077 - Shared RSS: 216.547 + Total RSS: 890.704 + Private RSS: 674.058 + Shared RSS: 216.924 Descendents: Total RSS: 0.0 Private RSS: 0.0 Shared RSS: 0.0 Combined: - Total RSS: 858.763 - Private RSS: 642.445 - Shared RSS: 216.527 + Total RSS: 901.263 + Private RSS: 684.618 + Shared RSS: 216.678 Max GPU RAM: Unit: megabytes - System capacity: 16376.0 - System: 727.0 - Main: 506.0 + System capacity: 2048.0 + System: 353.0 + Main: 277.0 Descendents: 0.0 - Combined: 506.0 + Combined: 277.0 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 148.9 - Max CPU percent: 12.408 - Mean core percent: 124.7 - Mean CPU percent: 10.392 + Max sum percent: 164.3 + Max hardware percent: 13.692 + Mean sum percent: 152.325 + Mean hardware percent: 12.694 Main: - Max core percent: 99.9 - Max CPU percent: 8.325 - Mean core percent: 97.533 - Mean CPU percent: 8.128 + Max sum percent: 102.6 + Max hardware percent: 8.55 + Mean sum percent: 91.258 + Mean hardware percent: 7.605 Descendents: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Combined: - Max core percent: 99.9 - Max CPU percent: 8.325 - Mean core percent: 97.533 - Mean CPU percent: 8.128 - Main number of threads: 15 + Max sum percent: 102.6 + Max hardware percent: 8.55 + Mean sum percent: 91.258 + Mean hardware percent: 7.605 + Main number of threads: 24 Descendents number of threads: 0 - Combined number of threads: 15 + Combined number of threads: 24 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 6.0 + Max hardware percent: 6.0 + Mean sum percent: 0.5 + Mean hardware percent: 0.5 Compute time: Unit: seconds - Time: 2.52 + Time: 3.346 The same information as the text format can be provided as a dictionary @@ -228,12 +264,12 @@ via the ``to_json()`` method of the ``Tracker``. { "max_ram": { "unit": "megabytes", - "system_capacity": 67254.1696, - "system": 5721.3952, + "system_capacity": 63088.2304, + "system": 2399.9201279999997, "main": { - "total_rss": 850.399232, - "private_rss": 634.077184, - "shared_rss": 216.547328 + "total_rss": 890.7038719999999, + "private_rss": 674.05824, + "shared_rss": 216.92416 }, "descendents": { "total_rss": 0.0, @@ -241,52 +277,63 @@ via the ``to_json()`` method of the ``Tracker``. "shared_rss": 0.0 }, "combined": { - "total_rss": 858.7632639999999, - "private_rss": 642.445312, - "shared_rss": 216.526848 + "total_rss": 901.2633599999999, + "private_rss": 684.6177279999999, + "shared_rss": 216.67839999999998 } }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 16376.0, - "system": 727.0, - "main": 506.0, + "system_capacity": 2048.0, + "system": 353.0, + "main": 277.0, "descendents": 0.0, - "combined": 506.0 + "combined": 277.0 }, "cpu_utilization": { "system_core_count": 12, + "n_expected_cores": 12, "system": { - "max_core_percent": 148.90000000000003, - "max_cpu_percent": 12.408333333333337, - "mean_core_percent": 124.70000000000003, - "mean_cpu_percent": 10.39166666666667 + "max_sum_percent": 164.3, + "max_hardware_percent": 13.691666666666668, + "mean_sum_percent": 152.325, + "mean_hardware_percent": 12.693750000000001 }, "main": { - "max_core_percent": 99.9, - "max_cpu_percent": 8.325000000000001, - "mean_core_percent": 97.53333333333335, - "mean_cpu_percent": 8.127777777777778 + "max_sum_percent": 102.6, + "max_hardware_percent": 8.549999999999999, + "mean_sum_percent": 91.25833333333334, + "mean_hardware_percent": 7.604861111111112 }, "descendents": { - "max_core_percent": 0.0, - "max_cpu_percent": 0.0, - "mean_core_percent": 0.0, - "mean_cpu_percent": 0.0 + "max_sum_percent": 0.0, + "max_hardware_percent": 0.0, + "mean_sum_percent": 0.0, + "mean_hardware_percent": 0.0 }, "combined": { - "max_core_percent": 99.9, - "max_cpu_percent": 8.325000000000001, - "mean_core_percent": 97.53333333333335, - "mean_cpu_percent": 8.127777777777778 + "max_sum_percent": 102.6, + "max_hardware_percent": 8.549999999999999, + "mean_sum_percent": 91.25833333333334, + "mean_hardware_percent": 7.604861111111112 }, - "main_n_threads": 15, + "main_n_threads": 24, "descendents_n_threads": 0, - "combined_n_threads": 15 + "combined_n_threads": 24 + }, + "gpu_utilization": { + "system_gpu_count": 1, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 6.0, + "max_hardware_percent": 6.0, + "mean_sum_percent": 0.5, + "mean_hardware_percent": 0.5 + } }, "compute_time": { "unit": "seconds", - "time": 2.5198354721069336 + "time": 3.345628023147583 } } @@ -304,7 +351,7 @@ information for each individual computational resource. .. code:: none - MaxRAM(unit='megabytes', system_capacity=67254.1696, system=5721.3952, main=RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=858.7632639999999, private_rss=642.445312, shared_rss=216.526848)) + MaxRAM(unit='megabytes', system_capacity=63088.2304, system=2399.9201279999997, main=RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=901.2633599999999, private_rss=684.6177279999999, shared_rss=216.67839999999998)) @@ -330,7 +377,7 @@ information for each individual computational resource. .. code:: none - RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328) + RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416) @@ -343,7 +390,7 @@ information for each individual computational resource. .. code:: none - 850.399232 + 890.7038719999999 @@ -356,7 +403,7 @@ information for each individual computational resource. .. code:: none - MaxGPURAM(unit='megabytes', system_capacity=16376.0, system=727.0, main=506.0, descendents=0.0, combined=506.0) + MaxGPURAM(unit='megabytes', system_capacity=2048.0, system=353.0, main=277.0, descendents=0.0, combined=277.0) @@ -369,7 +416,7 @@ information for each individual computational resource. .. code:: none - ComputeTime(unit='seconds', time=2.5198354721069336) + ComputeTime(unit='seconds', time=3.345628023147583) @@ -391,7 +438,7 @@ to the point of failure, use a try/except block like so: .. code:: none The following error occured while tracking: AN ERROR - 0.506 + 0.277 Below is an example of using a child process. Notice the descendents @@ -402,11 +449,11 @@ fields are now non-zero. import multiprocessing as mp ctx = mp.get_context(method='spawn') child_process = ctx.Process(target=example_function) - with gput.Tracker() as tracker: + with gput.Tracker(n_expected_cores=2, sleep_time=0.2) as tracker: child_process.start() example_function() child_process.join() - child_process.close() + child_process.close() print(tracker) @@ -414,52 +461,61 @@ fields are now non-zero. Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.938 + System capacity: 63.088 + System: 2.877 Main: - Total RSS: 0.798 - Private RSS: 0.491 - Shared RSS: 0.311 + Total RSS: 0.844 + Private RSS: 0.525 + Shared RSS: 0.319 Descendents: - Total RSS: 0.85 - Private RSS: 0.728 - Shared RSS: 0.122 + Total RSS: 0.831 + Private RSS: 0.704 + Shared RSS: 0.127 Combined: - Total RSS: 1.451 - Private RSS: 1.144 - Shared RSS: 0.311 + Total RSS: 1.462 + Private RSS: 1.148 + Shared RSS: 0.32 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 1.043 - Main: 0.506 - Descendents: 0.314 - Combined: 0.82 + System capacity: 2.048 + System: 0.631 + Main: 0.277 + Descendents: 0.277 + Combined: 0.554 CPU utilization: System core count: 12 + Number of expected cores: 2 System: - Max core percent: 225.5 - Max CPU percent: 18.792 - Mean core percent: 187.575 - Mean CPU percent: 15.631 + Max sum percent: 398.9 + Max hardware percent: 33.242 + Mean sum percent: 222.255 + Mean hardware percent: 18.521 Main: - Max core percent: 99.6 - Max CPU percent: 8.3 - Mean core percent: 74.15 - Mean CPU percent: 6.179 + Max sum percent: 103.8 + Max hardware percent: 51.9 + Mean sum percent: 66.009 + Mean hardware percent: 33.005 Descendents: - Max core percent: 101.2 - Max CPU percent: 8.433 - Mean core percent: 74.125 - Mean CPU percent: 6.177 + Max sum percent: 308.5 + Max hardware percent: 154.25 + Mean sum percent: 117.109 + Mean hardware percent: 58.555 Combined: - Max core percent: 198.7 - Max CPU percent: 16.558 - Mean core percent: 148.275 - Mean CPU percent: 12.356 - Main number of threads: 15 - Descendents number of threads: 5 - Combined number of threads: 20 + Max sum percent: 409.2 + Max hardware percent: 204.6 + Mean sum percent: 183.118 + Mean hardware percent: 91.559 + Main number of threads: 24 + Descendents number of threads: 16 + Combined number of threads: 40 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 6.0 + Max hardware percent: 6.0 + Mean sum percent: 0.545 + Mean hardware percent: 0.545 Compute time: Unit: hours Time: 0.001 @@ -480,12 +536,12 @@ help message. .. code:: none - Tracks the computational resource usage (RAM, GPU RAM, and compute time) of a process corresponding to a given shell command. + Tracks the computational resource usage (RAM, GPU RAM, CPU utilization, GPU utilization, and compute time) of a process corresponding to a given shell command. Usage: gpu-tracker -h | --help gpu-tracker -v | --version - gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--disable-logs] + gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--nec=] [--guuids=] [--disable-logs] Options: -h --help Show this help message and exit. @@ -497,6 +553,8 @@ help message. --ru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --gru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --tu= One of 'seconds', 'minutes', 'hours', or 'days'. + --nec= The number of cores expected to be used. Defaults to the number of cores in the entire operating system. + --guuids= Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system. --disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual. @@ -508,7 +566,7 @@ completes, its status code is reported. .. code:: none - $ gpu-tracker -e "bash example-script.sh" + $ gpu-tracker -e "bash example-script.sh" --st=0.3 .. code:: none @@ -516,52 +574,61 @@ completes, its status code is reported. Resource tracking complete. Process completed with status code: 0 Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.964 + System capacity: 63.088 + System: 2.3 Main: Total RSS: 0.003 Private RSS: 0.0 Shared RSS: 0.003 Descendents: - Total RSS: 0.847 - Private RSS: 0.724 - Shared RSS: 0.122 + Total RSS: 0.917 + Private RSS: 0.905 + Shared RSS: 0.012 Combined: - Total RSS: 0.856 - Private RSS: 0.733 - Shared RSS: 0.123 + Total RSS: 0.925 + Private RSS: 0.912 + Shared RSS: 0.013 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 1.043 + System capacity: 2.048 + System: 0.193 Main: 0.0 - Descendents: 0.314 - Combined: 0.314 + Descendents: 0.117 + Combined: 0.117 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 177.6 - Max CPU percent: 14.8 - Mean core percent: 134.375 - Mean CPU percent: 11.198 + Max sum percent: 309.5 + Max hardware percent: 25.792 + Mean sum percent: 159.073 + Mean hardware percent: 13.256 Main: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Descendents: - Max core percent: 100.4 - Max CPU percent: 8.367 - Mean core percent: 95.45 - Mean CPU percent: 7.954 + Max sum percent: 493.1 + Max hardware percent: 41.092 + Mean sum percent: 134.427 + Mean hardware percent: 11.202 Combined: - Max core percent: 100.4 - Max CPU percent: 8.367 - Mean core percent: 95.45 - Mean CPU percent: 7.954 + Max sum percent: 493.1 + Max hardware percent: 41.092 + Mean sum percent: 134.427 + Mean hardware percent: 11.202 Main number of threads: 1 - Descendents number of threads: 4 - Combined number of threads: 5 + Descendents number of threads: 15 + Combined number of threads: 16 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 4.0 + Max hardware percent: 4.0 + Mean sum percent: 0.364 + Mean hardware percent: 0.364 Compute time: Unit: hours Time: 0.001 @@ -577,7 +644,7 @@ for ram-unit. .. code:: none - $ gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes + $ gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes --st=0.2 .. code:: none @@ -585,55 +652,64 @@ for ram-unit. Resource tracking complete. Process completed with status code: 0 Max RAM: Unit: megabytes - System capacity: 67254.17 - System: 5784.379 + System capacity: 63088.23 + System: 2242.593 Main: - Total RSS: 3.076 - Private RSS: 0.324 - Shared RSS: 2.753 + Total RSS: 3.039 + Private RSS: 0.315 + Shared RSS: 2.724 Descendents: - Total RSS: 838.545 - Private RSS: 716.681 - Shared RSS: 121.864 + Total RSS: 832.487 + Private RSS: 705.831 + Shared RSS: 126.657 Combined: - Total RSS: 847.249 - Private RSS: 724.492 - Shared RSS: 122.757 + Total RSS: 841.482 + Private RSS: 713.867 + Shared RSS: 127.992 Max GPU RAM: Unit: megabytes - System capacity: 16376.0 - System: 1043.0 + System capacity: 2048.0 + System: 631.0 Main: 0.0 - Descendents: 314.0 - Combined: 314.0 + Descendents: 277.0 + Combined: 277.0 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 188.7 - Max CPU percent: 15.725 - Mean core percent: 136.45 - Mean CPU percent: 11.371 + Max sum percent: 362.6 + Max hardware percent: 30.217 + Mean sum percent: 156.853 + Mean hardware percent: 13.071 Main: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Descendents: - Max core percent: 96.2 - Max CPU percent: 8.017 - Mean core percent: 94.55 - Mean CPU percent: 7.879 + Max sum percent: 512.8 + Max hardware percent: 42.733 + Mean sum percent: 120.333 + Mean hardware percent: 10.028 Combined: - Max core percent: 96.2 - Max CPU percent: 8.017 - Mean core percent: 94.55 - Mean CPU percent: 7.879 + Max sum percent: 512.8 + Max hardware percent: 42.733 + Mean sum percent: 120.333 + Mean hardware percent: 10.028 Main number of threads: 1 - Descendents number of threads: 4 - Combined number of threads: 5 + Descendents number of threads: 15 + Combined number of threads: 16 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 4.0 + Max hardware percent: 4.0 + Mean sum percent: 0.267 + Mean hardware percent: 0.267 Compute time: Unit: seconds - Time: 3.566 + Time: 4.931 By default, the computational-resource-usage statistics are printed to @@ -642,7 +718,7 @@ that same content in a file. .. code:: none - $ gpu-tracker -e 'bash example-script.sh' -o out.txt + $ gpu-tracker -e 'bash example-script.sh' -o out.txt --st=0.2 .. code:: none @@ -659,52 +735,61 @@ that same content in a file. Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.584 + System capacity: 63.088 + System: 2.683 Main: Total RSS: 0.003 Private RSS: 0.0 Shared RSS: 0.003 Descendents: - Total RSS: 0.853 - Private RSS: 0.731 - Shared RSS: 0.122 + Total RSS: 0.843 + Private RSS: 0.717 + Shared RSS: 0.127 Combined: - Total RSS: 0.862 - Private RSS: 0.739 - Shared RSS: 0.123 + Total RSS: 0.852 + Private RSS: 0.725 + Shared RSS: 0.128 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 1.043 + System capacity: 2.048 + System: 0.631 Main: 0.0 - Descendents: 0.314 - Combined: 0.314 + Descendents: 0.277 + Combined: 0.277 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 187.6 - Max CPU percent: 15.633 - Mean core percent: 137.675 - Mean CPU percent: 11.473 + Max sum percent: 383.8 + Max hardware percent: 31.983 + Mean sum percent: 166.507 + Mean hardware percent: 13.876 Main: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Descendents: - Max core percent: 101.3 - Max CPU percent: 8.442 - Mean core percent: 97.675 - Mean CPU percent: 8.14 + Max sum percent: 528.4 + Max hardware percent: 44.033 + Mean sum percent: 128.014 + Mean hardware percent: 10.668 Combined: - Max core percent: 101.3 - Max CPU percent: 8.442 - Mean core percent: 97.675 - Mean CPU percent: 8.14 + Max sum percent: 528.4 + Max hardware percent: 44.033 + Mean sum percent: 128.014 + Mean hardware percent: 10.668 Main number of threads: 1 - Descendents number of threads: 4 - Combined number of threads: 5 + Descendents number of threads: 15 + Combined number of threads: 16 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 7.0 + Max hardware percent: 7.0 + Mean sum percent: 0.643 + Mean hardware percent: 0.643 Compute time: Unit: hours Time: 0.001 @@ -714,7 +799,7 @@ By default, the format of the output is “text”. The ``-f`` or .. code:: none - $ gpu-tracker -e 'bash example-script.sh' -f json + $ gpu-tracker -e 'bash example-script.sh' -f json --st=0.2 .. code:: none @@ -723,72 +808,83 @@ By default, the format of the output is “text”. The ``-f`` or { "max_ram": { "unit": "gigabytes", - "system_capacity": 67.2541696, - "system": 5.720379392000001, + "system_capacity": 63.0882304, + "system": 3.111936, "main": { - "total_rss": 0.003084288, - "private_rss": 0.00031948800000000004, - "shared_rss": 0.0027648 + "total_rss": 0.003059712, + "private_rss": 0.000339968, + "shared_rss": 0.002719744 }, "descendents": { - "total_rss": 0.854237184, - "private_rss": 0.73218048, - "shared_rss": 0.122056704 + "total_rss": 0.846565376, + "private_rss": 0.7198023680000001, + "shared_rss": 0.12713984 }, "combined": { - "total_rss": 0.863256576, - "private_rss": 0.7403069440000001, - "shared_rss": 0.122949632 + "total_rss": 0.8552325120000001, + "private_rss": 0.727576576, + "shared_rss": 0.12803276800000002 } }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 16.376, - "system": 1.043, + "system_capacity": 2.048, + "system": 0.631, "main": 0.0, - "descendents": 0.314, - "combined": 0.314 + "descendents": 0.277, + "combined": 0.277 }, "cpu_utilization": { "system_core_count": 12, + "n_expected_cores": 12, "system": { - "max_core_percent": 260.00000000000006, - "max_cpu_percent": 21.66666666666667, - "mean_core_percent": 159.35000000000002, - "mean_cpu_percent": 13.279166666666669 + "max_sum_percent": 384.5999999999999, + "max_hardware_percent": 32.04999999999999, + "mean_sum_percent": 167.49285714285716, + "mean_hardware_percent": 13.957738095238097 }, "main": { - "max_core_percent": 0.0, - "max_cpu_percent": 0.0, - "mean_core_percent": 0.0, - "mean_cpu_percent": 0.0 + "max_sum_percent": 0.0, + "max_hardware_percent": 0.0, + "mean_sum_percent": 0.0, + "mean_hardware_percent": 0.0 }, "descendents": { - "max_core_percent": 102.9, - "max_cpu_percent": 8.575000000000001, - "mean_core_percent": 97.475, - "mean_cpu_percent": 8.122916666666667 + "max_sum_percent": 526.0, + "max_hardware_percent": 43.833333333333336, + "mean_sum_percent": 128.65, + "mean_hardware_percent": 10.720833333333333 }, "combined": { - "max_core_percent": 102.9, - "max_cpu_percent": 8.575000000000001, - "mean_core_percent": 97.475, - "mean_cpu_percent": 8.122916666666667 + "max_sum_percent": 526.0, + "max_hardware_percent": 43.833333333333336, + "mean_sum_percent": 128.65, + "mean_hardware_percent": 10.720833333333333 }, "main_n_threads": 1, - "descendents_n_threads": 4, - "combined_n_threads": 5 + "descendents_n_threads": 15, + "combined_n_threads": 16 + }, + "gpu_utilization": { + "system_gpu_count": 1, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 7.0, + "max_hardware_percent": 7.0, + "mean_sum_percent": 0.5, + "mean_hardware_percent": 0.5 + } }, "compute_time": { "unit": "hours", - "time": 0.001005272732840644 + "time": 0.0012672905127207438 } } .. code:: none - $ gpu-tracker -e 'bash example-script.sh' -f json -o out.json + $ gpu-tracker -e 'bash example-script.sh' -f json -o out.json --st=0.3 .. code:: none @@ -806,64 +902,75 @@ By default, the format of the output is “text”. The ``-f`` or { "max_ram": { "unit": "gigabytes", - "system_capacity": 67.2541696, - "system": 5.560373248, + "system_capacity": 63.0882304, + "system": 2.878910464, "main": { - "total_rss": 0.002957312, - "private_rss": 0.000323584, - "shared_rss": 0.002633728 + "total_rss": 0.0029777920000000004, + "private_rss": 0.00031948800000000004, + "shared_rss": 0.0026583040000000002 }, "descendents": { - "total_rss": 0.848539648, - "private_rss": 0.726519808, - "shared_rss": 0.12201984 + "total_rss": 0.8333844480000001, + "private_rss": 0.7066091520000001, + "shared_rss": 0.127152128 }, "combined": { - "total_rss": 0.857731072, - "private_rss": 0.734818304, - "shared_rss": 0.122912768 + "total_rss": 0.841486336, + "private_rss": 0.713818112, + "shared_rss": 0.12804505600000002 } }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 16.376, - "system": 1.043, + "system_capacity": 2.048, + "system": 0.631, "main": 0.0, - "descendents": 0.314, - "combined": 0.314 + "descendents": 0.277, + "combined": 0.277 }, "cpu_utilization": { "system_core_count": 12, + "n_expected_cores": 12, "system": { - "max_core_percent": 192.5, - "max_cpu_percent": 16.041666666666668, - "mean_core_percent": 154.22500000000002, - "mean_cpu_percent": 12.852083333333335 + "max_sum_percent": 306.09999999999997, + "max_hardware_percent": 25.50833333333333, + "mean_sum_percent": 161.4272727272727, + "mean_hardware_percent": 13.452272727272724 }, "main": { - "max_core_percent": 0.0, - "max_cpu_percent": 0.0, - "mean_core_percent": 0.0, - "mean_cpu_percent": 0.0 + "max_sum_percent": 0.0, + "max_hardware_percent": 0.0, + "mean_sum_percent": 0.0, + "mean_hardware_percent": 0.0 }, "descendents": { - "max_core_percent": 104.1, - "max_cpu_percent": 8.674999999999999, - "mean_core_percent": 97.7, - "mean_cpu_percent": 8.141666666666667 + "max_sum_percent": 440.2, + "max_hardware_percent": 36.68333333333333, + "mean_sum_percent": 128.27272727272728, + "mean_hardware_percent": 10.68939393939394 }, "combined": { - "max_core_percent": 104.1, - "max_cpu_percent": 8.674999999999999, - "mean_core_percent": 97.7, - "mean_cpu_percent": 8.141666666666667 + "max_sum_percent": 440.2, + "max_hardware_percent": 36.68333333333333, + "mean_sum_percent": 128.27272727272728, + "mean_hardware_percent": 10.68939393939394 }, "main_n_threads": 1, - "descendents_n_threads": 4, - "combined_n_threads": 5 + "descendents_n_threads": 15, + "combined_n_threads": 16 + }, + "gpu_utilization": { + "system_gpu_count": 1, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 7.0, + "max_hardware_percent": 7.0, + "mean_sum_percent": 0.6363636363636364, + "mean_hardware_percent": 0.6363636363636364 + } }, "compute_time": { "unit": "hours", - "time": 0.000995432734489441 + "time": 0.0012816817230648465 } } diff --git a/src/gpu_tracker/__main__.py b/src/gpu_tracker/__main__.py index d2837da..ae005c8 100644 --- a/src/gpu_tracker/__main__.py +++ b/src/gpu_tracker/__main__.py @@ -1,10 +1,10 @@ """ -Tracks the computational resource usage (RAM, GPU RAM, and compute time) of a process corresponding to a given shell command. +Tracks the computational resource usage (RAM, GPU RAM, CPU utilization, GPU utilization, and compute time) of a process corresponding to a given shell command. Usage: gpu-tracker -h | --help gpu-tracker -v | --version - gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--disable-logs] + gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--nec=] [--guuids=] [--disable-logs] Options: -h --help Show this help message and exit. @@ -16,6 +16,8 @@ --ru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --gru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --tu= One of 'seconds', 'minutes', 'hours', or 'days'. + --nec= The number of cores expected to be used. Defaults to the number of cores in the entire operating system. + --guuids= Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system. --disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual. """ import docopt as doc @@ -37,6 +39,8 @@ def main(): '--ru': 'ram_unit', '--gru': 'gpu_ram_unit', '--tu': 'time_unit', + '--nec': 'n_expected_cores', + '--guuids': 'gpu_uuids', '--disable-logs': 'disable_logs' } kwargs = { @@ -44,6 +48,10 @@ def main(): '--execute', '--output', '--format', '--help', '--version'}} if 'sleep_time' in kwargs.keys(): kwargs['sleep_time'] = float(kwargs['sleep_time']) + if 'n_expected_cores' in kwargs.keys(): + kwargs['n_expected_cores'] = int(kwargs['n_expected_cores']) + if 'gpu_uuids' in kwargs.keys(): + kwargs['gpu_uuids'] = set(kwargs['gpu_uuids'].split(',')) if len(command) == 0: log.error('Empty command provided.') sys.exit(1) diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index 027935c..a1d8f73 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -480,8 +480,8 @@ class ProcessingUnitPercentages: :param max_sum_percent: The maximum sum of utilization percentages of the processing units at any given time. :param max_hardware_percent: The maximum utilization percentage of the group of units as a whole (i.e. max_sum_percent divided by the number of units involved). - :param mean_sum_percent: The mean sum of utilization percentages of the core(s) used by the process(es) over time. - :param mean_hardware_percent: The mean utilization percentage of the core(s) as a whole (i.e. mean_sum_percent divided by the number of cores involved). + :param mean_sum_percent: The mean sum of utilization percentages of the processing units used by the process(es) over time. + :param mean_hardware_percent: The mean utilization percentage of the group of units as a whole (i.e. mean_sum_percent divided by the number of units involved). """ max_sum_percent: float = 0. max_hardware_percent: float = 0. diff --git a/tests/test_cli.py b/tests/test_cli.py index 73013f2..e4aa805 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -20,8 +20,11 @@ def get_output(request) -> str | None: (['--execute=my-command arg1 arg2', '--st=0.4'], ['my-command', 'arg1', 'arg2'], {'disable_logs': False, 'sleep_time': 0.4}), ( ['-e', 'my-command', '--gru=megabytes', '--tu=days'], ['my-command'], - {'disable_logs': False, 'gpu_ram_unit': 'megabytes', 'time_unit': 'days'} - )] + {'disable_logs': False, 'gpu_ram_unit': 'megabytes', 'time_unit': 'days'}), + ( + ['-e', 'my-command', '--nec=3', '--guuids=gpu-id1,gpu-id2,gpu-id3'], ['my-command'], + {'disable_logs': False, 'n_expected_cores': 3, 'gpu_uuids': {'gpu-id1', 'gpu-id2', 'gpu-id3'}}), + (['-e', 'my-command', '--guuids=gpu-id1'], ['my-command'], {'disable_logs': False, 'gpu_uuids': {'gpu-id1'}})] @pt.mark.parametrize('argv,command,kwargs', test_data) From 4ec54d2444adda408a5a36838b3ba2a970c7e5db Mon Sep 17 00:00:00 2001 From: erikhuck Date: Tue, 11 Jun 2024 12:11:01 -0400 Subject: [PATCH 9/9] Fixes typo --- docs/notebook/tutorial.ipynb | 4 ++-- docs/tutorial.rst | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/notebook/tutorial.ipynb b/docs/notebook/tutorial.ipynb index 540f9fd..b03dd19 100644 --- a/docs/notebook/tutorial.ipynb +++ b/docs/notebook/tutorial.ipynb @@ -144,7 +144,7 @@ "\n", "The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents combined, and CPU utilization across the entire system. The sum percent is the sum of the percentages of all the cores being used. The hardware percent is that divided by the expected number of cores being used i.e. the optional `n_expected_cores` parameter (defaults to the number of cores in the entire system) for the main, descendents, and combined measurements. For the system measurements, hardware percent is divided by the total number of cores in the system regardless of the value of `n_expected_cores`. The max percent is the highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads used across its descendent processes and combined.\n", "\n", - "The GPU utilization is similar to the CPU utilization but rather than being based on utilization of processes, it can only measure the utilization percentages of the GPUs themselves, regardless of what processes are using them. To ameliorate this limiation, the optional `gpu_uuids` parameter can be set to specify which GPUs to measure utilization for (defaults to all the GPUs in the system). The system GPU count is the total number of GPUs in the system. The sum percent is the sum of all the percentages of these GPUs and the hardware percent is that divided by the expected number of GPUs being used (i.e. `len(gpu_uuids)`). Likewise with CPU utilization, the max and mean of both the sum and hardware percentages are provided.\n", + "The GPU utilization is similar to the CPU utilization but rather than being based on utilization of processes, it can only measure the utilization percentages of the GPUs themselves, regardless of what processes are using them. To ameliorate this limitation, the optional `gpu_uuids` parameter can be set to specify which GPUs to measure utilization for (defaults to all the GPUs in the system). The system GPU count is the total number of GPUs in the system. The sum percent is the sum of all the percentages of these GPUs and the hardware percent is that divided by the expected number of GPUs being used (i.e. `len(gpu_uuids)`). Likewise with CPU utilization, the max and mean of both the sum and hardware percentages are provided.\n", "\n", "The compute time is the real time that the computational-resource tracking lasted (as compared to CPU time)." ] @@ -1189,7 +1189,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 1e202c6..a42c756 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -148,7 +148,7 @@ processes and combined. The GPU utilization is similar to the CPU utilization but rather than being based on utilization of processes, it can only measure the utilization percentages of the GPUs themselves, regardless of what -processes are using them. To ameliorate this limiation, the optional +processes are using them. To ameliorate this limitation, the optional ``gpu_uuids`` parameter can be set to specify which GPUs to measure utilization for (defaults to all the GPUs in the system). The system GPU count is the total number of GPUs in the system. The sum percent is the