diff --git a/README.rst b/README.rst index f6b2fef..9e32de0 100644 --- a/README.rst +++ b/README.rst @@ -3,7 +3,7 @@ gpu_tracker ########### Description ----------- -The ``gpu_tracker`` package provides a ``Tracker`` class and a commandline-interface that tracks (profiles) the usage of compute time, CPU utilization, maximum RAM, and maximum GPU RAM. +The ``gpu_tracker`` package provides a ``Tracker`` class and a commandline-interface that tracks (profiles) the usage of compute time, CPU utilization, maximum RAM, GPU utilization, and maximum GPU RAM. The compute time is a measurement of the real time taken by the task as opposed to the CPU-utilization time. The GPU tracking is for Nvidia GPUs and uses the ``nvidia-smi`` command. If the Nvidia drivers have not been installed, then the max GPU RAM is not tracked and measurements are reported as 0. Computational resources are tracked throughout the duration of a context manager or the duration of explicit calls to the ``start()`` and ``stop()`` methods of the ``Tracker`` class. @@ -11,6 +11,8 @@ The ``gpu-tracker`` command-line interface alternatively tracks the computationa **NOTE: The tracking occurs in a separate process. To maximize the accuracy of the reported resource usage, you may want to have a core available solely for the tracking process e.g. if your job uses 3 workers, you may want to allocate 4 cores.** +**NOTE: Since the tracking process is created using the Python multiprocessing library, if done so using the "spawn" start method (default on MacOS and Windows) or the "forkserver" method, you may get a runtime error after starting the tracking. To prevent this, you'll need to start the tracker after checking** ``if __name__ == '__main__'``. **See "Safe importing of main module" under** `The spawn and forkserver start methods `__ **for more information.** + Documentation ------------- The complete documentation for the ``gpu_tracker`` package, including tutorials, can be found `here `__. diff --git a/docs/conf.py b/docs/conf.py index b605b15..966388d 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -14,8 +14,8 @@ # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information project = 'gpu_tracker' -copyright = '2024, Erik Huckvale, Hunter Moseley' -author = 'Erik Huckvale, Hunter Moseley' +copyright = '2024, Erik Huckvale, Hunter N. B. Moseley' +author = 'Erik Huckvale, Hunter N. B. Moseley' version = __version__ release = __version__ diff --git a/docs/notebook/tutorial.ipynb b/docs/notebook/tutorial.ipynb index 0b1b5e2..b03dd19 100644 --- a/docs/notebook/tutorial.ipynb +++ b/docs/notebook/tutorial.ipynb @@ -21,7 +21,7 @@ "id": "2bb9e84a-8523-4e5f-bc01-1d6b234c19a6", "metadata": {}, "source": [ - "The `gpu_tracker` package provides the `Tracker` class which uses a subprocess to measure computational resource usage, namely the compute time, CPU utilization, maximum RAM used, and maximum GPU RAM used. The `start()` method starts this process which tracks usage in the background. After calling `start()`, one can write the code for which resource usage is measured, followed by calling the `stop()` method. The compute time will be the time from the call to `start()` to the call to `stop()` and the RAM, GPU RAM, and CPU utilization quantities will be the respective computational resources used by the code that's in between `start()` and `stop()`." + "The `gpu_tracker` package provides the `Tracker` class which uses a subprocess to measure computational resource usage, namely the compute time, maximum CPU utilization, mean CPU utilization, maximum RAM used, maximum GPU utilization, mean GPU utilization, and maximum GPU RAM used. The `start()` method starts this process which tracks usage in the background. After calling `start()`, one can write the code for which resource usage is measured, followed by calling the `stop()` method. The compute time will be the time from the call to `start()` to the call to `stop()` and the RAM, GPU RAM, CPU utilization, and GPU utilization quantities will be the respective computational resources used by the code that's in between `start()` and `stop()`." ] }, { @@ -42,7 +42,7 @@ "metadata": {}, "outputs": [], "source": [ - "tracker = gput.Tracker()\n", + "tracker = gput.Tracker(n_expected_cores=1, sleep_time=0.1)\n", "tracker.start()\n", "example_function()\n", "tracker.stop()" @@ -68,52 +68,61 @@ "text": [ "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.21\n", + " System capacity: 63.088\n", + " System: 1.899\n", " Main:\n", - " Total RSS: 0.827\n", - " Private RSS: 0.674\n", - " Shared RSS: 0.154\n", + " Total RSS: 0.914\n", + " Private RSS: 0.753\n", + " Shared RSS: 0.161\n", " Descendents:\n", " Total RSS: 0.0\n", " Private RSS: 0.0\n", " Shared RSS: 0.0\n", " Combined:\n", - " Total RSS: 0.834\n", - " Private RSS: 0.681\n", - " Shared RSS: 0.154\n", + " Total RSS: 0.883\n", + " Private RSS: 0.723\n", + " Shared RSS: 0.161\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 0.535\n", - " Main: 0.314\n", + " System capacity: 2.048\n", + " System: 0.353\n", + " Main: 0.277\n", " Descendents: 0.0\n", - " Combined: 0.314\n", + " Combined: 0.277\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 1\n", " System:\n", - " Max core percent: 150.6\n", - " Max CPU percent: 12.55\n", - " Mean core percent: 122.9\n", - " Mean CPU percent: 10.242\n", + " Max sum percent: 169.7\n", + " Max hardware percent: 14.142\n", + " Mean sum percent: 150.183\n", + " Mean hardware percent: 12.515\n", " Main:\n", - " Max core percent: 98.6\n", - " Max CPU percent: 8.217\n", - " Mean core percent: 96.8\n", - " Mean CPU percent: 8.067\n", + " Max sum percent: 101.2\n", + " Max hardware percent: 101.2\n", + " Mean sum percent: 93.158\n", + " Mean hardware percent: 93.158\n", " Descendents:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Combined:\n", - " Max core percent: 98.6\n", - " Max CPU percent: 8.217\n", - " Mean core percent: 96.8\n", - " Mean CPU percent: 8.067\n", - " Main number of threads: 15\n", + " Max sum percent: 101.2\n", + " Max hardware percent: 101.2\n", + " Mean sum percent: 93.158\n", + " Mean hardware percent: 93.158\n", + " Main number of threads: 24\n", " Descendents number of threads: 0\n", - " Combined number of threads: 15\n", + " Combined number of threads: 24\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 4.0\n", + " Max hardware percent: 4.0\n", + " Mean sum percent: 0.333\n", + " Mean hardware percent: 0.333\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001\n" @@ -131,9 +140,11 @@ "source": [ "The output is organized by computational resource followed by information specific to that resource. The system capacity is a constant for the total RAM capacity across the entire operating system. There is a system capacity field both for RAM and GPU RAM. This is not to be confused with the system field, which measures the maximum RAM / GPU RAM (operating system wide) that was actually used over the duration of the computational-resource tracking. Both the RAM and GPU RAM have 3 additional fields, namely the usage of the main process itself followed by the summed usage of any descendent processes it may have (i.e. child processes, grandchild processes, etc.), and combined usage which is the sum of the main and its descendent processes. RAM is divided further to include the private RSS (RAM usage unique to the process), shared RSS (RAM that's shared by a process and at least one other process), and total RSS (the sum of private and shared RSS). The private and shared RSS values are only available on Linux distributions. So for non-linux operating systems, the private and shared RSS will remain 0 and only the total RSS will be reported. Theoretically, the combined total RSS would never exceed the overall system RAM usage, but inaccuracies resulting from shared RSS can cause this to happen, especially for non-linux operating systems (see note below).\n", "\n", - "The `Tracker` assumes that GPU memory is not shared accross multiple processes and if it is, the reported GPU RAM of \"descendent\" and \"combined\" may be an overestimation.\n", + "The `Tracker` assumes that GPU memory is not shared across multiple processes and if it is, the reported GPU RAM of \"descendent\" and \"combined\" may be an overestimation.\n", "\n", - "The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents combined, and CPU utilization across the entire system. The core percent is the sum of the percentages of all the cores being used. The CPU percent is that divided by the system core count. The max percent is the highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads used accross its descendent processes and combined.\n", + "The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents combined, and CPU utilization across the entire system. The sum percent is the sum of the percentages of all the cores being used. The hardware percent is that divided by the expected number of cores being used i.e. the optional `n_expected_cores` parameter (defaults to the number of cores in the entire system) for the main, descendents, and combined measurements. For the system measurements, hardware percent is divided by the total number of cores in the system regardless of the value of `n_expected_cores`. The max percent is the highest percentage detected through the duration of tracking while the mean percent is the average of all the percentages detected over that duration. The CPU utilization concludes with the maximum number of threads used at any time for the main process and the sum of the threads used across its descendent processes and combined.\n", + "\n", + "The GPU utilization is similar to the CPU utilization but rather than being based on utilization of processes, it can only measure the utilization percentages of the GPUs themselves, regardless of what processes are using them. To ameliorate this limitation, the optional `gpu_uuids` parameter can be set to specify which GPUs to measure utilization for (defaults to all the GPUs in the system). The system GPU count is the total number of GPUs in the system. The sum percent is the sum of all the percentages of these GPUs and the hardware percent is that divided by the expected number of GPUs being used (i.e. `len(gpu_uuids)`). Likewise with CPU utilization, the max and mean of both the sum and hardware percentages are provided.\n", "\n", "The compute time is the real time that the computational-resource tracking lasted (as compared to CPU time)." ] @@ -185,60 +196,69 @@ "text": [ "Max RAM:\n", " Unit: megabytes\n", - " System capacity: 67254.17\n", - " System: 5721.395\n", + " System capacity: 63088.23\n", + " System: 2399.92\n", " Main:\n", - " Total RSS: 850.399\n", - " Private RSS: 634.077\n", - " Shared RSS: 216.547\n", + " Total RSS: 890.704\n", + " Private RSS: 674.058\n", + " Shared RSS: 216.924\n", " Descendents:\n", " Total RSS: 0.0\n", " Private RSS: 0.0\n", " Shared RSS: 0.0\n", " Combined:\n", - " Total RSS: 858.763\n", - " Private RSS: 642.445\n", - " Shared RSS: 216.527\n", + " Total RSS: 901.263\n", + " Private RSS: 684.618\n", + " Shared RSS: 216.678\n", "Max GPU RAM:\n", " Unit: megabytes\n", - " System capacity: 16376.0\n", - " System: 727.0\n", - " Main: 506.0\n", + " System capacity: 2048.0\n", + " System: 353.0\n", + " Main: 277.0\n", " Descendents: 0.0\n", - " Combined: 506.0\n", + " Combined: 277.0\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 148.9\n", - " Max CPU percent: 12.408\n", - " Mean core percent: 124.7\n", - " Mean CPU percent: 10.392\n", + " Max sum percent: 164.3\n", + " Max hardware percent: 13.692\n", + " Mean sum percent: 152.325\n", + " Mean hardware percent: 12.694\n", " Main:\n", - " Max core percent: 99.9\n", - " Max CPU percent: 8.325\n", - " Mean core percent: 97.533\n", - " Mean CPU percent: 8.128\n", + " Max sum percent: 102.6\n", + " Max hardware percent: 8.55\n", + " Mean sum percent: 91.258\n", + " Mean hardware percent: 7.605\n", " Descendents:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Combined:\n", - " Max core percent: 99.9\n", - " Max CPU percent: 8.325\n", - " Mean core percent: 97.533\n", - " Mean CPU percent: 8.128\n", - " Main number of threads: 15\n", + " Max sum percent: 102.6\n", + " Max hardware percent: 8.55\n", + " Mean sum percent: 91.258\n", + " Mean hardware percent: 7.605\n", + " Main number of threads: 24\n", " Descendents number of threads: 0\n", - " Combined number of threads: 15\n", + " Combined number of threads: 24\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 6.0\n", + " Max hardware percent: 6.0\n", + " Mean sum percent: 0.5\n", + " Mean hardware percent: 0.5\n", "Compute time:\n", " Unit: seconds\n", - " Time: 2.52\n" + " Time: 3.346\n" ] } ], "source": [ - "with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds') as tracker:\n", + "with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds', sleep_time=0.1) as tracker:\n", " example_function()\n", "print(tracker)" ] @@ -264,12 +284,12 @@ "{\n", " \"max_ram\": {\n", " \"unit\": \"megabytes\",\n", - " \"system_capacity\": 67254.1696,\n", - " \"system\": 5721.3952,\n", + " \"system_capacity\": 63088.2304,\n", + " \"system\": 2399.9201279999997,\n", " \"main\": {\n", - " \"total_rss\": 850.399232,\n", - " \"private_rss\": 634.077184,\n", - " \"shared_rss\": 216.547328\n", + " \"total_rss\": 890.7038719999999,\n", + " \"private_rss\": 674.05824,\n", + " \"shared_rss\": 216.92416\n", " },\n", " \"descendents\": {\n", " \"total_rss\": 0.0,\n", @@ -277,52 +297,63 @@ " \"shared_rss\": 0.0\n", " },\n", " \"combined\": {\n", - " \"total_rss\": 858.7632639999999,\n", - " \"private_rss\": 642.445312,\n", - " \"shared_rss\": 216.526848\n", + " \"total_rss\": 901.2633599999999,\n", + " \"private_rss\": 684.6177279999999,\n", + " \"shared_rss\": 216.67839999999998\n", " }\n", " },\n", " \"max_gpu_ram\": {\n", " \"unit\": \"megabytes\",\n", - " \"system_capacity\": 16376.0,\n", - " \"system\": 727.0,\n", - " \"main\": 506.0,\n", + " \"system_capacity\": 2048.0,\n", + " \"system\": 353.0,\n", + " \"main\": 277.0,\n", " \"descendents\": 0.0,\n", - " \"combined\": 506.0\n", + " \"combined\": 277.0\n", " },\n", " \"cpu_utilization\": {\n", " \"system_core_count\": 12,\n", + " \"n_expected_cores\": 12,\n", " \"system\": {\n", - " \"max_core_percent\": 148.90000000000003,\n", - " \"max_cpu_percent\": 12.408333333333337,\n", - " \"mean_core_percent\": 124.70000000000003,\n", - " \"mean_cpu_percent\": 10.39166666666667\n", + " \"max_sum_percent\": 164.3,\n", + " \"max_hardware_percent\": 13.691666666666668,\n", + " \"mean_sum_percent\": 152.325,\n", + " \"mean_hardware_percent\": 12.693750000000001\n", " },\n", " \"main\": {\n", - " \"max_core_percent\": 99.9,\n", - " \"max_cpu_percent\": 8.325000000000001,\n", - " \"mean_core_percent\": 97.53333333333335,\n", - " \"mean_cpu_percent\": 8.127777777777778\n", + " \"max_sum_percent\": 102.6,\n", + " \"max_hardware_percent\": 8.549999999999999,\n", + " \"mean_sum_percent\": 91.25833333333334,\n", + " \"mean_hardware_percent\": 7.604861111111112\n", " },\n", " \"descendents\": {\n", - " \"max_core_percent\": 0.0,\n", - " \"max_cpu_percent\": 0.0,\n", - " \"mean_core_percent\": 0.0,\n", - " \"mean_cpu_percent\": 0.0\n", + " \"max_sum_percent\": 0.0,\n", + " \"max_hardware_percent\": 0.0,\n", + " \"mean_sum_percent\": 0.0,\n", + " \"mean_hardware_percent\": 0.0\n", " },\n", " \"combined\": {\n", - " \"max_core_percent\": 99.9,\n", - " \"max_cpu_percent\": 8.325000000000001,\n", - " \"mean_core_percent\": 97.53333333333335,\n", - " \"mean_cpu_percent\": 8.127777777777778\n", + " \"max_sum_percent\": 102.6,\n", + " \"max_hardware_percent\": 8.549999999999999,\n", + " \"mean_sum_percent\": 91.25833333333334,\n", + " \"mean_hardware_percent\": 7.604861111111112\n", " },\n", - " \"main_n_threads\": 15,\n", + " \"main_n_threads\": 24,\n", " \"descendents_n_threads\": 0,\n", - " \"combined_n_threads\": 15\n", + " \"combined_n_threads\": 24\n", + " },\n", + " \"gpu_utilization\": {\n", + " \"system_gpu_count\": 1,\n", + " \"n_expected_gpus\": 1,\n", + " \"gpu_percentages\": {\n", + " \"max_sum_percent\": 6.0,\n", + " \"max_hardware_percent\": 6.0,\n", + " \"mean_sum_percent\": 0.5,\n", + " \"mean_hardware_percent\": 0.5\n", + " }\n", " },\n", " \"compute_time\": {\n", " \"unit\": \"seconds\",\n", - " \"time\": 2.5198354721069336\n", + " \"time\": 3.345628023147583\n", " }\n", "}\n" ] @@ -350,7 +381,7 @@ { "data": { "text/plain": [ - "MaxRAM(unit='megabytes', system_capacity=67254.1696, system=5721.3952, main=RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=858.7632639999999, private_rss=642.445312, shared_rss=216.526848))" + "MaxRAM(unit='megabytes', system_capacity=63088.2304, system=2399.9201279999997, main=RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=901.2633599999999, private_rss=684.6177279999999, shared_rss=216.67839999999998))" ] }, "execution_count": 7, @@ -392,7 +423,7 @@ { "data": { "text/plain": [ - "RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328)" + "RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416)" ] }, "execution_count": 9, @@ -413,7 +444,7 @@ { "data": { "text/plain": [ - "850.399232" + "890.7038719999999" ] }, "execution_count": 10, @@ -434,7 +465,7 @@ { "data": { "text/plain": [ - "MaxGPURAM(unit='megabytes', system_capacity=16376.0, system=727.0, main=506.0, descendents=0.0, combined=506.0)" + "MaxGPURAM(unit='megabytes', system_capacity=2048.0, system=353.0, main=277.0, descendents=0.0, combined=277.0)" ] }, "execution_count": 11, @@ -455,7 +486,7 @@ { "data": { "text/plain": [ - "ComputeTime(unit='seconds', time=2.5198354721069336)" + "ComputeTime(unit='seconds', time=3.345628023147583)" ] }, "execution_count": 12, @@ -486,7 +517,7 @@ "output_type": "stream", "text": [ "The following error occured while tracking: AN ERROR\n", - "0.506\n" + "0.277\n" ] } ], @@ -511,7 +542,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 23, "id": "f429ced6-573b-4f0f-ad64-658e9c05242d", "metadata": {}, "outputs": [ @@ -521,52 +552,61 @@ "text": [ "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.938\n", + " System capacity: 63.088\n", + " System: 2.877\n", " Main:\n", - " Total RSS: 0.798\n", - " Private RSS: 0.491\n", - " Shared RSS: 0.311\n", + " Total RSS: 0.844\n", + " Private RSS: 0.525\n", + " Shared RSS: 0.319\n", " Descendents:\n", - " Total RSS: 0.85\n", - " Private RSS: 0.728\n", - " Shared RSS: 0.122\n", + " Total RSS: 0.831\n", + " Private RSS: 0.704\n", + " Shared RSS: 0.127\n", " Combined:\n", - " Total RSS: 1.451\n", - " Private RSS: 1.144\n", - " Shared RSS: 0.311\n", + " Total RSS: 1.462\n", + " Private RSS: 1.148\n", + " Shared RSS: 0.32\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 1.043\n", - " Main: 0.506\n", - " Descendents: 0.314\n", - " Combined: 0.82\n", + " System capacity: 2.048\n", + " System: 0.631\n", + " Main: 0.277\n", + " Descendents: 0.277\n", + " Combined: 0.554\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 2\n", " System:\n", - " Max core percent: 225.5\n", - " Max CPU percent: 18.792\n", - " Mean core percent: 187.575\n", - " Mean CPU percent: 15.631\n", + " Max sum percent: 398.9\n", + " Max hardware percent: 33.242\n", + " Mean sum percent: 222.255\n", + " Mean hardware percent: 18.521\n", " Main:\n", - " Max core percent: 99.6\n", - " Max CPU percent: 8.3\n", - " Mean core percent: 74.15\n", - " Mean CPU percent: 6.179\n", + " Max sum percent: 103.8\n", + " Max hardware percent: 51.9\n", + " Mean sum percent: 66.009\n", + " Mean hardware percent: 33.005\n", " Descendents:\n", - " Max core percent: 101.2\n", - " Max CPU percent: 8.433\n", - " Mean core percent: 74.125\n", - " Mean CPU percent: 6.177\n", + " Max sum percent: 308.5\n", + " Max hardware percent: 154.25\n", + " Mean sum percent: 117.109\n", + " Mean hardware percent: 58.555\n", " Combined:\n", - " Max core percent: 198.7\n", - " Max CPU percent: 16.558\n", - " Mean core percent: 148.275\n", - " Mean CPU percent: 12.356\n", - " Main number of threads: 15\n", - " Descendents number of threads: 5\n", - " Combined number of threads: 20\n", + " Max sum percent: 409.2\n", + " Max hardware percent: 204.6\n", + " Mean sum percent: 183.118\n", + " Mean hardware percent: 91.559\n", + " Main number of threads: 24\n", + " Descendents number of threads: 16\n", + " Combined number of threads: 40\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 6.0\n", + " Max hardware percent: 6.0\n", + " Mean sum percent: 0.545\n", + " Mean hardware percent: 0.545\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001\n" @@ -577,11 +617,11 @@ "import multiprocessing as mp\n", "ctx = mp.get_context(method='spawn')\n", "child_process = ctx.Process(target=example_function)\n", - "with gput.Tracker() as tracker:\n", + "with gput.Tracker(n_expected_cores=2, sleep_time=0.2) as tracker:\n", " child_process.start()\n", " example_function()\n", " child_process.join()\n", - " child_process.close()\n", + "child_process.close()\n", "print(tracker)" ] }, @@ -611,12 +651,12 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tracks the computational resource usage (RAM, GPU RAM, and compute time) of a process corresponding to a given shell command.\n", + "Tracks the computational resource usage (RAM, GPU RAM, CPU utilization, GPU utilization, and compute time) of a process corresponding to a given shell command.\n", "\n", "Usage:\n", " gpu-tracker -h | --help\n", " gpu-tracker -v | --version\n", - " gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--disable-logs]\n", + " gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--nec=] [--guuids=] [--disable-logs]\n", "\n", "Options:\n", " -h --help Show this help message and exit.\n", @@ -628,6 +668,8 @@ " --ru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.\n", " --gru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'.\n", " --tu= One of 'seconds', 'minutes', 'hours', or 'days'.\n", + " --nec= The number of cores expected to be used. Defaults to the number of cores in the entire operating system.\n", + " --guuids= Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system.\n", " --disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual.\n" ] } @@ -646,7 +688,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "id": "ea7c710f-a238-460d-836c-a979e1c72f4f", "metadata": {}, "outputs": [ @@ -657,52 +699,61 @@ "Resource tracking complete. Process completed with status code: 0\n", "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.964\n", + " System capacity: 63.088\n", + " System: 2.3\n", " Main:\n", " Total RSS: 0.003\n", " Private RSS: 0.0\n", " Shared RSS: 0.003\n", " Descendents:\n", - " Total RSS: 0.847\n", - " Private RSS: 0.724\n", - " Shared RSS: 0.122\n", + " Total RSS: 0.917\n", + " Private RSS: 0.905\n", + " Shared RSS: 0.012\n", " Combined:\n", - " Total RSS: 0.856\n", - " Private RSS: 0.733\n", - " Shared RSS: 0.123\n", + " Total RSS: 0.925\n", + " Private RSS: 0.912\n", + " Shared RSS: 0.013\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 1.043\n", + " System capacity: 2.048\n", + " System: 0.193\n", " Main: 0.0\n", - " Descendents: 0.314\n", - " Combined: 0.314\n", + " Descendents: 0.117\n", + " Combined: 0.117\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 177.6\n", - " Max CPU percent: 14.8\n", - " Mean core percent: 134.375\n", - " Mean CPU percent: 11.198\n", + " Max sum percent: 309.5\n", + " Max hardware percent: 25.792\n", + " Mean sum percent: 159.073\n", + " Mean hardware percent: 13.256\n", " Main:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Descendents:\n", - " Max core percent: 100.4\n", - " Max CPU percent: 8.367\n", - " Mean core percent: 95.45\n", - " Mean CPU percent: 7.954\n", + " Max sum percent: 493.1\n", + " Max hardware percent: 41.092\n", + " Mean sum percent: 134.427\n", + " Mean hardware percent: 11.202\n", " Combined:\n", - " Max core percent: 100.4\n", - " Max CPU percent: 8.367\n", - " Mean core percent: 95.45\n", - " Mean CPU percent: 7.954\n", + " Max sum percent: 493.1\n", + " Max hardware percent: 41.092\n", + " Mean sum percent: 134.427\n", + " Mean hardware percent: 11.202\n", " Main number of threads: 1\n", - " Descendents number of threads: 4\n", - " Combined number of threads: 5\n", + " Descendents number of threads: 15\n", + " Combined number of threads: 16\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 4.0\n", + " Max hardware percent: 4.0\n", + " Mean sum percent: 0.364\n", + " Mean hardware percent: 0.364\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001\n" @@ -710,7 +761,7 @@ } ], "source": [ - "!gpu-tracker -e \"bash example-script.sh\"" + "!gpu-tracker -e \"bash example-script.sh\" --st=0.3" ] }, { @@ -731,7 +782,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "id": "cff099a7-1070-42ba-9f2a-008d58863fe6", "metadata": {}, "outputs": [ @@ -742,60 +793,69 @@ "Resource tracking complete. Process completed with status code: 0\n", "Max RAM:\n", " Unit: megabytes\n", - " System capacity: 67254.17\n", - " System: 5784.379\n", + " System capacity: 63088.23\n", + " System: 2242.593\n", " Main:\n", - " Total RSS: 3.076\n", - " Private RSS: 0.324\n", - " Shared RSS: 2.753\n", + " Total RSS: 3.039\n", + " Private RSS: 0.315\n", + " Shared RSS: 2.724\n", " Descendents:\n", - " Total RSS: 838.545\n", - " Private RSS: 716.681\n", - " Shared RSS: 121.864\n", + " Total RSS: 832.487\n", + " Private RSS: 705.831\n", + " Shared RSS: 126.657\n", " Combined:\n", - " Total RSS: 847.249\n", - " Private RSS: 724.492\n", - " Shared RSS: 122.757\n", + " Total RSS: 841.482\n", + " Private RSS: 713.867\n", + " Shared RSS: 127.992\n", "Max GPU RAM:\n", " Unit: megabytes\n", - " System capacity: 16376.0\n", - " System: 1043.0\n", + " System capacity: 2048.0\n", + " System: 631.0\n", " Main: 0.0\n", - " Descendents: 314.0\n", - " Combined: 314.0\n", + " Descendents: 277.0\n", + " Combined: 277.0\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 188.7\n", - " Max CPU percent: 15.725\n", - " Mean core percent: 136.45\n", - " Mean CPU percent: 11.371\n", + " Max sum percent: 362.6\n", + " Max hardware percent: 30.217\n", + " Mean sum percent: 156.853\n", + " Mean hardware percent: 13.071\n", " Main:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Descendents:\n", - " Max core percent: 96.2\n", - " Max CPU percent: 8.017\n", - " Mean core percent: 94.55\n", - " Mean CPU percent: 7.879\n", + " Max sum percent: 512.8\n", + " Max hardware percent: 42.733\n", + " Mean sum percent: 120.333\n", + " Mean hardware percent: 10.028\n", " Combined:\n", - " Max core percent: 96.2\n", - " Max CPU percent: 8.017\n", - " Mean core percent: 94.55\n", - " Mean CPU percent: 7.879\n", + " Max sum percent: 512.8\n", + " Max hardware percent: 42.733\n", + " Mean sum percent: 120.333\n", + " Mean hardware percent: 10.028\n", " Main number of threads: 1\n", - " Descendents number of threads: 4\n", - " Combined number of threads: 5\n", + " Descendents number of threads: 15\n", + " Combined number of threads: 16\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 4.0\n", + " Max hardware percent: 4.0\n", + " Mean sum percent: 0.267\n", + " Mean hardware percent: 0.267\n", "Compute time:\n", " Unit: seconds\n", - " Time: 3.566\n" + " Time: 4.931\n" ] } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes" + "!gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes --st=0.2" ] }, { @@ -808,7 +868,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 12, "id": "a8520fd9-0907-4c0c-a68f-8fdaec040e1a", "metadata": {}, "outputs": [ @@ -821,12 +881,12 @@ } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' -o out.txt " + "!gpu-tracker -e 'bash example-script.sh' -o out.txt --st=0.2" ] }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 13, "id": "213550b7-d808-4e11-be37-f2f892e4834b", "metadata": {}, "outputs": [ @@ -836,52 +896,61 @@ "text": [ "Max RAM:\n", " Unit: gigabytes\n", - " System capacity: 67.254\n", - " System: 5.584\n", + " System capacity: 63.088\n", + " System: 2.683\n", " Main:\n", " Total RSS: 0.003\n", " Private RSS: 0.0\n", " Shared RSS: 0.003\n", " Descendents:\n", - " Total RSS: 0.853\n", - " Private RSS: 0.731\n", - " Shared RSS: 0.122\n", + " Total RSS: 0.843\n", + " Private RSS: 0.717\n", + " Shared RSS: 0.127\n", " Combined:\n", - " Total RSS: 0.862\n", - " Private RSS: 0.739\n", - " Shared RSS: 0.123\n", + " Total RSS: 0.852\n", + " Private RSS: 0.725\n", + " Shared RSS: 0.128\n", "Max GPU RAM:\n", " Unit: gigabytes\n", - " System capacity: 16.376\n", - " System: 1.043\n", + " System capacity: 2.048\n", + " System: 0.631\n", " Main: 0.0\n", - " Descendents: 0.314\n", - " Combined: 0.314\n", + " Descendents: 0.277\n", + " Combined: 0.277\n", "CPU utilization:\n", " System core count: 12\n", + " Number of expected cores: 12\n", " System:\n", - " Max core percent: 187.6\n", - " Max CPU percent: 15.633\n", - " Mean core percent: 137.675\n", - " Mean CPU percent: 11.473\n", + " Max sum percent: 383.8\n", + " Max hardware percent: 31.983\n", + " Mean sum percent: 166.507\n", + " Mean hardware percent: 13.876\n", " Main:\n", - " Max core percent: 0.0\n", - " Max CPU percent: 0.0\n", - " Mean core percent: 0.0\n", - " Mean CPU percent: 0.0\n", + " Max sum percent: 0.0\n", + " Max hardware percent: 0.0\n", + " Mean sum percent: 0.0\n", + " Mean hardware percent: 0.0\n", " Descendents:\n", - " Max core percent: 101.3\n", - " Max CPU percent: 8.442\n", - " Mean core percent: 97.675\n", - " Mean CPU percent: 8.14\n", + " Max sum percent: 528.4\n", + " Max hardware percent: 44.033\n", + " Mean sum percent: 128.014\n", + " Mean hardware percent: 10.668\n", " Combined:\n", - " Max core percent: 101.3\n", - " Max CPU percent: 8.442\n", - " Mean core percent: 97.675\n", - " Mean CPU percent: 8.14\n", + " Max sum percent: 528.4\n", + " Max hardware percent: 44.033\n", + " Mean sum percent: 128.014\n", + " Mean hardware percent: 10.668\n", " Main number of threads: 1\n", - " Descendents number of threads: 4\n", - " Combined number of threads: 5\n", + " Descendents number of threads: 15\n", + " Combined number of threads: 16\n", + "GPU utilization:\n", + " System GPU count: 1\n", + " Number of expected GPUs: 1\n", + " GPU percentages:\n", + " Max sum percent: 7.0\n", + " Max hardware percent: 7.0\n", + " Mean sum percent: 0.643\n", + " Mean hardware percent: 0.643\n", "Compute time:\n", " Unit: hours\n", " Time: 0.001" @@ -902,7 +971,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 26, "id": "f6fd29d2-cad6-4f9c-8af8-ccf4f0e721d3", "metadata": {}, "outputs": [ @@ -914,77 +983,88 @@ "{\n", " \"max_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 67.2541696,\n", - " \"system\": 5.720379392000001,\n", + " \"system_capacity\": 63.0882304,\n", + " \"system\": 3.111936,\n", " \"main\": {\n", - " \"total_rss\": 0.003084288,\n", - " \"private_rss\": 0.00031948800000000004,\n", - " \"shared_rss\": 0.0027648\n", + " \"total_rss\": 0.003059712,\n", + " \"private_rss\": 0.000339968,\n", + " \"shared_rss\": 0.002719744\n", " },\n", " \"descendents\": {\n", - " \"total_rss\": 0.854237184,\n", - " \"private_rss\": 0.73218048,\n", - " \"shared_rss\": 0.122056704\n", + " \"total_rss\": 0.846565376,\n", + " \"private_rss\": 0.7198023680000001,\n", + " \"shared_rss\": 0.12713984\n", " },\n", " \"combined\": {\n", - " \"total_rss\": 0.863256576,\n", - " \"private_rss\": 0.7403069440000001,\n", - " \"shared_rss\": 0.122949632\n", + " \"total_rss\": 0.8552325120000001,\n", + " \"private_rss\": 0.727576576,\n", + " \"shared_rss\": 0.12803276800000002\n", " }\n", " },\n", " \"max_gpu_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 16.376,\n", - " \"system\": 1.043,\n", + " \"system_capacity\": 2.048,\n", + " \"system\": 0.631,\n", " \"main\": 0.0,\n", - " \"descendents\": 0.314,\n", - " \"combined\": 0.314\n", + " \"descendents\": 0.277,\n", + " \"combined\": 0.277\n", " },\n", " \"cpu_utilization\": {\n", " \"system_core_count\": 12,\n", + " \"n_expected_cores\": 12,\n", " \"system\": {\n", - " \"max_core_percent\": 260.00000000000006,\n", - " \"max_cpu_percent\": 21.66666666666667,\n", - " \"mean_core_percent\": 159.35000000000002,\n", - " \"mean_cpu_percent\": 13.279166666666669\n", + " \"max_sum_percent\": 384.5999999999999,\n", + " \"max_hardware_percent\": 32.04999999999999,\n", + " \"mean_sum_percent\": 167.49285714285716,\n", + " \"mean_hardware_percent\": 13.957738095238097\n", " },\n", " \"main\": {\n", - " \"max_core_percent\": 0.0,\n", - " \"max_cpu_percent\": 0.0,\n", - " \"mean_core_percent\": 0.0,\n", - " \"mean_cpu_percent\": 0.0\n", + " \"max_sum_percent\": 0.0,\n", + " \"max_hardware_percent\": 0.0,\n", + " \"mean_sum_percent\": 0.0,\n", + " \"mean_hardware_percent\": 0.0\n", " },\n", " \"descendents\": {\n", - " \"max_core_percent\": 102.9,\n", - " \"max_cpu_percent\": 8.575000000000001,\n", - " \"mean_core_percent\": 97.475,\n", - " \"mean_cpu_percent\": 8.122916666666667\n", + " \"max_sum_percent\": 526.0,\n", + " \"max_hardware_percent\": 43.833333333333336,\n", + " \"mean_sum_percent\": 128.65,\n", + " \"mean_hardware_percent\": 10.720833333333333\n", " },\n", " \"combined\": {\n", - " \"max_core_percent\": 102.9,\n", - " \"max_cpu_percent\": 8.575000000000001,\n", - " \"mean_core_percent\": 97.475,\n", - " \"mean_cpu_percent\": 8.122916666666667\n", + " \"max_sum_percent\": 526.0,\n", + " \"max_hardware_percent\": 43.833333333333336,\n", + " \"mean_sum_percent\": 128.65,\n", + " \"mean_hardware_percent\": 10.720833333333333\n", " },\n", " \"main_n_threads\": 1,\n", - " \"descendents_n_threads\": 4,\n", - " \"combined_n_threads\": 5\n", + " \"descendents_n_threads\": 15,\n", + " \"combined_n_threads\": 16\n", + " },\n", + " \"gpu_utilization\": {\n", + " \"system_gpu_count\": 1,\n", + " \"n_expected_gpus\": 1,\n", + " \"gpu_percentages\": {\n", + " \"max_sum_percent\": 7.0,\n", + " \"max_hardware_percent\": 7.0,\n", + " \"mean_sum_percent\": 0.5,\n", + " \"mean_hardware_percent\": 0.5\n", + " }\n", " },\n", " \"compute_time\": {\n", " \"unit\": \"hours\",\n", - " \"time\": 0.001005272732840644\n", + " \"time\": 0.0012672905127207438\n", " }\n", "}\n" ] } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' -f json" + "!gpu-tracker -e 'bash example-script.sh' -f json --st=0.2" ] }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 27, "id": "5c825e42-d100-4533-b218-c36f6380e6ed", "metadata": {}, "outputs": [ @@ -997,12 +1077,12 @@ } ], "source": [ - "!gpu-tracker -e 'bash example-script.sh' -f json -o out.json" + "!gpu-tracker -e 'bash example-script.sh' -f json -o out.json --st=0.3" ] }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 28, "id": "c821972e-0bed-4245-8933-27b0b28589de", "metadata": {}, "outputs": [ @@ -1013,65 +1093,76 @@ "{\n", " \"max_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 67.2541696,\n", - " \"system\": 5.560373248,\n", + " \"system_capacity\": 63.0882304,\n", + " \"system\": 2.878910464,\n", " \"main\": {\n", - " \"total_rss\": 0.002957312,\n", - " \"private_rss\": 0.000323584,\n", - " \"shared_rss\": 0.002633728\n", + " \"total_rss\": 0.0029777920000000004,\n", + " \"private_rss\": 0.00031948800000000004,\n", + " \"shared_rss\": 0.0026583040000000002\n", " },\n", " \"descendents\": {\n", - " \"total_rss\": 0.848539648,\n", - " \"private_rss\": 0.726519808,\n", - " \"shared_rss\": 0.12201984\n", + " \"total_rss\": 0.8333844480000001,\n", + " \"private_rss\": 0.7066091520000001,\n", + " \"shared_rss\": 0.127152128\n", " },\n", " \"combined\": {\n", - " \"total_rss\": 0.857731072,\n", - " \"private_rss\": 0.734818304,\n", - " \"shared_rss\": 0.122912768\n", + " \"total_rss\": 0.841486336,\n", + " \"private_rss\": 0.713818112,\n", + " \"shared_rss\": 0.12804505600000002\n", " }\n", " },\n", " \"max_gpu_ram\": {\n", " \"unit\": \"gigabytes\",\n", - " \"system_capacity\": 16.376,\n", - " \"system\": 1.043,\n", + " \"system_capacity\": 2.048,\n", + " \"system\": 0.631,\n", " \"main\": 0.0,\n", - " \"descendents\": 0.314,\n", - " \"combined\": 0.314\n", + " \"descendents\": 0.277,\n", + " \"combined\": 0.277\n", " },\n", " \"cpu_utilization\": {\n", " \"system_core_count\": 12,\n", + " \"n_expected_cores\": 12,\n", " \"system\": {\n", - " \"max_core_percent\": 192.5,\n", - " \"max_cpu_percent\": 16.041666666666668,\n", - " \"mean_core_percent\": 154.22500000000002,\n", - " \"mean_cpu_percent\": 12.852083333333335\n", + " \"max_sum_percent\": 306.09999999999997,\n", + " \"max_hardware_percent\": 25.50833333333333,\n", + " \"mean_sum_percent\": 161.4272727272727,\n", + " \"mean_hardware_percent\": 13.452272727272724\n", " },\n", " \"main\": {\n", - " \"max_core_percent\": 0.0,\n", - " \"max_cpu_percent\": 0.0,\n", - " \"mean_core_percent\": 0.0,\n", - " \"mean_cpu_percent\": 0.0\n", + " \"max_sum_percent\": 0.0,\n", + " \"max_hardware_percent\": 0.0,\n", + " \"mean_sum_percent\": 0.0,\n", + " \"mean_hardware_percent\": 0.0\n", " },\n", " \"descendents\": {\n", - " \"max_core_percent\": 104.1,\n", - " \"max_cpu_percent\": 8.674999999999999,\n", - " \"mean_core_percent\": 97.7,\n", - " \"mean_cpu_percent\": 8.141666666666667\n", + " \"max_sum_percent\": 440.2,\n", + " \"max_hardware_percent\": 36.68333333333333,\n", + " \"mean_sum_percent\": 128.27272727272728,\n", + " \"mean_hardware_percent\": 10.68939393939394\n", " },\n", " \"combined\": {\n", - " \"max_core_percent\": 104.1,\n", - " \"max_cpu_percent\": 8.674999999999999,\n", - " \"mean_core_percent\": 97.7,\n", - " \"mean_cpu_percent\": 8.141666666666667\n", + " \"max_sum_percent\": 440.2,\n", + " \"max_hardware_percent\": 36.68333333333333,\n", + " \"mean_sum_percent\": 128.27272727272728,\n", + " \"mean_hardware_percent\": 10.68939393939394\n", " },\n", " \"main_n_threads\": 1,\n", - " \"descendents_n_threads\": 4,\n", - " \"combined_n_threads\": 5\n", + " \"descendents_n_threads\": 15,\n", + " \"combined_n_threads\": 16\n", + " },\n", + " \"gpu_utilization\": {\n", + " \"system_gpu_count\": 1,\n", + " \"n_expected_gpus\": 1,\n", + " \"gpu_percentages\": {\n", + " \"max_sum_percent\": 7.0,\n", + " \"max_hardware_percent\": 7.0,\n", + " \"mean_sum_percent\": 0.6363636363636364,\n", + " \"mean_hardware_percent\": 0.6363636363636364\n", + " }\n", " },\n", " \"compute_time\": {\n", " \"unit\": \"hours\",\n", - " \"time\": 0.000995432734489441\n", + " \"time\": 0.0012816817230648465\n", " }\n", "}" ] @@ -1098,7 +1189,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.12" + "version": "3.12.2" } }, "nbformat": 4, diff --git a/docs/tutorial.rst b/docs/tutorial.rst index 37aa49d..a42c756 100644 --- a/docs/tutorial.rst +++ b/docs/tutorial.rst @@ -8,14 +8,15 @@ API The ``gpu_tracker`` package provides the ``Tracker`` class which uses a subprocess to measure computational resource usage, namely the compute -time, CPU utilization, maximum RAM used, and maximum GPU RAM used. The -``start()`` method starts this process which tracks usage in the +time, maximum CPU utilization, mean CPU utilization, maximum RAM used, +maximum GPU utilization, mean GPU utilization, and maximum GPU RAM used. +The ``start()`` method starts this process which tracks usage in the background. After calling ``start()``, one can write the code for which resource usage is measured, followed by calling the ``stop()`` method. The compute time will be the time from the call to ``start()`` to the -call to ``stop()`` and the RAM, GPU RAM, and CPU utilization quantities -will be the respective computational resources used by the code that’s -in between ``start()`` and ``stop()``. +call to ``stop()`` and the RAM, GPU RAM, CPU utilization, and GPU +utilization quantities will be the respective computational resources +used by the code that’s in between ``start()`` and ``stop()``. .. code:: python3 @@ -24,7 +25,7 @@ in between ``start()`` and ``stop()``. .. code:: python3 - tracker = gput.Tracker() + tracker = gput.Tracker(n_expected_cores=1, sleep_time=0.1) tracker.start() example_function() tracker.stop() @@ -42,52 +43,61 @@ resource formatted. Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.21 + System capacity: 63.088 + System: 1.899 Main: - Total RSS: 0.827 - Private RSS: 0.674 - Shared RSS: 0.154 + Total RSS: 0.914 + Private RSS: 0.753 + Shared RSS: 0.161 Descendents: Total RSS: 0.0 Private RSS: 0.0 Shared RSS: 0.0 Combined: - Total RSS: 0.834 - Private RSS: 0.681 - Shared RSS: 0.154 + Total RSS: 0.883 + Private RSS: 0.723 + Shared RSS: 0.161 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 0.535 - Main: 0.314 + System capacity: 2.048 + System: 0.353 + Main: 0.277 Descendents: 0.0 - Combined: 0.314 + Combined: 0.277 CPU utilization: System core count: 12 + Number of expected cores: 1 System: - Max core percent: 150.6 - Max CPU percent: 12.55 - Mean core percent: 122.9 - Mean CPU percent: 10.242 + Max sum percent: 169.7 + Max hardware percent: 14.142 + Mean sum percent: 150.183 + Mean hardware percent: 12.515 Main: - Max core percent: 98.6 - Max CPU percent: 8.217 - Mean core percent: 96.8 - Mean CPU percent: 8.067 + Max sum percent: 101.2 + Max hardware percent: 101.2 + Mean sum percent: 93.158 + Mean hardware percent: 93.158 Descendents: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Combined: - Max core percent: 98.6 - Max CPU percent: 8.217 - Mean core percent: 96.8 - Mean CPU percent: 8.067 - Main number of threads: 15 + Max sum percent: 101.2 + Max hardware percent: 101.2 + Mean sum percent: 93.158 + Mean hardware percent: 93.158 + Main number of threads: 24 Descendents number of threads: 0 - Combined number of threads: 15 + Combined number of threads: 24 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 4.0 + Max hardware percent: 4.0 + Mean sum percent: 0.333 + Mean hardware percent: 0.333 Compute time: Unit: hours Time: 0.001 @@ -114,21 +124,38 @@ never exceed the overall system RAM usage, but inaccuracies resulting from shared RSS can cause this to happen, especially for non-linux operating systems (see note below). -The ``Tracker`` assumes that GPU memory is not shared accross multiple +The ``Tracker`` assumes that GPU memory is not shared across multiple processes and if it is, the reported GPU RAM of “descendent” and “combined” may be an overestimation. The CPU utilization includes the system core count field which is the total number of cores available system-wide. Utilization is measured for the main process, its descendents, the main process and its descendents -combined, and CPU utilization across the entire system. The core percent -is the sum of the percentages of all the cores being used. The CPU -percent is that divided by the system core count. The max percent is the -highest percentage detected through the duration of tracking while the -mean percent is the average of all the percentages detected over that -duration. The CPU utilization concludes with the maximum number of -threads used at any time for the main process and the sum of the threads -used accross its descendent processes and combined. +combined, and CPU utilization across the entire system. The sum percent +is the sum of the percentages of all the cores being used. The hardware +percent is that divided by the expected number of cores being used +i.e. the optional ``n_expected_cores`` parameter (defaults to the number +of cores in the entire system) for the main, descendents, and combined +measurements. For the system measurements, hardware percent is divided +by the total number of cores in the system regardless of the value of +``n_expected_cores``. The max percent is the highest percentage detected +through the duration of tracking while the mean percent is the average +of all the percentages detected over that duration. The CPU utilization +concludes with the maximum number of threads used at any time for the +main process and the sum of the threads used across its descendent +processes and combined. + +The GPU utilization is similar to the CPU utilization but rather than +being based on utilization of processes, it can only measure the +utilization percentages of the GPUs themselves, regardless of what +processes are using them. To ameliorate this limitation, the optional +``gpu_uuids`` parameter can be set to specify which GPUs to measure +utilization for (defaults to all the GPUs in the system). The system GPU +count is the total number of GPUs in the system. The sum percent is the +sum of all the percentages of these GPUs and the hardware percent is +that divided by the expected number of GPUs being used +(i.e. ``len(gpu_uuids)``). Likewise with CPU utilization, the max and +mean of both the sum and hardware percentages are provided. The compute time is the real time that the computational-resource tracking lasted (as compared to CPU time). @@ -154,7 +181,7 @@ the compute time in seconds: .. code:: python3 - with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds') as tracker: + with gput.Tracker(ram_unit='megabytes', gpu_ram_unit='megabytes', time_unit='seconds', sleep_time=0.1) as tracker: example_function() print(tracker) @@ -163,55 +190,64 @@ the compute time in seconds: Max RAM: Unit: megabytes - System capacity: 67254.17 - System: 5721.395 + System capacity: 63088.23 + System: 2399.92 Main: - Total RSS: 850.399 - Private RSS: 634.077 - Shared RSS: 216.547 + Total RSS: 890.704 + Private RSS: 674.058 + Shared RSS: 216.924 Descendents: Total RSS: 0.0 Private RSS: 0.0 Shared RSS: 0.0 Combined: - Total RSS: 858.763 - Private RSS: 642.445 - Shared RSS: 216.527 + Total RSS: 901.263 + Private RSS: 684.618 + Shared RSS: 216.678 Max GPU RAM: Unit: megabytes - System capacity: 16376.0 - System: 727.0 - Main: 506.0 + System capacity: 2048.0 + System: 353.0 + Main: 277.0 Descendents: 0.0 - Combined: 506.0 + Combined: 277.0 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 148.9 - Max CPU percent: 12.408 - Mean core percent: 124.7 - Mean CPU percent: 10.392 + Max sum percent: 164.3 + Max hardware percent: 13.692 + Mean sum percent: 152.325 + Mean hardware percent: 12.694 Main: - Max core percent: 99.9 - Max CPU percent: 8.325 - Mean core percent: 97.533 - Mean CPU percent: 8.128 + Max sum percent: 102.6 + Max hardware percent: 8.55 + Mean sum percent: 91.258 + Mean hardware percent: 7.605 Descendents: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Combined: - Max core percent: 99.9 - Max CPU percent: 8.325 - Mean core percent: 97.533 - Mean CPU percent: 8.128 - Main number of threads: 15 + Max sum percent: 102.6 + Max hardware percent: 8.55 + Mean sum percent: 91.258 + Mean hardware percent: 7.605 + Main number of threads: 24 Descendents number of threads: 0 - Combined number of threads: 15 + Combined number of threads: 24 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 6.0 + Max hardware percent: 6.0 + Mean sum percent: 0.5 + Mean hardware percent: 0.5 Compute time: Unit: seconds - Time: 2.52 + Time: 3.346 The same information as the text format can be provided as a dictionary @@ -228,12 +264,12 @@ via the ``to_json()`` method of the ``Tracker``. { "max_ram": { "unit": "megabytes", - "system_capacity": 67254.1696, - "system": 5721.3952, + "system_capacity": 63088.2304, + "system": 2399.9201279999997, "main": { - "total_rss": 850.399232, - "private_rss": 634.077184, - "shared_rss": 216.547328 + "total_rss": 890.7038719999999, + "private_rss": 674.05824, + "shared_rss": 216.92416 }, "descendents": { "total_rss": 0.0, @@ -241,52 +277,63 @@ via the ``to_json()`` method of the ``Tracker``. "shared_rss": 0.0 }, "combined": { - "total_rss": 858.7632639999999, - "private_rss": 642.445312, - "shared_rss": 216.526848 + "total_rss": 901.2633599999999, + "private_rss": 684.6177279999999, + "shared_rss": 216.67839999999998 } }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 16376.0, - "system": 727.0, - "main": 506.0, + "system_capacity": 2048.0, + "system": 353.0, + "main": 277.0, "descendents": 0.0, - "combined": 506.0 + "combined": 277.0 }, "cpu_utilization": { "system_core_count": 12, + "n_expected_cores": 12, "system": { - "max_core_percent": 148.90000000000003, - "max_cpu_percent": 12.408333333333337, - "mean_core_percent": 124.70000000000003, - "mean_cpu_percent": 10.39166666666667 + "max_sum_percent": 164.3, + "max_hardware_percent": 13.691666666666668, + "mean_sum_percent": 152.325, + "mean_hardware_percent": 12.693750000000001 }, "main": { - "max_core_percent": 99.9, - "max_cpu_percent": 8.325000000000001, - "mean_core_percent": 97.53333333333335, - "mean_cpu_percent": 8.127777777777778 + "max_sum_percent": 102.6, + "max_hardware_percent": 8.549999999999999, + "mean_sum_percent": 91.25833333333334, + "mean_hardware_percent": 7.604861111111112 }, "descendents": { - "max_core_percent": 0.0, - "max_cpu_percent": 0.0, - "mean_core_percent": 0.0, - "mean_cpu_percent": 0.0 + "max_sum_percent": 0.0, + "max_hardware_percent": 0.0, + "mean_sum_percent": 0.0, + "mean_hardware_percent": 0.0 }, "combined": { - "max_core_percent": 99.9, - "max_cpu_percent": 8.325000000000001, - "mean_core_percent": 97.53333333333335, - "mean_cpu_percent": 8.127777777777778 + "max_sum_percent": 102.6, + "max_hardware_percent": 8.549999999999999, + "mean_sum_percent": 91.25833333333334, + "mean_hardware_percent": 7.604861111111112 }, - "main_n_threads": 15, + "main_n_threads": 24, "descendents_n_threads": 0, - "combined_n_threads": 15 + "combined_n_threads": 24 + }, + "gpu_utilization": { + "system_gpu_count": 1, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 6.0, + "max_hardware_percent": 6.0, + "mean_sum_percent": 0.5, + "mean_hardware_percent": 0.5 + } }, "compute_time": { "unit": "seconds", - "time": 2.5198354721069336 + "time": 3.345628023147583 } } @@ -304,7 +351,7 @@ information for each individual computational resource. .. code:: none - MaxRAM(unit='megabytes', system_capacity=67254.1696, system=5721.3952, main=RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=858.7632639999999, private_rss=642.445312, shared_rss=216.526848)) + MaxRAM(unit='megabytes', system_capacity=63088.2304, system=2399.9201279999997, main=RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416), descendents=RSSValues(total_rss=0.0, private_rss=0.0, shared_rss=0.0), combined=RSSValues(total_rss=901.2633599999999, private_rss=684.6177279999999, shared_rss=216.67839999999998)) @@ -330,7 +377,7 @@ information for each individual computational resource. .. code:: none - RSSValues(total_rss=850.399232, private_rss=634.077184, shared_rss=216.547328) + RSSValues(total_rss=890.7038719999999, private_rss=674.05824, shared_rss=216.92416) @@ -343,7 +390,7 @@ information for each individual computational resource. .. code:: none - 850.399232 + 890.7038719999999 @@ -356,7 +403,7 @@ information for each individual computational resource. .. code:: none - MaxGPURAM(unit='megabytes', system_capacity=16376.0, system=727.0, main=506.0, descendents=0.0, combined=506.0) + MaxGPURAM(unit='megabytes', system_capacity=2048.0, system=353.0, main=277.0, descendents=0.0, combined=277.0) @@ -369,7 +416,7 @@ information for each individual computational resource. .. code:: none - ComputeTime(unit='seconds', time=2.5198354721069336) + ComputeTime(unit='seconds', time=3.345628023147583) @@ -391,7 +438,7 @@ to the point of failure, use a try/except block like so: .. code:: none The following error occured while tracking: AN ERROR - 0.506 + 0.277 Below is an example of using a child process. Notice the descendents @@ -402,11 +449,11 @@ fields are now non-zero. import multiprocessing as mp ctx = mp.get_context(method='spawn') child_process = ctx.Process(target=example_function) - with gput.Tracker() as tracker: + with gput.Tracker(n_expected_cores=2, sleep_time=0.2) as tracker: child_process.start() example_function() child_process.join() - child_process.close() + child_process.close() print(tracker) @@ -414,52 +461,61 @@ fields are now non-zero. Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.938 + System capacity: 63.088 + System: 2.877 Main: - Total RSS: 0.798 - Private RSS: 0.491 - Shared RSS: 0.311 + Total RSS: 0.844 + Private RSS: 0.525 + Shared RSS: 0.319 Descendents: - Total RSS: 0.85 - Private RSS: 0.728 - Shared RSS: 0.122 + Total RSS: 0.831 + Private RSS: 0.704 + Shared RSS: 0.127 Combined: - Total RSS: 1.451 - Private RSS: 1.144 - Shared RSS: 0.311 + Total RSS: 1.462 + Private RSS: 1.148 + Shared RSS: 0.32 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 1.043 - Main: 0.506 - Descendents: 0.314 - Combined: 0.82 + System capacity: 2.048 + System: 0.631 + Main: 0.277 + Descendents: 0.277 + Combined: 0.554 CPU utilization: System core count: 12 + Number of expected cores: 2 System: - Max core percent: 225.5 - Max CPU percent: 18.792 - Mean core percent: 187.575 - Mean CPU percent: 15.631 + Max sum percent: 398.9 + Max hardware percent: 33.242 + Mean sum percent: 222.255 + Mean hardware percent: 18.521 Main: - Max core percent: 99.6 - Max CPU percent: 8.3 - Mean core percent: 74.15 - Mean CPU percent: 6.179 + Max sum percent: 103.8 + Max hardware percent: 51.9 + Mean sum percent: 66.009 + Mean hardware percent: 33.005 Descendents: - Max core percent: 101.2 - Max CPU percent: 8.433 - Mean core percent: 74.125 - Mean CPU percent: 6.177 + Max sum percent: 308.5 + Max hardware percent: 154.25 + Mean sum percent: 117.109 + Mean hardware percent: 58.555 Combined: - Max core percent: 198.7 - Max CPU percent: 16.558 - Mean core percent: 148.275 - Mean CPU percent: 12.356 - Main number of threads: 15 - Descendents number of threads: 5 - Combined number of threads: 20 + Max sum percent: 409.2 + Max hardware percent: 204.6 + Mean sum percent: 183.118 + Mean hardware percent: 91.559 + Main number of threads: 24 + Descendents number of threads: 16 + Combined number of threads: 40 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 6.0 + Max hardware percent: 6.0 + Mean sum percent: 0.545 + Mean hardware percent: 0.545 Compute time: Unit: hours Time: 0.001 @@ -480,12 +536,12 @@ help message. .. code:: none - Tracks the computational resource usage (RAM, GPU RAM, and compute time) of a process corresponding to a given shell command. + Tracks the computational resource usage (RAM, GPU RAM, CPU utilization, GPU utilization, and compute time) of a process corresponding to a given shell command. Usage: gpu-tracker -h | --help gpu-tracker -v | --version - gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--disable-logs] + gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--nec=] [--guuids=] [--disable-logs] Options: -h --help Show this help message and exit. @@ -497,6 +553,8 @@ help message. --ru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --gru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --tu= One of 'seconds', 'minutes', 'hours', or 'days'. + --nec= The number of cores expected to be used. Defaults to the number of cores in the entire operating system. + --guuids= Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system. --disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual. @@ -508,7 +566,7 @@ completes, its status code is reported. .. code:: none - $ gpu-tracker -e "bash example-script.sh" + $ gpu-tracker -e "bash example-script.sh" --st=0.3 .. code:: none @@ -516,52 +574,61 @@ completes, its status code is reported. Resource tracking complete. Process completed with status code: 0 Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.964 + System capacity: 63.088 + System: 2.3 Main: Total RSS: 0.003 Private RSS: 0.0 Shared RSS: 0.003 Descendents: - Total RSS: 0.847 - Private RSS: 0.724 - Shared RSS: 0.122 + Total RSS: 0.917 + Private RSS: 0.905 + Shared RSS: 0.012 Combined: - Total RSS: 0.856 - Private RSS: 0.733 - Shared RSS: 0.123 + Total RSS: 0.925 + Private RSS: 0.912 + Shared RSS: 0.013 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 1.043 + System capacity: 2.048 + System: 0.193 Main: 0.0 - Descendents: 0.314 - Combined: 0.314 + Descendents: 0.117 + Combined: 0.117 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 177.6 - Max CPU percent: 14.8 - Mean core percent: 134.375 - Mean CPU percent: 11.198 + Max sum percent: 309.5 + Max hardware percent: 25.792 + Mean sum percent: 159.073 + Mean hardware percent: 13.256 Main: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Descendents: - Max core percent: 100.4 - Max CPU percent: 8.367 - Mean core percent: 95.45 - Mean CPU percent: 7.954 + Max sum percent: 493.1 + Max hardware percent: 41.092 + Mean sum percent: 134.427 + Mean hardware percent: 11.202 Combined: - Max core percent: 100.4 - Max CPU percent: 8.367 - Mean core percent: 95.45 - Mean CPU percent: 7.954 + Max sum percent: 493.1 + Max hardware percent: 41.092 + Mean sum percent: 134.427 + Mean hardware percent: 11.202 Main number of threads: 1 - Descendents number of threads: 4 - Combined number of threads: 5 + Descendents number of threads: 15 + Combined number of threads: 16 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 4.0 + Max hardware percent: 4.0 + Mean sum percent: 0.364 + Mean hardware percent: 0.364 Compute time: Unit: hours Time: 0.001 @@ -577,7 +644,7 @@ for ram-unit. .. code:: none - $ gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes + $ gpu-tracker -e 'bash example-script.sh' --tu=seconds --gru=megabytes --ru=megabytes --st=0.2 .. code:: none @@ -585,55 +652,64 @@ for ram-unit. Resource tracking complete. Process completed with status code: 0 Max RAM: Unit: megabytes - System capacity: 67254.17 - System: 5784.379 + System capacity: 63088.23 + System: 2242.593 Main: - Total RSS: 3.076 - Private RSS: 0.324 - Shared RSS: 2.753 + Total RSS: 3.039 + Private RSS: 0.315 + Shared RSS: 2.724 Descendents: - Total RSS: 838.545 - Private RSS: 716.681 - Shared RSS: 121.864 + Total RSS: 832.487 + Private RSS: 705.831 + Shared RSS: 126.657 Combined: - Total RSS: 847.249 - Private RSS: 724.492 - Shared RSS: 122.757 + Total RSS: 841.482 + Private RSS: 713.867 + Shared RSS: 127.992 Max GPU RAM: Unit: megabytes - System capacity: 16376.0 - System: 1043.0 + System capacity: 2048.0 + System: 631.0 Main: 0.0 - Descendents: 314.0 - Combined: 314.0 + Descendents: 277.0 + Combined: 277.0 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 188.7 - Max CPU percent: 15.725 - Mean core percent: 136.45 - Mean CPU percent: 11.371 + Max sum percent: 362.6 + Max hardware percent: 30.217 + Mean sum percent: 156.853 + Mean hardware percent: 13.071 Main: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Descendents: - Max core percent: 96.2 - Max CPU percent: 8.017 - Mean core percent: 94.55 - Mean CPU percent: 7.879 + Max sum percent: 512.8 + Max hardware percent: 42.733 + Mean sum percent: 120.333 + Mean hardware percent: 10.028 Combined: - Max core percent: 96.2 - Max CPU percent: 8.017 - Mean core percent: 94.55 - Mean CPU percent: 7.879 + Max sum percent: 512.8 + Max hardware percent: 42.733 + Mean sum percent: 120.333 + Mean hardware percent: 10.028 Main number of threads: 1 - Descendents number of threads: 4 - Combined number of threads: 5 + Descendents number of threads: 15 + Combined number of threads: 16 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 4.0 + Max hardware percent: 4.0 + Mean sum percent: 0.267 + Mean hardware percent: 0.267 Compute time: Unit: seconds - Time: 3.566 + Time: 4.931 By default, the computational-resource-usage statistics are printed to @@ -642,7 +718,7 @@ that same content in a file. .. code:: none - $ gpu-tracker -e 'bash example-script.sh' -o out.txt + $ gpu-tracker -e 'bash example-script.sh' -o out.txt --st=0.2 .. code:: none @@ -659,52 +735,61 @@ that same content in a file. Max RAM: Unit: gigabytes - System capacity: 67.254 - System: 5.584 + System capacity: 63.088 + System: 2.683 Main: Total RSS: 0.003 Private RSS: 0.0 Shared RSS: 0.003 Descendents: - Total RSS: 0.853 - Private RSS: 0.731 - Shared RSS: 0.122 + Total RSS: 0.843 + Private RSS: 0.717 + Shared RSS: 0.127 Combined: - Total RSS: 0.862 - Private RSS: 0.739 - Shared RSS: 0.123 + Total RSS: 0.852 + Private RSS: 0.725 + Shared RSS: 0.128 Max GPU RAM: Unit: gigabytes - System capacity: 16.376 - System: 1.043 + System capacity: 2.048 + System: 0.631 Main: 0.0 - Descendents: 0.314 - Combined: 0.314 + Descendents: 0.277 + Combined: 0.277 CPU utilization: System core count: 12 + Number of expected cores: 12 System: - Max core percent: 187.6 - Max CPU percent: 15.633 - Mean core percent: 137.675 - Mean CPU percent: 11.473 + Max sum percent: 383.8 + Max hardware percent: 31.983 + Mean sum percent: 166.507 + Mean hardware percent: 13.876 Main: - Max core percent: 0.0 - Max CPU percent: 0.0 - Mean core percent: 0.0 - Mean CPU percent: 0.0 + Max sum percent: 0.0 + Max hardware percent: 0.0 + Mean sum percent: 0.0 + Mean hardware percent: 0.0 Descendents: - Max core percent: 101.3 - Max CPU percent: 8.442 - Mean core percent: 97.675 - Mean CPU percent: 8.14 + Max sum percent: 528.4 + Max hardware percent: 44.033 + Mean sum percent: 128.014 + Mean hardware percent: 10.668 Combined: - Max core percent: 101.3 - Max CPU percent: 8.442 - Mean core percent: 97.675 - Mean CPU percent: 8.14 + Max sum percent: 528.4 + Max hardware percent: 44.033 + Mean sum percent: 128.014 + Mean hardware percent: 10.668 Main number of threads: 1 - Descendents number of threads: 4 - Combined number of threads: 5 + Descendents number of threads: 15 + Combined number of threads: 16 + GPU utilization: + System GPU count: 1 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 7.0 + Max hardware percent: 7.0 + Mean sum percent: 0.643 + Mean hardware percent: 0.643 Compute time: Unit: hours Time: 0.001 @@ -714,7 +799,7 @@ By default, the format of the output is “text”. The ``-f`` or .. code:: none - $ gpu-tracker -e 'bash example-script.sh' -f json + $ gpu-tracker -e 'bash example-script.sh' -f json --st=0.2 .. code:: none @@ -723,72 +808,83 @@ By default, the format of the output is “text”. The ``-f`` or { "max_ram": { "unit": "gigabytes", - "system_capacity": 67.2541696, - "system": 5.720379392000001, + "system_capacity": 63.0882304, + "system": 3.111936, "main": { - "total_rss": 0.003084288, - "private_rss": 0.00031948800000000004, - "shared_rss": 0.0027648 + "total_rss": 0.003059712, + "private_rss": 0.000339968, + "shared_rss": 0.002719744 }, "descendents": { - "total_rss": 0.854237184, - "private_rss": 0.73218048, - "shared_rss": 0.122056704 + "total_rss": 0.846565376, + "private_rss": 0.7198023680000001, + "shared_rss": 0.12713984 }, "combined": { - "total_rss": 0.863256576, - "private_rss": 0.7403069440000001, - "shared_rss": 0.122949632 + "total_rss": 0.8552325120000001, + "private_rss": 0.727576576, + "shared_rss": 0.12803276800000002 } }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 16.376, - "system": 1.043, + "system_capacity": 2.048, + "system": 0.631, "main": 0.0, - "descendents": 0.314, - "combined": 0.314 + "descendents": 0.277, + "combined": 0.277 }, "cpu_utilization": { "system_core_count": 12, + "n_expected_cores": 12, "system": { - "max_core_percent": 260.00000000000006, - "max_cpu_percent": 21.66666666666667, - "mean_core_percent": 159.35000000000002, - "mean_cpu_percent": 13.279166666666669 + "max_sum_percent": 384.5999999999999, + "max_hardware_percent": 32.04999999999999, + "mean_sum_percent": 167.49285714285716, + "mean_hardware_percent": 13.957738095238097 }, "main": { - "max_core_percent": 0.0, - "max_cpu_percent": 0.0, - "mean_core_percent": 0.0, - "mean_cpu_percent": 0.0 + "max_sum_percent": 0.0, + "max_hardware_percent": 0.0, + "mean_sum_percent": 0.0, + "mean_hardware_percent": 0.0 }, "descendents": { - "max_core_percent": 102.9, - "max_cpu_percent": 8.575000000000001, - "mean_core_percent": 97.475, - "mean_cpu_percent": 8.122916666666667 + "max_sum_percent": 526.0, + "max_hardware_percent": 43.833333333333336, + "mean_sum_percent": 128.65, + "mean_hardware_percent": 10.720833333333333 }, "combined": { - "max_core_percent": 102.9, - "max_cpu_percent": 8.575000000000001, - "mean_core_percent": 97.475, - "mean_cpu_percent": 8.122916666666667 + "max_sum_percent": 526.0, + "max_hardware_percent": 43.833333333333336, + "mean_sum_percent": 128.65, + "mean_hardware_percent": 10.720833333333333 }, "main_n_threads": 1, - "descendents_n_threads": 4, - "combined_n_threads": 5 + "descendents_n_threads": 15, + "combined_n_threads": 16 + }, + "gpu_utilization": { + "system_gpu_count": 1, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 7.0, + "max_hardware_percent": 7.0, + "mean_sum_percent": 0.5, + "mean_hardware_percent": 0.5 + } }, "compute_time": { "unit": "hours", - "time": 0.001005272732840644 + "time": 0.0012672905127207438 } } .. code:: none - $ gpu-tracker -e 'bash example-script.sh' -f json -o out.json + $ gpu-tracker -e 'bash example-script.sh' -f json -o out.json --st=0.3 .. code:: none @@ -806,64 +902,75 @@ By default, the format of the output is “text”. The ``-f`` or { "max_ram": { "unit": "gigabytes", - "system_capacity": 67.2541696, - "system": 5.560373248, + "system_capacity": 63.0882304, + "system": 2.878910464, "main": { - "total_rss": 0.002957312, - "private_rss": 0.000323584, - "shared_rss": 0.002633728 + "total_rss": 0.0029777920000000004, + "private_rss": 0.00031948800000000004, + "shared_rss": 0.0026583040000000002 }, "descendents": { - "total_rss": 0.848539648, - "private_rss": 0.726519808, - "shared_rss": 0.12201984 + "total_rss": 0.8333844480000001, + "private_rss": 0.7066091520000001, + "shared_rss": 0.127152128 }, "combined": { - "total_rss": 0.857731072, - "private_rss": 0.734818304, - "shared_rss": 0.122912768 + "total_rss": 0.841486336, + "private_rss": 0.713818112, + "shared_rss": 0.12804505600000002 } }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 16.376, - "system": 1.043, + "system_capacity": 2.048, + "system": 0.631, "main": 0.0, - "descendents": 0.314, - "combined": 0.314 + "descendents": 0.277, + "combined": 0.277 }, "cpu_utilization": { "system_core_count": 12, + "n_expected_cores": 12, "system": { - "max_core_percent": 192.5, - "max_cpu_percent": 16.041666666666668, - "mean_core_percent": 154.22500000000002, - "mean_cpu_percent": 12.852083333333335 + "max_sum_percent": 306.09999999999997, + "max_hardware_percent": 25.50833333333333, + "mean_sum_percent": 161.4272727272727, + "mean_hardware_percent": 13.452272727272724 }, "main": { - "max_core_percent": 0.0, - "max_cpu_percent": 0.0, - "mean_core_percent": 0.0, - "mean_cpu_percent": 0.0 + "max_sum_percent": 0.0, + "max_hardware_percent": 0.0, + "mean_sum_percent": 0.0, + "mean_hardware_percent": 0.0 }, "descendents": { - "max_core_percent": 104.1, - "max_cpu_percent": 8.674999999999999, - "mean_core_percent": 97.7, - "mean_cpu_percent": 8.141666666666667 + "max_sum_percent": 440.2, + "max_hardware_percent": 36.68333333333333, + "mean_sum_percent": 128.27272727272728, + "mean_hardware_percent": 10.68939393939394 }, "combined": { - "max_core_percent": 104.1, - "max_cpu_percent": 8.674999999999999, - "mean_core_percent": 97.7, - "mean_cpu_percent": 8.141666666666667 + "max_sum_percent": 440.2, + "max_hardware_percent": 36.68333333333333, + "mean_sum_percent": 128.27272727272728, + "mean_hardware_percent": 10.68939393939394 }, "main_n_threads": 1, - "descendents_n_threads": 4, - "combined_n_threads": 5 + "descendents_n_threads": 15, + "combined_n_threads": 16 + }, + "gpu_utilization": { + "system_gpu_count": 1, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 7.0, + "max_hardware_percent": 7.0, + "mean_sum_percent": 0.6363636363636364, + "mean_hardware_percent": 0.6363636363636364 + } }, "compute_time": { "unit": "hours", - "time": 0.000995432734489441 + "time": 0.0012816817230648465 } } diff --git a/requirements.txt b/requirements.txt index d3c5812..003656a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ psutil docopt +pandas diff --git a/src/gpu_tracker/__main__.py b/src/gpu_tracker/__main__.py index 1756345..ae005c8 100644 --- a/src/gpu_tracker/__main__.py +++ b/src/gpu_tracker/__main__.py @@ -1,10 +1,10 @@ """ -Tracks the computational resource usage (RAM, GPU RAM, and compute time) of a process corresponding to a given shell command. +Tracks the computational resource usage (RAM, GPU RAM, CPU utilization, GPU utilization, and compute time) of a process corresponding to a given shell command. Usage: gpu-tracker -h | --help gpu-tracker -v | --version - gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--disable-logs] + gpu-tracker --execute= [--output=] [--format=] [--st=] [--ru=] [--gru=] [--tu=] [--nec=] [--guuids=] [--disable-logs] Options: -h --help Show this help message and exit. @@ -16,6 +16,8 @@ --ru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --gru= One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. --tu= One of 'seconds', 'minutes', 'hours', or 'days'. + --nec= The number of cores expected to be used. Defaults to the number of cores in the entire operating system. + --guuids= Comma separated list of the UUIDs of the GPUs for which to track utilization e.g. gpu-uuid1,gpu-uuid2,etc. Defaults to all the GPUs in the system. --disable-logs If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual. """ import docopt as doc @@ -29,7 +31,7 @@ def main(): args = doc.docopt(__doc__, version=__version__) - command = args['--execute'].split(' ') + command = args['--execute'].split() output = args['--output'] output_format = args['--format'] if args['--format'] is not None else 'text' option_map = { @@ -37,6 +39,8 @@ def main(): '--ru': 'ram_unit', '--gru': 'gpu_ram_unit', '--tu': 'time_unit', + '--nec': 'n_expected_cores', + '--guuids': 'gpu_uuids', '--disable-logs': 'disable_logs' } kwargs = { @@ -44,6 +48,13 @@ def main(): '--execute', '--output', '--format', '--help', '--version'}} if 'sleep_time' in kwargs.keys(): kwargs['sleep_time'] = float(kwargs['sleep_time']) + if 'n_expected_cores' in kwargs.keys(): + kwargs['n_expected_cores'] = int(kwargs['n_expected_cores']) + if 'gpu_uuids' in kwargs.keys(): + kwargs['gpu_uuids'] = set(kwargs['gpu_uuids'].split(',')) + if len(command) == 0: + log.error('Empty command provided.') + sys.exit(1) try: process = subp.Popen(command) except FileNotFoundError: @@ -61,7 +72,8 @@ def main(): elif output_format == 'text': output_str = str(tracker) else: - raise ValueError(f'"{output_format} is not a valid format. Valid values are "json" or "text".') + log.error(f'"{output_format}" is not a valid format. Valid values are "json" or "text".') + sys.exit(1) if output is None: print(output_str) else: diff --git a/src/gpu_tracker/tracker.py b/src/gpu_tracker/tracker.py index 653a441..a1d8f73 100644 --- a/src/gpu_tracker/tracker.py +++ b/src/gpu_tracker/tracker.py @@ -13,6 +13,8 @@ import enum import pickle as pkl import uuid +import io +import pandas as pd class _TrackingProcess(mproc.Process): @@ -40,7 +42,8 @@ class _TrackingProcess(mproc.Process): def __init__( self, stop_event: mproc.Event, sleep_time: float, ram_unit: str, gpu_ram_unit: str, time_unit: str, - disable_logs: bool, main_process_id: int, resource_usage_file: str, extraneous_process_ids: set[int]): + n_expected_cores: int | None, gpu_uuids: set[str] | None, disable_logs: bool, main_process_id: int, + resource_usage_file: str, extraneous_process_ids: set[int]): super().__init__() self._stop_event = stop_event if sleep_time < _TrackingProcess._CPU_PERCENT_INTERVAL: @@ -55,8 +58,9 @@ def __init__( time_unit, _TrackingProcess._time_unit2coefficient, unit_type='time') self._disable_logs = disable_logs self._main_process_id = main_process_id - self._core_percent_sums = {key: 0. for key in ['system', 'main', 'descendents', 'combined']} - self._cpu_percent_sums = {key: 0. for key in ['system', 'main', 'descendents', 'combined']} + percent_keys = ['cpu_system', 'cpu_main', 'cpu_descendents', 'cpu_combined', 'gpu'] + self._sum_percent_sums = {key: 0. for key in percent_keys} + self._hardware_percent_sums = {key: 0. for key in percent_keys} self._tracking_iteration = 1 self._is_linux = platform.system().lower() == 'linux' self._nvidia_available = True @@ -68,12 +72,32 @@ def __init__( 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. ' 'Otherwise the Max GPU RAM values will remain 0.0') max_ram = MaxRAM(unit=ram_unit, system_capacity=psutil.virtual_memory().total * self._ram_coefficient) - max_gpu_ram = MaxGPURAM( - unit=gpu_ram_unit, system_capacity=self._system_gpu_ram(measurement='total') if self._nvidia_available else 0.0) - cpu_utilization = CPUUtilization(system_core_count=psutil.cpu_count()) + system_core_count = psutil.cpu_count() + cpu_utilization = CPUUtilization( + system_core_count=system_core_count, + n_expected_cores=n_expected_cores if n_expected_cores is not None else system_core_count) + if self._nvidia_available: + gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.total') + gpu_ram_system_capacity = self._get_gpu_ram(gpu_info=gpu_info, column='memory.total') + max_gpu_ram = MaxGPURAM(unit=gpu_ram_unit, system_capacity=gpu_ram_system_capacity) + all_uuids = set(gpu_info['uuid']) + if gpu_uuids is None: + self._gpu_uuids = all_uuids + else: + if len(gpu_uuids) == 0: + raise ValueError('gpu_uuids is not None but the set is empty. Please provide a set of at least one GPU UUID.') + for gpu_uuid in gpu_uuids: + if gpu_uuid not in all_uuids: + raise ValueError(f'GPU UUID of {gpu_uuid} is not valid. Available UUIDs are: {", ".join(sorted(all_uuids))}') + self._gpu_uuids = gpu_uuids + gpu_utilization = GPUUtilization(system_gpu_count=len(all_uuids), n_expected_gpus=len(self._gpu_uuids)) + else: + max_gpu_ram = MaxGPURAM(unit=gpu_ram_unit, system_capacity=0.0) + gpu_utilization = GPUUtilization(system_gpu_count=0, n_expected_gpus=0) compute_time = ComputeTime(unit=time_unit) self._resource_usage = ResourceUsage( - max_ram=max_ram, max_gpu_ram=max_gpu_ram, cpu_utilization=cpu_utilization, compute_time=compute_time) + max_ram=max_ram, max_gpu_ram=max_gpu_ram, cpu_utilization=cpu_utilization, gpu_utilization=gpu_utilization, + compute_time=compute_time) self._resource_usage_file = resource_usage_file self._extraneous_process_ids = extraneous_process_ids @@ -112,34 +136,47 @@ def run(self): self._resource_usage.max_ram.system = max( self._resource_usage.max_ram.system, psutil.virtual_memory().used * self._ram_coefficient) # Get the maximum GPU RAM usage if available. - if self._nvidia_available: - memory_used_command = 'nvidia-smi --query-compute-apps=pid,used_gpu_memory --format=csv,noheader' - nvidia_smi_output = subp.check_output(memory_used_command.split(), stderr=subp.STDOUT).decode() - if nvidia_smi_output: + if self._nvidia_available: # pragma: nocover + gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-compute-apps=pid,used_gpu_memory') + if len(gpu_info): process_ids = {self._main_process_id} - self._update_gpu_ram(attr='main', process_ids=process_ids, nvidia_smi_output=nvidia_smi_output) - process_ids = { - process_id for process_id in self._map_processes( - processes=descendent_processes, map_func=lambda process: process.pid)} - self._update_gpu_ram(attr='descendents', process_ids=process_ids, - nvidia_smi_output=nvidia_smi_output) + self._update_gpu_ram(attr='main', process_ids=process_ids, gpu_info=gpu_info) + process_ids = set(self._map_processes(processes=descendent_processes, map_func=lambda process: process.pid)) + self._update_gpu_ram(attr='descendents', process_ids=process_ids, gpu_info=gpu_info) process_ids.add(self._main_process_id) - self._update_gpu_ram(attr='combined', process_ids=process_ids, nvidia_smi_output=nvidia_smi_output) - self._resource_usage.max_gpu_ram.system = max( - self._resource_usage.max_gpu_ram.system, self._system_gpu_ram(measurement='used')) + self._update_gpu_ram(attr='combined', process_ids=process_ids, gpu_info=gpu_info) + gpu_info = _TrackingProcess._query_gpu(nvidia_command='--query-gpu=uuid,memory.used,utilization.gpu') + system_gpu_ram = self._get_gpu_ram(gpu_info, column='memory.used') + self._resource_usage.max_gpu_ram.system = max(self._resource_usage.max_gpu_ram.system, system_gpu_ram) + gpu_info = gpu_info.loc[gpu_info['uuid'].apply(lambda gpu_uuid: gpu_uuid in self._gpu_uuids)] + gpu_percentages = [float(percentage.replace('%', '').strip()) for percentage in gpu_info['utilization.gpu']] + self._update_processing_unit_utilization( + current_percentages=gpu_percentages, + processing_unit_percentages=self._resource_usage.gpu_utilization.gpu_percentages, percent_key='gpu', + n_hardware_units=self._resource_usage.gpu_utilization.n_expected_gpus) + # Get the mean and maximum CPU usages. self._update_n_threads(processes=[main_process], attr='main') self._update_n_threads(processes=descendent_processes, attr='descendents') self._update_n_threads(processes=combined_processes, attr='combined') # noinspection PyTypeChecker system_core_percentages: list[float] = psutil.cpu_percent(percpu=True) - self._update_cpu_utilization(percentages=system_core_percentages, attr='system') + cpu_utilization = self._resource_usage.cpu_utilization + self._update_processing_unit_utilization( + current_percentages=system_core_percentages, processing_unit_percentages=cpu_utilization.system, + percent_key='cpu_system', n_hardware_units=cpu_utilization.system_core_count) time.sleep(_TrackingProcess._CPU_PERCENT_INTERVAL) main_percentage = main_process.cpu_percent() descendent_percentages = self._map_processes(processes=descendent_processes, map_func=get_cpu_percent) - self._update_cpu_utilization(percentages=[main_percentage], attr='main') - self._update_cpu_utilization(percentages=descendent_percentages, attr='descendents') - self._update_cpu_utilization(percentages=[main_percentage] + descendent_percentages, attr='combined') + self._update_processing_unit_utilization( + current_percentages=[main_percentage], processing_unit_percentages=cpu_utilization.main, percent_key='cpu_main', + n_hardware_units=cpu_utilization.n_expected_cores) + self._update_processing_unit_utilization( + current_percentages=descendent_percentages, processing_unit_percentages=cpu_utilization.descendents, + percent_key='cpu_descendents', n_hardware_units=cpu_utilization.n_expected_cores) + self._update_processing_unit_utilization( + current_percentages=[main_percentage] + descendent_percentages, processing_unit_percentages=cpu_utilization.combined, + percent_key='cpu_combined', n_hardware_units=cpu_utilization.n_expected_cores) # Update compute time. self._resource_usage.compute_time.time = (time.time() - start_time) * self._time_coefficient self._tracking_iteration += 1 @@ -156,8 +193,8 @@ def _map_processes(self, processes: list[psutil.Process], map_func: typ.Callable for process in processes: try: mapped_list.append(map_func(process)) - except psutil.NoSuchProcess: - self._log_warning('Attempted to obtain usage information of a process that no longer exists.') + except psutil.NoSuchProcess: # pragma: nocover + self._log_warning('Attempted to obtain usage information of a process that no longer exists.') # pragma: nocover return mapped_list def _update_ram(self, rss_values: RSSValues, processes: list[psutil.Process]): @@ -185,41 +222,37 @@ def _update_ram(self, rss_values: RSSValues, processes: list[psutil.Process]): total_rss *= self._ram_coefficient rss_values.total_rss = max(rss_values.total_rss, total_rss) - def _update_gpu_ram(self, attr: str, process_ids: set[int], nvidia_smi_output: str): - nvidia_smi_output = nvidia_smi_output.strip().split('\n') - curr_gpu_ram = 0 - for process_info in nvidia_smi_output: - pid, megabytes_used = process_info.strip().split(',') - pid = int(pid.strip()) - if pid in process_ids: - megabytes_used = int(megabytes_used.replace('MiB', '').strip()) - curr_gpu_ram += megabytes_used - curr_gpu_ram *= self._gpu_ram_coefficient + def _update_gpu_ram(self, attr: str, process_ids: set[int], gpu_info: pd.DataFrame): + gpu_info = gpu_info.loc[[pid in process_ids for pid in gpu_info['pid']]] + gpu_ram = self._get_gpu_ram(gpu_info, column='used_gpu_memory') max_gpu_ram = getattr(self._resource_usage.max_gpu_ram, attr) - setattr(self._resource_usage.max_gpu_ram, attr, max(max_gpu_ram, curr_gpu_ram)) + setattr(self._resource_usage.max_gpu_ram, attr, max(max_gpu_ram, gpu_ram)) - def _system_gpu_ram(self, measurement: str) -> float: - command = f'nvidia-smi --query-gpu=memory.{measurement} --format=csv,noheader' + @staticmethod + def _query_gpu(nvidia_command: str) -> pd.DataFrame: + command = f'nvidia-smi {nvidia_command} --format=csv' output = subp.check_output(command.split(), stderr=subp.STDOUT).decode() - output = output.strip().split('\n') - usages = [line.replace('MiB', '').strip() for line in output] - ram_sum = sum([int(usage) for usage in usages if usage != '']) - return ram_sum * self._gpu_ram_coefficient - - def _update_cpu_utilization(self, percentages: list[float], attr: str): - cpu_percentages: CPUPercentages = getattr(self._resource_usage.cpu_utilization, attr) - - def update_percentages(percent: float, percent_type: str, percent_sums: dict[str, float]): - percent_sums[attr] += percent - mean_percent = percent_sums[attr] / self._tracking_iteration - setattr(cpu_percentages, f'mean_{percent_type}_percent', mean_percent) - max_percent: float = getattr(cpu_percentages, f'max_{percent_type}_percent') - setattr(cpu_percentages, f'max_{percent_type}_percent', max(max_percent, percent)) - - core_percent = sum(percentages) - cpu_percent = core_percent / self._resource_usage.cpu_utilization.system_core_count - update_percentages(percent=core_percent, percent_type='core', percent_sums=self._core_percent_sums) - update_percentages(percent=cpu_percent, percent_type='cpu', percent_sums=self._cpu_percent_sums) + gpu_info = pd.read_csv(io.StringIO(output)) + gpu_info.columns = [col.replace('[MiB]', '').replace('[%]', '').strip() for col in gpu_info.columns] + return gpu_info.map(lambda value: value.strip() if type(value) is str else value) + + def _get_gpu_ram(self, gpu_info: pd.DataFrame, column: str) -> float: + gpu_rams = gpu_info[column] + gpu_rams = gpu_rams.apply(lambda ram: int(ram.replace('MiB', '').strip())) + return sum(gpu_rams) * self._gpu_ram_coefficient + + def _update_processing_unit_utilization( + self, current_percentages: list[float], processing_unit_percentages: ProcessingUnitPercentages, + percent_key: str, n_hardware_units: int): + sum_percent = sum(current_percentages) + hardware_percent = sum_percent / n_hardware_units + for percent, percent_sums, percent_type in ( + (sum_percent, self._sum_percent_sums, 'sum'), (hardware_percent, self._hardware_percent_sums, 'hardware')): + percent_sums[percent_key] += percent + mean_percent = percent_sums[percent_key] / self._tracking_iteration + setattr(processing_unit_percentages, f'mean_{percent_type}_percent', mean_percent) + max_percent: float = getattr(processing_unit_percentages, f'max_{percent_type}_percent') + setattr(processing_unit_percentages, f'max_{percent_type}_percent', max(max_percent, percent)) def _update_n_threads(self, processes: list[psutil.Process], attr: str): n_threads_list = self._map_processes(processes, map_func=lambda process: process.num_threads()) @@ -242,10 +275,10 @@ def _log_warning(self, warning: str): class Tracker: """ - Runs a sub-process that tracks computational resources of the calling process. Including the compute time, maximum RAM, and maximum GPU RAM usage within a context manager or explicit ``start()`` and ``stop()`` methods. + Runs a sub-process that tracks computational resources of the calling process. Including the compute time, maximum CPU utilization, mean CPU utilization, maximum RAM, and maximum GPU RAM used within a context manager or explicit calls to ``start()`` and ``stop()`` methods. Calculated quantities are scaled depending on the units chosen for them (e.g. megabytes vs. gigabytes, hours vs. days, etc.). - :ivar resource_usage: Data class containing the max_ram (Description of the maximum RAM usage of the process, any descendents it may have, and the operating system overall), max_gpu_ram (Description of the maximum GPU RAM usage of the process and any descendents it may have), and compute_time (Description of the real compute time i.e. the duration of tracking) attributes. + :ivar ResourceUsage resource_usage: Data class containing the computational resource usage data collected by the tracking process. """ _USAGE_FILE_TIME_DIFFERENCE = 10.0 @@ -257,12 +290,15 @@ class State(enum.Enum): def __init__( self, sleep_time: float = 1.0, ram_unit: str = 'gigabytes', gpu_ram_unit: str = 'gigabytes', time_unit: str = 'hours', - disable_logs: bool = False, process_id: int | None = None, n_join_attempts: int = 5, join_timeout: float = 10.0): + n_expected_cores: int = None, gpu_uuids: set[str] = None, disable_logs: bool = False, process_id: int = None, + n_join_attempts: int = 5, join_timeout: float = 10.0): """ :param sleep_time: The number of seconds to sleep in between usage-collection iterations. :param ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. :param gpu_ram_unit: One of 'bytes', 'kilobytes', 'megabytes', 'gigabytes', or 'terabytes'. :param time_unit: One of 'seconds', 'minutes', 'hours', or 'days'. + :param n_expected_cores: The number of cores expected to be used during tracking (e.g. number of processes spawned, number of parallelized threads, etc.). Used as the denominator when calculating the hardware percentages of the CPU utilization (except for system-wide CPU utilization which always divides by all the cores in the system). Defaults to all the cores in the system. + :param gpu_uuids: The UUIDs of the GPUs to track utilization for. The length of this set is used as the denominator when calculating the hardware percentages of the GPU utilization (i.e. n_expected_gpus). Defaults to all the GPUs in the system. :param disable_logs: If set, warnings are suppressed during tracking. Otherwise, the Tracker logs warnings as usual. :param process_id: The ID of the process to track. Defaults to the current process. :param n_join_attempts: The number of times the tracker attempts to join its underlying sub-process. @@ -277,9 +313,9 @@ def __init__( legit_child_ids = {process.pid for process in current_process.children()} self._stop_event = mproc.Event() extraneous_ids = {process.pid for process in current_process.children()} - legit_child_ids - self._resource_usage_file = f'.{uuid.uuid1()}.pkl' + self._resource_usage_file = f'.gpu-tracker_{uuid.uuid1()}.pkl' self._tracking_process = _TrackingProcess( - self._stop_event, sleep_time, ram_unit, gpu_ram_unit, time_unit, disable_logs, + self._stop_event, sleep_time, ram_unit, gpu_ram_unit, time_unit, n_expected_cores, gpu_uuids, disable_logs, process_id if process_id is not None else current_process_id, self._resource_usage_file, extraneous_ids) self.resource_usage = None self.n_join_attempts = n_join_attempts @@ -328,7 +364,7 @@ def __exit__(self, *_): f'last updated. Resource usage was not updated during that time.') os.remove(self._resource_usage_file) else: - raise RuntimeError('The temporary tracking results file does not exist. Tracking results cannot be obtained.') + raise RuntimeError('The temporary tracking results file does not exist. Tracking results cannot be obtained.') # pragma: nocover self.state = Tracker.State.STOPPED def start(self): @@ -358,7 +394,7 @@ def __str__(self) -> str: 'compute', 'Compute').replace('time: ', 'Time: ').replace('rss', 'RSS').replace('total', 'Total').replace( 'private', 'Private').replace('shared', 'Shared').replace('main', 'Main').replace('descendents', 'Descendents').replace( 'combined', 'Combined').replace('gpu', 'GPU').replace('mean', 'Mean').replace('cpu', 'CPU').replace( - 'n threads', 'number of threads') + 'n threads', 'number of threads').replace('n expected', 'Number of expected') @staticmethod def _format_float(dictionary: dict): @@ -382,6 +418,8 @@ def to_json(self) -> dict[str, dict]: @dclass.dataclass class RSSValues: """ + The resident set size (RSS) i.e. memory used by a process or processes. + :param total_rss: The sum of ``private_rss`` and ``shared_rss``. :param private_rss: The RAM usage exclusive to a process. :param shared_rss: The RAM usage of a process shared with at least one other process. @@ -394,6 +432,8 @@ class RSSValues: @dclass.dataclass class MaxRAM: """ + Information related to RAM including the maximum RAM used over a period of time. + :param unit: The unit of measurement for RAM e.g. gigabytes. :param system_capacity: A constant value for the RAM capacity of the entire operating system. :param system: The RAM usage across the entire operating system. @@ -412,7 +452,11 @@ class MaxRAM: @dclass.dataclass class MaxGPURAM: """ + Information related to GPU RAM including the maximum GPU RAM used over a period of time. + :param unit: The unit of measurement for GPU RAM e.g. gigabytes. + :param system_capacity: A constant value for the GPU RAM capacity of all the GPUs in the system. + :param system: The GPU RAM usage of all the GPUs in the system. :param main: The GPU RAM usage of the main process. :param descendents: The summed GPU RAM usage of the descendent processes (i.e. child processes, grandchild processes, etc.). :param combined: The summed GPU RAM usage of both the main process and any descendent processes it may have. @@ -426,44 +470,72 @@ class MaxGPURAM: @dclass.dataclass -class CPUPercentages: +class ProcessingUnitPercentages: """ - :param max_core_percent: The maximum sum of utilization percentages of the cores used at any given time. - :param max_cpu_percent: The maximum percentage utilization of the entire CPU (core percentage divided by the number of cores in the system). - :param mean_core_percent: The mean sum of utilization percentages of the cores used over time. - :param mean_cpu_percent: The mean percentage utilization of the entire CPU (core percentage divided by the number of cores in the system). + Utilization percentages of one or more processing units (i.e. GPUs or CPU cores). + Max refers to the highest value measured over a duration of time. + Mean refers to the average of the measured values during this time. + Sum refers to the sum of the percentages of the processing units involved. If there is only one unit in question, this is the percentage of just that unit. + Hardware refers to this sum divided by the number of units involved. If there is only one unit in question, this is the same as the sum. + + :param max_sum_percent: The maximum sum of utilization percentages of the processing units at any given time. + :param max_hardware_percent: The maximum utilization percentage of the group of units as a whole (i.e. max_sum_percent divided by the number of units involved). + :param mean_sum_percent: The mean sum of utilization percentages of the processing units used by the process(es) over time. + :param mean_hardware_percent: The mean utilization percentage of the group of units as a whole (i.e. mean_sum_percent divided by the number of units involved). """ - max_core_percent: float = 0. - max_cpu_percent: float = 0. - mean_core_percent: float = 0. - mean_cpu_percent: float = 0. + max_sum_percent: float = 0. + max_hardware_percent: float = 0. + mean_sum_percent: float = 0. + mean_hardware_percent: float = 0. @dclass.dataclass class CPUUtilization: """ - :param system_core_count: The number of cores available to the operating system. - :param system: The core and CPU utilization percentages of the entire system. - :param main: The core and CPU utilization percentages of the main process. - :param descendents: The core and CPU utilization percentages summed across descendent processes (i.e. child processes, grandchild processes, etc.). - :param combined: The core and CPU utilization percentages summed across both the descendent processes and the main process. + Information related to CPU usage, including core utilization percentages of the main process and any descendent processes it may have as well as system-wide utilization. + The system hardware utilization percentages are strictly divided by the total number of cores in the system while that of the main, descendent, and combined processes can be divided by the expected number of cores used in a task. + + :param system_core_count: The number of cores available to the entire operating system. + :param n_expected_cores: The number of cores expected to be used by the main process and/or any descendent processes it may have. + :param system: The utilization percentages of all the cores in the entire operating system. + :param main: The utilization percentages of the cores used by the main process. + :param descendents: The utilization percentages summed across descendent processes (i.e. child processes, grandchild processes, etc.). + :param combined: The utilization percentages summed across both the descendent processes and the main process. :param main_n_threads: The maximum detected number of threads used by the main process at any time. :param descendents_n_threads: The maximum sum of threads used across the descendent processes at any time. :param combined_n_threads: The maximum sum of threads used by both the main and descendent processes. """ system_core_count: int - system: CPUPercentages = dclass.field(default_factory=CPUPercentages) - main: CPUPercentages = dclass.field(default_factory=CPUPercentages) - descendents: CPUPercentages = dclass.field(default_factory=CPUPercentages) - combined: CPUPercentages = dclass.field(default_factory=CPUPercentages) + n_expected_cores: int + system: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + main: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + descendents: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + combined: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) main_n_threads: int = 0 descendents_n_threads: int = 0 combined_n_threads: int = 0 +@dclass.dataclass +class GPUUtilization: + """ + Utilization percentages of one or more GPUs being tracked. + Hardware percentages are the summed percentages divided by the number of GPUs being tracked. + + :param system_gpu_count: The number of GPUs in the system. + :param n_expected_gpus: The number of GPUs to be tracked (e.g. GPUs actually used while there may be other GPUs in the system). + :param gpu_percentages: The utilization percentages of the GPU(s) being tracked. + """ + system_gpu_count: int + n_expected_gpus: int + gpu_percentages: ProcessingUnitPercentages = dclass.field(default_factory=ProcessingUnitPercentages) + + @dclass.dataclass class ComputeTime: """ + The time it takes for a task to complete. + :param unit: The unit of measurement for compute time e.g. hours. :param time: The real compute time. """ @@ -474,12 +546,16 @@ class ComputeTime: @dclass.dataclass class ResourceUsage: """ + Contains data for computational resource usage. + :param max_ram: The maximum RAM used at any point while tracking. :param max_gpu_ram: The maximum GPU RAM used at any point while tracking. - :param cpu_utilization: The core and CPU utilization and maximum number of threads used while tracking. + :param cpu_utilization: Core counts, utilization percentages of cores and maximum number of threads used while tracking. + :param gpu_utilization: GPU counts and utilization percentages of the GPU(s). :param compute_time: The real time spent tracking. """ max_ram: MaxRAM max_gpu_ram: MaxGPURAM cpu_utilization: CPUUtilization + gpu_utilization: GPUUtilization compute_time: ComputeTime diff --git a/tests/data/False-Linux-bytes-megabytes-seconds.json b/tests/data/False-Linux-bytes-megabytes-seconds.json index 6328498..3093f55 100644 --- a/tests/data/False-Linux-bytes-megabytes-seconds.json +++ b/tests/data/False-Linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/False-Linux-bytes-megabytes-seconds.txt b/tests/data/False-Linux-bytes-megabytes-seconds.txt index 16dbb28..d7c8650 100644 --- a/tests/data/False-Linux-bytes-megabytes-seconds.txt +++ b/tests/data/False-Linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22017.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/False-Linux-kilobytes-bytes-days.json b/tests/data/False-Linux-kilobytes-bytes-days.json index 5279afa..d43d05a 100644 --- a/tests/data/False-Linux-kilobytes-bytes-days.json +++ b/tests/data/False-Linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/False-Linux-kilobytes-bytes-days.txt b/tests/data/False-Linux-kilobytes-bytes-days.txt index 21a8209..a81ccd0 100644 --- a/tests/data/False-Linux-kilobytes-bytes-days.txt +++ b/tests/data/False-Linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/False-Linux-kilobytes-gigabytes-minutes.json b/tests/data/False-Linux-kilobytes-gigabytes-minutes.json index 3e93fab..b970b71 100644 --- a/tests/data/False-Linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/False-Linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt b/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt index 8ca2677..53c1719 100644 --- a/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/False-Linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/False-Linux-megabytes-kilobytes-hours.json b/tests/data/False-Linux-megabytes-kilobytes-hours.json index e5c30ec..e856451 100644 --- a/tests/data/False-Linux-megabytes-kilobytes-hours.json +++ b/tests/data/False-Linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/False-Linux-megabytes-kilobytes-hours.txt b/tests/data/False-Linux-megabytes-kilobytes-hours.txt index f6d8b0e..51f338b 100644 --- a/tests/data/False-Linux-megabytes-kilobytes-hours.txt +++ b/tests/data/False-Linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.022 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/data/False-not-linux-bytes-megabytes-seconds.json b/tests/data/False-not-linux-bytes-megabytes-seconds.json index 014365f..62e2a04 100644 --- a/tests/data/False-not-linux-bytes-megabytes-seconds.json +++ b/tests/data/False-not-linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/False-not-linux-bytes-megabytes-seconds.txt b/tests/data/False-not-linux-bytes-megabytes-seconds.txt index 3814148..fd05eb2 100644 --- a/tests/data/False-not-linux-bytes-megabytes-seconds.txt +++ b/tests/data/False-not-linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/False-not-linux-kilobytes-bytes-days.json b/tests/data/False-not-linux-kilobytes-bytes-days.json index 539c545..f8be40a 100644 --- a/tests/data/False-not-linux-kilobytes-bytes-days.json +++ b/tests/data/False-not-linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/False-not-linux-kilobytes-bytes-days.txt b/tests/data/False-not-linux-kilobytes-bytes-days.txt index 74d156b..a26e309 100644 --- a/tests/data/False-not-linux-kilobytes-bytes-days.txt +++ b/tests/data/False-not-linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json index ace2ffe..a01c201 100644 --- a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt index 00d0142..e3143e7 100644 --- a/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/False-not-linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/False-not-linux-megabytes-kilobytes-hours.json b/tests/data/False-not-linux-megabytes-kilobytes-hours.json index 15d697b..cb82585 100644 --- a/tests/data/False-not-linux-megabytes-kilobytes-hours.json +++ b/tests/data/False-not-linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/False-not-linux-megabytes-kilobytes-hours.txt b/tests/data/False-not-linux-megabytes-kilobytes-hours.txt index ee50767..afdff2c 100644 --- a/tests/data/False-not-linux-megabytes-kilobytes-hours.txt +++ b/tests/data/False-not-linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/data/True-Linux-bytes-megabytes-seconds.json b/tests/data/True-Linux-bytes-megabytes-seconds.json index 6328498..3093f55 100644 --- a/tests/data/True-Linux-bytes-megabytes-seconds.json +++ b/tests/data/True-Linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/True-Linux-bytes-megabytes-seconds.txt b/tests/data/True-Linux-bytes-megabytes-seconds.txt index 16dbb28..d7c8650 100644 --- a/tests/data/True-Linux-bytes-megabytes-seconds.txt +++ b/tests/data/True-Linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22017.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/True-Linux-kilobytes-bytes-days.json b/tests/data/True-Linux-kilobytes-bytes-days.json index 5279afa..d43d05a 100644 --- a/tests/data/True-Linux-kilobytes-bytes-days.json +++ b/tests/data/True-Linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/True-Linux-kilobytes-bytes-days.txt b/tests/data/True-Linux-kilobytes-bytes-days.txt index 21a8209..a81ccd0 100644 --- a/tests/data/True-Linux-kilobytes-bytes-days.txt +++ b/tests/data/True-Linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/True-Linux-kilobytes-gigabytes-minutes.json b/tests/data/True-Linux-kilobytes-gigabytes-minutes.json index 3e93fab..b970b71 100644 --- a/tests/data/True-Linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/True-Linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt b/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt index 8ca2677..53c1719 100644 --- a/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/True-Linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 22.017 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/True-Linux-megabytes-kilobytes-hours.json b/tests/data/True-Linux-megabytes-kilobytes-hours.json index e5c30ec..e856451 100644 --- a/tests/data/True-Linux-megabytes-kilobytes-hours.json +++ b/tests/data/True-Linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/True-Linux-megabytes-kilobytes-hours.txt b/tests/data/True-Linux-megabytes-kilobytes-hours.txt index f6d8b0e..51f338b 100644 --- a/tests/data/True-Linux-megabytes-kilobytes-hours.txt +++ b/tests/data/True-Linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.022 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/data/True-not-linux-bytes-megabytes-seconds.json b/tests/data/True-not-linux-bytes-megabytes-seconds.json index 014365f..62e2a04 100644 --- a/tests/data/True-not-linux-bytes-megabytes-seconds.json +++ b/tests/data/True-not-linux-bytes-megabytes-seconds.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "megabytes", - "system_capacity": 24396.0, - "system": 5800.0, + "system_capacity": 36594.0, + "system": 6500.0, "main": 1600.0, "descendents": 4300.0, "combined": 5800.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 3, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 66.3, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 39.9 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 47.4, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 43.24444444444445 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 103.86666666666667, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 83.14444444444445 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "seconds", "time": 300.0 diff --git a/tests/data/True-not-linux-bytes-megabytes-seconds.txt b/tests/data/True-not-linux-bytes-megabytes-seconds.txt index 3814148..fd05eb2 100644 --- a/tests/data/True-not-linux-bytes-megabytes-seconds.txt +++ b/tests/data/True-not-linux-bytes-megabytes-seconds.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: megabytes - System capacity: 24396.0 - System: 5800.0 + System capacity: 36594.0 + System: 6500.0 Main: 1600.0 Descendents: 4300.0 Combined: 5800.0 CPU utilization: System core count: 4 + Number of expected cores: 3 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 66.3 + Mean sum percent: 119.7 + Mean hardware percent: 39.9 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 47.4 + Mean sum percent: 129.733 + Mean hardware percent: 43.244 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 103.867 + Mean sum percent: 249.433 + Mean hardware percent: 83.144 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: seconds Time: 300.0 \ No newline at end of file diff --git a/tests/data/True-not-linux-kilobytes-bytes-days.json b/tests/data/True-not-linux-kilobytes-bytes-days.json index 539c545..f8be40a 100644 --- a/tests/data/True-not-linux-kilobytes-bytes-days.json +++ b/tests/data/True-not-linux-kilobytes-bytes-days.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "bytes", - "system_capacity": 24396000000.0, - "system": 5800000000.0, + "system_capacity": 36594000000.0, + "system": 6500000000.0, "main": 1600000000.0, "descendents": 4300000000.0, "combined": 5800000000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 4, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 49.725, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 29.925 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 35.55, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 32.43333333333333 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 77.9, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 62.35833333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 3, + "gpu_percentages": { + "max_sum_percent": 150.0, + "max_hardware_percent": 50.0, + "mean_sum_percent": 95.0, + "mean_hardware_percent": 31.666666666666668 + } + }, "compute_time": { "unit": "days", "time": 0.003472222222222222 diff --git a/tests/data/True-not-linux-kilobytes-bytes-days.txt b/tests/data/True-not-linux-kilobytes-bytes-days.txt index 74d156b..a26e309 100644 --- a/tests/data/True-not-linux-kilobytes-bytes-days.txt +++ b/tests/data/True-not-linux-kilobytes-bytes-days.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: bytes - System capacity: 24396000000.0 - System: 5800000000.0 + System capacity: 36594000000.0 + System: 6500000000.0 Main: 1600000000.0 Descendents: 4300000000.0 Combined: 5800000000.0 CPU utilization: System core count: 4 + Number of expected cores: 4 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 49.725 + Mean sum percent: 119.7 + Mean hardware percent: 29.925 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 35.55 + Mean sum percent: 129.733 + Mean hardware percent: 32.433 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 77.9 + Mean sum percent: 249.433 + Mean hardware percent: 62.358 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 3 + GPU percentages: + Max sum percent: 150.0 + Max hardware percent: 50.0 + Mean sum percent: 95.0 + Mean hardware percent: 31.667 Compute time: Unit: days Time: 0.003 \ No newline at end of file diff --git a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json index ace2ffe..a01c201 100644 --- a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json +++ b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "gigabytes", - "system_capacity": 24.396, - "system": 5.8, + "system_capacity": 36.594, + "system": 6.5, "main": 1.6, "descendents": 4.3, "combined": 5.8 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 2, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 99.45, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 59.85 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 71.1, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 64.86666666666666 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 155.8, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 124.71666666666665 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 1, + "gpu_percentages": { + "max_sum_percent": 75.0, + "max_hardware_percent": 75.0, + "mean_sum_percent": 43.333333333333336, + "mean_hardware_percent": 43.333333333333336 + } + }, "compute_time": { "unit": "minutes", "time": 5.0 diff --git a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt index 00d0142..e3143e7 100644 --- a/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt +++ b/tests/data/True-not-linux-kilobytes-gigabytes-minutes.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: gigabytes - System capacity: 24.396 - System: 5.8 + System capacity: 36.594 + System: 6.5 Main: 1.6 Descendents: 4.3 Combined: 5.8 CPU utilization: System core count: 4 + Number of expected cores: 2 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 99.45 + Mean sum percent: 119.7 + Mean hardware percent: 59.85 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 71.1 + Mean sum percent: 129.733 + Mean hardware percent: 64.867 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 155.8 + Mean sum percent: 249.433 + Mean hardware percent: 124.717 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 1 + GPU percentages: + Max sum percent: 75.0 + Max hardware percent: 75.0 + Mean sum percent: 43.333 + Mean hardware percent: 43.333 Compute time: Unit: minutes Time: 5.0 \ No newline at end of file diff --git a/tests/data/True-not-linux-megabytes-kilobytes-hours.json b/tests/data/True-not-linux-megabytes-kilobytes-hours.json index 15d697b..cb82585 100644 --- a/tests/data/True-not-linux-megabytes-kilobytes-hours.json +++ b/tests/data/True-not-linux-megabytes-kilobytes-hours.json @@ -21,42 +21,53 @@ }, "max_gpu_ram": { "unit": "kilobytes", - "system_capacity": 24396000.0, - "system": 5800000.0, + "system_capacity": 36594000.0, + "system": 6500000.0, "main": 1600000.0, "descendents": 4300000.0, "combined": 5800000.0 }, "cpu_utilization": { "system_core_count": 4, + "n_expected_cores": 1, "system": { - "max_core_percent": 276.3, - "max_cpu_percent": 69.075, - "mean_core_percent": 261.9666666666667, - "mean_cpu_percent": 65.49166666666667 + "max_sum_percent": 276.3, + "max_hardware_percent": 69.075, + "mean_sum_percent": 261.9666666666667, + "mean_hardware_percent": 65.49166666666667 }, "main": { - "max_core_percent": 198.9, - "max_cpu_percent": 49.725, - "mean_core_percent": 119.7, - "mean_cpu_percent": 29.925 + "max_sum_percent": 198.9, + "max_hardware_percent": 198.9, + "mean_sum_percent": 119.7, + "mean_hardware_percent": 119.7 }, "descendents": { - "max_core_percent": 142.2, - "max_cpu_percent": 35.55, - "mean_core_percent": 129.73333333333332, - "mean_cpu_percent": 32.43333333333333 + "max_sum_percent": 142.2, + "max_hardware_percent": 142.2, + "mean_sum_percent": 129.73333333333332, + "mean_hardware_percent": 129.73333333333332 }, "combined": { - "max_core_percent": 311.6, - "max_cpu_percent": 77.9, - "mean_core_percent": 249.4333333333333, - "mean_cpu_percent": 62.35833333333333 + "max_sum_percent": 311.6, + "max_hardware_percent": 311.6, + "mean_sum_percent": 249.4333333333333, + "mean_hardware_percent": 249.4333333333333 }, "main_n_threads": 2, "descendents_n_threads": 6, "combined_n_threads": 8 }, + "gpu_utilization": { + "system_gpu_count": 3, + "n_expected_gpus": 2, + "gpu_percentages": { + "max_sum_percent": 125.0, + "max_hardware_percent": 62.5, + "mean_sum_percent": 75.0, + "mean_hardware_percent": 37.5 + } + }, "compute_time": { "unit": "hours", "time": 0.08333333333333333 diff --git a/tests/data/True-not-linux-megabytes-kilobytes-hours.txt b/tests/data/True-not-linux-megabytes-kilobytes-hours.txt index ee50767..afdff2c 100644 --- a/tests/data/True-not-linux-megabytes-kilobytes-hours.txt +++ b/tests/data/True-not-linux-megabytes-kilobytes-hours.txt @@ -16,36 +16,45 @@ Max RAM: Shared RSS: 0.0 Max GPU RAM: Unit: kilobytes - System capacity: 24396000.0 - System: 5800000.0 + System capacity: 36594000.0 + System: 6500000.0 Main: 1600000.0 Descendents: 4300000.0 Combined: 5800000.0 CPU utilization: System core count: 4 + Number of expected cores: 1 System: - Max core percent: 276.3 - Max CPU percent: 69.075 - Mean core percent: 261.967 - Mean CPU percent: 65.492 + Max sum percent: 276.3 + Max hardware percent: 69.075 + Mean sum percent: 261.967 + Mean hardware percent: 65.492 Main: - Max core percent: 198.9 - Max CPU percent: 49.725 - Mean core percent: 119.7 - Mean CPU percent: 29.925 + Max sum percent: 198.9 + Max hardware percent: 198.9 + Mean sum percent: 119.7 + Mean hardware percent: 119.7 Descendents: - Max core percent: 142.2 - Max CPU percent: 35.55 - Mean core percent: 129.733 - Mean CPU percent: 32.433 + Max sum percent: 142.2 + Max hardware percent: 142.2 + Mean sum percent: 129.733 + Mean hardware percent: 129.733 Combined: - Max core percent: 311.6 - Max CPU percent: 77.9 - Mean core percent: 249.433 - Mean CPU percent: 62.358 + Max sum percent: 311.6 + Max hardware percent: 311.6 + Mean sum percent: 249.433 + Mean hardware percent: 249.433 Main number of threads: 2 Descendents number of threads: 6 Combined number of threads: 8 +GPU utilization: + System GPU count: 3 + Number of expected GPUs: 2 + GPU percentages: + Max sum percent: 125.0 + Max hardware percent: 62.5 + Mean sum percent: 75.0 + Mean hardware percent: 37.5 Compute time: Unit: hours Time: 0.083 \ No newline at end of file diff --git a/tests/test_cli.py b/tests/test_cli.py new file mode 100644 index 0000000..e4aa805 --- /dev/null +++ b/tests/test_cli.py @@ -0,0 +1,86 @@ +import gpu_tracker.__main__ as cli +import pytest as pt +import os +import utils + + +@pt.fixture(name='format_', params=['text', 'json', None]) +def get_format(request) -> str | None: + yield request.param + + +@pt.fixture(name='output', params=['my-file', None]) +def get_output(request) -> str | None: + yield request.param + + +test_data = [ + (['-e', 'my-command', '--ru=kilobytes'], ['my-command'], {'disable_logs': False, 'ram_unit': 'kilobytes'}), + (['--execute', 'my-command arg1 ', '--disable-logs'], ['my-command', 'arg1'], {'disable_logs': True}), + (['--execute=my-command arg1 arg2', '--st=0.4'], ['my-command', 'arg1', 'arg2'], {'disable_logs': False, 'sleep_time': 0.4}), + ( + ['-e', 'my-command', '--gru=megabytes', '--tu=days'], ['my-command'], + {'disable_logs': False, 'gpu_ram_unit': 'megabytes', 'time_unit': 'days'}), + ( + ['-e', 'my-command', '--nec=3', '--guuids=gpu-id1,gpu-id2,gpu-id3'], ['my-command'], + {'disable_logs': False, 'n_expected_cores': 3, 'gpu_uuids': {'gpu-id1', 'gpu-id2', 'gpu-id3'}}), + (['-e', 'my-command', '--guuids=gpu-id1'], ['my-command'], {'disable_logs': False, 'gpu_uuids': {'gpu-id1'}})] + + +@pt.mark.parametrize('argv,command,kwargs', test_data) +def test_main(mocker, argv: list[str], command: list[str], kwargs: dict, format_: str | None, output: str | None): + argv = ['gpu-tracker'] + argv + argv += ['-f', format_] if format_ else [] + argv += ['-o', output] if output else [] + mocker.patch('sys.argv', argv) + process_mock = mocker.MagicMock(returncode=0, pid=666) + subprocess_mock = mocker.patch('gpu_tracker.__main__.subp', Popen=mocker.MagicMock(return_value=process_mock)) + tracker_str = 'tracker-str' + tracker_json = {'tracker': 'json'} + tracker_mock = mocker.MagicMock( + __str__=mocker.MagicMock(return_value=tracker_str), to_json=mocker.MagicMock(return_value=tracker_json), __enter__=lambda self: self) + TrackerMock = mocker.patch('gpu_tracker.__main__.Tracker', return_value=tracker_mock) + print_mock = mocker.patch('builtins.print') + cli.main() + TrackerMock.assert_called_with(process_id=process_mock.pid, **kwargs) + subprocess_mock.Popen.assert_called_once_with(command) + process_mock.wait.assert_called_once_with() + if format_ == 'text' or format_ is None: + tracker_mock.__str__.assert_called_once_with() + output_str = tracker_str + else: + tracker_mock.to_json.assert_called_once_with() + output_str = '{\n "tracker": "json"\n}' + print_args = [('Resource tracking complete. Process completed with status code: 0',)] + if output is None: + print_args.append((output_str,)) + else: + with open(output, 'r') as file: + assert output_str == file.read() + os.remove(output) + utils.assert_args_list(print_mock, print_args) + + +error_data = [ + (['-e '], 'Empty command provided.'), (['-e', 'my-command'], 'Command not found: "my-command"'), + (['-e', 'my-command'], f'The following error occurred when starting the command "my-command":'), + (['-e', 'my-command', '-f', 'invalid-format'], '"invalid-format" is not a valid format. Valid values are "json" or "text".')] + + +@pt.mark.parametrize('argv,error_message', error_data) +def test_errors(mocker, argv: list[str], error_message: str): + argv = ['gpu-tracker'] + argv + mocker.patch('sys.argv', argv) + if 'Command not found' in error_message: + popen_side_effect = FileNotFoundError + elif 'The following error occurred' in error_message: + popen_side_effect = Exception + else: + popen_side_effect = mocker.MagicMock() + mocker.patch('gpu_tracker.__main__.subp.Popen', side_effect=popen_side_effect) + log_mock = mocker.patch('gpu_tracker.__main__.log', error=mocker.MagicMock()) + mocker.patch('gpu_tracker.__main__.Tracker') + with pt.raises(SystemExit) as error: + cli.main() + assert str(error.value) == '1' + log_mock.error.assert_called_once_with(error_message) diff --git a/tests/test_tracker.py b/tests/test_tracker.py index 8841aa7..10cbc7b 100644 --- a/tests/test_tracker.py +++ b/tests/test_tracker.py @@ -1,7 +1,12 @@ import gpu_tracker as gput +import psutil import json import os import pytest as pt +import utils + +nvidia_smi_unavailable_message = 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. ' \ + 'Otherwise the Max GPU RAM values will remain 0.0' @pt.fixture(name='operating_system', params=['Linux', 'not-linux']) @@ -19,15 +24,17 @@ def multiply_list(_list: list, multiple=2) -> list: test_tracker_data = [ - ('bytes', 'megabytes', 'seconds'), - ('kilobytes', 'gigabytes', 'minutes'), - ('megabytes', 'kilobytes', 'hours'), - ('kilobytes', 'bytes', 'days') + ('bytes', 'megabytes', 'seconds', None, 3), + ('kilobytes', 'gigabytes', 'minutes', {'gpu-id1'}, 2), + ('megabytes', 'kilobytes', 'hours', {'gpu-id1', 'gpu-id2'}, 1), + ('kilobytes', 'bytes', 'days', {'gpu-id1', 'gpu-id2', 'gpu-id3'}, None) ] -@pt.mark.parametrize('ram_unit,gpu_ram_unit,time_unit', test_tracker_data) -def test_tracker(mocker, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str): +@pt.mark.parametrize('ram_unit,gpu_ram_unit,time_unit,gpu_uuids,n_expected_cores', test_tracker_data) +def test_tracker( + mocker, use_context_manager: bool, operating_system: str, ram_unit: str, gpu_ram_unit: str, time_unit: str, gpu_uuids: set[str], + n_expected_cores: int): class EventMock: def __init__(self): self.count = 0 @@ -126,13 +133,13 @@ def start_mock(self): mocker.MagicMock(used=29 * 1e9)]) nvidia_smi_outputs = [ b'', - b'12198 MiB\n12198 MiB', - b'', - b'', - b'12,1600 MiB\n21,700 MiB\n22,200 MiB', - b'1600 MiB\n900 MiB', - b'12,1500 MiB\n21,2100 MiB\n22,2200 MiB', - b'1500 MiB\n4300 MiB'] + b' uuid,memory.total [MiB]\ngpu-id1,12198 MiB\ngpu-id2,12198 MiB\ngpu-id3 , 12198MiB', + b'pid, used_gpu_memory [MiB]\n', + b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 0 MiB, 0 %\ngpu-id2 , 0 MiB, 0 %\ngpu-id3 , 0 MiB, 0 %', + b'pid, used_gpu_memory [MiB]\n12,1600 MiB\n21,700 MiB\n22,200 MiB', + b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1600 MiB,75 %\ngpu-id2,900 MiB , 50 %\n gpu-id3, 500 MiB, 25 %', + b'pid, used_gpu_memory [MiB]\n12,1500 MiB\n21,2100 MiB\n22,2200 MiB', + b'uuid, memory.used [MiB], utilization.gpu [%]\ngpu-id1, 1500 MiB, 55 %\n gpu-id2, 4300 MiB, 45%\ngpu-id3,700MiB,35%'] check_output_mock = mocker.patch('gpu_tracker.tracker.subp.check_output', side_effect=nvidia_smi_outputs) cpu_count_mock = mocker.patch('gpu_tracker.tracker.psutil.cpu_count', return_value=4) cpu_percent_mock = mocker.patch( @@ -146,45 +153,46 @@ def start_mock(self): if use_context_manager: with gput.Tracker( sleep_time=sleep_time, join_timeout=join_timeout, ram_unit=ram_unit, gpu_ram_unit=gpu_ram_unit, - time_unit=time_unit) as tracker: + time_unit=time_unit, gpu_uuids=gpu_uuids, n_expected_cores=n_expected_cores) as tracker: pass else: tracker = gput.Tracker( - sleep_time=sleep_time, join_timeout=join_timeout, ram_unit=ram_unit, gpu_ram_unit=gpu_ram_unit, time_unit=time_unit) + sleep_time=sleep_time, join_timeout=join_timeout, ram_unit=ram_unit, gpu_ram_unit=gpu_ram_unit, time_unit=time_unit, + gpu_uuids=gpu_uuids, n_expected_cores=n_expected_cores) tracker.start() tracker.stop() assert start_mock.called assert not os.path.isfile(tracker._resource_usage_file) assert not log_spy.called - _assert_args_list(virtual_memory_mock, [()] * 4) + utils.assert_args_list(virtual_memory_mock, [()] * 4) system_mock.assert_called_once_with() EventMock.assert_called_once_with() - _assert_args_list(mock=tracker._stop_event.is_set, expected_args_list=[()] * 4) - _assert_args_list(mock=PsProcessMock, expected_args_list=[(main_process_id,)] * 2) - _assert_args_list(current_process_mock.children, [()] * 2) - _assert_args_list(mock=main_process_mock.children, expected_args_list=[{'recursive': True}] * 3, use_kwargs=True) + utils.assert_args_list(mock=tracker._stop_event.is_set, expected_args_list=[()] * 4) + utils.assert_args_list(mock=PsProcessMock, expected_args_list=[(main_process_id,)] * 2) + utils.assert_args_list(current_process_mock.children, [()] * 2) + utils.assert_args_list(mock=main_process_mock.children, expected_args_list=[{'recursive': True}] * 3, use_kwargs=True) if operating_system == 'Linux': - _assert_args_list(mock=main_process_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) - _assert_args_list(mock=child1_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) - _assert_args_list(mock=child2_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) + utils.assert_args_list(mock=main_process_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) + utils.assert_args_list(mock=child1_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) + utils.assert_args_list(mock=child2_mock.memory_maps, expected_args_list=[{'grouped': False}] * 6, use_kwargs=True) else: - _assert_args_list(mock=main_process_mock.memory_info, expected_args_list=[()] * 6) - _assert_args_list(mock=child1_mock.memory_info, expected_args_list=[()] * 6) - _assert_args_list(mock=child2_mock.memory_info, expected_args_list=[()] * 6) + utils.assert_args_list(mock=main_process_mock.memory_info, expected_args_list=[()] * 6) + utils.assert_args_list(mock=child1_mock.memory_info, expected_args_list=[()] * 6) + utils.assert_args_list(mock=child2_mock.memory_info, expected_args_list=[()] * 6) assert len(check_output_mock.call_args_list) == 8 os_mock.getpid.assert_called_once_with() - _assert_args_list(mock=time_mock.time, expected_args_list=[()] * 5) + utils.assert_args_list(mock=time_mock.time, expected_args_list=[()] * 5) cpu_percent_interval = gput.tracker._TrackingProcess._CPU_PERCENT_INTERVAL true_sleep_time = sleep_time - cpu_percent_interval - _assert_args_list( + utils.assert_args_list( mock=time_mock.sleep, expected_args_list=[(cpu_percent_interval,), (true_sleep_time,)] * 3) tracker._stop_event.set.assert_called_once_with() tracker._tracking_process.join.assert_called_once_with(timeout=join_timeout) - _assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * 2) + utils.assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * 2) assert not tracker._tracking_process.terminate.called tracker._tracking_process.close.assert_called_once_with() cpu_count_mock.assert_called_once_with() - _assert_args_list(cpu_percent_mock, [()] * 3) + utils.assert_args_list(cpu_percent_mock, [()] * 3) expected_measurements_file = f'tests/data/{use_context_manager}-{operating_system}-{ram_unit}-{gpu_ram_unit}-{time_unit}' with open(f'{expected_measurements_file}.txt', 'r') as file: expected_tracker_str = file.read() @@ -194,12 +202,7 @@ def start_mock(self): assert expected_measurements == tracker.to_json() -def _assert_args_list(mock, expected_args_list: list[tuple | dict], use_kwargs: bool = False): - actual_args_list = [call.kwargs if use_kwargs else call.args for call in mock.call_args_list] - assert actual_args_list == expected_args_list - - -def test_warnings(mocker, caplog): +def test_main_process_warnings(mocker, caplog): n_join_attempts = 3 join_timeout = 5.2 subprocess_mock = mocker.patch('gpu_tracker.tracker.subp', check_output=mocker.MagicMock(side_effect=FileNotFoundError)) @@ -212,34 +215,88 @@ def test_warnings(mocker, caplog): with gput.Tracker(n_join_attempts=n_join_attempts, join_timeout=join_timeout) as tracker: set_spy = mocker.spy(tracker._stop_event, 'set') subprocess_mock.check_output.assert_called_once() - _assert_args_list(mock=set_spy, expected_args_list=[()] * n_join_attempts) - _assert_args_list( + utils.assert_args_list(mock=set_spy, expected_args_list=[()] * n_join_attempts) + utils.assert_args_list( mock=join_spy, expected_args_list=[{'timeout': join_timeout}] * n_join_attempts, use_kwargs=True) - _assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * (n_join_attempts + 1)) + utils.assert_args_list(mock=tracker._tracking_process.is_alive, expected_args_list=[()] * (n_join_attempts + 1)) terminate_spy.assert_called_once() close_spy.assert_called_once() - expected_warnings = [ - 'The nvidia-smi command is not available. Please install the Nvidia drivers to track GPU usage. ' - 'Otherwise the Max GPU RAM values will remain 0.0'] + expected_warnings = [nvidia_smi_unavailable_message] expected_warnings += ['The tracking process is still alive after join timout. Attempting to join again...'] * n_join_attempts expected_warnings.append( 'The tracking process is still alive after 3 attempts to join. Terminating the process by force...') expected_warnings.append( 'Tracking is stopping and it has been 11.0 seconds since the temporary tracking results file was last updated. ' 'Resource usage was not updated during that time.') + assert not os.path.isfile(tracker._resource_usage_file) + _assert_warnings(caplog, expected_warnings) + + +def _assert_warnings(caplog, expected_warnings: list[str]): for expected_warning, record in zip(expected_warnings, caplog.records): assert record.levelname == 'WARNING' assert record.message == expected_warning - assert not os.path.isfile(tracker._resource_usage_file) -def test_validate_unit(): +@pt.fixture(name='disable_logs', params=[True, False]) +def get_disable_logs(request) -> bool: + yield request.param + + +def test_tracking_process_warnings(mocker, disable_logs: bool, caplog): + main_process_id = 666 + child_process_id = 777 + error_message = 'Unexpected error' + ProcessMock = mocker.patch( + 'gpu_tracker.tracker.psutil.Process', + side_effect=[ + mocker.MagicMock(), psutil.NoSuchProcess(pid=666), mocker.MagicMock(), + mocker.MagicMock(children=mocker.MagicMock( + side_effect=[psutil.NoSuchProcess(child_process_id), RuntimeError(error_message)]))]) + subprocess_mock = mocker.patch('gpu_tracker.tracker.subp', check_output=mocker.MagicMock(side_effect=FileNotFoundError)) + log_spy = mocker.spy(gput.tracker.log, 'warning') + tracker = gput.Tracker(process_id=main_process_id, disable_logs=disable_logs) + tracker._tracking_process.run() + os.remove(tracker._resource_usage_file) + mocker.patch( + 'gpu_tracker.tracker.mproc.Event', return_value=mocker.MagicMock(is_set=mocker.MagicMock(side_effect=[False, False, True]))) + print_mock = mocker.patch('builtins.print') + tracker = gput.Tracker(process_id=main_process_id, disable_logs=disable_logs) + tracker._tracking_process.run() + os.remove(tracker._resource_usage_file) + utils.assert_args_list(ProcessMock, [(os.getpid(),), (main_process_id,), (os.getpid(),), (main_process_id,)]) + [printed] = print_mock.call_args_list + [printed] = printed.args + assert error_message == str(printed) + assert len(subprocess_mock.check_output.call_args_list) == 2 + if disable_logs: + assert not log_spy.called + else: + expected_warnings = [ + nvidia_smi_unavailable_message, 'The target process of ID 666 ended before tracking could begin.', nvidia_smi_unavailable_message, + 'Failed to track a process (PID: 777) that does not exist. This possibly resulted from the process completing before it could be tracked.', + 'The following uncaught exception occurred in the tracking process:'] + _assert_warnings(caplog, expected_warnings) + + +def test_validate_arguments(mocker): with pt.raises(ValueError) as error: gput.Tracker(sleep_time=0.0) assert str(error.value) == 'Sleep time of 0.0 is invalid. Must be at least 0.1 seconds.' with pt.raises(ValueError) as error: gput.Tracker(ram_unit='milibytes') assert str(error.value) == '"milibytes" is not a valid RAM unit. Valid values are bytes, gigabytes, kilobytes, megabytes, terabytes' + subprocess_mock = mocker.patch( + 'gpu_tracker.tracker.subp', check_output=mocker.MagicMock( + side_effect=[b'', b'uuid ,memory.total [MiB] \ngpu-id1,2048 MiB\ngpu-id2,2048 MiB', b'', b'uuid ,memory.total [MiB] '])) + with pt.raises(ValueError) as error: + gput.Tracker(gpu_uuids={'invalid-id'}) + assert len(subprocess_mock.check_output.call_args_list) == 2 + assert str(error.value) == 'GPU UUID of invalid-id is not valid. Available UUIDs are: gpu-id1, gpu-id2' + with pt.raises(ValueError) as error: + gput.Tracker(gpu_uuids=set[str]()) + assert len(subprocess_mock.check_output.call_args_list) == 4 + assert str(error.value) == 'gpu_uuids is not None but the set is empty. Please provide a set of at least one GPU UUID.' def test_state(mocker): diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..cb9f7f9 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,3 @@ +def assert_args_list(mock, expected_args_list: list[tuple | dict], use_kwargs: bool = False): + actual_args_list = [call.kwargs if use_kwargs else call.args for call in mock.call_args_list] + assert actual_args_list == expected_args_list