diff --git a/tensilelite/Tensile/LibraryLogic.py b/tensilelite/Tensile/LibraryLogic.py index 4f71179d51..1b9e2de87d 100644 --- a/tensilelite/Tensile/LibraryLogic.py +++ b/tensilelite/Tensile/LibraryLogic.py @@ -451,6 +451,7 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap): # iterate over rows rowIdx = 0 + deviceMaxFreq = None for row in csvFile: rowIdx+=1 if rowIdx == 1: @@ -472,13 +473,6 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap): except ValueError as e: csvHasWinnerColumn = False print1(f"Error: Could not find WinnerGFlops or WinnerIdx column in CSV file: {e}") - - # get the column index of Frequency(MHz) - try: - columnOfFreqIdx = row.index(" DeviceMaxFreq") - except ValueError as e: - columnOfFreqIdx = None - print1(f"Error: Could not find DeviceMaxFreq column in the CSV file: {e}") # get the length of each row, and derive the first column of the solution instead of using wrong "solutionStartIdx = totalSizeIdx + 1" rowLength = len(row) @@ -516,20 +510,24 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap): solutionIdx += 1 if globalParameters["UseEffLike"]: + if not deviceMaxFreq: + deviceMaxFreq = read_max_freq() + if not deviceMaxFreq or deviceMaxFreq <= 0 or math.isnan(deviceMaxFreq): + deviceMaxFreq = handle_frequency_issue("Warning: Error when reading frequency.") + + # calculate effLike + # effLike = winnerGFlops / Frequency(MHz) try: - frequency = float(row[columnOfFreqIdx]) - if frequency != 0 and not math.isnan(frequency): - # calculate effLike - # effLike = winnerGFlops / Frequency(MHz) - performance_metric = round(float(winnerGFlops) / frequency, 2) - else: - handle_frequency_issue("Warning: Frequency is NaN or 0.") - performance_metric = float(winnerGFlops) - except(ValueError, TypeError): - handle_frequency_issue("Warning: Error when reading frequency.") - performance_metric = float(winnerGFlops) + performance_metric = round(float(winnerGFlops) / deviceMaxFreq, 2) + except: + print1("Error: Could not convert winnerGFlops to float.") + performance_metric = float('nan') else: - performance_metric = float(winnerGFlops) + try: + performance_metric = float(winnerGFlops) + except: + print1("Error: Could not convert winnerGFlops to float.") + performance_metric = float('nan') if winnerIdx != -1: if problemSize in self.exactWinners: @@ -1528,21 +1526,37 @@ def generateLogic(config, benchmarkDataPath, libraryLogicPath, cxxCompiler: str) ############################################################################## def handle_frequency_issue(message): print1(message) - print1(" - Type 'yes(y)' to abort the operation.") - print1(" - Type 'no(n)' to continue and use GFlops as the efficiency metric.") + print1("Input the frequency manually to proceed.") while True: - user_choice = input("Do you want to abort (yes(y)/no(n))? ").strip().lower() - if user_choice in ['yes', 'no', 'y', 'n']: - break - else: - print1("Invalid input. Please type 'yes(y)' or 'no(n)'.") - if user_choice == "yes" or user_choice == 'y': - print1("Operation aborted by the user.") - raise Exception("User chose to abort due to frequency issue.") - else: - globalParameters["UseEffLike"] = False - print1("Proceeding with GFlops as the efficiency metric.") + frequency_input = input("Frequency: ").strip() + if frequency_input == "": + print1("Frequency cannot be empty") + continue + try: + frequency = float(frequency_input) + if frequency > 0: + return frequency + else: + print1("Frequency cannot be negative or zero.") + except ValueError: + print1("Invalid frequency.Please input a valid frequency.") + +def read_max_freq(): + try: + with open("tmp_max_frequency.txt", "r") as f: + max_freq = float(f.read().strip()) + + os.remove("tmp_max_frequency.txt") + return max_freq + + except FileNotFoundError: + print("Frequency file not found") + return None + except Exception as e: + print(f"Error reading from file: {e}") + return None + ################################################################################ ################################################################################ ### diff --git a/tensilelite/Tensile/Source/client/source/HardwareMonitor.cpp b/tensilelite/Tensile/Source/client/source/HardwareMonitor.cpp index c334312c64..6ebbb749bd 100644 --- a/tensilelite/Tensile/Source/client/source/HardwareMonitor.cpp +++ b/tensilelite/Tensile/Source/client/source/HardwareMonitor.cpp @@ -37,19 +37,20 @@ #include "ResultReporter.hpp" -#define RSMI_CHECK_EXC(expr) \ - do \ - { \ - rsmi_status_t e = (expr); \ - if(e) \ - { \ - const char* errName = nullptr; \ - rsmi_status_string(e, &errName); \ - std::ostringstream msg; \ - msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " << std::endl \ - << #expr << std::endl; \ - throw std::runtime_error(msg.str()); \ - } \ +#define RSMI_CHECK_EXC(expr) \ + do \ + { \ + rsmi_status_t e = (expr); \ + if(e) \ + { \ + const char* errName = nullptr; \ + rsmi_status_string(e, &errName); \ + std::ostringstream msg; \ + msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " \ + << std::endl \ + << #expr << std::endl; \ + throw std::runtime_error(msg.str()); \ + } \ } while(0) namespace TensileLite @@ -68,8 +69,9 @@ namespace TensileLite HIP_CHECK_EXC(hipRuntimeGetVersion(&hip_version)); if(hip_version >= 50220730) { - HIP_CHECK_EXC(hipDeviceGetAttribute( - &props.multiProcessorCount, hipDeviceAttributePhysicalMultiProcessorCount, hipDeviceIndex)); + HIP_CHECK_EXC(hipDeviceGetAttribute(&props.multiProcessorCount, + hipDeviceAttributePhysicalMultiProcessorCount, + hipDeviceIndex)); } #endif @@ -102,7 +104,8 @@ namespace TensileLite } msg << "]" << std::endl; - std::time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); + std::time_t now + = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now()); msg << std::put_time(gmtime(&now), "%F %T %z"); throw std::runtime_error(concatenate("RSMI Can't find a device with PCI ID ", @@ -179,7 +182,8 @@ namespace TensileLite m_thread = std::thread([this]() { this->runLoop(); }); } - void HardwareMonitor::addTempMonitor(rsmi_temperature_type_t sensorType, rsmi_temperature_metric_t metric) + void HardwareMonitor::addTempMonitor(rsmi_temperature_type_t sensorType, + rsmi_temperature_metric_t metric) { assertNotActive(); @@ -203,7 +207,8 @@ namespace TensileLite m_fanValues.resize(m_fanMetrics.size()); } - double HardwareMonitor::getAverageTemp(rsmi_temperature_type_t sensorType, rsmi_temperature_metric_t metric) + double HardwareMonitor::getAverageTemp(rsmi_temperature_type_t sensorType, + rsmi_temperature_metric_t metric) { assertNotActive(); @@ -222,8 +227,8 @@ namespace TensileLite } } - throw std::runtime_error( - concatenate("Can't read temp value that wasn't requested: ", sensorType, " - ", metric)); + throw std::runtime_error(concatenate( + "Can't read temp value that wasn't requested: ", sensorType, " - ", metric)); } double HardwareMonitor::getAverageClock(rsmi_clk_type_t clockType) @@ -252,7 +257,8 @@ namespace TensileLite } } - throw std::runtime_error(concatenate("Can't read clock value that wasn't requested: ", clockType)); + throw std::runtime_error( + concatenate("Can't read clock value that wasn't requested: ", clockType)); } double HardwareMonitor::getAverageFanSpeed(uint32_t sensorIndex) @@ -274,7 +280,8 @@ namespace TensileLite } } - throw std::runtime_error(concatenate("Can't read fan value that wasn't requested: ", sensorIndex)); + throw std::runtime_error( + concatenate("Can't read fan value that wasn't requested: ", sensorIndex)); } void HardwareMonitor::start() @@ -303,7 +310,8 @@ namespace TensileLite m_hasStopEvent = stopEvent != nullptr; - m_task = std::move(Task([this, startEvent, stopEvent]() { this->collect(startEvent, stopEvent); })); + m_task = std::move(Task( + [this, startEvent, stopEvent]() { this->collect(startEvent, stopEvent); })); m_future = m_task.get_future(); m_stop = false; @@ -326,8 +334,9 @@ namespace TensileLite m_lastCollection = clock::time_point(); m_nextCollection = clock::time_point(); - m_SYSCLK_sum = std::vector(m_XCDCount, 0); - m_SYSCLK_array = std::vector>(m_XCDCount, std::vector{}); + m_SYSCLK_sum = std::vector(m_XCDCount, 0); + m_SYSCLK_array + = std::vector>(m_XCDCount, std::vector{}); } void HardwareMonitor::collectOnce() @@ -345,7 +354,8 @@ namespace TensileLite std::tie(sensorType, metric) = m_tempMetrics[i]; int64_t newValue = 0; - auto status = rsmi_dev_temp_metric_get(m_smiDeviceIndex, sensorType, metric, &newValue); + auto status + = rsmi_dev_temp_metric_get(m_smiDeviceIndex, sensorType, metric, &newValue); if(status != RSMI_STATUS_SUCCESS) m_tempValues[i] = std::numeric_limits::max(); else @@ -372,14 +382,16 @@ namespace TensileLite for(uint32_t xcd = 0; xcd < m_XCDCount; xcd++) { m_SYSCLK_sum[xcd] += gpuMetrics.current_gfxclks[xcd] * cMhzToHz; - m_SYSCLK_array[xcd].push_back(gpuMetrics.current_gfxclks[xcd] * cMhzToHz); + m_SYSCLK_array[xcd].push_back(gpuMetrics.current_gfxclks[xcd] + * cMhzToHz); sysclkSum += gpuMetrics.current_gfxclks[xcd] * cMhzToHz; } m_clockValues[i] += sysclkSum; } #else // XCD0 - auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq); + auto status + = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq); if(status != RSMI_STATUS_SUCCESS) { m_clockValues[i] = std::numeric_limits::max(); @@ -392,7 +404,8 @@ namespace TensileLite } else { - auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq); + auto status + = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq); if(status != RSMI_STATUS_SUCCESS) { m_clockValues[i] = std::numeric_limits::max(); @@ -413,7 +426,7 @@ namespace TensileLite rsmi_frequencies_t freq; int64_t newValue = 0; - auto status = rsmi_dev_fan_rpms_get(m_smiDeviceIndex, m_fanMetrics[i], &newValue); + auto status = rsmi_dev_fan_rpms_get(m_smiDeviceIndex, m_fanMetrics[i], &newValue); if(status != RSMI_STATUS_SUCCESS) m_fanValues[i] = std::numeric_limits::max(); else @@ -422,23 +435,42 @@ namespace TensileLite // Retrieves the maximum hardware supported frequency. rsmi_frequencies_t freqs; - auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_SYS, &freqs); - if(status != RSMI_STATUS_SUCCESS) - { - m_hasInvalidGpuFreqStatus = true; - } - else + const int MAX_RETRY = 10; + const int SLEEP_TIME = 100; // sleep time in milliseconds + bool success = false; + + if(!has_maxFreqValues && !m_hasInvalidGpuFreqStatus) { - if(!m_hasInvalidGpuFreqStatus && !has_maxFreqValues) + for(int retry = 0; retry < MAX_RETRY; ++retry) { - m_maxFreqValues = 0; - for(auto freq : freqs.frequency) + auto status + = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_SYS, &freqs); + + if(status == RSMI_STATUS_SUCCESS) { - m_maxFreqValues = std::max(m_maxFreqValues, freq); + success = true; + break; } + // Sleep before next retry + std::this_thread::sleep_for(std::chrono::milliseconds(SLEEP_TIME)); + } + + if(!success) + { + m_hasInvalidGpuFreqStatus = true; + } + else if(freqs.num_supported > 0) + { + m_maxFreqValues + = *std::max_element(freqs.frequency, freqs.frequency + freqs.num_supported); + has_maxFreqValues = true; m_maxFreqValues /= cMhzToHz; // Convert to MHz } + else + { + m_hasInvalidGpuFreqStatus = true; + } } m_dataPoints++; } diff --git a/tensilelite/Tensile/Source/client/source/ResultFileReporter.cpp b/tensilelite/Tensile/Source/client/source/ResultFileReporter.cpp index e41063e317..ff31359170 100644 --- a/tensilelite/Tensile/Source/client/source/ResultFileReporter.cpp +++ b/tensilelite/Tensile/Source/client/source/ResultFileReporter.cpp @@ -159,7 +159,6 @@ namespace TensileLite m_output.setHeaderForKey(ResultKey::LDA, "LDA"); m_output.setHeaderForKey(ResultKey::LDB, "LDB"); m_output.setHeaderForKey(ResultKey::TotalFlops, "TotalFlops"); - m_output.setHeaderForKey(ResultKey::GfxFrequency, "DeviceMaxFreq"); if(m_extraCol) { m_output.setHeaderForKey(ResultKey::TilesPerCu, "TilesPerCu"); diff --git a/tensilelite/Tensile/Tensile.py b/tensilelite/Tensile/Tensile.py index 3b35ba8b06..3964197672 100644 --- a/tensilelite/Tensile/Tensile.py +++ b/tensilelite/Tensile/Tensile.py @@ -40,7 +40,7 @@ from . import LibraryLogic from . import __version__ from datetime import datetime - +import subprocess ############################################################################### # Execute Steps in Config @@ -183,6 +183,93 @@ def argUpdatedGlobalParameters(args): return rv +def check_config_values(config): + """ + Checking the device value and library logic exists in the config file + """ + device_value = 0 # Default device value + if 'GlobalParameters' in config: + device_value = config['GlobalParameters'].get('Device') + + library_logic_exists = 'LibraryLogic' in config + return device_value, library_logic_exists + +def get_gpu_max_frequency(device_id): + ''' + Get the maximum frequency of the specified GPU device + ''' + try: + # Run rocm-smi command and capture output + result = subprocess.run(['rocm-smi', '-s'], capture_output=True, text=True) + + if result.returncode != 0: + print(f"Error running rocm-smi: {result.stderr}") + return None + + # Parse the output + lines = result.stdout.split('\n') + sclk_section = False + frequencies = [] + + # Look for the sclk section of the specified device + for line in lines: + line = line.split(" ") + if 'sclk' in line and f"GPU{device_id}" in line: + sclk_section = True + continue + + # Parse frequencies in the sclk section + if sclk_section: + for part in line: + if part.endswith("Mhz"): + try: + frequency = part.replace("Mhz", "") + frequencies.append(int(frequency)) + except ValueError: + print(f"Error parsing frequency: {part}") + break + if "socclk" in line: + break + + # Return the maximum frequency found + return max(frequencies) if frequencies else None + + except Exception as e: + print(f"Error: {e}") + return None + +def get_user_max_frequency(): + ''' + Get the maximum frequency from the user when the GPU frequency cannot be determined + ''' + while True: + try: + user_input = input("Please enter the maximum frequency (MHz): ") + + frequency = float(user_input) + + if frequency <= 0: + print("Error: Frequency must be greater than 0 MHz") + continue + + return frequency + + except ValueError: + print("Error: Please enter a valid number") + except Exception as e: + print(f"Error: {str(e)}") + print("Please try again") + +def store_max_frequency(max_frequency): + try: + with open("tmp_max_frequency.txt", "w") as f: + f.write(str(max_frequency)) + return True + + except Exception as e: + print(f"Error writing to file: {e}") + return False + ################################################################################ # Tensile @@ -269,6 +356,18 @@ def Tensile(userArgs): config["UseCache"] = useCache globalParameters["ConfigPath"] = configPaths + + device_value, has_library_logic = check_config_values(config) + if has_library_logic: + max_frequency = get_gpu_max_frequency(device_value) + + if not max_frequency or max_frequency <= 0: + print(f"Could not detect valid GPU frequency for device {device_value}") + max_frequency = get_user_max_frequency() + + print(f"Max frequency: {max_frequency}") + + store_max_frequency(max_frequency) cxxCompiler, cCompiler, assembler, offloadBundler = validateToolchain(args.CxxCompiler, args.CCompiler, args.Assembler, args.OffloadBundler) assignGlobalParameters(config.get("GlobalParameters", {}), cxxCompiler)