Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move frequency retrieval to the beginning and manual input when error #1483

Open
wants to merge 1 commit into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
78 changes: 46 additions & 32 deletions tensilelite/Tensile/LibraryLogic.py
Original file line number Diff line number Diff line change
Expand Up @@ -451,6 +451,7 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap):

# iterate over rows
rowIdx = 0
deviceMaxFreq = None
for row in csvFile:
rowIdx+=1
if rowIdx == 1:
Expand All @@ -472,13 +473,6 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap):
except ValueError as e:
csvHasWinnerColumn = False
print1(f"Error: Could not find WinnerGFlops or WinnerIdx column in CSV file: {e}")

# get the column index of Frequency(MHz)
try:
columnOfFreqIdx = row.index(" DeviceMaxFreq")
except ValueError as e:
columnOfFreqIdx = None
print1(f"Error: Could not find DeviceMaxFreq column in the CSV file: {e}")

# get the length of each row, and derive the first column of the solution instead of using wrong "solutionStartIdx = totalSizeIdx + 1"
rowLength = len(row)
Expand Down Expand Up @@ -516,20 +510,24 @@ def addFromCSV(self, dataFileName, numSolutions, solutionMap):
solutionIdx += 1

if globalParameters["UseEffLike"]:
if not deviceMaxFreq:
deviceMaxFreq = read_max_freq()
if not deviceMaxFreq or deviceMaxFreq <= 0 or math.isnan(deviceMaxFreq):
deviceMaxFreq = handle_frequency_issue("Warning: Error when reading frequency.")

# calculate effLike
# effLike = winnerGFlops / Frequency(MHz)
try:
frequency = float(row[columnOfFreqIdx])
if frequency != 0 and not math.isnan(frequency):
# calculate effLike
# effLike = winnerGFlops / Frequency(MHz)
performance_metric = round(float(winnerGFlops) / frequency, 2)
else:
handle_frequency_issue("Warning: Frequency is NaN or 0.")
performance_metric = float(winnerGFlops)
except(ValueError, TypeError):
handle_frequency_issue("Warning: Error when reading frequency.")
performance_metric = float(winnerGFlops)
performance_metric = round(float(winnerGFlops) / deviceMaxFreq, 2)
except:
print1("Error: Could not convert winnerGFlops to float.")
performance_metric = float('nan')
else:
performance_metric = float(winnerGFlops)
try:
performance_metric = float(winnerGFlops)
except:
print1("Error: Could not convert winnerGFlops to float.")
performance_metric = float('nan')

if winnerIdx != -1:
if problemSize in self.exactWinners:
Expand Down Expand Up @@ -1528,21 +1526,37 @@ def generateLogic(config, benchmarkDataPath, libraryLogicPath, cxxCompiler: str)
##############################################################################
def handle_frequency_issue(message):
print1(message)
print1(" - Type 'yes(y)' to abort the operation.")
print1(" - Type 'no(n)' to continue and use GFlops as the efficiency metric.")
print1("Input the frequency manually to proceed.")

while True:
user_choice = input("Do you want to abort (yes(y)/no(n))? ").strip().lower()
if user_choice in ['yes', 'no', 'y', 'n']:
break
else:
print1("Invalid input. Please type 'yes(y)' or 'no(n)'.")
if user_choice == "yes" or user_choice == 'y':
print1("Operation aborted by the user.")
raise Exception("User chose to abort due to frequency issue.")
else:
globalParameters["UseEffLike"] = False
print1("Proceeding with GFlops as the efficiency metric.")
frequency_input = input("Frequency: ").strip()
if frequency_input == "":
print1("Frequency cannot be empty")
continue
try:
frequency = float(frequency_input)
if frequency > 0:
return frequency
else:
print1("Frequency cannot be negative or zero.")
except ValueError:
print1("Invalid frequency.Please input a valid frequency.")

def read_max_freq():
try:
with open("tmp_max_frequency.txt", "r") as f:
max_freq = float(f.read().strip())

os.remove("tmp_max_frequency.txt")
return max_freq

except FileNotFoundError:
print("Frequency file not found")
return None
except Exception as e:
print(f"Error reading from file: {e}")
return None

################################################################################
################################################################################
###
Expand Down
112 changes: 72 additions & 40 deletions tensilelite/Tensile/Source/client/source/HardwareMonitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,20 @@

#include "ResultReporter.hpp"

#define RSMI_CHECK_EXC(expr) \
do \
{ \
rsmi_status_t e = (expr); \
if(e) \
{ \
const char* errName = nullptr; \
rsmi_status_string(e, &errName); \
std::ostringstream msg; \
msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " << std::endl \
<< #expr << std::endl; \
throw std::runtime_error(msg.str()); \
} \
#define RSMI_CHECK_EXC(expr) \
do \
{ \
rsmi_status_t e = (expr); \
if(e) \
{ \
const char* errName = nullptr; \
rsmi_status_string(e, &errName); \
std::ostringstream msg; \
msg << "Error " << e << "(" << errName << ") " << __FILE__ << ":" << __LINE__ << ": " \
<< std::endl \
<< #expr << std::endl; \
throw std::runtime_error(msg.str()); \
} \
} while(0)

namespace TensileLite
Expand All @@ -68,8 +69,9 @@ namespace TensileLite
HIP_CHECK_EXC(hipRuntimeGetVersion(&hip_version));
if(hip_version >= 50220730)
{
HIP_CHECK_EXC(hipDeviceGetAttribute(
&props.multiProcessorCount, hipDeviceAttributePhysicalMultiProcessorCount, hipDeviceIndex));
HIP_CHECK_EXC(hipDeviceGetAttribute(&props.multiProcessorCount,
hipDeviceAttributePhysicalMultiProcessorCount,
hipDeviceIndex));
}
#endif

Expand Down Expand Up @@ -102,7 +104,8 @@ namespace TensileLite
}

msg << "]" << std::endl;
std::time_t now = std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
std::time_t now
= std::chrono::system_clock::to_time_t(std::chrono::system_clock::now());
msg << std::put_time(gmtime(&now), "%F %T %z");

throw std::runtime_error(concatenate("RSMI Can't find a device with PCI ID ",
Expand Down Expand Up @@ -179,7 +182,8 @@ namespace TensileLite
m_thread = std::thread([this]() { this->runLoop(); });
}

void HardwareMonitor::addTempMonitor(rsmi_temperature_type_t sensorType, rsmi_temperature_metric_t metric)
void HardwareMonitor::addTempMonitor(rsmi_temperature_type_t sensorType,
rsmi_temperature_metric_t metric)
{
assertNotActive();

Expand All @@ -203,7 +207,8 @@ namespace TensileLite
m_fanValues.resize(m_fanMetrics.size());
}

double HardwareMonitor::getAverageTemp(rsmi_temperature_type_t sensorType, rsmi_temperature_metric_t metric)
double HardwareMonitor::getAverageTemp(rsmi_temperature_type_t sensorType,
rsmi_temperature_metric_t metric)
{
assertNotActive();

Expand All @@ -222,8 +227,8 @@ namespace TensileLite
}
}

throw std::runtime_error(
concatenate("Can't read temp value that wasn't requested: ", sensorType, " - ", metric));
throw std::runtime_error(concatenate(
"Can't read temp value that wasn't requested: ", sensorType, " - ", metric));
}

double HardwareMonitor::getAverageClock(rsmi_clk_type_t clockType)
Expand Down Expand Up @@ -252,7 +257,8 @@ namespace TensileLite
}
}

throw std::runtime_error(concatenate("Can't read clock value that wasn't requested: ", clockType));
throw std::runtime_error(
concatenate("Can't read clock value that wasn't requested: ", clockType));
}

double HardwareMonitor::getAverageFanSpeed(uint32_t sensorIndex)
Expand All @@ -274,7 +280,8 @@ namespace TensileLite
}
}

throw std::runtime_error(concatenate("Can't read fan value that wasn't requested: ", sensorIndex));
throw std::runtime_error(
concatenate("Can't read fan value that wasn't requested: ", sensorIndex));
}

void HardwareMonitor::start()
Expand Down Expand Up @@ -303,7 +310,8 @@ namespace TensileLite

m_hasStopEvent = stopEvent != nullptr;

m_task = std::move(Task([this, startEvent, stopEvent]() { this->collect(startEvent, stopEvent); }));
m_task = std::move(Task(
[this, startEvent, stopEvent]() { this->collect(startEvent, stopEvent); }));
m_future = m_task.get_future();

m_stop = false;
Expand All @@ -326,8 +334,9 @@ namespace TensileLite
m_lastCollection = clock::time_point();
m_nextCollection = clock::time_point();

m_SYSCLK_sum = std::vector<uint64_t>(m_XCDCount, 0);
m_SYSCLK_array = std::vector<std::vector<uint64_t>>(m_XCDCount, std::vector<uint64_t>{});
m_SYSCLK_sum = std::vector<uint64_t>(m_XCDCount, 0);
m_SYSCLK_array
= std::vector<std::vector<uint64_t>>(m_XCDCount, std::vector<uint64_t>{});
}

void HardwareMonitor::collectOnce()
Expand All @@ -345,7 +354,8 @@ namespace TensileLite
std::tie(sensorType, metric) = m_tempMetrics[i];

int64_t newValue = 0;
auto status = rsmi_dev_temp_metric_get(m_smiDeviceIndex, sensorType, metric, &newValue);
auto status
= rsmi_dev_temp_metric_get(m_smiDeviceIndex, sensorType, metric, &newValue);
if(status != RSMI_STATUS_SUCCESS)
m_tempValues[i] = std::numeric_limits<int64_t>::max();
else
Expand All @@ -372,14 +382,16 @@ namespace TensileLite
for(uint32_t xcd = 0; xcd < m_XCDCount; xcd++)
{
m_SYSCLK_sum[xcd] += gpuMetrics.current_gfxclks[xcd] * cMhzToHz;
m_SYSCLK_array[xcd].push_back(gpuMetrics.current_gfxclks[xcd] * cMhzToHz);
m_SYSCLK_array[xcd].push_back(gpuMetrics.current_gfxclks[xcd]
* cMhzToHz);
sysclkSum += gpuMetrics.current_gfxclks[xcd] * cMhzToHz;
}
m_clockValues[i] += sysclkSum;
}
#else
// XCD0
auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
auto status
= rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
if(status != RSMI_STATUS_SUCCESS)
{
m_clockValues[i] = std::numeric_limits<uint64_t>::max();
Expand All @@ -392,7 +404,8 @@ namespace TensileLite
}
else
{
auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
auto status
= rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, m_clockMetrics[i], &freq);
if(status != RSMI_STATUS_SUCCESS)
{
m_clockValues[i] = std::numeric_limits<uint64_t>::max();
Expand All @@ -413,7 +426,7 @@ namespace TensileLite
rsmi_frequencies_t freq;

int64_t newValue = 0;
auto status = rsmi_dev_fan_rpms_get(m_smiDeviceIndex, m_fanMetrics[i], &newValue);
auto status = rsmi_dev_fan_rpms_get(m_smiDeviceIndex, m_fanMetrics[i], &newValue);
if(status != RSMI_STATUS_SUCCESS)
m_fanValues[i] = std::numeric_limits<int64_t>::max();
else
Expand All @@ -422,23 +435,42 @@ namespace TensileLite

// Retrieves the maximum hardware supported frequency.
rsmi_frequencies_t freqs;
auto status = rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_SYS, &freqs);
if(status != RSMI_STATUS_SUCCESS)
{
m_hasInvalidGpuFreqStatus = true;
}
else
const int MAX_RETRY = 10;
const int SLEEP_TIME = 100; // sleep time in milliseconds
bool success = false;

if(!has_maxFreqValues && !m_hasInvalidGpuFreqStatus)
{
if(!m_hasInvalidGpuFreqStatus && !has_maxFreqValues)
for(int retry = 0; retry < MAX_RETRY; ++retry)
{
m_maxFreqValues = 0;
for(auto freq : freqs.frequency)
auto status
= rsmi_dev_gpu_clk_freq_get(m_smiDeviceIndex, RSMI_CLK_TYPE_SYS, &freqs);

if(status == RSMI_STATUS_SUCCESS)
{
m_maxFreqValues = std::max(m_maxFreqValues, freq);
success = true;
break;
}
// Sleep before next retry
std::this_thread::sleep_for(std::chrono::milliseconds(SLEEP_TIME));
}

if(!success)
{
m_hasInvalidGpuFreqStatus = true;
}
else if(freqs.num_supported > 0)
{
m_maxFreqValues
= *std::max_element(freqs.frequency, freqs.frequency + freqs.num_supported);

has_maxFreqValues = true;
m_maxFreqValues /= cMhzToHz; // Convert to MHz
}
else
{
m_hasInvalidGpuFreqStatus = true;
}
}
m_dataPoints++;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -159,7 +159,6 @@ namespace TensileLite
m_output.setHeaderForKey(ResultKey::LDA, "LDA");
m_output.setHeaderForKey(ResultKey::LDB, "LDB");
m_output.setHeaderForKey(ResultKey::TotalFlops, "TotalFlops");
m_output.setHeaderForKey(ResultKey::GfxFrequency, "DeviceMaxFreq");
if(m_extraCol)
{
m_output.setHeaderForKey(ResultKey::TilesPerCu, "TilesPerCu");
Expand Down
Loading
Loading