diff --git a/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.cpp b/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.cpp index 52c8a68f94a..45230584912 100644 --- a/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.cpp +++ b/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.cpp @@ -178,10 +178,23 @@ static void azureHotResetAsync(size_t index) */ int azureHotReset(size_t index, int *resp) { - //tell xocl don't try to restore anything since we are going - //to do hotplug in wireserver - *resp = -ESHUTDOWN; - nouse = std::async(std::launch::async, &azureHotResetAsync, index); + /* + * Tell xocl don't try to restore anything since we are going + * to do hotplug in wireserver + * If we can't get S/N of the card, we are not even going to issue the reset + * to wireserver since this makes no sense and even hangs the instance. + * Empty S/N may happen in this scenario, + * 1. vm boots and is ready before the mgmt side is ready + * 2. 'xbutil reset' tries to reset the card immediately after mgmt is ready + * in this case, there is no chance for mpd to get S/N info. so we just fails + * the reset + */ + if (fpga_serial_number.at(index).empty()) { + *resp = -E_EMPTY_SN; + } else { + *resp = -ESHUTDOWN; + nouse = std::async(std::launch::async, &azureHotResetAsync, index); + } return 0; } @@ -213,9 +226,12 @@ int AzureDev::azureLoadXclBin(const xclBin *buffer) { char *xclbininmemory = reinterpret_cast (const_cast (buffer)); if (memcmp(xclbininmemory, "xclbin2", 8) != 0) - return -1; + return -1; std::string fpgaSerialNumber; get_fpga_serialNo(fpgaSerialNumber); + + if (fpgaSerialNumber.empty()) + return -E_EMPTY_SN; std::cout << "LoadXclBin FPGA serial No: " << fpgaSerialNumber << std::endl; int index = 0; std::string imageSHA; @@ -236,6 +252,8 @@ int AzureDev::azureLoadXclBin(const xclBin *buffer) std::cout << "Start upload segment (" << fpgaSerialNumber << ")" << std::endl; gettimeofday(&tvStartUpload, NULL); for (auto &chunk: chunks) { + if (goingTimeout()) + return -E_REST_TIMEOUT; //upload each segment individually std::cout << "upload segment (" << fpgaSerialNumber << "): " << index << " size: " << chunk.size() << std::endl; if (UploadToWireServer( @@ -265,6 +283,8 @@ int AzureDev::azureLoadXclBin(const xclBin *buffer) std::cout << "Start reimage process (" << fpgaSerialNumber << ")" << std::endl; gettimeofday(&tvStartReimage, NULL); do { + if (goingTimeout()) + return -E_REST_TIMEOUT; ret = REST_Get( restip_endpoint, "machine/plugins/?comp=FpgaController&type=StartReimaging", @@ -299,6 +319,8 @@ int AzureDev::azureLoadXclBin(const xclBin *buffer) gettimeofday(&tvStartStatus, NULL); int wait = 0; do { + if (goingTimeout()) + return -E_REST_TIMEOUT; ret = REST_Get( restip_endpoint, "machine/plugins/?comp=FpgaController&type=GetReimagingStatus", @@ -391,6 +413,7 @@ AzureDev::~AzureDev() AzureDev::AzureDev(size_t index) : index(index) { dev = pcidev::get_dev(index, true); + gettimeofday(&start, NULL); } //private methods @@ -583,8 +606,11 @@ void AzureDev::get_fpga_serialNo(std::string &fpgaSerialNo) //fpgaSerialNo = "1281002AT024"; if (fpgaSerialNo.empty()) fpgaSerialNo = fpga_serial_number.at(index); + else if (fpga_serial_number.at(index).empty()) + //save the serial in case the already saved is empty + fpga_serial_number.at(index) = fpgaSerialNo; if (!errmsg.empty() || fpgaSerialNo.empty()) { - std::cerr << "azure warning(" << dev->sysfs_name << ")"; + std::cerr << "get_fpga_serialNo warning(" << dev->sysfs_name << ")"; std::cerr << " sysfs errmsg: " << errmsg; std::cerr << " serialNumber: " << fpga_serial_number.at(index); std::cerr << std::endl; @@ -600,3 +626,14 @@ void AzureDev::msleep(long msecs) nanosleep(&ts, NULL); } + +int AzureDev::goingTimeout() +{ + struct timeval now; + gettimeofday(&now, NULL); + if (now.tv_sec - start.tv_sec > timeout_threshold) + return 1; + else + return 0; +} + diff --git a/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.h b/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.h index 32f54b02d41..fd603e5fff4 100644 --- a/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.h +++ b/src/runtime_src/core/pcie/tools/cloud-daemon/azure/azure.h @@ -35,6 +35,8 @@ enum azure_rest_err { E_GET_REIMAGE_STATUS = 2021, E_RESET = 2030, E_GET_RESET_STATUS = 2031, + E_EMPTY_SN = 2040, + E_REST_TIMEOUT = 2050, }; /* * This class is for azure xclbin download handling. @@ -109,8 +111,10 @@ class AzureDev static const int rest_timeout { 30 }; //in second static const int upload_retry { 15 }; static const int reset_retry { 3 }; + static const int timeout_threshold { 49 }; //mailbox timeout set as 50s std::shared_ptr dev; size_t index; + struct timeval start; int UploadToWireServer( const std::string &ip, const std::string &endpoint, @@ -129,6 +133,7 @@ class AzureDev std::string &sha); void get_fpga_serialNo(std::string &fpgaSerialNo); void msleep(long msecs); + int goingTimeout(); };