VitisAccelerator Host code refactor:

- Multiple devices support - Selection of device by BDF - OpenCL error checking - Automatic memory bank association - Inferences validation - Improved command line parameters - Improved debug output - Dummy buffer copy to avoid benchmarking buffer allocation time - Removal of mutexes preventing buffer copies overlap with kernel executions on the same CU with multiple workers - Documentation
fastmachinelearning · Jan 12, 2025 · 18f7fc7 · 18f7fc7
1 parent a8e8466
commit 18f7fc7
Show file tree

Hide file tree

Showing 8 changed files with 414 additions and 288 deletions.
diff --git a/docs/backend/accelerator.rst b/docs/backend/accelerator.rst
@@ -132,11 +132,34 @@ Once the project is generated, it possible to run manually the build steps by us
 
 It is also possible to run the full build process by calling ``make`` without any target. Modifications to the ``accelerator_card.cfg`` file can be done manually before running the build process (e.g., to change the clock period, or add addition ``.xo`` kernel to the build).
 
-The generated host code application and the xclbin file can be executed as such:
+Host code
+=========
+
+Once built, the host program can be run to load the board and perform inferences:
+
+.. code-block:: Bash
+
+    ./host
+
+By defaut, all Computing Unit (CU) on all compatible devices will be used, with 3 worker thread per CU.
+
+The generated host code application support the following options to tweak the execution:
+
+ * ``-d``: device BDF to use (can be specified multiple times)
+ * ``-x``: XCLBIN path
+ * ``-i``: input feature file
+ * ``-o``: output feature file
+ * ``-c``: maximum computing units count to use
+ * ``-n``: number of worker threads to use
+ * ``-r``: number of repeatition of the input feature file (For artificially increasing the data size for benchmarking purpose)
+ * ``-v``: enable verbose output
+ * ``-h``: print help
+
+The following example shows how to limit on only one device, one CU, and on worker thread:
 
 .. code-block:: Bash
 
-    ./host <build_directory>/<myproject>.xclbin
+    ./host -d 0000:c1:00.1 -c 1 -n 1
 
 Example
 =======

diff --git a/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp b/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp
@@ -5,8 +5,8 @@
 #include <fstream>
 #include <iostream>
 #include <list>
-#include <stdexcept>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -25,26 +25,25 @@ template <class T, class U> class DataBatcher {
      * \param profilingDataRepeat Only used if profiling is set to True. Additional number of
      * times the given data is iterated over.
      */
-    DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers,
-                bool profiling, int profilingDataRepeat)
+    DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers, int profilingDataRepeat)
         : _batchsize(batchsize), _sampleInputSize(sampleInputSize), _sampleOutputSize(sampleOutputSize),
-          _numWorkers(numWorkers), _profiling(profiling), _profilingDataRepeat(profilingDataRepeat) {}
+          _numWorkers(numWorkers), _profilingDataRepeat(profilingDataRepeat) {}
 
     /**
      * \brief Read in data to a buffer. Allocate space for results.
      * \param filename Filename.
      * \param s Type of input, currently supports text files used by VitisAccelerator backend, and
      * binary files produced by NumPy's toFile() function
      */
-    void read(const std::string& filename) {
-        std::cout << "\nReading data from text file " << filename << std::endl;
+    void read(const std::string &filename) {
 
-        // Read in text file
         std::ifstream fin(filename);
         if (!fin.is_open()) {
             throw std::runtime_error("Error opening file " + filename);
         }
 
+        std::cout << "Reading data from: " << filename << std::endl;
+
         std::string line;
         while (std::getline(fin, line)) {
             originalSampleCount++;
@@ -57,13 +56,70 @@ template <class T, class U> class DataBatcher {
                 throw std::runtime_error("Failed to parse value on line " + std::to_string(originalSampleCount));
             }
         }
-        std::cout << "Read in " << originalSampleCount << " lines" << std::endl;
+
+        std::cout << "Read in " << originalSampleCount << " samples (" << inputData.size() << " elements)" << std::endl;
         fin.close();
 
         // Zero-pad
         numBatches = std::ceil(static_cast<double>(originalSampleCount) / _batchsize);
-        if (numBatches * _batchsize > originalSampleCount) {
-            inputData.resize(numBatches * _batchsize * _sampleInputSize, (T)0);
+        size_t finalSampleCount = numBatches * _batchsize;
+        if (finalSampleCount > originalSampleCount) {
+            std::cout << "Padding with " << (finalSampleCount - originalSampleCount) << " empty samples for a total of "
+                      << numBatches << " batches of " << _batchsize << " samples" << std::endl;
+            inputData.resize(finalSampleCount * _sampleInputSize, (T)0);
+        }
+    }
+
+    bool readReference(const std::string &filename) {
+
+        std::ifstream fref(filename);
+        if (!fref.is_open()) {
+            return false;
+        }
+
+        std::cout << "Reading data from: " << filename << std::endl;
+        size_t refSampleCount = 0;
+        std::string line;
+        while (std::getline(fref, line)) {
+            refSampleCount++;
+            std::istringstream parser(line);
+            T val;
+            while (parser >> val) {
+                refData.push_back(val);
+            }
+            if (!parser.eof()) {
+                throw std::runtime_error("Failed to parse value on line " + std::to_string(refSampleCount));
+            }
+        }
+
+        std::cout << "Read in " << refSampleCount << " reference samples (" << refData.size() << " elements)" << std::endl;
+        fref.close();
+        return true;
+    }
+
+    void checkResults() {
+        if (storedEvalResults.size() == 0 || refData.size() == 0) {
+            throw std::runtime_error("No data to check");
+        }
+
+        if (storedEvalResults.size() != refData.size()) {
+            throw std::runtime_error("Stored results and reference data are not the same size");
+        }
+        size_t error_count = 0;
+        for (uint64_t i = 0; i < storedEvalResults.size(); i++) {
+            if (storedEvalResults[i] != refData[i]) {
+                error_count++;
+                std::cout << "Mismatch at index " + std::to_string(i) + ": " + std::to_string((float)storedEvalResults[i]) +
+                                 " != " + std::to_string((float)refData[i])
+                          << ", error = " << ((float)storedEvalResults[i] - (float)refData[i]) << std::endl;
+            }
+        }
+
+        if (error_count > 0) {
+            std::cout << "Mismatch count: " << error_count << std::endl;
+            throw std::runtime_error("Results do not match reference data");
+        } else {
+            std::cout << "Results match reference data" << std::endl;
         }
     }
 
@@ -74,7 +130,7 @@ template <class T, class U> class DataBatcher {
         storedEvalResults.resize(numBatches * _batchsize * _sampleOutputSize, (U)0);
 
         // Allocate space to dump the extra arbitrary data used during profiling
-        if (_profiling) {
+        if (isProfilingMode()) {
             profilingResultsDump.resize(_numWorkers * _batchsize * _sampleOutputSize, (U)0);
         }
     }
@@ -84,43 +140,47 @@ template <class T, class U> class DataBatcher {
      * \param batchedData A vector of containers for each Worker's batches/workload.
      * Size must be equal to _numWorkers.
      */
-    void batch(std::vector<std::list<Batch<T, U>>>& batchedData) {
+    void batch(std::vector<std::list<Batch<T, U>>> &batchedData) {
         if (inputData.size() == 0 || originalSampleCount == 0) {
             throw std::runtime_error("No data to batch");
         }
+        std::cout << "Original sample count: " << originalSampleCount << std::endl;
+        std::cout << "Input sample element count: " << _sampleInputSize << std::endl;
+        std::cout << "Output sample element count: " << _sampleOutputSize << std::endl;
         if (storedEvalResults.size() == 0) {
             throw std::runtime_error("Create result buffers first");
         }
 
-        batchedData.reserve(_numWorkers);
-        for (int i = 0; i < _numWorkers; i++) {
-            batchedData.emplace_back();
-        }
+        batchedData.resize(_numWorkers);
 
         uint64_t batchIndex = 0;
         while (batchIndex < numBatches) {
             int worker = batchIndex % _numWorkers;
             uint64_t inputLocation = batchIndex * _batchsize * _sampleInputSize;
             uint64_t outputLocation = batchIndex * _batchsize * _sampleOutputSize;
 
-            const T* in = &inputData[inputLocation];
-            U* out = &storedEvalResults[outputLocation];
+            const T *in = &inputData[inputLocation];
+            U *out = &storedEvalResults[outputLocation];
             Batch<T, U> newBatch = {in, out};
 
             batchedData[worker].push_back(newBatch);
             batchIndex++;
         }
 
-        if (_profiling) {
+        if (isProfilingMode()) {
             std::cout << "Creating profiling batches" << std::endl;
             profilingBatchCount = numBatches * (_profilingDataRepeat + 1);
+            std::cout << "Batches: " << numBatches << std::endl;
+            std::cout << "Profiling batch count: " << profilingBatchCount << std::endl;
+            std::cout << "Profiling data repeat: " << _profilingDataRepeat << std::endl;
+            std::cout << "Profiling total data count: " << profilingBatchCount * _batchsize << std::endl;
             while (batchIndex < profilingBatchCount) {
                 int worker = batchIndex % _numWorkers;
                 uint64_t inputLocation = (batchIndex % numBatches) * _batchsize * _sampleInputSize;
                 uint64_t outputLocation = worker * _batchsize * _sampleOutputSize;
 
-                const T* in = &inputData[inputLocation];
-                U* out = &profilingResultsDump[outputLocation];
+                const T *in = &inputData[inputLocation];
+                U *out = &profilingResultsDump[outputLocation];
                 Batch<T, U> newBatch = {in, out};
 
                 batchedData[worker].push_back(newBatch);
@@ -141,8 +201,8 @@ template <class T, class U> class DataBatcher {
         profilingBatchCount = 0;
     }
 
-    void write(const std::string& filename) {
-        std::cout << "\nWriting HW results to file " << filename << std::endl;
+    void write(const std::string &filename) {
+        std::cout << "Writing HW results to: " << filename << std::endl;
         std::ofstream fout;
         fout.open(filename, std::ios::trunc);
 
@@ -163,28 +223,19 @@ template <class T, class U> class DataBatcher {
         profilingResultsDump.clear();
     }
 
-    uint64_t getSampleCount() {
-        return originalSampleCount;
-    }
+    uint64_t getSampleCount() { return originalSampleCount; }
 
-    uint64_t getPaddedSampleCount() {
-        return numBatches * _batchsize;
-    }
+    uint64_t getPaddedSampleCount() { return numBatches * _batchsize; }
 
-    uint64_t getProfilingSampleCount() {
-        return profilingBatchCount * _batchsize;
-    }
+    uint64_t getProfilingSampleCount() { return profilingBatchCount * _batchsize; }
 
-    bool isProfilingMode() {
-        return _profiling;
-    }
+    bool isProfilingMode() { return _profilingDataRepeat > 0; }
 
   private:
     int _batchsize;
     int _sampleInputSize;
     int _sampleOutputSize;
     int _numWorkers;
-    bool _profiling;
     int _profilingDataRepeat;
 
     /// @brief Number of floats read in. (Not including padding).
@@ -195,6 +246,8 @@ template <class T, class U> class DataBatcher {
     uint64_t profilingBatchCount = 0;
     /// @brief Vector with values.
     std::vector<T> inputData;
+    /// @brief Vector with reference values.
+    std::vector<T> refData;
     /// @brief Vector to store evaluation results.
     std::vector<U> storedEvalResults;
     /// @brief Vector for dumping results from extra arbitrary data used during profiling.