From 18f7fc7f10e58bfab225dc96dab28756e4394937 Mon Sep 17 00:00:00 2001
From: Quentin Berthet <quentin.berthet@cern.ch>
Date: Sun, 12 Jan 2025 22:39:19 +0100
Subject: [PATCH] VitisAccelerator Host code refactor:

- Multiple devices support
- Selection of device by BDF
- OpenCL error checking
- Automatic memory bank association
- Inferences validation
- Improved command line parameters
- Improved debug output
- Dummy buffer copy to avoid benchmarking buffer allocation time
- Removal of mutexes preventing buffer copies overlap with kernel executions on the same CU with multiple workers
- Documentation
---
 docs/backend/accelerator.rst                  |  27 ++-
 .../vitis_accelerator/libs/DataBatcher.hpp    | 125 +++++++----
 .../vitis_accelerator/libs/FpgaObj.hpp        | 191 +++++++++--------
 .../vitis_accelerator/libs/Params.hpp         | 107 ++++++++++
 .../vitis_accelerator/libs/Types.hpp          |  12 +-
 .../vitis_accelerator/libs/Worker.hpp         | 194 ++++++------------
 .../templates/vitis_accelerator/libs/xcl2.cpp |   6 +-
 .../vitis_accelerator/myproject_host_cl.cpp   |  40 ++--
 8 files changed, 414 insertions(+), 288 deletions(-)
 create mode 100644 hls4ml/templates/vitis_accelerator/libs/Params.hpp
diff --git a/docs/backend/accelerator.rst b/docs/backend/accelerator.rst
index 1c1d06a7a6..d9cb4e31b1 100644
--- a/docs/backend/accelerator.rst
+++ b/docs/backend/accelerator.rst
@@ -132,11 +132,34 @@ Once the project is generated, it possible to run manually the build steps by us
 
 It is also possible to run the full build process by calling ``make`` without any target. Modifications to the ``accelerator_card.cfg`` file can be done manually before running the build process (e.g., to change the clock period, or add addition ``.xo`` kernel to the build).
 
-The generated host code application and the xclbin file can be executed as such:
+Host code
+=========
+
+Once built, the host program can be run to load the board and perform inferences:
+
+.. code-block:: Bash
+
+    ./host
+
+By defaut, all Computing Unit (CU) on all compatible devices will be used, with 3 worker thread per CU.
+
+The generated host code application support the following options to tweak the execution:
+
+ * ``-d``: device BDF to use (can be specified multiple times)
+ * ``-x``: XCLBIN path
+ * ``-i``: input feature file
+ * ``-o``: output feature file
+ * ``-c``: maximum computing units count to use
+ * ``-n``: number of worker threads to use
+ * ``-r``: number of repeatition of the input feature file (For artificially increasing the data size for benchmarking purpose)
+ * ``-v``: enable verbose output
+ * ``-h``: print help
+
+The following example shows how to limit on only one device, one CU, and on worker thread:
 
 .. code-block:: Bash
 
-    ./host <build_directory>/<myproject>.xclbin
+    ./host -d 0000:c1:00.1 -c 1 -n 1
 
 Example
 =======
diff --git a/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp b/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp
index 6353d59c61..96f15d4a1a 100644
--- a/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp
+++ b/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp
@@ -5,8 +5,8 @@
 #include <fstream>
 #include <iostream>
 #include <list>
-#include <stdexcept>
 #include <sstream>
+#include <stdexcept>
 #include <string>
 #include <vector>
 
@@ -25,10 +25,9 @@ template <class T, class U> class DataBatcher {
      * \param profilingDataRepeat Only used if profiling is set to True. Additional number of
      * times the given data is iterated over.
      */
-    DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers,
-                bool profiling, int profilingDataRepeat)
+    DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers, int profilingDataRepeat)
         : _batchsize(batchsize), _sampleInputSize(sampleInputSize), _sampleOutputSize(sampleOutputSize),
-          _numWorkers(numWorkers), _profiling(profiling), _profilingDataRepeat(profilingDataRepeat) {}
+          _numWorkers(numWorkers), _profilingDataRepeat(profilingDataRepeat) {}
 
     /**
      * \brief Read in data to a buffer. Allocate space for results.
@@ -36,15 +35,15 @@ template <class T, class U> class DataBatcher {
      * \param s Type of input, currently supports text files used by VitisAccelerator backend, and
      * binary files produced by NumPy's toFile() function
      */
-    void read(const std::string& filename) {
-        std::cout << "\nReading data from text file " << filename << std::endl;
+    void read(const std::string &filename) {
 
-        // Read in text file
         std::ifstream fin(filename);
         if (!fin.is_open()) {
             throw std::runtime_error("Error opening file " + filename);
         }
 
+        std::cout << "Reading data from: " << filename << std::endl;
+
         std::string line;
         while (std::getline(fin, line)) {
             originalSampleCount++;
@@ -57,13 +56,70 @@ template <class T, class U> class DataBatcher {
                 throw std::runtime_error("Failed to parse value on line " + std::to_string(originalSampleCount));
             }
         }
-        std::cout << "Read in " << originalSampleCount << " lines" << std::endl;
+
+        std::cout << "Read in " << originalSampleCount << " samples (" << inputData.size() << " elements)" << std::endl;
         fin.close();
 
         // Zero-pad
         numBatches = std::ceil(static_cast<double>(originalSampleCount) / _batchsize);
-        if (numBatches * _batchsize > originalSampleCount) {
-            inputData.resize(numBatches * _batchsize * _sampleInputSize, (T)0);
+        size_t finalSampleCount = numBatches * _batchsize;
+        if (finalSampleCount > originalSampleCount) {
+            std::cout << "Padding with " << (finalSampleCount - originalSampleCount) << " empty samples for a total of "
+                      << numBatches << " batches of " << _batchsize << " samples" << std::endl;
+            inputData.resize(finalSampleCount * _sampleInputSize, (T)0);
+        }
+    }
+
+    bool readReference(const std::string &filename) {
+
+        std::ifstream fref(filename);
+        if (!fref.is_open()) {
+            return false;
+        }
+
+        std::cout << "Reading data from: " << filename << std::endl;
+        size_t refSampleCount = 0;
+        std::string line;
+        while (std::getline(fref, line)) {
+            refSampleCount++;
+            std::istringstream parser(line);
+            T val;
+            while (parser >> val) {
+                refData.push_back(val);
+            }
+            if (!parser.eof()) {
+                throw std::runtime_error("Failed to parse value on line " + std::to_string(refSampleCount));
+            }
+        }
+
+        std::cout << "Read in " << refSampleCount << " reference samples (" << refData.size() << " elements)" << std::endl;
+        fref.close();
+        return true;
+    }
+
+    void checkResults() {
+        if (storedEvalResults.size() == 0 || refData.size() == 0) {
+            throw std::runtime_error("No data to check");
+        }
+
+        if (storedEvalResults.size() != refData.size()) {
+            throw std::runtime_error("Stored results and reference data are not the same size");
+        }
+        size_t error_count = 0;
+        for (uint64_t i = 0; i < storedEvalResults.size(); i++) {
+            if (storedEvalResults[i] != refData[i]) {
+                error_count++;
+                std::cout << "Mismatch at index " + std::to_string(i) + ": " + std::to_string((float)storedEvalResults[i]) +
+                                 " != " + std::to_string((float)refData[i])
+                          << ", error = " << ((float)storedEvalResults[i] - (float)refData[i]) << std::endl;
+            }
+        }
+
+        if (error_count > 0) {
+            std::cout << "Mismatch count: " << error_count << std::endl;
+            throw std::runtime_error("Results do not match reference data");
+        } else {
+            std::cout << "Results match reference data" << std::endl;
         }
     }
 
@@ -74,7 +130,7 @@ template <class T, class U> class DataBatcher {
         storedEvalResults.resize(numBatches * _batchsize * _sampleOutputSize, (U)0);
 
         // Allocate space to dump the extra arbitrary data used during profiling
-        if (_profiling) {
+        if (isProfilingMode()) {
             profilingResultsDump.resize(_numWorkers * _batchsize * _sampleOutputSize, (U)0);
         }
     }
@@ -84,18 +140,18 @@ template <class T, class U> class DataBatcher {
      * \param batchedData A vector of containers for each Worker's batches/workload.
      * Size must be equal to _numWorkers.
      */
-    void batch(std::vector<std::list<Batch<T, U>>>& batchedData) {
+    void batch(std::vector<std::list<Batch<T, U>>> &batchedData) {
         if (inputData.size() == 0 || originalSampleCount == 0) {
             throw std::runtime_error("No data to batch");
         }
+        std::cout << "Original sample count: " << originalSampleCount << std::endl;
+        std::cout << "Input sample element count: " << _sampleInputSize << std::endl;
+        std::cout << "Output sample element count: " << _sampleOutputSize << std::endl;
         if (storedEvalResults.size() == 0) {
             throw std::runtime_error("Create result buffers first");
         }
 
-        batchedData.reserve(_numWorkers);
-        for (int i = 0; i < _numWorkers; i++) {
-            batchedData.emplace_back();
-        }
+        batchedData.resize(_numWorkers);
 
         uint64_t batchIndex = 0;
         while (batchIndex < numBatches) {
@@ -103,24 +159,28 @@ template <class T, class U> class DataBatcher {
             uint64_t inputLocation = batchIndex * _batchsize * _sampleInputSize;
             uint64_t outputLocation = batchIndex * _batchsize * _sampleOutputSize;
 
-            const T* in = &inputData[inputLocation];
-            U* out = &storedEvalResults[outputLocation];
+            const T *in = &inputData[inputLocation];
+            U *out = &storedEvalResults[outputLocation];
             Batch<T, U> newBatch = {in, out};
 
             batchedData[worker].push_back(newBatch);
             batchIndex++;
         }
 
-        if (_profiling) {
+        if (isProfilingMode()) {
             std::cout << "Creating profiling batches" << std::endl;
             profilingBatchCount = numBatches * (_profilingDataRepeat + 1);
+            std::cout << "Batches: " << numBatches << std::endl;
+            std::cout << "Profiling batch count: " << profilingBatchCount << std::endl;
+            std::cout << "Profiling data repeat: " << _profilingDataRepeat << std::endl;
+            std::cout << "Profiling total data count: " << profilingBatchCount * _batchsize << std::endl;
             while (batchIndex < profilingBatchCount) {
                 int worker = batchIndex % _numWorkers;
                 uint64_t inputLocation = (batchIndex % numBatches) * _batchsize * _sampleInputSize;
                 uint64_t outputLocation = worker * _batchsize * _sampleOutputSize;
 
-                const T* in = &inputData[inputLocation];
-                U* out = &profilingResultsDump[outputLocation];
+                const T *in = &inputData[inputLocation];
+                U *out = &profilingResultsDump[outputLocation];
                 Batch<T, U> newBatch = {in, out};
 
                 batchedData[worker].push_back(newBatch);
@@ -141,8 +201,8 @@ template <class T, class U> class DataBatcher {
         profilingBatchCount = 0;
     }
 
-    void write(const std::string& filename) {
-        std::cout << "\nWriting HW results to file " << filename << std::endl;
+    void write(const std::string &filename) {
+        std::cout << "Writing HW results to: " << filename << std::endl;
         std::ofstream fout;
         fout.open(filename, std::ios::trunc);
 
@@ -163,28 +223,19 @@ template <class T, class U> class DataBatcher {
         profilingResultsDump.clear();
     }
 
-    uint64_t getSampleCount() {
-        return originalSampleCount;
-    }
+    uint64_t getSampleCount() { return originalSampleCount; }
 
-    uint64_t getPaddedSampleCount() {
-        return numBatches * _batchsize;
-    }
+    uint64_t getPaddedSampleCount() { return numBatches * _batchsize; }
 
-    uint64_t getProfilingSampleCount() {
-        return profilingBatchCount * _batchsize;
-    }
+    uint64_t getProfilingSampleCount() { return profilingBatchCount * _batchsize; }
 
-    bool isProfilingMode() {
-        return _profiling;
-    }
+    bool isProfilingMode() { return _profilingDataRepeat > 0; }
 
   private:
     int _batchsize;
     int _sampleInputSize;
     int _sampleOutputSize;
     int _numWorkers;
-    bool _profiling;
     int _profilingDataRepeat;
 
     /// @brief Number of floats read in. (Not including padding).
@@ -195,6 +246,8 @@ template <class T, class U> class DataBatcher {
     uint64_t profilingBatchCount = 0;
     /// @brief Vector with values.
     std::vector<T> inputData;
+    /// @brief Vector with reference values.
+    std::vector<T> refData;
     /// @brief Vector to store evaluation results.
     std::vector<U> storedEvalResults;
     /// @brief Vector for dumping results from extra arbitrary data used during profiling.
diff --git a/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp b/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp
index a046c28644..df3e1d9de9 100644
--- a/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp
+++ b/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp
@@ -10,6 +10,7 @@
 #include <vector>
 
 #include "DataBatcher.hpp"
+#include "Params.hpp"
 #include "Types.hpp"
 #include "Worker.hpp"
 #include "xcl2.hpp"
@@ -24,75 +25,97 @@ template <class T, class U> class FpgaObj {
      * \param numCU Number of compute units synthesized on the FPGA
      * \param xclbinFilename String containing path of synthesized xclbin
      */
-    FpgaObj(int batchsize, int sampleInputSize, int sampleOutputSize, int numCU,
-            std::string xclbinFilename)
-        : _batchsize(batchsize), _sampleInputSize(sampleInputSize), _sampleOutputSize(sampleOutputSize),
-          _numCU(numCU), _xclbinFilename(xclbinFilename) {
+    FpgaObj(const Params &params)
+        : _batchsize(params.batchSize), _sampleInputSize(params.sampleInputSize), _sampleOutputSize(params.sampleOutputSize),
+          _numCU(params.numCU), _xclbinFilename(params.xclbinFilename) {
+
+        if (params.deviceBDFs.size() == 0) {
+            // Finds all AMD/Xilinx devices present in system
+            devices = xcl::get_xil_devices();
+            if (devices.size() == 0) {
+                throw std::runtime_error("No AMD/Xilinx FPGA devices found");
+            }
+            for (auto &device : devices) {
+                std::string device_bdf;
+                OCL_CHECK(err, err = device.getInfo(CL_DEVICE_PCIE_BDF, &device_bdf));
+                std::cout << "Found device: " << device.getInfo<CL_DEVICE_NAME>() << " (" << device_bdf << ")" << std::endl;
+            }
+
+        } else {
+            // Find devices by BDF
+            devices.reserve(params.deviceBDFs.size());
+            for (auto &bdf : params.deviceBDFs) {
+                devices.push_back(xcl::find_device_bdf(xcl::get_xil_devices(), bdf));
+                std::cout << "Found device: " << devices.back().getInfo<CL_DEVICE_NAME>() << " (" << bdf << ")" << std::endl;
+            }
+        }
 
-        // Finds Xilinx device
-        devices = xcl::get_xil_devices();
-        device = devices[0];
-        std::string deviceName = device.getInfo<CL_DEVICE_NAME>();
-        std::cout << "Found Device: " << deviceName << std::endl;
+        // Ensure that all devices are of the same type
+        for (auto &device : devices) {
+            std::string device_name = device.getInfo<CL_DEVICE_NAME>();
+            if (_deviceName.empty()) {
+                _deviceName = device_name;
+            } else if (_deviceName != device_name) {
+                throw std::runtime_error(
+                    "All devices must be of the same type, use -d to specify the BDFs of the devices you want to use");
+            }
+        }
+
+        _numDevice = devices.size();
 
         // Load xclbin
-        fileBuf = xcl::read_binary_file(_xclbinFilename);
-        bins = cl::Program::Binaries({{fileBuf.data(), fileBuf.size()}});
+        std::cout << "Loading: " << _xclbinFilename << std::endl;
+        std::vector<unsigned char> fileBuf = xcl::read_binary_file(_xclbinFilename);
+        cl::Program::Binaries bins;
+        for (int i = 0; i < _numDevice; i++) {
+            bins.push_back({fileBuf.data(), fileBuf.size()});
+        }
 
         // Create OpenCL context
-        context = cl::Context(device);
+        OCL_CHECK(err, context = cl::Context(devices, nullptr, nullptr, nullptr, &err));
 
         // Create OpenCL program from binary file
-        program = cl::Program(context, devices, bins);
-
-        // Create OpenCL command queues
-        comQueues.reserve(_numCU);
-        for (int i = 0; i < _numCU; i++) {
-            comQueues.emplace_back(context,
-                                   device,
-                                   CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+        OCL_CHECK(err, program = cl::Program(context, devices, bins, nullptr, &err));
+
+        std::cout << "Device programmed successfully" << std::endl;
+
+        // Create OpenCL program, and command queues for each device
+        comQueues.resize(_numDevice);
+        for (int i = 0; i < _numDevice; i++) {
+            comQueues[i].resize(_numCU);
+            // Create OpenCL out-of-order command queues (One per compute unit)
+            for (int j = 0; j < _numCU; j++) {
+                comQueues[i][j] = cl::CommandQueue(context, devices[i],
+                                                   CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE);
+            }
         }
-
-        // Create mutexes for each command queue
-        std::vector<std::mutex> temp(_numCU);
-        comQueueMtxi.swap(temp);
     }
 
     /**
-     * \brief Creates worker objects for each compute unit
+     * \brief Creates worker objects for each compute unit on each device
      * \param workersPerCU Number of worker objects that will drive each compute unit
-     * \param fpga Type of memory resource used by the FPGA.
-     * \param numHBMChannels Number of channels per port each Worker uses. Only for HBM.
      */
-    void createWorkers(int workersPerCU, FPGAType fpga, int numHBMChannels = 0) {
+    void createWorkers(int workersPerCU) {
         _workersPerCU = workersPerCU;
 
         // Construct workers
         workers.reserve(_numCU * _workersPerCU);
-        for (int i = 0; i < _numCU; i++) {
-            for (int j = 0; j < workersPerCU; j++) {
-                workers.emplace_back(_batchsize,
-                                     _sampleInputSize,
-                                     _sampleOutputSize,
-                                     comQueues[i],
-                                     comQueueMtxi[i]);
+        for (int d = 0; d < _numDevice; d++) {
+            for (int cu = 0; cu < _numCU; cu++) {
+                for (int w = 0; w < _workersPerCU; w++) {
+                    workers.emplace_back(d, d * (_numCU * _workersPerCU) + cu * _workersPerCU + w, _batchsize,
+                                         _sampleInputSize, _sampleOutputSize, comQueues[d][cu]);
+                }
             }
         }
 
         // Initialize workers
-        int currHBMChannel = 0;  // Only used if FPGAType is HBM
-        for (int i = 0; i < _numCU; i++) {
-            for (int j = 0; j < _workersPerCU; j++) {
-                workers[i * _workersPerCU + j].initialize(context,
-                                                          program,
-                                                          i + 1,
-                                                          i * _workersPerCU + j,
-                                                          fpga,
-                                                          currHBMChannel,
-                                                          numHBMChannels);
-
+        for (int d = 0; d < _numDevice; d++) {
+            for (int cu = 0; cu < _numCU; cu++) {
+                for (int w = 0; w < _workersPerCU; w++) {
+                    workers[d * (_numCU * _workersPerCU) + cu * _workersPerCU + w].initialize(context, program, cu + 1);
+                }
             }
-            currHBMChannel += 2 * numHBMChannels;
         }
     }
 
@@ -101,24 +124,18 @@ template <class T, class U> class FpgaObj {
      * \param fin Filename
      * \param s Input type. VitisAccelerator Backend currently uses text input. However,
      * the code also supports binary input in the format produced by NumPy's toFile().
-     * \param profiling If true, the given data will be iterated over multiple times,
-     * for more accurate throughput testing.
-     * \param profilingDataRepeat Only used if profiling is set to True. Additional number of
-     * times the given data is iterated over.
+     * \param profilingDataRepeat Additional number of times the given data is iterated
+     * over. Profiling is enabled if this is greater than 0.
      */
-    void loadData(const std::string& fin, bool profiling = false, int profilingDataRepeat = 0) {
+    void loadData(const std::string &fin, int profilingDataRepeat = 0) {
         // Set-up containers for each Worker's batches/workload
-        batchedData.reserve(_numCU * _workersPerCU);
-        for (int i = 0; i < _numCU * _workersPerCU; i++) {
+        batchedData.reserve(_numCU * _workersPerCU * _numDevice);
+        for (int i = 0; i < _numCU * _workersPerCU * _numDevice; i++) {
             batchedData.emplace_back();
         }
 
         // Batch and distribute data
-        db = new DataBatcher<T, U>(_batchsize,
-                                   _sampleInputSize,
-                                   _sampleOutputSize,
-                                   _numCU * _workersPerCU,
-                                   profiling,
+        db = new DataBatcher<T, U>(_batchsize, _sampleInputSize, _sampleOutputSize, _numCU * _workersPerCU * _numDevice,
                                    profilingDataRepeat);
         db->read(fin);
         db->createResultBuffers();
@@ -134,41 +151,57 @@ template <class T, class U> class FpgaObj {
             throw std::runtime_error("No data loaded");
         }
 
-        std::cout << "\nStarting FPGA run" << std::endl;
+        std::cout << "Starting FPGA run" << std::endl;
 
         auto ts_start = std::chrono::system_clock::now();
+
         std::vector<std::thread> accelThreads;
-        accelThreads.reserve(_numCU * _workersPerCU);
-        for (int i = 0; i < _numCU * _workersPerCU; i++) {
-            accelThreads.emplace_back([this, i]() {
-                this->workers[i].evalLoop(this->batchedData[i]);
-            });
+        accelThreads.reserve(_numCU * _workersPerCU * _numDevice);
+        for (int i = 0; i < _numCU * _workersPerCU * _numDevice; i++) {
+            accelThreads.emplace_back([this, i]() { this->workers[i].evalLoop(this->batchedData[i]); });
         }
-        for (int i = 0; i < _numCU * _workersPerCU; i++) {
+        for (int i = 0; i < _numCU * _workersPerCU * _numDevice; i++) {
             accelThreads[i].join();
         }
-        for (int i = 0; i < _numCU; i++) {
-            OCL_CHECK(err, err = comQueues[i].finish());
+
+        for (auto deviceQueue : comQueues) {
+            for (auto queue : deviceQueue) {
+                OCL_CHECK(err, err = queue.finish());
+            }
         }
+
         auto ts_end = std::chrono::system_clock::now();
 
         uint64_t ns_elapsed = std::chrono::duration_cast<std::chrono::nanoseconds>(ts_end - ts_start).count();
         if (db->isProfilingMode()) {
             double profilingThroughput = 1.0e9 * static_cast<double>(db->getProfilingSampleCount()) / ns_elapsed;
-            std::cout << "\nProfiling throughput: " << profilingThroughput << " predictions/second" << std::endl;
+            std::cout << "Processed " << db->getProfilingSampleCount() << " samples in " << ns_elapsed / 1000000 << " ms"
+                      << std::endl;
+            std::cout << "Profiling throughput: " << profilingThroughput << " predictions/second" << std::endl;
         } else {
             double throughput = 1.0e9 * static_cast<double>(db->getSampleCount()) / ns_elapsed;
             double maxThroughput = 1.0e9 * static_cast<double>(db->getPaddedSampleCount()) / ns_elapsed;
-            std::cout << "\nUtilized throughput: " << throughput << " predictions/second" << std::endl;
+            std::cout << "Utilized throughput: " << throughput << " predictions/second" << std::endl;
             std::cout << "Max possible throughput: " << maxThroughput << " predictions/second" << std::endl;
         }
     }
 
+    void checkResults(const std::string &ref) {
+        if (db == nullptr) {
+            throw std::runtime_error("No data loaded");
+        }
+        if (db->readReference(ref)) {
+            db->checkResults();
+        } else {
+            std::cout << "No reference file provided, skipping results check" << std::endl;
+        }
+    }
+
     /**
      * \brief Writes results, in text format, to provided file. Releases resources
      * \param fout Filename. If file already exists, it will be overwritten with current results.
      */
-    void saveResults(const std::string& fout) {
+    void saveResults(const std::string &fout) {
         if (db == nullptr) {
             throw std::runtime_error("No data loaded");
         }
@@ -181,24 +214,18 @@ template <class T, class U> class FpgaObj {
     int _sampleInputSize;
     int _sampleOutputSize;
     int _numCU;
+    int _numDevice;
     std::string _xclbinFilename;
+    std::string _deviceName;
 
-    /// @brief A list of connected Xilinx devices
+    /// @brief A list of connected AMD/Xilinx devices
     std::vector<cl::Device> devices;
-    /// @brief The identified FPGA
-    cl::Device device;
-    /// @brief Container that xclbin file is read into
-    std::vector<unsigned char> fileBuf;
-    /// @brief OpenCL object constructed from xclbin
-    cl::Program::Binaries bins;
     /// @brief OpenCL Program that each compute unit executes
     cl::Program program;
     /// @brief OpenCL Device Context
     cl::Context context;
     /// @brief OpenCL Command Queues for each compute unit
-    std::vector<cl::CommandQueue> comQueues;
-    /// @brief Mutexes for each Command Queue
-    mutable std::vector<std::mutex> comQueueMtxi;
+    std::vector<std::vector<cl::CommandQueue>> comQueues;
     /// @brief Error code storage
     cl_int err;
 
@@ -206,7 +233,7 @@ template <class T, class U> class FpgaObj {
     /// @brief Workers, indexed by (i_CU * _workersPerCU + i_worker)
     std::vector<Worker<T, U>> workers;
     /// @brief Data Batcher
-    DataBatcher<T, U>* db = nullptr;
+    DataBatcher<T, U> *db = nullptr;
     /// @brief A vector containing each Worker's batches/workload
     std::vector<std::list<Batch<T, U>>> batchedData;
 };
diff --git a/hls4ml/templates/vitis_accelerator/libs/Params.hpp b/hls4ml/templates/vitis_accelerator/libs/Params.hpp
new file mode 100644
index 0000000000..b2ddf66c96
--- /dev/null
+++ b/hls4ml/templates/vitis_accelerator/libs/Params.hpp
@@ -0,0 +1,107 @@
+#pragma once
+
+#include <ctype.h>
+#include <getopt.h>
+#include <iostream>
+#include <stdio.h>
+#include <stdlib.h>
+
+#include "../kernel_wrapper.h"
+#include "FpgaObj.hpp"
+
+class Params {
+
+  public:
+    Params(int argc, char **argv) {
+        int opt, temp;
+        while ((opt = getopt(argc, argv, "x:vhr:n:i:o:d:c:")) != EOF)
+            switch (opt) {
+            case 'd':
+                deviceBDFs.push_back(optarg);
+                break;
+            case 'x':
+                xclbinFilename = optarg;
+                break;
+            case 'i':
+                inputFilename = optarg;
+                break;
+            case 'o':
+                outputFilename = optarg;
+                break;
+            case 'c':
+                temp = atoi(optarg);
+                if (temp > 0 && temp < NUM_CU)
+                    ;
+                numCU = temp;
+                break;
+            case 'n':
+                temp = atoi(optarg);
+                if (temp > 0)
+                    numWorker = temp;
+                break;
+            case 'r':
+                dataRepeatCount = atoi(optarg);
+                break;
+            case 'v':
+                verbose++;
+                break;
+            case 'h':
+                help();
+                exit(0);
+            default:
+                std::cout << std::endl;
+                abort();
+            }
+
+        if (verbose > 0)
+            print();
+    }
+
+    void help(void) {
+        std::cout << "Options:" << std::endl;
+        std::cout << "  -d: device BDF (can be specified multiple times)" << std::endl;
+        std::cout << "  -x: XCLBIN path" << std::endl;
+        std::cout << "  -i: input file" << std::endl;
+        std::cout << "  -o: output file" << std::endl;
+        std::cout << "  -c: maximum computing units count" << std::endl;
+        std::cout << "  -n: maximum workers count" << std::endl;
+        std::cout << "  -r: input data repeat count" << std::endl;
+        std::cout << "  -v: enable verbose output" << std::endl;
+        std::cout << "  -h: this helps message" << std::endl;
+    }
+
+    void print(void) {
+        std::cout << "Run parameters:" << std::endl;
+        std::cout << "   xclbinFilename: " << xclbinFilename << std::endl;
+        std::cout << "        batchSize: " << batchSize << std::endl;
+        std::cout << "  sampleInputSize: " << sampleInputSize << std::endl;
+        std::cout << " sampleOutputSize: " << sampleOutputSize << std::endl;
+        std::cout << "            numCU: " << numCU << std::endl;
+        std::cout << "    inputFilename: " << inputFilename << std::endl;
+        std::cout << "   outputFilename: " << outputFilename << std::endl;
+        std::cout << "        numWorker: " << numWorker << std::endl;
+        std::cout << "  dataRepeatCount: " << dataRepeatCount << std::endl;
+    }
+
+    // Device
+    std::vector<std::string> deviceBDFs;
+
+    // Bitstream
+    std::string xclbinFilename = "./build_hw_rel/kernel_wrapper.xclbin";
+    size_t batchSize = BATCHSIZE;
+    const size_t sampleInputSize = INSTREAMSIZE;
+    const size_t sampleOutputSize = OUTSTREAMSIZE;
+    size_t numCU = NUM_CU;
+
+    // Data paths
+    std::string inputFilename = "./tb_data/tb_input_features.dat";
+    std::string referenceFilename = "tb_data/tb_output_predictions.dat";
+    std::string outputFilename = "./tb_data/hw_results.dat";
+
+    // Workers
+    int numWorker = NUM_WORKER;
+
+    // Benchmark
+    int dataRepeatCount = -1;
+    int verbose = 0;
+};
diff --git a/hls4ml/templates/vitis_accelerator/libs/Types.hpp b/hls4ml/templates/vitis_accelerator/libs/Types.hpp
index 1e8bf0b55e..0ff3bed610 100644
--- a/hls4ml/templates/vitis_accelerator/libs/Types.hpp
+++ b/hls4ml/templates/vitis_accelerator/libs/Types.hpp
@@ -2,13 +2,7 @@
 
 #include <cstdint>
 
-template<class T, class U>
-struct Batch {
-    const T* dataIn;
-    U* dataOut;
-};
-
-enum class FPGAType : uint8_t {
-    DDR = 0,
-    HBM = 1
+template <class T, class U> struct Batch {
+    const T *dataIn;
+    U *dataOut;
 };
diff --git a/hls4ml/templates/vitis_accelerator/libs/Worker.hpp b/hls4ml/templates/vitis_accelerator/libs/Worker.hpp
index ff1677d073..5174936f24 100644
--- a/hls4ml/templates/vitis_accelerator/libs/Worker.hpp
+++ b/hls4ml/templates/vitis_accelerator/libs/Worker.hpp
@@ -10,15 +10,6 @@
 #include "Types.hpp"
 #include "xcl2.hpp"
 
-// HBM Pseudo-channel(PC) requirements
-#define MAX_HBM_PC_COUNT 32
-#define PC_NAME(n) n | XCL_MEM_TOPOLOGY
-const int pc[MAX_HBM_PC_COUNT] = {PC_NAME(0),  PC_NAME(1),  PC_NAME(2),  PC_NAME(3),  PC_NAME(4),  PC_NAME(5),  PC_NAME(6),
-                                  PC_NAME(7),  PC_NAME(8),  PC_NAME(9),  PC_NAME(10), PC_NAME(11), PC_NAME(12), PC_NAME(13),
-                                  PC_NAME(14), PC_NAME(15), PC_NAME(16), PC_NAME(17), PC_NAME(18), PC_NAME(19), PC_NAME(20),
-                                  PC_NAME(21), PC_NAME(22), PC_NAME(23), PC_NAME(24), PC_NAME(25), PC_NAME(26), PC_NAME(27),
-                                  PC_NAME(28), PC_NAME(29), PC_NAME(30), PC_NAME(31)};
-
 template <class T, class U> class Worker {
   public:
     /**
@@ -29,14 +20,9 @@ template <class T, class U> class Worker {
      * \param commandQueue cl::CommandQueue that the worker will enqueue operations to
      * \param queueMutex Mutex protecting the CommandQueue (potentially shared with other workers)
      */
-    Worker(int batchsize, int sampleInputSize, int sampleOutputSize, cl::CommandQueue& queue,
-            std::mutex& queueMutex) :
-        _batchsize(batchsize),
-        _sampleInputSize(sampleInputSize),
-        _sampleOutputSize(sampleOutputSize),
-        _queue(queue),
-        _queueMutex(queueMutex)
-    {
+    Worker(int deviceId, int workerId, int batchsize, int sampleInputSize, int sampleOutputSize, cl::CommandQueue &queue)
+        : _deviceId(deviceId), _workerId(workerId), _batchsize(batchsize), _sampleInputSize(sampleInputSize),
+          _sampleOutputSize(sampleOutputSize), _queue(queue), writeEvents(1), executionEvents(1) {
         memmap_in.resize(_batchsize * _sampleInputSize, T(0.0f));
         memmap_out.resize(_batchsize * _sampleOutputSize, U(0.0f));
     }
@@ -46,97 +32,103 @@ template <class T, class U> class Worker {
      * \param context cl::Context of the FPGA.
      * \param program cl:Program of the FPGA.
      * \param computeUnit The number of the physical compute unit this worker will use.
-     * \param workerId Worker's ID number.
-     * \param fpga Type of memory resource used by the FPGA.
-     * \param firstHBMChannel Start index of this Worker's memory channels this worker uses. Only for HBM.
-     * \param numHBMChannels Number of channels per port this worker uses. Only for HBM.
      */
-    void initialize(
-        cl::Context& context,
-        cl::Program& program,
-        int computeUnit,
-        int workerId,
-        FPGAType fpga,
-        int firstHBMChannel = 0,  // Only used for if fpga == FPGAType::HBM
-        int numHBMChannels = 0  // Only used for if fpga == FPGAType::HBM
-    ) {
-        if (fpga == FPGAType::HBM) {
-            allocateHBMMemory(context, firstHBMChannel, numHBMChannels);
-        } else if (fpga == FPGAType::DDR) {
-            allocateDDRMemory(context);
-        }
+    void initialize(cl::Context &context, cl::Program &program, int computeUnit) {
+        cl_int err;
+
+        // This is AMD's format for specifying the Compute Unit a kernel object uses
+        std::string krnl_name = "kernel_wrapper:{kernel_wrapper_" + std::to_string(computeUnit) + "}";
 
         // Creating Kernel object
-        std::string krnl_name =
-            "kernel_wrapper:{kernel_wrapper_" + std::to_string(computeUnit) +
-            "}"; // This is Xilinx's format for specifying the Compute Unit a kernel object uses
-        krnl = cl::Kernel(program, krnl_name.c_str(), &err);
-        krnl.setArg(0, input_buffer);
-        krnl.setArg(1, output_buffer);
-
-        std::cout << "Initialized Worker " << workerId
-            << ", which will use Compute Unit " << computeUnit << std::endl;
+        OCL_CHECK(err, krnl = cl::Kernel(program, krnl_name.c_str(), &err));
+
+        // Per AMD documentation we can leave XRT infer the bank location for the buffer:
+        // " The XRT can obtain the bank location for the buffer if the buffer
+        //   is used for setting the kernel arguments right after the buffer
+        //   creation, i.e. before any enqueue operation on the buffer."
+
+        const size_t vector_in_size_bytes = sizeof(T) * _batchsize * _sampleInputSize;
+        const size_t vector_out_size_bytes = sizeof(U) * _batchsize * _sampleOutputSize;
+
+        OCL_CHECK(err, input_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_in_size_bytes,
+                                                 memmap_in.data(), &err));
+
+        OCL_CHECK(err, output_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, vector_out_size_bytes,
+                                                  memmap_out.data(), &err));
+
+        // Set kernel arguments will effectively affect the memory bank location
+        OCL_CHECK(err, err = krnl.setArg(0, input_buffer));
+        OCL_CHECK(err, err = krnl.setArg(1, output_buffer));
+
+        // Perform a dummy transfer input batch to FPGA to ensure that allocation time is not counted
+        // in the evaluation time. Also allows us to query the memory bank location.
+        int mem_bank_index = -1;
+        OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({input_buffer},
+                                                             0,    // 0 means from host
+                                                             NULL, // No dependencies
+                                                             &writeEvents[0]));
+        OCL_CHECK(err, err = writeEvents[0].wait());
+        OCL_CHECK(err, err = clGetMemObjectInfo(input_buffer.get(), CL_MEM_BANK, sizeof(int), &mem_bank_index, nullptr));
+
+        std::cout << "Initialized Worker " << _workerId << ", using CU " << computeUnit << " and memory bank "
+                  << mem_bank_index << " on device " << _deviceId << std::endl;
     }
 
     /**
      * \brief Evaluates the single batch currently in input_buffer and writes to output_buffer.
      */
     void evaluate() {
-        std::lock_guard<std::mutex> lock(_queueMutex);
-        // Transfer inputs
+
+        cl_int err;
+
+        // Transfer input batch to FPGA
         OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({input_buffer},
-                                                             0, // 0 means from host
+                                                             0,    // 0 means from host
                                                              NULL, // No dependencies
-                                                             &write_event));
-        // Execute program
-        writeCompleteEvents.push_back(write_event);
-        OCL_CHECK(err, err = _queue.enqueueNDRangeKernel(krnl,
-                                                         0,
-                                                         1,
-                                                         1,
-                                                         &writeCompleteEvents,
-                                                         &kernExe_event));
-        writeCompleteEvents.pop_back();
-        // Transfer outputs
-        kernExeCompleteEvents.push_back(kernExe_event);
-        OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({output_buffer},
-                                                              CL_MIGRATE_MEM_OBJECT_HOST,
-                                                              &kernExeCompleteEvents,
-                                                              &read_event));
-        kernExeCompleteEvents.pop_back();
-        OCL_CHECK(err, err = read_event.wait());
+                                                             &writeEvents[0]));
+
+        // Run kernel on the batch
+        OCL_CHECK(err, err = _queue.enqueueNDRangeKernel(krnl, 0, 1, 1, &writeEvents, &executionEvents[0]));
+
+        // Transfer output batch from FPGA
+        OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({output_buffer}, CL_MIGRATE_MEM_OBJECT_HOST, &executionEvents,
+                                                             &readEvent));
+
+        // Wait for all operations to complete
+        OCL_CHECK(err, err = readEvent.wait());
     }
 
     /**
      * \brief Evaluates each batch of data provided via dataTracker. Uses float datatype
      * \param dataTracker Vector of input locations to read from and output locations to write to
      */
-    void evalLoop(std::list<Batch<T, U>>& dataTracker) {
+    void evalLoop(std::list<Batch<T, U>> &dataTracker) {
+
         while (!dataTracker.empty()) {
             // Copy inputs into memory-mapped buffer
-            const T* dataLoc = dataTracker.front().dataIn;
+            // FIXME: It there a way to avoid this copy? Could the orignal batch be used directly if aligned?
+            const T *dataLoc = dataTracker.front().dataIn;
             memcpy(&memmap_in[0], dataLoc, _batchsize * _sampleInputSize * sizeof(T));
 
             // Evaluate
             evaluate();
 
             // Copy outputs into persistent results vector
-            U* resLoc = dataTracker.front().dataOut;
+            U *resLoc = dataTracker.front().dataOut;
             memcpy(resLoc, &memmap_out[0], _batchsize * _sampleOutputSize * sizeof(U));
-
             dataTracker.pop_front();
         }
     }
 
   private:
+    int _deviceId;
+    int _workerId;
     int _batchsize;
     int _sampleInputSize;
     int _sampleOutputSize;
 
     /// @brief Reference to the OpenCL command queue
-    const cl::CommandQueue& _queue;
-    /// @brief Mutex for thread-safe access to the command queue
-    std::mutex& _queueMutex;
+    const cl::CommandQueue &_queue;
 
     /// @brief Vector mapped to FPGA input buffer
     std::vector<T, aligned_allocator<T>> memmap_in;
@@ -150,60 +142,10 @@ template <class T, class U> class Worker {
     /// @brief OpenCL kernel object
     cl::Kernel krnl;
 
-    /// @brief Pointer mapping host input buffer to FPGA memory pseudo-channels (HBM only)
-    cl_mem_ext_ptr_t hbm_in_ptr;
-    /// @brief Pointer mapping host output buffer to FPGA memory pseudo-channels (HBM only)
-    cl_mem_ext_ptr_t hbm_out_ptr;
-
-    /// @brief Event for signaling input transfer completion
-    cl::Event write_event;
-    /// @brief Event for signaling kernel execution completion
-    cl::Event kernExe_event;
-    /// @brief Event for signaling output transfer completion
-    cl::Event read_event;
     /// @brief Vector tracking write events. Required by OpenCL queue functions.
-    std::vector<cl::Event> writeCompleteEvents;
+    std::vector<cl::Event> writeEvents;
     /// @brief Vector tracking kernel execution events. Required by OpenCL queue functions.
-    std::vector<cl::Event> kernExeCompleteEvents;
-
-    /// @brief Error code storage
-    cl_int err;
-    bool firstLoop = true;
-
-    void allocateHBMMemory(cl::Context& context, int firstHBMChannel, int numHBMChannels) {
-        // Create Pointer objects for the in/out ports for each worker
-        // Assigning Pointers to specific HBM PC's using cl_mem_ext_ptr_t type and corresponding PC flags
-        hbm_in_ptr.obj = memmap_in.data();
-        hbm_in_ptr.param = 0;
-        int in_flags = 0;
-        for (int i = 0; i < numHBMChannels; i++) {
-            in_flags |= pc[firstHBMChannel + i];
-        }
-        hbm_in_ptr.flags = in_flags;
-
-        hbm_out_ptr.obj = memmap_out.data();
-        hbm_out_ptr.param = 0;
-        int out_flags = 0;
-        for (int i = 0; i < numHBMChannels; i++) {
-            out_flags |= pc[firstHBMChannel + numHBMChannels + i];
-        }
-        hbm_out_ptr.flags = out_flags;
-
-        // Creating Buffer objects in Host memory
-        uint64_t vector_in_size_bytes = sizeof(T) * _batchsize * _sampleInputSize;
-        uint64_t vector_out_size_bytes = sizeof(U) * _batchsize * _sampleOutputSize;
-        input_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_EXT_PTR_XILINX | CL_MEM_READ_ONLY,
-                                    vector_in_size_bytes, &hbm_in_ptr);
-        output_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_EXT_PTR_XILINX | CL_MEM_WRITE_ONLY,
-                                    vector_out_size_bytes, &hbm_out_ptr);
-    }
-
-    void allocateDDRMemory(cl::Context& context) {
-        uint64_t vector_in_size_bytes = sizeof(T) * _batchsize * _sampleInputSize;
-        uint64_t vector_out_size_bytes = sizeof(U) * _batchsize * _sampleOutputSize;
-        input_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_in_size_bytes,
-                                    memmap_in.data());
-        output_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, vector_out_size_bytes,
-                                    memmap_out.data());
-    }
+    std::vector<cl::Event> executionEvents;
+    /// @brief Event for signaling output transfer completion
+    cl::Event readEvent;
 };
diff --git a/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp b/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp
index f2480567ae..6e03deb793 100644
--- a/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp
+++ b/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp
@@ -37,8 +37,6 @@ std::vector<cl::Device> get_devices(const std::string &vendor_name) {
         platform = platforms[i];
         OCL_CHECK(err, std::string platformName = platform.getInfo<CL_PLATFORM_NAME>(&err));
         if (!(platformName.compare(vendor_name))) {
-            std::cout << "Found Platform" << std::endl;
-            std::cout << "Platform Name: " << platformName.c_str() << std::endl;
             break;
         }
     }
@@ -74,7 +72,7 @@ cl::Device find_device_bdf(const std::vector<cl::Device> &devices, const std::st
         }
     }
     if (cnt == 0) {
-        std::cout << "Invalid device bdf. Please check and provide valid bdf\n";
+        std::cout << "Invalid device bdf: " << bdf << ". Please check and provide valid bdf\n";
         exit(EXIT_FAILURE);
     }
     return device;
@@ -103,14 +101,12 @@ cl_device_id find_device_bdf_c(cl_device_id *devices, const std::string &bdf, cl
     return device;
 }
 std::vector<unsigned char> read_binary_file(const std::string &xclbin_file_name) {
-    std::cout << "INFO: Reading " << xclbin_file_name << std::endl;
     FILE *fp;
     if ((fp = fopen(xclbin_file_name.c_str(), "r")) == nullptr) {
         printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str());
         exit(EXIT_FAILURE);
     }
     // Loading XCL Bin into char buffer
-    std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n";
     std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary);
     bin_file.seekg(0, bin_file.end);
     auto nb = bin_file.tellg();
diff --git a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp
index 0fd48a4bec..daad501fa9 100644
--- a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp
+++ b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp
@@ -1,42 +1,26 @@
 #include <string>
 
 #include "FpgaObj.hpp"
+#include "Params.hpp"
 #include "Types.hpp"
 #include "kernel_wrapper.h"
 #include "xcl2.hpp"
 
 int main(int argc, char **argv) {
-    if (argc < 2) {
-        std::cout << "Usage: " << argv[0]
-                               << " <XCLBIN filename>"
-                               << " [Profiling: Data repeat count]" << std::endl;
-        return EXIT_FAILURE;
-    }
-    std::string xclbinFilename = argv[1];
-    int dataRepeatCount = -1;
-    if (argc == 3) {
-        dataRepeatCount = std::stoi(argv[2]);
-    }
-
-	FpgaObj</*INTERFACE_TYPES*/> fpga(BATCHSIZE,
-                                            INSTREAMSIZE,
-                                            OUTSTREAMSIZE,
-                                            NUM_CU,
-                                            xclbinFilename);
-
-    fpga.createWorkers(NUM_WORKER,
-                       FPGAType::/*FPGA_Type*/,
-                       NUM_CHANNEL);
-
-    if (dataRepeatCount == -1) {
-        fpga.loadData("../tb_data/tb_input_features.dat");
-    } else {
-        fpga.loadData("../tb_data/tb_input_features.dat", true, dataRepeatCount);
-    }
+
+    Params params(argc, argv);
+
+    FpgaObj</*INTERFACE_TYPES*/> fpga(params);
+
+    fpga.createWorkers(params.numWorker);
+
+    fpga.loadData(params.inputFilename, params.dataRepeatCount);
 
     fpga.evaluateAll();
 
-    fpga.saveResults("../tb_data/hw_results.dat");
+    fpga.checkResults(params.referenceFilename);
+
+    fpga.saveResults(params.outputFilename);
 
     return EXIT_SUCCESS;
 }