From 18f7fc7f10e58bfab225dc96dab28756e4394937 Mon Sep 17 00:00:00 2001 From: Quentin Berthet Date: Sun, 12 Jan 2025 22:39:19 +0100 Subject: [PATCH] VitisAccelerator Host code refactor: - Multiple devices support - Selection of device by BDF - OpenCL error checking - Automatic memory bank association - Inferences validation - Improved command line parameters - Improved debug output - Dummy buffer copy to avoid benchmarking buffer allocation time - Removal of mutexes preventing buffer copies overlap with kernel executions on the same CU with multiple workers - Documentation --- docs/backend/accelerator.rst | 27 ++- .../vitis_accelerator/libs/DataBatcher.hpp | 125 +++++++---- .../vitis_accelerator/libs/FpgaObj.hpp | 191 +++++++++-------- .../vitis_accelerator/libs/Params.hpp | 107 ++++++++++ .../vitis_accelerator/libs/Types.hpp | 12 +- .../vitis_accelerator/libs/Worker.hpp | 194 ++++++------------ .../templates/vitis_accelerator/libs/xcl2.cpp | 6 +- .../vitis_accelerator/myproject_host_cl.cpp | 40 ++-- 8 files changed, 414 insertions(+), 288 deletions(-) create mode 100644 hls4ml/templates/vitis_accelerator/libs/Params.hpp diff --git a/docs/backend/accelerator.rst b/docs/backend/accelerator.rst index 1c1d06a7a6..d9cb4e31b1 100644 --- a/docs/backend/accelerator.rst +++ b/docs/backend/accelerator.rst @@ -132,11 +132,34 @@ Once the project is generated, it possible to run manually the build steps by us It is also possible to run the full build process by calling ``make`` without any target. Modifications to the ``accelerator_card.cfg`` file can be done manually before running the build process (e.g., to change the clock period, or add addition ``.xo`` kernel to the build). -The generated host code application and the xclbin file can be executed as such: +Host code +========= + +Once built, the host program can be run to load the board and perform inferences: + +.. code-block:: Bash + + ./host + +By defaut, all Computing Unit (CU) on all compatible devices will be used, with 3 worker thread per CU. + +The generated host code application support the following options to tweak the execution: + + * ``-d``: device BDF to use (can be specified multiple times) + * ``-x``: XCLBIN path + * ``-i``: input feature file + * ``-o``: output feature file + * ``-c``: maximum computing units count to use + * ``-n``: number of worker threads to use + * ``-r``: number of repeatition of the input feature file (For artificially increasing the data size for benchmarking purpose) + * ``-v``: enable verbose output + * ``-h``: print help + +The following example shows how to limit on only one device, one CU, and on worker thread: .. code-block:: Bash - ./host /.xclbin + ./host -d 0000:c1:00.1 -c 1 -n 1 Example ======= diff --git a/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp b/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp index 6353d59c61..96f15d4a1a 100644 --- a/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp +++ b/hls4ml/templates/vitis_accelerator/libs/DataBatcher.hpp @@ -5,8 +5,8 @@ #include #include #include -#include #include +#include #include #include @@ -25,10 +25,9 @@ template class DataBatcher { * \param profilingDataRepeat Only used if profiling is set to True. Additional number of * times the given data is iterated over. */ - DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers, - bool profiling, int profilingDataRepeat) + DataBatcher(int batchsize, int sampleInputSize, int sampleOutputSize, int numWorkers, int profilingDataRepeat) : _batchsize(batchsize), _sampleInputSize(sampleInputSize), _sampleOutputSize(sampleOutputSize), - _numWorkers(numWorkers), _profiling(profiling), _profilingDataRepeat(profilingDataRepeat) {} + _numWorkers(numWorkers), _profilingDataRepeat(profilingDataRepeat) {} /** * \brief Read in data to a buffer. Allocate space for results. @@ -36,15 +35,15 @@ template class DataBatcher { * \param s Type of input, currently supports text files used by VitisAccelerator backend, and * binary files produced by NumPy's toFile() function */ - void read(const std::string& filename) { - std::cout << "\nReading data from text file " << filename << std::endl; + void read(const std::string &filename) { - // Read in text file std::ifstream fin(filename); if (!fin.is_open()) { throw std::runtime_error("Error opening file " + filename); } + std::cout << "Reading data from: " << filename << std::endl; + std::string line; while (std::getline(fin, line)) { originalSampleCount++; @@ -57,13 +56,70 @@ template class DataBatcher { throw std::runtime_error("Failed to parse value on line " + std::to_string(originalSampleCount)); } } - std::cout << "Read in " << originalSampleCount << " lines" << std::endl; + + std::cout << "Read in " << originalSampleCount << " samples (" << inputData.size() << " elements)" << std::endl; fin.close(); // Zero-pad numBatches = std::ceil(static_cast(originalSampleCount) / _batchsize); - if (numBatches * _batchsize > originalSampleCount) { - inputData.resize(numBatches * _batchsize * _sampleInputSize, (T)0); + size_t finalSampleCount = numBatches * _batchsize; + if (finalSampleCount > originalSampleCount) { + std::cout << "Padding with " << (finalSampleCount - originalSampleCount) << " empty samples for a total of " + << numBatches << " batches of " << _batchsize << " samples" << std::endl; + inputData.resize(finalSampleCount * _sampleInputSize, (T)0); + } + } + + bool readReference(const std::string &filename) { + + std::ifstream fref(filename); + if (!fref.is_open()) { + return false; + } + + std::cout << "Reading data from: " << filename << std::endl; + size_t refSampleCount = 0; + std::string line; + while (std::getline(fref, line)) { + refSampleCount++; + std::istringstream parser(line); + T val; + while (parser >> val) { + refData.push_back(val); + } + if (!parser.eof()) { + throw std::runtime_error("Failed to parse value on line " + std::to_string(refSampleCount)); + } + } + + std::cout << "Read in " << refSampleCount << " reference samples (" << refData.size() << " elements)" << std::endl; + fref.close(); + return true; + } + + void checkResults() { + if (storedEvalResults.size() == 0 || refData.size() == 0) { + throw std::runtime_error("No data to check"); + } + + if (storedEvalResults.size() != refData.size()) { + throw std::runtime_error("Stored results and reference data are not the same size"); + } + size_t error_count = 0; + for (uint64_t i = 0; i < storedEvalResults.size(); i++) { + if (storedEvalResults[i] != refData[i]) { + error_count++; + std::cout << "Mismatch at index " + std::to_string(i) + ": " + std::to_string((float)storedEvalResults[i]) + + " != " + std::to_string((float)refData[i]) + << ", error = " << ((float)storedEvalResults[i] - (float)refData[i]) << std::endl; + } + } + + if (error_count > 0) { + std::cout << "Mismatch count: " << error_count << std::endl; + throw std::runtime_error("Results do not match reference data"); + } else { + std::cout << "Results match reference data" << std::endl; } } @@ -74,7 +130,7 @@ template class DataBatcher { storedEvalResults.resize(numBatches * _batchsize * _sampleOutputSize, (U)0); // Allocate space to dump the extra arbitrary data used during profiling - if (_profiling) { + if (isProfilingMode()) { profilingResultsDump.resize(_numWorkers * _batchsize * _sampleOutputSize, (U)0); } } @@ -84,18 +140,18 @@ template class DataBatcher { * \param batchedData A vector of containers for each Worker's batches/workload. * Size must be equal to _numWorkers. */ - void batch(std::vector>>& batchedData) { + void batch(std::vector>> &batchedData) { if (inputData.size() == 0 || originalSampleCount == 0) { throw std::runtime_error("No data to batch"); } + std::cout << "Original sample count: " << originalSampleCount << std::endl; + std::cout << "Input sample element count: " << _sampleInputSize << std::endl; + std::cout << "Output sample element count: " << _sampleOutputSize << std::endl; if (storedEvalResults.size() == 0) { throw std::runtime_error("Create result buffers first"); } - batchedData.reserve(_numWorkers); - for (int i = 0; i < _numWorkers; i++) { - batchedData.emplace_back(); - } + batchedData.resize(_numWorkers); uint64_t batchIndex = 0; while (batchIndex < numBatches) { @@ -103,24 +159,28 @@ template class DataBatcher { uint64_t inputLocation = batchIndex * _batchsize * _sampleInputSize; uint64_t outputLocation = batchIndex * _batchsize * _sampleOutputSize; - const T* in = &inputData[inputLocation]; - U* out = &storedEvalResults[outputLocation]; + const T *in = &inputData[inputLocation]; + U *out = &storedEvalResults[outputLocation]; Batch newBatch = {in, out}; batchedData[worker].push_back(newBatch); batchIndex++; } - if (_profiling) { + if (isProfilingMode()) { std::cout << "Creating profiling batches" << std::endl; profilingBatchCount = numBatches * (_profilingDataRepeat + 1); + std::cout << "Batches: " << numBatches << std::endl; + std::cout << "Profiling batch count: " << profilingBatchCount << std::endl; + std::cout << "Profiling data repeat: " << _profilingDataRepeat << std::endl; + std::cout << "Profiling total data count: " << profilingBatchCount * _batchsize << std::endl; while (batchIndex < profilingBatchCount) { int worker = batchIndex % _numWorkers; uint64_t inputLocation = (batchIndex % numBatches) * _batchsize * _sampleInputSize; uint64_t outputLocation = worker * _batchsize * _sampleOutputSize; - const T* in = &inputData[inputLocation]; - U* out = &profilingResultsDump[outputLocation]; + const T *in = &inputData[inputLocation]; + U *out = &profilingResultsDump[outputLocation]; Batch newBatch = {in, out}; batchedData[worker].push_back(newBatch); @@ -141,8 +201,8 @@ template class DataBatcher { profilingBatchCount = 0; } - void write(const std::string& filename) { - std::cout << "\nWriting HW results to file " << filename << std::endl; + void write(const std::string &filename) { + std::cout << "Writing HW results to: " << filename << std::endl; std::ofstream fout; fout.open(filename, std::ios::trunc); @@ -163,28 +223,19 @@ template class DataBatcher { profilingResultsDump.clear(); } - uint64_t getSampleCount() { - return originalSampleCount; - } + uint64_t getSampleCount() { return originalSampleCount; } - uint64_t getPaddedSampleCount() { - return numBatches * _batchsize; - } + uint64_t getPaddedSampleCount() { return numBatches * _batchsize; } - uint64_t getProfilingSampleCount() { - return profilingBatchCount * _batchsize; - } + uint64_t getProfilingSampleCount() { return profilingBatchCount * _batchsize; } - bool isProfilingMode() { - return _profiling; - } + bool isProfilingMode() { return _profilingDataRepeat > 0; } private: int _batchsize; int _sampleInputSize; int _sampleOutputSize; int _numWorkers; - bool _profiling; int _profilingDataRepeat; /// @brief Number of floats read in. (Not including padding). @@ -195,6 +246,8 @@ template class DataBatcher { uint64_t profilingBatchCount = 0; /// @brief Vector with values. std::vector inputData; + /// @brief Vector with reference values. + std::vector refData; /// @brief Vector to store evaluation results. std::vector storedEvalResults; /// @brief Vector for dumping results from extra arbitrary data used during profiling. diff --git a/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp b/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp index a046c28644..df3e1d9de9 100644 --- a/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp +++ b/hls4ml/templates/vitis_accelerator/libs/FpgaObj.hpp @@ -10,6 +10,7 @@ #include #include "DataBatcher.hpp" +#include "Params.hpp" #include "Types.hpp" #include "Worker.hpp" #include "xcl2.hpp" @@ -24,75 +25,97 @@ template class FpgaObj { * \param numCU Number of compute units synthesized on the FPGA * \param xclbinFilename String containing path of synthesized xclbin */ - FpgaObj(int batchsize, int sampleInputSize, int sampleOutputSize, int numCU, - std::string xclbinFilename) - : _batchsize(batchsize), _sampleInputSize(sampleInputSize), _sampleOutputSize(sampleOutputSize), - _numCU(numCU), _xclbinFilename(xclbinFilename) { + FpgaObj(const Params ¶ms) + : _batchsize(params.batchSize), _sampleInputSize(params.sampleInputSize), _sampleOutputSize(params.sampleOutputSize), + _numCU(params.numCU), _xclbinFilename(params.xclbinFilename) { + + if (params.deviceBDFs.size() == 0) { + // Finds all AMD/Xilinx devices present in system + devices = xcl::get_xil_devices(); + if (devices.size() == 0) { + throw std::runtime_error("No AMD/Xilinx FPGA devices found"); + } + for (auto &device : devices) { + std::string device_bdf; + OCL_CHECK(err, err = device.getInfo(CL_DEVICE_PCIE_BDF, &device_bdf)); + std::cout << "Found device: " << device.getInfo() << " (" << device_bdf << ")" << std::endl; + } + + } else { + // Find devices by BDF + devices.reserve(params.deviceBDFs.size()); + for (auto &bdf : params.deviceBDFs) { + devices.push_back(xcl::find_device_bdf(xcl::get_xil_devices(), bdf)); + std::cout << "Found device: " << devices.back().getInfo() << " (" << bdf << ")" << std::endl; + } + } - // Finds Xilinx device - devices = xcl::get_xil_devices(); - device = devices[0]; - std::string deviceName = device.getInfo(); - std::cout << "Found Device: " << deviceName << std::endl; + // Ensure that all devices are of the same type + for (auto &device : devices) { + std::string device_name = device.getInfo(); + if (_deviceName.empty()) { + _deviceName = device_name; + } else if (_deviceName != device_name) { + throw std::runtime_error( + "All devices must be of the same type, use -d to specify the BDFs of the devices you want to use"); + } + } + + _numDevice = devices.size(); // Load xclbin - fileBuf = xcl::read_binary_file(_xclbinFilename); - bins = cl::Program::Binaries({{fileBuf.data(), fileBuf.size()}}); + std::cout << "Loading: " << _xclbinFilename << std::endl; + std::vector fileBuf = xcl::read_binary_file(_xclbinFilename); + cl::Program::Binaries bins; + for (int i = 0; i < _numDevice; i++) { + bins.push_back({fileBuf.data(), fileBuf.size()}); + } // Create OpenCL context - context = cl::Context(device); + OCL_CHECK(err, context = cl::Context(devices, nullptr, nullptr, nullptr, &err)); // Create OpenCL program from binary file - program = cl::Program(context, devices, bins); - - // Create OpenCL command queues - comQueues.reserve(_numCU); - for (int i = 0; i < _numCU; i++) { - comQueues.emplace_back(context, - device, - CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); + OCL_CHECK(err, program = cl::Program(context, devices, bins, nullptr, &err)); + + std::cout << "Device programmed successfully" << std::endl; + + // Create OpenCL program, and command queues for each device + comQueues.resize(_numDevice); + for (int i = 0; i < _numDevice; i++) { + comQueues[i].resize(_numCU); + // Create OpenCL out-of-order command queues (One per compute unit) + for (int j = 0; j < _numCU; j++) { + comQueues[i][j] = cl::CommandQueue(context, devices[i], + CL_QUEUE_PROFILING_ENABLE | CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE); + } } - - // Create mutexes for each command queue - std::vector temp(_numCU); - comQueueMtxi.swap(temp); } /** - * \brief Creates worker objects for each compute unit + * \brief Creates worker objects for each compute unit on each device * \param workersPerCU Number of worker objects that will drive each compute unit - * \param fpga Type of memory resource used by the FPGA. - * \param numHBMChannels Number of channels per port each Worker uses. Only for HBM. */ - void createWorkers(int workersPerCU, FPGAType fpga, int numHBMChannels = 0) { + void createWorkers(int workersPerCU) { _workersPerCU = workersPerCU; // Construct workers workers.reserve(_numCU * _workersPerCU); - for (int i = 0; i < _numCU; i++) { - for (int j = 0; j < workersPerCU; j++) { - workers.emplace_back(_batchsize, - _sampleInputSize, - _sampleOutputSize, - comQueues[i], - comQueueMtxi[i]); + for (int d = 0; d < _numDevice; d++) { + for (int cu = 0; cu < _numCU; cu++) { + for (int w = 0; w < _workersPerCU; w++) { + workers.emplace_back(d, d * (_numCU * _workersPerCU) + cu * _workersPerCU + w, _batchsize, + _sampleInputSize, _sampleOutputSize, comQueues[d][cu]); + } } } // Initialize workers - int currHBMChannel = 0; // Only used if FPGAType is HBM - for (int i = 0; i < _numCU; i++) { - for (int j = 0; j < _workersPerCU; j++) { - workers[i * _workersPerCU + j].initialize(context, - program, - i + 1, - i * _workersPerCU + j, - fpga, - currHBMChannel, - numHBMChannels); - + for (int d = 0; d < _numDevice; d++) { + for (int cu = 0; cu < _numCU; cu++) { + for (int w = 0; w < _workersPerCU; w++) { + workers[d * (_numCU * _workersPerCU) + cu * _workersPerCU + w].initialize(context, program, cu + 1); + } } - currHBMChannel += 2 * numHBMChannels; } } @@ -101,24 +124,18 @@ template class FpgaObj { * \param fin Filename * \param s Input type. VitisAccelerator Backend currently uses text input. However, * the code also supports binary input in the format produced by NumPy's toFile(). - * \param profiling If true, the given data will be iterated over multiple times, - * for more accurate throughput testing. - * \param profilingDataRepeat Only used if profiling is set to True. Additional number of - * times the given data is iterated over. + * \param profilingDataRepeat Additional number of times the given data is iterated + * over. Profiling is enabled if this is greater than 0. */ - void loadData(const std::string& fin, bool profiling = false, int profilingDataRepeat = 0) { + void loadData(const std::string &fin, int profilingDataRepeat = 0) { // Set-up containers for each Worker's batches/workload - batchedData.reserve(_numCU * _workersPerCU); - for (int i = 0; i < _numCU * _workersPerCU; i++) { + batchedData.reserve(_numCU * _workersPerCU * _numDevice); + for (int i = 0; i < _numCU * _workersPerCU * _numDevice; i++) { batchedData.emplace_back(); } // Batch and distribute data - db = new DataBatcher(_batchsize, - _sampleInputSize, - _sampleOutputSize, - _numCU * _workersPerCU, - profiling, + db = new DataBatcher(_batchsize, _sampleInputSize, _sampleOutputSize, _numCU * _workersPerCU * _numDevice, profilingDataRepeat); db->read(fin); db->createResultBuffers(); @@ -134,41 +151,57 @@ template class FpgaObj { throw std::runtime_error("No data loaded"); } - std::cout << "\nStarting FPGA run" << std::endl; + std::cout << "Starting FPGA run" << std::endl; auto ts_start = std::chrono::system_clock::now(); + std::vector accelThreads; - accelThreads.reserve(_numCU * _workersPerCU); - for (int i = 0; i < _numCU * _workersPerCU; i++) { - accelThreads.emplace_back([this, i]() { - this->workers[i].evalLoop(this->batchedData[i]); - }); + accelThreads.reserve(_numCU * _workersPerCU * _numDevice); + for (int i = 0; i < _numCU * _workersPerCU * _numDevice; i++) { + accelThreads.emplace_back([this, i]() { this->workers[i].evalLoop(this->batchedData[i]); }); } - for (int i = 0; i < _numCU * _workersPerCU; i++) { + for (int i = 0; i < _numCU * _workersPerCU * _numDevice; i++) { accelThreads[i].join(); } - for (int i = 0; i < _numCU; i++) { - OCL_CHECK(err, err = comQueues[i].finish()); + + for (auto deviceQueue : comQueues) { + for (auto queue : deviceQueue) { + OCL_CHECK(err, err = queue.finish()); + } } + auto ts_end = std::chrono::system_clock::now(); uint64_t ns_elapsed = std::chrono::duration_cast(ts_end - ts_start).count(); if (db->isProfilingMode()) { double profilingThroughput = 1.0e9 * static_cast(db->getProfilingSampleCount()) / ns_elapsed; - std::cout << "\nProfiling throughput: " << profilingThroughput << " predictions/second" << std::endl; + std::cout << "Processed " << db->getProfilingSampleCount() << " samples in " << ns_elapsed / 1000000 << " ms" + << std::endl; + std::cout << "Profiling throughput: " << profilingThroughput << " predictions/second" << std::endl; } else { double throughput = 1.0e9 * static_cast(db->getSampleCount()) / ns_elapsed; double maxThroughput = 1.0e9 * static_cast(db->getPaddedSampleCount()) / ns_elapsed; - std::cout << "\nUtilized throughput: " << throughput << " predictions/second" << std::endl; + std::cout << "Utilized throughput: " << throughput << " predictions/second" << std::endl; std::cout << "Max possible throughput: " << maxThroughput << " predictions/second" << std::endl; } } + void checkResults(const std::string &ref) { + if (db == nullptr) { + throw std::runtime_error("No data loaded"); + } + if (db->readReference(ref)) { + db->checkResults(); + } else { + std::cout << "No reference file provided, skipping results check" << std::endl; + } + } + /** * \brief Writes results, in text format, to provided file. Releases resources * \param fout Filename. If file already exists, it will be overwritten with current results. */ - void saveResults(const std::string& fout) { + void saveResults(const std::string &fout) { if (db == nullptr) { throw std::runtime_error("No data loaded"); } @@ -181,24 +214,18 @@ template class FpgaObj { int _sampleInputSize; int _sampleOutputSize; int _numCU; + int _numDevice; std::string _xclbinFilename; + std::string _deviceName; - /// @brief A list of connected Xilinx devices + /// @brief A list of connected AMD/Xilinx devices std::vector devices; - /// @brief The identified FPGA - cl::Device device; - /// @brief Container that xclbin file is read into - std::vector fileBuf; - /// @brief OpenCL object constructed from xclbin - cl::Program::Binaries bins; /// @brief OpenCL Program that each compute unit executes cl::Program program; /// @brief OpenCL Device Context cl::Context context; /// @brief OpenCL Command Queues for each compute unit - std::vector comQueues; - /// @brief Mutexes for each Command Queue - mutable std::vector comQueueMtxi; + std::vector> comQueues; /// @brief Error code storage cl_int err; @@ -206,7 +233,7 @@ template class FpgaObj { /// @brief Workers, indexed by (i_CU * _workersPerCU + i_worker) std::vector> workers; /// @brief Data Batcher - DataBatcher* db = nullptr; + DataBatcher *db = nullptr; /// @brief A vector containing each Worker's batches/workload std::vector>> batchedData; }; diff --git a/hls4ml/templates/vitis_accelerator/libs/Params.hpp b/hls4ml/templates/vitis_accelerator/libs/Params.hpp new file mode 100644 index 0000000000..b2ddf66c96 --- /dev/null +++ b/hls4ml/templates/vitis_accelerator/libs/Params.hpp @@ -0,0 +1,107 @@ +#pragma once + +#include +#include +#include +#include +#include + +#include "../kernel_wrapper.h" +#include "FpgaObj.hpp" + +class Params { + + public: + Params(int argc, char **argv) { + int opt, temp; + while ((opt = getopt(argc, argv, "x:vhr:n:i:o:d:c:")) != EOF) + switch (opt) { + case 'd': + deviceBDFs.push_back(optarg); + break; + case 'x': + xclbinFilename = optarg; + break; + case 'i': + inputFilename = optarg; + break; + case 'o': + outputFilename = optarg; + break; + case 'c': + temp = atoi(optarg); + if (temp > 0 && temp < NUM_CU) + ; + numCU = temp; + break; + case 'n': + temp = atoi(optarg); + if (temp > 0) + numWorker = temp; + break; + case 'r': + dataRepeatCount = atoi(optarg); + break; + case 'v': + verbose++; + break; + case 'h': + help(); + exit(0); + default: + std::cout << std::endl; + abort(); + } + + if (verbose > 0) + print(); + } + + void help(void) { + std::cout << "Options:" << std::endl; + std::cout << " -d: device BDF (can be specified multiple times)" << std::endl; + std::cout << " -x: XCLBIN path" << std::endl; + std::cout << " -i: input file" << std::endl; + std::cout << " -o: output file" << std::endl; + std::cout << " -c: maximum computing units count" << std::endl; + std::cout << " -n: maximum workers count" << std::endl; + std::cout << " -r: input data repeat count" << std::endl; + std::cout << " -v: enable verbose output" << std::endl; + std::cout << " -h: this helps message" << std::endl; + } + + void print(void) { + std::cout << "Run parameters:" << std::endl; + std::cout << " xclbinFilename: " << xclbinFilename << std::endl; + std::cout << " batchSize: " << batchSize << std::endl; + std::cout << " sampleInputSize: " << sampleInputSize << std::endl; + std::cout << " sampleOutputSize: " << sampleOutputSize << std::endl; + std::cout << " numCU: " << numCU << std::endl; + std::cout << " inputFilename: " << inputFilename << std::endl; + std::cout << " outputFilename: " << outputFilename << std::endl; + std::cout << " numWorker: " << numWorker << std::endl; + std::cout << " dataRepeatCount: " << dataRepeatCount << std::endl; + } + + // Device + std::vector deviceBDFs; + + // Bitstream + std::string xclbinFilename = "./build_hw_rel/kernel_wrapper.xclbin"; + size_t batchSize = BATCHSIZE; + const size_t sampleInputSize = INSTREAMSIZE; + const size_t sampleOutputSize = OUTSTREAMSIZE; + size_t numCU = NUM_CU; + + // Data paths + std::string inputFilename = "./tb_data/tb_input_features.dat"; + std::string referenceFilename = "tb_data/tb_output_predictions.dat"; + std::string outputFilename = "./tb_data/hw_results.dat"; + + // Workers + int numWorker = NUM_WORKER; + + // Benchmark + int dataRepeatCount = -1; + int verbose = 0; +}; diff --git a/hls4ml/templates/vitis_accelerator/libs/Types.hpp b/hls4ml/templates/vitis_accelerator/libs/Types.hpp index 1e8bf0b55e..0ff3bed610 100644 --- a/hls4ml/templates/vitis_accelerator/libs/Types.hpp +++ b/hls4ml/templates/vitis_accelerator/libs/Types.hpp @@ -2,13 +2,7 @@ #include -template -struct Batch { - const T* dataIn; - U* dataOut; -}; - -enum class FPGAType : uint8_t { - DDR = 0, - HBM = 1 +template struct Batch { + const T *dataIn; + U *dataOut; }; diff --git a/hls4ml/templates/vitis_accelerator/libs/Worker.hpp b/hls4ml/templates/vitis_accelerator/libs/Worker.hpp index ff1677d073..5174936f24 100644 --- a/hls4ml/templates/vitis_accelerator/libs/Worker.hpp +++ b/hls4ml/templates/vitis_accelerator/libs/Worker.hpp @@ -10,15 +10,6 @@ #include "Types.hpp" #include "xcl2.hpp" -// HBM Pseudo-channel(PC) requirements -#define MAX_HBM_PC_COUNT 32 -#define PC_NAME(n) n | XCL_MEM_TOPOLOGY -const int pc[MAX_HBM_PC_COUNT] = {PC_NAME(0), PC_NAME(1), PC_NAME(2), PC_NAME(3), PC_NAME(4), PC_NAME(5), PC_NAME(6), - PC_NAME(7), PC_NAME(8), PC_NAME(9), PC_NAME(10), PC_NAME(11), PC_NAME(12), PC_NAME(13), - PC_NAME(14), PC_NAME(15), PC_NAME(16), PC_NAME(17), PC_NAME(18), PC_NAME(19), PC_NAME(20), - PC_NAME(21), PC_NAME(22), PC_NAME(23), PC_NAME(24), PC_NAME(25), PC_NAME(26), PC_NAME(27), - PC_NAME(28), PC_NAME(29), PC_NAME(30), PC_NAME(31)}; - template class Worker { public: /** @@ -29,14 +20,9 @@ template class Worker { * \param commandQueue cl::CommandQueue that the worker will enqueue operations to * \param queueMutex Mutex protecting the CommandQueue (potentially shared with other workers) */ - Worker(int batchsize, int sampleInputSize, int sampleOutputSize, cl::CommandQueue& queue, - std::mutex& queueMutex) : - _batchsize(batchsize), - _sampleInputSize(sampleInputSize), - _sampleOutputSize(sampleOutputSize), - _queue(queue), - _queueMutex(queueMutex) - { + Worker(int deviceId, int workerId, int batchsize, int sampleInputSize, int sampleOutputSize, cl::CommandQueue &queue) + : _deviceId(deviceId), _workerId(workerId), _batchsize(batchsize), _sampleInputSize(sampleInputSize), + _sampleOutputSize(sampleOutputSize), _queue(queue), writeEvents(1), executionEvents(1) { memmap_in.resize(_batchsize * _sampleInputSize, T(0.0f)); memmap_out.resize(_batchsize * _sampleOutputSize, U(0.0f)); } @@ -46,97 +32,103 @@ template class Worker { * \param context cl::Context of the FPGA. * \param program cl:Program of the FPGA. * \param computeUnit The number of the physical compute unit this worker will use. - * \param workerId Worker's ID number. - * \param fpga Type of memory resource used by the FPGA. - * \param firstHBMChannel Start index of this Worker's memory channels this worker uses. Only for HBM. - * \param numHBMChannels Number of channels per port this worker uses. Only for HBM. */ - void initialize( - cl::Context& context, - cl::Program& program, - int computeUnit, - int workerId, - FPGAType fpga, - int firstHBMChannel = 0, // Only used for if fpga == FPGAType::HBM - int numHBMChannels = 0 // Only used for if fpga == FPGAType::HBM - ) { - if (fpga == FPGAType::HBM) { - allocateHBMMemory(context, firstHBMChannel, numHBMChannels); - } else if (fpga == FPGAType::DDR) { - allocateDDRMemory(context); - } + void initialize(cl::Context &context, cl::Program &program, int computeUnit) { + cl_int err; + + // This is AMD's format for specifying the Compute Unit a kernel object uses + std::string krnl_name = "kernel_wrapper:{kernel_wrapper_" + std::to_string(computeUnit) + "}"; // Creating Kernel object - std::string krnl_name = - "kernel_wrapper:{kernel_wrapper_" + std::to_string(computeUnit) + - "}"; // This is Xilinx's format for specifying the Compute Unit a kernel object uses - krnl = cl::Kernel(program, krnl_name.c_str(), &err); - krnl.setArg(0, input_buffer); - krnl.setArg(1, output_buffer); - - std::cout << "Initialized Worker " << workerId - << ", which will use Compute Unit " << computeUnit << std::endl; + OCL_CHECK(err, krnl = cl::Kernel(program, krnl_name.c_str(), &err)); + + // Per AMD documentation we can leave XRT infer the bank location for the buffer: + // " The XRT can obtain the bank location for the buffer if the buffer + // is used for setting the kernel arguments right after the buffer + // creation, i.e. before any enqueue operation on the buffer." + + const size_t vector_in_size_bytes = sizeof(T) * _batchsize * _sampleInputSize; + const size_t vector_out_size_bytes = sizeof(U) * _batchsize * _sampleOutputSize; + + OCL_CHECK(err, input_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_in_size_bytes, + memmap_in.data(), &err)); + + OCL_CHECK(err, output_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, vector_out_size_bytes, + memmap_out.data(), &err)); + + // Set kernel arguments will effectively affect the memory bank location + OCL_CHECK(err, err = krnl.setArg(0, input_buffer)); + OCL_CHECK(err, err = krnl.setArg(1, output_buffer)); + + // Perform a dummy transfer input batch to FPGA to ensure that allocation time is not counted + // in the evaluation time. Also allows us to query the memory bank location. + int mem_bank_index = -1; + OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({input_buffer}, + 0, // 0 means from host + NULL, // No dependencies + &writeEvents[0])); + OCL_CHECK(err, err = writeEvents[0].wait()); + OCL_CHECK(err, err = clGetMemObjectInfo(input_buffer.get(), CL_MEM_BANK, sizeof(int), &mem_bank_index, nullptr)); + + std::cout << "Initialized Worker " << _workerId << ", using CU " << computeUnit << " and memory bank " + << mem_bank_index << " on device " << _deviceId << std::endl; } /** * \brief Evaluates the single batch currently in input_buffer and writes to output_buffer. */ void evaluate() { - std::lock_guard lock(_queueMutex); - // Transfer inputs + + cl_int err; + + // Transfer input batch to FPGA OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({input_buffer}, - 0, // 0 means from host + 0, // 0 means from host NULL, // No dependencies - &write_event)); - // Execute program - writeCompleteEvents.push_back(write_event); - OCL_CHECK(err, err = _queue.enqueueNDRangeKernel(krnl, - 0, - 1, - 1, - &writeCompleteEvents, - &kernExe_event)); - writeCompleteEvents.pop_back(); - // Transfer outputs - kernExeCompleteEvents.push_back(kernExe_event); - OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({output_buffer}, - CL_MIGRATE_MEM_OBJECT_HOST, - &kernExeCompleteEvents, - &read_event)); - kernExeCompleteEvents.pop_back(); - OCL_CHECK(err, err = read_event.wait()); + &writeEvents[0])); + + // Run kernel on the batch + OCL_CHECK(err, err = _queue.enqueueNDRangeKernel(krnl, 0, 1, 1, &writeEvents, &executionEvents[0])); + + // Transfer output batch from FPGA + OCL_CHECK(err, err = _queue.enqueueMigrateMemObjects({output_buffer}, CL_MIGRATE_MEM_OBJECT_HOST, &executionEvents, + &readEvent)); + + // Wait for all operations to complete + OCL_CHECK(err, err = readEvent.wait()); } /** * \brief Evaluates each batch of data provided via dataTracker. Uses float datatype * \param dataTracker Vector of input locations to read from and output locations to write to */ - void evalLoop(std::list>& dataTracker) { + void evalLoop(std::list> &dataTracker) { + while (!dataTracker.empty()) { // Copy inputs into memory-mapped buffer - const T* dataLoc = dataTracker.front().dataIn; + // FIXME: It there a way to avoid this copy? Could the orignal batch be used directly if aligned? + const T *dataLoc = dataTracker.front().dataIn; memcpy(&memmap_in[0], dataLoc, _batchsize * _sampleInputSize * sizeof(T)); // Evaluate evaluate(); // Copy outputs into persistent results vector - U* resLoc = dataTracker.front().dataOut; + U *resLoc = dataTracker.front().dataOut; memcpy(resLoc, &memmap_out[0], _batchsize * _sampleOutputSize * sizeof(U)); - dataTracker.pop_front(); } } private: + int _deviceId; + int _workerId; int _batchsize; int _sampleInputSize; int _sampleOutputSize; /// @brief Reference to the OpenCL command queue - const cl::CommandQueue& _queue; - /// @brief Mutex for thread-safe access to the command queue - std::mutex& _queueMutex; + const cl::CommandQueue &_queue; /// @brief Vector mapped to FPGA input buffer std::vector> memmap_in; @@ -150,60 +142,10 @@ template class Worker { /// @brief OpenCL kernel object cl::Kernel krnl; - /// @brief Pointer mapping host input buffer to FPGA memory pseudo-channels (HBM only) - cl_mem_ext_ptr_t hbm_in_ptr; - /// @brief Pointer mapping host output buffer to FPGA memory pseudo-channels (HBM only) - cl_mem_ext_ptr_t hbm_out_ptr; - - /// @brief Event for signaling input transfer completion - cl::Event write_event; - /// @brief Event for signaling kernel execution completion - cl::Event kernExe_event; - /// @brief Event for signaling output transfer completion - cl::Event read_event; /// @brief Vector tracking write events. Required by OpenCL queue functions. - std::vector writeCompleteEvents; + std::vector writeEvents; /// @brief Vector tracking kernel execution events. Required by OpenCL queue functions. - std::vector kernExeCompleteEvents; - - /// @brief Error code storage - cl_int err; - bool firstLoop = true; - - void allocateHBMMemory(cl::Context& context, int firstHBMChannel, int numHBMChannels) { - // Create Pointer objects for the in/out ports for each worker - // Assigning Pointers to specific HBM PC's using cl_mem_ext_ptr_t type and corresponding PC flags - hbm_in_ptr.obj = memmap_in.data(); - hbm_in_ptr.param = 0; - int in_flags = 0; - for (int i = 0; i < numHBMChannels; i++) { - in_flags |= pc[firstHBMChannel + i]; - } - hbm_in_ptr.flags = in_flags; - - hbm_out_ptr.obj = memmap_out.data(); - hbm_out_ptr.param = 0; - int out_flags = 0; - for (int i = 0; i < numHBMChannels; i++) { - out_flags |= pc[firstHBMChannel + numHBMChannels + i]; - } - hbm_out_ptr.flags = out_flags; - - // Creating Buffer objects in Host memory - uint64_t vector_in_size_bytes = sizeof(T) * _batchsize * _sampleInputSize; - uint64_t vector_out_size_bytes = sizeof(U) * _batchsize * _sampleOutputSize; - input_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_EXT_PTR_XILINX | CL_MEM_READ_ONLY, - vector_in_size_bytes, &hbm_in_ptr); - output_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_EXT_PTR_XILINX | CL_MEM_WRITE_ONLY, - vector_out_size_bytes, &hbm_out_ptr); - } - - void allocateDDRMemory(cl::Context& context) { - uint64_t vector_in_size_bytes = sizeof(T) * _batchsize * _sampleInputSize; - uint64_t vector_out_size_bytes = sizeof(U) * _batchsize * _sampleOutputSize; - input_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_READ_ONLY, vector_in_size_bytes, - memmap_in.data()); - output_buffer = cl::Buffer(context, CL_MEM_USE_HOST_PTR | CL_MEM_WRITE_ONLY, vector_out_size_bytes, - memmap_out.data()); - } + std::vector executionEvents; + /// @brief Event for signaling output transfer completion + cl::Event readEvent; }; diff --git a/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp b/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp index f2480567ae..6e03deb793 100644 --- a/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp +++ b/hls4ml/templates/vitis_accelerator/libs/xcl2.cpp @@ -37,8 +37,6 @@ std::vector get_devices(const std::string &vendor_name) { platform = platforms[i]; OCL_CHECK(err, std::string platformName = platform.getInfo(&err)); if (!(platformName.compare(vendor_name))) { - std::cout << "Found Platform" << std::endl; - std::cout << "Platform Name: " << platformName.c_str() << std::endl; break; } } @@ -74,7 +72,7 @@ cl::Device find_device_bdf(const std::vector &devices, const std::st } } if (cnt == 0) { - std::cout << "Invalid device bdf. Please check and provide valid bdf\n"; + std::cout << "Invalid device bdf: " << bdf << ". Please check and provide valid bdf\n"; exit(EXIT_FAILURE); } return device; @@ -103,14 +101,12 @@ cl_device_id find_device_bdf_c(cl_device_id *devices, const std::string &bdf, cl return device; } std::vector read_binary_file(const std::string &xclbin_file_name) { - std::cout << "INFO: Reading " << xclbin_file_name << std::endl; FILE *fp; if ((fp = fopen(xclbin_file_name.c_str(), "r")) == nullptr) { printf("ERROR: %s xclbin not available please build\n", xclbin_file_name.c_str()); exit(EXIT_FAILURE); } // Loading XCL Bin into char buffer - std::cout << "Loading: '" << xclbin_file_name.c_str() << "'\n"; std::ifstream bin_file(xclbin_file_name.c_str(), std::ifstream::binary); bin_file.seekg(0, bin_file.end); auto nb = bin_file.tellg(); diff --git a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp index 0fd48a4bec..daad501fa9 100644 --- a/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp +++ b/hls4ml/templates/vitis_accelerator/myproject_host_cl.cpp @@ -1,42 +1,26 @@ #include #include "FpgaObj.hpp" +#include "Params.hpp" #include "Types.hpp" #include "kernel_wrapper.h" #include "xcl2.hpp" int main(int argc, char **argv) { - if (argc < 2) { - std::cout << "Usage: " << argv[0] - << " " - << " [Profiling: Data repeat count]" << std::endl; - return EXIT_FAILURE; - } - std::string xclbinFilename = argv[1]; - int dataRepeatCount = -1; - if (argc == 3) { - dataRepeatCount = std::stoi(argv[2]); - } - - FpgaObj fpga(BATCHSIZE, - INSTREAMSIZE, - OUTSTREAMSIZE, - NUM_CU, - xclbinFilename); - - fpga.createWorkers(NUM_WORKER, - FPGAType::/*FPGA_Type*/, - NUM_CHANNEL); - - if (dataRepeatCount == -1) { - fpga.loadData("../tb_data/tb_input_features.dat"); - } else { - fpga.loadData("../tb_data/tb_input_features.dat", true, dataRepeatCount); - } + + Params params(argc, argv); + + FpgaObj fpga(params); + + fpga.createWorkers(params.numWorker); + + fpga.loadData(params.inputFilename, params.dataRepeatCount); fpga.evaluateAll(); - fpga.saveResults("../tb_data/hw_results.dat"); + fpga.checkResults(params.referenceFilename); + + fpga.saveResults(params.outputFilename); return EXIT_SUCCESS; }