From 9a4449b43c44887f10736da63d2ef0c3f9bf0bcd Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 15 Mar 2021 13:19:01 +0100 Subject: [PATCH 1/5] Add b_eff build config --- b_eff/configs/Bittware_520N.cmake | 17 +++++++++++++++++ b_eff/scripts/build_520n.sh | 28 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 b_eff/configs/Bittware_520N.cmake create mode 100644 b_eff/scripts/build_520n.sh diff --git a/b_eff/configs/Bittware_520N.cmake b/b_eff/configs/Bittware_520N.cmake new file mode 100644 index 00000000..1769c4d5 --- /dev/null +++ b/b_eff/configs/Bittware_520N.cmake @@ -0,0 +1,17 @@ +# This file contains the default configuration for the Nallatech 520N board +# for the use with single precision floating point values. +# To use this configuration file, call cmake with the parameter +# +# cmake [...] -DHPCC_FPGA_CONFIG="path to this file" +# + + +set(USE_MPI Yes CACHE BOOL "" FORCE) +set(USE_SVM No CACHE BOOL "" FORCE) +set(USE_HBM No CACHE BOOL "" FORCE) +set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE) +set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE) + +# GEMM specific options +set(CHANNEL_WIDTH 32 CACHE STRING "Width of a single external channel in Byte" FORCE) +set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications" FORCE) \ No newline at end of file diff --git a/b_eff/scripts/build_520n.sh b/b_eff/scripts/build_520n.sh new file mode 100644 index 00000000..6c629445 --- /dev/null +++ b/b_eff/scripts/build_520n.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# +# Synthesize the b_eff kernel for the Nallaatech 520N board. +# This is an example script, how the synthesis can be started on Noctua using a HPCC FPGA configuration file. +# Submit this script to sbatch in this folder! +# +#SBATCH -p fpgasyn +#SBATCH -J b_eff + +module load intelFPGA_pro/20.4.0 +module load nalla_pcie/19.4.0_hpc +module load intel +module load devel/CMake/3.15.3-GCCcore-8.3.0 + +SCRIPT_PATH=${SLURM_SUBMIT_DIR} + +BENCHMARK_DIR=${SCRIPT_PATH}/../ + +SYNTH_DIR=${PFS_SCRATCH}/synth/520n/multi_fpga/b_eff + + +mkdir -p ${SYNTH_DIR} +cd ${SYNTH_DIR} + +cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/Bittware_520N.cmake + +make communication_bw520n_intel Network_intel + From 0a05d09455aec1cb13fd0452f37c103aeb604463 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Mon, 29 Mar 2021 12:04:35 +0200 Subject: [PATCH 2/5] Use NUM_REPLICATIONS in host --- b_eff/CMakeLists.txt | 6 +++--- b_eff/src/common/parameters.h.in | 1 + b_eff/src/host/execution_default.cpp | 27 +++++++++++++++++++++++---- 3 files changed, 27 insertions(+), 7 deletions(-) diff --git a/b_eff/CMakeLists.txt b/b_eff/CMakeLists.txt index 4e514252..ecdd99ac 100755 --- a/b_eff/CMakeLists.txt +++ b/b_eff/CMakeLists.txt @@ -9,6 +9,7 @@ set(DEFAULT_MAX_LOOP_LENGTH 65536 CACHE STRING "Maximum number of repetitions do set(DEFAULT_MIN_LOOP_LENGTH 256 CACHE STRING "Minimum number of repetitions done for a single message size")# set(DEFAULT_LOOP_LENGTH_OFFSET 11 CACHE STRING "Offset that is used for the message sizes before reducing the number of repetitions") set(DEFAULT_LOOP_LENGTH_DECREASE 7 CACHE STRING "Number of steps that are used to decrease the number of repetitions to its minimum") +set(NUM_REPLICATIONS 2 CACHE STRING "") set(USE_MPI Yes) set(USE_DEPRECATED_HPP_HEADER No) @@ -16,9 +17,8 @@ set(USE_DEPRECATED_HPP_HEADER No) set(DATA_TYPE char) include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake) unset(DATA_TYPE CACHE) -if (NOT MPI_FOUND) - message(ERROR "Benchmarks host code requires MPI, but MPI could not be found.") -endif() +find_package(MPI REQUIRED) + if (NOT INTELFPGAOPENCL_FOUND) message(ERROR "Benchmark does only support the Intel OpenCL SDK") endif() diff --git a/b_eff/src/common/parameters.h.in b/b_eff/src/common/parameters.h.in index 7bd133cf..b2114e38 100644 --- a/b_eff/src/common/parameters.h.in +++ b/b_eff/src/common/parameters.h.in @@ -17,6 +17,7 @@ * Kernel Parameters */ #define CHANNEL_WIDTH @CHANNEL_WIDTH@ +#define NUM_REPLICATIONS @NUM_REPLICATIONS@ #define HOST_DATA_TYPE @HOST_DATA_TYPE@ #define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@ diff --git a/b_eff/src/host/execution_default.cpp b/b_eff/src/host/execution_default.cpp index 3e115e04..1c5cd908 100644 --- a/b_eff/src/host/execution_default.cpp +++ b/b_eff/src/host/execution_default.cpp @@ -52,7 +52,7 @@ namespace bm_execution { std::vector validationBuffers; // Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels - for (int r = 0; r < 2; r++) { + for (int r = 0; r < config.programSettings->kernelReplications; r++) { validationBuffers.push_back(cl::Buffer(*config.context, CL_MEM_WRITE_ONLY, sizeof(HOST_DATA_TYPE) * validationData.size(),0,&err)); ASSERT_CL(err) @@ -90,23 +90,42 @@ namespace bm_execution { for (uint r =0; r < config.programSettings->numRepetitions; r++) { MPI_Barrier(MPI_COMM_WORLD); auto startCalculation = std::chrono::high_resolution_clock::now(); - for (int i = 0; i < 2; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { sendQueues[i].enqueueNDRangeKernel(sendKernels[i], cl::NullRange, cl::NDRange(1)); recvQueues[i].enqueueNDRangeKernel(recvKernels[i], cl::NullRange, cl::NDRange(1)); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl; + #endif } - for (int i = 0; i < 2; i++) { + for (int i = 0; i < config.programSettings->kernelReplications; i++) { sendQueues[i].finish(); + #ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Send done " << r << "," << i << std::endl; + #endif recvQueues[i].finish(); + #ifndef NDEBUG + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Recv done " << r << "," << i << std::endl; + #endif } auto endCalculation = std::chrono::high_resolution_clock::now(); std::chrono::duration calculationTime = std::chrono::duration_cast> (endCalculation - startCalculation); calculationTimings.push_back(calculationTime.count()); +#ifndef NDEBUG + int current_rank; + MPI_Comm_rank(MPI_COMM_WORLD, & current_rank); + std::cout << "Rank " << current_rank << ": Done " << r << std::endl; +#endif } // Read validation data from FPGA will be placed sequentially in buffer for all replications // The data order should not matter, because every byte should have the same value! - for (int r = 0; r < 2; r++) { + for (int r = 0; r < config.programSettings->kernelReplications; r++) { err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / 2, &validationData.data()[r * validationData.size() / 2]); ASSERT_CL(err); } From 986ff3e1729cc34f39ee883602f9efee44880818 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 8 Apr 2021 12:47:57 +0200 Subject: [PATCH 3/5] Change parsing script for fixed PTRANS output --- scripts/evaluation/parse_raw_to_csv.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/evaluation/parse_raw_to_csv.py b/scripts/evaluation/parse_raw_to_csv.py index 1a5ef7d2..5b57fd57 100755 --- a/scripts/evaluation/parse_raw_to_csv.py +++ b/scripts/evaluation/parse_raw_to_csv.py @@ -12,7 +12,7 @@ fft_regex = "Version:\\s+(?P.+)\n(.*\n)+Batch\\sSize\\s+(?P\d+)\nFFT\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" gemm_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P.+)\\s+(?P.+)\\s+(?P.+)" ra_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+Error:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" -trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[GB/s\\]\\s+Mem\\s+\\[GB/s\\]\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" +trans_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[B/s\\]\\s+Mem\\s+\\[B/s\\]\n\\s*avg:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" stream_regex = "Version:\\s+(?P.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P.+)\n(.*\n)+Kernel\\sReplications\\s+(?P\d+)(.*\n)+Kernel\\sType\\s+(?P.+)\n(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" linpack_regex = "Version:\\s+(?P.+)\n(.*\n)+Matrix\\sSize\\s+(?P\d+)(.*\n)+Device\\s+(?P.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P((\d|\.|\+|-|e)+|nan))\\s+(?P(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)\\s+(?P(\d|\.|\+|-|e)+)" From 336539a752cec42313288276094a38b7609aa12e Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 8 Apr 2021 14:38:27 +0200 Subject: [PATCH 4/5] Remove debug messages to pass unit tests --- PTRANS/src/host/transpose_handlers.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/PTRANS/src/host/transpose_handlers.cpp b/PTRANS/src/host/transpose_handlers.cpp index 3ab76dcf..e3b18c39 100644 --- a/PTRANS/src/host/transpose_handlers.cpp +++ b/PTRANS/src/host/transpose_handlers.cpp @@ -110,7 +110,7 @@ std::unique_ptr transpose::DistributedDiagonalTranspos void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose::TransposeData& data) { #ifndef NDEBUG - std::cout << "Start data exchange " << mpi_comm_rank << std::endl; + // std::cout << "Start data exchange " << mpi_comm_rank << std::endl; #endif // Only need to exchange data, if rank has a partner if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) { @@ -132,7 +132,7 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose: while (remaining_data_size > 0) { int next_chunk = (remaining_data_size > std::numeric_limits::max()) ? std::numeric_limits::max(): remaining_data_size; #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl; + // std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl; #endif if (pair_rank > mpi_comm_rank) { MPI_Send(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD); @@ -148,14 +148,14 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose: } // MPI_Sendrecv_replace(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, pair_rank, 0, MPI_COMM_WORLD, &status); #ifndef NDEBUG - std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl; + // std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl; #endif remaining_data_size -= next_chunk; offset += next_chunk; } } #ifndef NDEBUG - std::cout << "End data exchange " << mpi_comm_rank << std::endl; + // std::cout << "End data exchange " << mpi_comm_rank << std::endl; #endif } From 5c206e77f994b4b588e1b658aba634bf14b08fa8 Mon Sep 17 00:00:00 2001 From: Marius Meyer Date: Thu, 8 Apr 2021 16:31:39 +0200 Subject: [PATCH 5/5] Use NUM_REPLICATIONS if available for default --- shared/include/hpcc_benchmark.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/shared/include/hpcc_benchmark.hpp b/shared/include/hpcc_benchmark.hpp index c1912130..1d3bd92b 100644 --- a/shared/include/hpcc_benchmark.hpp +++ b/shared/include/hpcc_benchmark.hpp @@ -130,7 +130,11 @@ class BaseSettings { defaultPlatform(results["platform"].as()), defaultDevice(results["device"].as()), kernelFileName(results["f"].as()), +#ifdef NUM_REPLICATIONS + kernelReplications(results.count("r") > 0 ? results["r"].as() : NUM_REPLICATIONS), +#else kernelReplications(results.count("r") > 0 ? results["r"].as() : 1), +#endif testOnly(static_cast(results.count("test"))) {} /**