Skip to content

Commit

Permalink
Merge branch 'beff_synth' into 'master'
Browse files Browse the repository at this point in the history
Beff synth

See merge request pc2/HPCC_FPGA!44
  • Loading branch information
Mellich committed Apr 9, 2021
2 parents af5b916 + 5c206e7 commit 60651eb
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 12 deletions.
8 changes: 4 additions & 4 deletions PTRANS/src/host/transpose_handlers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ std::unique_ptr<transpose::TransposeData> transpose::DistributedDiagonalTranspos

void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose::TransposeData& data) {
#ifndef NDEBUG
std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
// std::cout << "Start data exchange " << mpi_comm_rank << std::endl;
#endif
// Only need to exchange data, if rank has a partner
if (mpi_comm_rank < mpi_comm_size - num_diagonal_ranks) {
Expand All @@ -132,7 +132,7 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose:
while (remaining_data_size > 0) {
int next_chunk = (remaining_data_size > std::numeric_limits<int>::max()) ? std::numeric_limits<int>::max(): remaining_data_size;
#ifndef NDEBUG
std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl;
// std::cout << "Rank " << mpi_comm_rank << " " << next_chunk << " to " << pair_rank << std::endl;
#endif
if (pair_rank > mpi_comm_rank) {
MPI_Send(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, MPI_COMM_WORLD);
Expand All @@ -148,14 +148,14 @@ void transpose::DistributedDiagonalTransposeDataHandler::exchangeData(transpose:
}
// MPI_Sendrecv_replace(&data.A[offset], next_chunk, MPI_FLOAT, pair_rank, 0, pair_rank, 0, MPI_COMM_WORLD, &status);
#ifndef NDEBUG
std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl;
// std::cout << "Rank " << mpi_comm_rank << " Done!"<< std::endl;
#endif
remaining_data_size -= next_chunk;
offset += next_chunk;
}
}
#ifndef NDEBUG
std::cout << "End data exchange " << mpi_comm_rank << std::endl;
// std::cout << "End data exchange " << mpi_comm_rank << std::endl;
#endif
}

Expand Down
6 changes: 3 additions & 3 deletions b_eff/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,16 @@ set(DEFAULT_MAX_LOOP_LENGTH 65536 CACHE STRING "Maximum number of repetitions do
set(DEFAULT_MIN_LOOP_LENGTH 256 CACHE STRING "Minimum number of repetitions done for a single message size")#
set(DEFAULT_LOOP_LENGTH_OFFSET 11 CACHE STRING "Offset that is used for the message sizes before reducing the number of repetitions")
set(DEFAULT_LOOP_LENGTH_DECREASE 7 CACHE STRING "Number of steps that are used to decrease the number of repetitions to its minimum")
set(NUM_REPLICATIONS 2 CACHE STRING "")

set(USE_MPI Yes)
set(USE_DEPRECATED_HPP_HEADER No)

set(DATA_TYPE char)
include(${CMAKE_SOURCE_DIR}/../cmake/general_benchmark_build_setup.cmake)
unset(DATA_TYPE CACHE)
if (NOT MPI_FOUND)
message(ERROR "Benchmarks host code requires MPI, but MPI could not be found.")
endif()
find_package(MPI REQUIRED)

if (NOT INTELFPGAOPENCL_FOUND)
message(ERROR "Benchmark does only support the Intel OpenCL SDK")
endif()
17 changes: 17 additions & 0 deletions b_eff/configs/Bittware_520N.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# This file contains the default configuration for the Nallatech 520N board
# for the use with single precision floating point values.
# To use this configuration file, call cmake with the parameter
#
# cmake [...] -DHPCC_FPGA_CONFIG="path to this file"
#


set(USE_MPI Yes CACHE BOOL "" FORCE)
set(USE_SVM No CACHE BOOL "" FORCE)
set(USE_HBM No CACHE BOOL "" FORCE)
set(FPGA_BOARD_NAME "p520_max_sg280l" CACHE STRING "" FORCE)
set(AOC_FLAGS "-fpc -fp-relaxed -seed=7" CACHE STRING "" FORCE)

# GEMM specific options
set(CHANNEL_WIDTH 32 CACHE STRING "Width of a single external channel in Byte" FORCE)
set(NUM_REPLICATIONS 2 CACHE STRING "Number of kernel replications" FORCE)
28 changes: 28 additions & 0 deletions b_eff/scripts/build_520n.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
#!/bin/bash
#
# Synthesize the b_eff kernel for the Nallaatech 520N board.
# This is an example script, how the synthesis can be started on Noctua using a HPCC FPGA configuration file.
# Submit this script to sbatch in this folder!
#
#SBATCH -p fpgasyn
#SBATCH -J b_eff

module load intelFPGA_pro/20.4.0
module load nalla_pcie/19.4.0_hpc
module load intel
module load devel/CMake/3.15.3-GCCcore-8.3.0

SCRIPT_PATH=${SLURM_SUBMIT_DIR}

BENCHMARK_DIR=${SCRIPT_PATH}/../

SYNTH_DIR=${PFS_SCRATCH}/synth/520n/multi_fpga/b_eff


mkdir -p ${SYNTH_DIR}
cd ${SYNTH_DIR}

cmake ${BENCHMARK_DIR} -DCMAKE_BUILD_TYPE=Release -DHPCC_FPGA_CONFIG=${BENCHMARK_DIR}/configs/Bittware_520N.cmake

make communication_bw520n_intel Network_intel

1 change: 1 addition & 0 deletions b_eff/src/common/parameters.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
* Kernel Parameters
*/
#define CHANNEL_WIDTH @CHANNEL_WIDTH@
#define NUM_REPLICATIONS @NUM_REPLICATIONS@

#define HOST_DATA_TYPE @HOST_DATA_TYPE@
#define DEVICE_DATA_TYPE @DEVICE_DATA_TYPE@
Expand Down
27 changes: 23 additions & 4 deletions b_eff/src/host/execution_default.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ namespace bm_execution {
std::vector<cl::Buffer> validationBuffers;

// Create all kernels and buffers. The kernel pairs are generated twice to utilize all channels
for (int r = 0; r < 2; r++) {
for (int r = 0; r < config.programSettings->kernelReplications; r++) {

validationBuffers.push_back(cl::Buffer(*config.context, CL_MEM_WRITE_ONLY, sizeof(HOST_DATA_TYPE) * validationData.size(),0,&err));
ASSERT_CL(err)
Expand Down Expand Up @@ -90,23 +90,42 @@ namespace bm_execution {
for (uint r =0; r < config.programSettings->numRepetitions; r++) {
MPI_Barrier(MPI_COMM_WORLD);
auto startCalculation = std::chrono::high_resolution_clock::now();
for (int i = 0; i < 2; i++) {
for (int i = 0; i < config.programSettings->kernelReplications; i++) {
sendQueues[i].enqueueNDRangeKernel(sendKernels[i], cl::NullRange, cl::NDRange(1));
recvQueues[i].enqueueNDRangeKernel(recvKernels[i], cl::NullRange, cl::NDRange(1));
#ifndef NDEBUG
int current_rank;
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
std::cout << "Rank " << current_rank << ": Enqueued " << r << "," << i << std::endl;
#endif
}
for (int i = 0; i < 2; i++) {
for (int i = 0; i < config.programSettings->kernelReplications; i++) {
sendQueues[i].finish();
#ifndef NDEBUG
int current_rank;
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
std::cout << "Rank " << current_rank << ": Send done " << r << "," << i << std::endl;
#endif
recvQueues[i].finish();
#ifndef NDEBUG
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
std::cout << "Rank " << current_rank << ": Recv done " << r << "," << i << std::endl;
#endif
}
auto endCalculation = std::chrono::high_resolution_clock::now();
std::chrono::duration<double> calculationTime =
std::chrono::duration_cast<std::chrono::duration<double>>
(endCalculation - startCalculation);
calculationTimings.push_back(calculationTime.count());
#ifndef NDEBUG
int current_rank;
MPI_Comm_rank(MPI_COMM_WORLD, & current_rank);
std::cout << "Rank " << current_rank << ": Done " << r << std::endl;
#endif
}
// Read validation data from FPGA will be placed sequentially in buffer for all replications
// The data order should not matter, because every byte should have the same value!
for (int r = 0; r < 2; r++) {
for (int r = 0; r < config.programSettings->kernelReplications; r++) {
err = recvQueues[r].enqueueReadBuffer(validationBuffers[r], CL_TRUE, 0, sizeof(HOST_DATA_TYPE) * validationData.size() / 2, &validationData.data()[r * validationData.size() / 2]);
ASSERT_CL(err);
}
Expand Down
2 changes: 1 addition & 1 deletion scripts/evaluation/parse_raw_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
fft_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Batch\\sSize\\s+(?P<batch_size>\d+)\nFFT\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+res\.\\serror\\s+mach\.\\seps\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+avg\\s+best\n\\s+Time\\s+in\\s+s:\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<best_time>(\d|\.|\+|-|e)+)\n\\s+GFLOPS:\\s+(?P<avg_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_flops>(\d|\.|\+|-|e)+)"
gemm_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep\n\\s+(?P<error>(\d|\.|\+|-|e)+)\\s+(?P<resid>(\d|\.|\+|-|e)+)\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GFLOPS\n\\s+(?P<best_time>.+)\\s+(?P<avg_time>.+)\\s+(?P<gflops>.+)"
ra_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+(?P<size>(\d|\.|\+|-|e)+)(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+Error:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+best\\s+mean\\s+GUOPS\n\\s+(?P<best_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_time>(\d|\.|\+|-|e)+)\\s+(?P<gops>(\d|\.|\+|-|e)+)"
trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[GB/s\\]\\s+Mem\\s+\\[GB/s\\]\n\\s*avg:\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)"
trans_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s*Maximum\\serror:\\s+(?P<error>(\d|\.|\+|-|e)+)(.*\n)+\\s+calc\\s+calc\\s+FLOPS\\s+Net\\s+\\[B/s\\]\\s+Mem\\s+\\[B/s\\]\n\\s*avg:\\s+(?P<avg_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<avg_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<avg_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<avg_mem_bw>(\d|\.|\+|-|e)+)\n\\s*best:\\s+(?P<best_calc_time>(\d|\.|\+|-|e)+)\\s+(?P<best_calc_flops>(\d|\.|\+|-|e)+)\\s+(?P<best_net_bw>(\d|\.|\+|-|e)+)\\s+(?P<best_mem_bw>(\d|\.|\+|-|e)+)"
stream_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Array\\sSize\\s+\\d+\\s+\\((?P<size>(\d|\.|\+|-|e)+)(.*\n)+Data\\sType\\s+(?P<data_type>.+)\n(.*\n)+Kernel\\sReplications\\s+(?P<replications>\d+)(.*\n)+Kernel\\sType\\s+(?P<type>.+)\n(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+Function\\s+Best\\sRate\\sMB/s\\s+Avg\\stime\\ss\\s+Min\\stime\\s+Max\\stime\n\\s+Add\\s+(?P<add_rate>(\d|\.|\+|-|e)+)\\s+(?P<add_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<add_min_time>(\d|\.|\+|-|e)+)\\s+(?P<add_max_time>(\d|\.|\+|-|e)+)\n\\s+Copy\\s+(?P<copy_rate>(\d|\.|\+|-|e)+)\\s+(?P<copy_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_min_time>(\d|\.|\+|-|e)+)\\s+(?P<copy_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\sread\\s+(?P<pcir_rate>(\d|\.|\+|-|e)+)\\s+(?P<pcir_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pcir_max_time>(\d|\.|\+|-|e)+)\n\\s+PCI\\swrite\\s+(?P<pciw_rate>(\d|\.|\+|-|e)+)\\s+(?P<pciw_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_min_time>(\d|\.|\+|-|e)+)\\s+(?P<pciw_max_time>(\d|\.|\+|-|e)+)\n\\s+Scale\\s+(?P<scale_rate>(\d|\.|\+|-|e)+)\\s+(?P<scale_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_min_time>(\d|\.|\+|-|e)+)\\s+(?P<scale_max_time>(\d|\.|\+|-|e)+)\n\\s+Triad\\s+(?P<triad_rate>(\d|\.|\+|-|e)+)\\s+(?P<triad_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_min_time>(\d|\.|\+|-|e)+)\\s+(?P<triad_max_time>(\d|\.|\+|-|e)+)"
linpack_regex = "Version:\\s+(?P<version>.+)\n(.*\n)+Matrix\\sSize\\s+(?P<size>\d+)(.*\n)+Device\\s+(?P<device>.+)\n(.*\n)+\\s+norm\.\\sresid\\s+resid\\s+machep.+\n\\s+(?P<error>((\d|\.|\+|-|e)+|nan))\\s+(?P<resid>((\d|\.|\+|-|e)+|nan))\\s+(?P<epsilon>(\d|\.|\+|-|e)+)(.*\n)+\\s+Method\\s+\\s+best\\s+mean\\s+GFLOPS(\\s*\n)\\s+total\\s+(?P<total_best_time>(\d|\.|\+|-|e)+)\\s+(?P<total_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<total_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GEFA\\s+(?P<lu_best_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<lu_gflops>(\d|\.|\+|-|e)+)(\\s*\n)\\s+GESL\\s+(?P<sl_best_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_avg_time>(\d|\.|\+|-|e)+)\\s+(?P<sl_gflops>(\d|\.|\+|-|e)+)"

Expand Down
4 changes: 4 additions & 0 deletions shared/include/hpcc_benchmark.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,11 @@ class BaseSettings {
defaultPlatform(results["platform"].as<int>()),
defaultDevice(results["device"].as<int>()),
kernelFileName(results["f"].as<std::string>()),
#ifdef NUM_REPLICATIONS
kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : NUM_REPLICATIONS),
#else
kernelReplications(results.count("r") > 0 ? results["r"].as<uint>() : 1),
#endif
testOnly(static_cast<bool>(results.count("test"))) {}

/**
Expand Down

0 comments on commit 60651eb

Please sign in to comment.