diff --git a/Jamroot b/Jamroot index 92096842..85103e8e 100644 --- a/Jamroot +++ b/Jamroot @@ -19,6 +19,7 @@ project # tests, regression tests and examples build-project libs/numeric/odeint/test ; build-project libs/numeric/odeint/examples ; +build-project libs/numeric/odeint/performance/openmp ; # additional tests with external libraries : diff --git a/libs/numeric/odeint/performance/openmp/Jamfile.v2 b/libs/numeric/odeint/performance/openmp/Jamfile.v2 new file mode 100644 index 00000000..389c190c --- /dev/null +++ b/libs/numeric/odeint/performance/openmp/Jamfile.v2 @@ -0,0 +1,19 @@ +# Copyright 2009 Karsten Ahnert and Mario Mulansky. +# Distributed under the Boost Software License, Version 1.0. (See +# accompanying file LICENSE_1_0.txt or copy at +# http://www.boost.org/LICENSE_1_0.txt) + +project + : requirements + ../../../../.. + .. + BOOST_ALL_NO_LIB=1 + /boost//timer + --std=c++11 + gcc:-fopenmp + gcc:-fopenmp + intel:-openmp + intel:-openmp + ; + +exe osc_chain_1d : osc_chain_1d.cpp ; diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_1d.cpp b/libs/numeric/odeint/performance/openmp/osc_chain_1d.cpp new file mode 100644 index 00000000..ae5feb47 --- /dev/null +++ b/libs/numeric/odeint/performance/openmp/osc_chain_1d.cpp @@ -0,0 +1,128 @@ +/* Boost libs/numeric/odeint/performance/openmp/osc_chain_1d.cpp + + Copyright 2009-2012 Karsten Ahnert + Copyright 2009-2012 Mario Mulansky + + stronlgy nonlinear hamiltonian lattice in 2d + + Distributed under the Boost Software License, Version 1.0. +(See accompanying file LICENSE_1_0.txt or + copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#include +#include +#include + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "osc_chain_1d_system.hpp" + +using namespace std; +using namespace boost::numeric::odeint; +using namespace boost::accumulators; + +using boost::timer::cpu_timer; + +const double p_kappa = 3.3; +const double p_lambda = 4.7; +const double p_beta = 1.0; + +int main( int argc , char* argv[] ) +{ + size_t N = 1024; + size_t blocks = omp_get_max_threads(); + size_t steps = 100; + size_t repeat = 5; + bool split_range = true; + if( argc > 1 ) N = boost::lexical_cast( argv[1] ); + if( argc > 2 ) blocks = boost::lexical_cast( argv[2] ); + if( argc > 3 ) steps = boost::lexical_cast( argv[3] ); + if( argc > 4 ) repeat = boost::lexical_cast( argv[4] ); + if( argc > 5 ) split_range = boost::lexical_cast( argv[5] ); + + cout << "Size: " << N << " with " << blocks << " blocks and " << steps << " steps." << endl; + + accumulator_set< double, stats > acc_time; + + for(size_t n_rep = 0 ; n_rep != repeat ; n_rep++) + { + osc_chain system( p_kappa , p_lambda , p_beta ); + + // fully random data + vector p_init( N ), q_init( N, 0 ); + uniform_real_distribution distribution( 0.0 ); + mt19937 engine( 0 ); + auto generator = bind( distribution , engine ); + generate( p_init.begin() , p_init.end() , generator ); + + if(split_range) { + typedef openmp_state state_type; + typedef symplectic_rkn_sb3a_mclachlan< + state_type , state_type , double + > stepper_type; + + // split into blocks + state_type p( blocks ); + boost::numeric::odeint::copy(p_init, p); + + state_type q( blocks ); + boost::numeric::odeint::copy(q_init, q); + + clog << "split " << N << " into"; + for(size_t i = 0 ; i != p.size() ; i++) + clog << ' ' << p[i].size(); + clog << endl; + + for(size_t n_run = 0 ; n_run != 5 ; n_run++) { + cpu_timer timer; + integrate_n_steps( stepper_type() , system , + make_pair( ref(q) , ref(p) ) , + 0.0 , 0.01 , steps ); + double run_time = static_cast(timer.elapsed().wall) * 1.0e-9; + acc_time(run_time); + clog << "run " << n_rep << "-" << n_run << " wall[s]: " << run_time << endl; + } + + } else { + typedef vector state_type; + typedef symplectic_rkn_sb3a_mclachlan< + state_type , state_type , double , + state_type , state_type , double , + openmp_range_algebra + > stepper_type; + + omp_set_num_threads(blocks); + + state_type p(p_init), q(q_init); + + for(size_t n_run = 0 ; n_run != 5 ; n_run++) { + cpu_timer timer; + integrate_n_steps( stepper_type() , system , + make_pair( ref(q) , ref(p) ) , + 0.0 , 0.01 , steps ); + double run_time = static_cast(timer.elapsed().wall) * 1.0e-9; + acc_time(run_time); + clog << "run " << n_rep << "-" << n_run << " wall[s]: " << run_time << endl; + } + + } + } + + cout << " mean[s]: " << mean(acc_time) + << " median[s]: " << median(acc_time) << endl; + + return 0; +} + diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_1d_system.hpp b/libs/numeric/odeint/performance/openmp/osc_chain_1d_system.hpp new file mode 100644 index 00000000..b42c6ec4 --- /dev/null +++ b/libs/numeric/odeint/performance/openmp/osc_chain_1d_system.hpp @@ -0,0 +1,99 @@ +/* Boost libs/numeric/odeint/performance/openmp/osc_chain_1d_system.hpp + + Copyright 2009-2012 Karsten Ahnert + Copyright 2009-2012 Mario Mulansky + + stronlgy nonlinear hamiltonian lattice + + Distributed under the Boost Software License, Version 1.0. +(See accompanying file LICENSE_1_0.txt or + copy at http://www.boost.org/LICENSE_1_0.txt) + */ + +#ifndef SYSTEM_HPP +#define SYSTEM_HPP + +#include +#include +#include + +#include + +#include +#include + +typedef std::vector< double > dvec; + +namespace checked_math { + inline double pow( double x , double y ) + { + if( x==0.0 ) + // 0**y = 0, don't care for y = 0 or NaN + return 0.0; + using std::pow; + using std::abs; + return pow( abs(x) , y ); + } +} + +double signed_pow( double x , double k ) +{ + using boost::math::sign; + return checked_math::pow( x , k ) * sign(x); +} + +struct osc_chain { + + const double m_kap, m_lam, m_beta; + + osc_chain( const double kap , const double lam , const double beta ) + : m_kap( kap ) , m_lam( lam ) , m_beta( beta ) + { } + + // Simple case with openmp_range_algebra + void operator()( const std::vector &q , + std::vector &dpdt ) const + { + const size_t N = q.size(); +# pragma omp parallel for schedule(runtime) + for(size_t i = 0 ; i < N ; ++i) + { + // can't store things between iterations + const double q_prev = i == 0 ? 0 : q[i - 1]; + const double q_next = i + 1 == N ? 0 : q[i + 1]; + const double coupling_l = signed_pow( q_prev - q[i] , m_lam-1 ); + const double coupling_r = signed_pow( q[i] - q_next , m_lam-1 ); + dpdt[i] = coupling_l - signed_pow( q[i] , m_kap-1 ) - coupling_r; + } + } + + // Split case with openmp_algebra + void operator()( const boost::numeric::odeint::openmp_state &q , + boost::numeric::odeint::openmp_state &dpdt ) const + { + const size_t N = q.size(); +# pragma omp parallel for schedule(runtime) + for(size_t i = 0 ; i < N ; ++i) + { + const double q_left = i == 0 ? 0 : q[i-1].back(); + const double q_right = i + 1 == N ? 0 : q[i+1].front(); + + const std::vector &_q = q[i]; + std::vector &_dpdt = dpdt[i]; + + const size_t M = q[i].size(); + double coupling_lr = signed_pow( q_left - _q[0] , m_lam-1 ); + for(size_t i = 0 ; i < M-1 ; ++i) + { + _dpdt[i] = -signed_pow( _q[i] , m_kap-1 ) + coupling_lr; + coupling_lr = signed_pow( _q[i] - _q[i+1] , m_lam-1 ); + _dpdt[i] -= coupling_lr; + } + _dpdt[N-1] = -signed_pow( _q[N-1] , m_kap-1 ) + coupling_lr; + _dpdt[N-1] -= signed_pow( _q[N-1] - q_right , m_lam-1 ); + } + } + +}; + +#endif diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_speedup.gnu b/libs/numeric/odeint/performance/openmp/osc_chain_speedup.gnu new file mode 100755 index 00000000..870731c1 --- /dev/null +++ b/libs/numeric/odeint/performance/openmp/osc_chain_speedup.gnu @@ -0,0 +1,50 @@ +#!/usr/bin/env gnuplot + +set terminal pngcairo size 1000,1000 +set output "osc_chain_speedup.png" + +set multiplot layout 2,2 + +set key left + +set xrange [1:16] +set x2range [1:16] +set x2tics 8 format "" +set grid x2tics +set yrange [0:8] + +set title "short: speedup" +plot \ + "osc_chain_speedup-short.dat" i 0 u "block":"gcc-s-mul" w lp t "gcc (split)" , \ + "osc_chain_speedup-short.dat" i 0 u "block":"gcc-t-mul" w lp t "gcc (simple)", \ + "osc_chain_speedup-short.dat" i 0 u "block":"icc-s-mul" w lp t "icc (split)" , \ + "osc_chain_speedup-short.dat" i 0 u "block":"icc-t-mul" w lp t "icc (simple)", \ + 4 lc 0 lt 0 t "target" + +unset key + +set title "long: speedup" +plot \ + "osc_chain_speedup-long.dat" i 0 u "block":"gcc-s-mul" w lp, \ + "osc_chain_speedup-long.dat" i 0 u "block":"gcc-t-mul" w lp, \ + "osc_chain_speedup-long.dat" i 0 u "block":"icc-s-mul" w lp, \ + "osc_chain_speedup-long.dat" i 0 u "block":"icc-t-mul" w lp, \ + 4 lc 0 lt 0 + +set yrange [0:*] + +set title "short: time[s]" +plot \ + "osc_chain_speedup-short.dat" i 0 u "block":"gcc-s-med" w lp, \ + "osc_chain_speedup-short.dat" i 0 u "block":"gcc-t-med" w lp, \ + "osc_chain_speedup-short.dat" i 0 u "block":"icc-s-med" w lp, \ + "osc_chain_speedup-short.dat" i 0 u "block":"icc-t-med" w lp + +set title "long: time[s]" +plot \ + "osc_chain_speedup-long.dat" i 0 u "block":"gcc-s-med" w lp, \ + "osc_chain_speedup-long.dat" i 0 u "block":"gcc-t-med" w lp, \ + "osc_chain_speedup-long.dat" i 0 u "block":"icc-s-med" w lp, \ + "osc_chain_speedup-long.dat" i 0 u "block":"icc-t-med" w lp + +unset multiplot diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_speedup.sh b/libs/numeric/odeint/performance/openmp/osc_chain_speedup.sh new file mode 100755 index 00000000..12a531ad --- /dev/null +++ b/libs/numeric/odeint/performance/openmp/osc_chain_speedup.sh @@ -0,0 +1,37 @@ +#!/bin/zsh + +export LC_NUMERIC=en_US.UTF-8 +declare -A times + +export OMP_SCHEDULE=static +repeat=2 + +function run { + n=$1 + steps=$2 + printf "# n=$n steps=$steps repeat=$repeat\n" + printf '"block"' + for b in gcc icc ; do + for s in s t ; do + for t in med mul ; do + printf "\t\"$b-$s-$t\"" + done + done + done + for block in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ; do + printf '\n%d' $block + for build in gcc-4.7 intel-linux ; do + bench="bin/$build/release/osc_chain_1d" + for split in 1 0 ; do + med=$($bench $n $block $steps $repeat $split | tail -1 | awk '{print $4}') + times[$build-$split-$block]=$med + speedup=$((${times[$build-$split-1]}/$med)) + printf '\t%f\t%f' $med $speedup + done + done + done + printf '\n\n\n' +} + +run 4096 1024 | tee osc_chain_speedup-short.dat +run 4194304 1 | tee osc_chain_speedup-long.dat