1D-chain performance test. #3

Based on <https://github.com/mariomulansky/hpx_odeint/tree/9792ca4f330bf0cffde4f000e900fb4c1c254891/osc_chain_1d/openmp2> Use osc_chain_speedup.{sh,gnu} to compute and plot speedup. "split" uses openmp_state/openmp_algebra; "simple" uses vector/openmp_range_algebra
BoostGSoC13 · Jul 19, 2013 · fd5a419 · fd5a419
1 parent daa5497
commit fd5a419
Show file tree

Hide file tree

Showing 6 changed files with 334 additions and 0 deletions.
diff --git a/Jamroot b/Jamroot
@@ -19,6 +19,7 @@ project
 # tests, regression tests and examples
 build-project libs/numeric/odeint/test ;
 build-project libs/numeric/odeint/examples ;
+build-project libs/numeric/odeint/performance/openmp ;
 
 
 # additional tests with external libraries :

diff --git a/libs/numeric/odeint/performance/openmp/Jamfile.v2 b/libs/numeric/odeint/performance/openmp/Jamfile.v2
@@ -0,0 +1,19 @@
+# Copyright 2009 Karsten Ahnert and Mario Mulansky.
+# Distributed under the Boost Software License, Version 1.0. (See
+# accompanying file LICENSE_1_0.txt or copy at
+# http://www.boost.org/LICENSE_1_0.txt)
+
+project
+    : requirements
+      <include>../../../../..
+      <include>..
+      <define>BOOST_ALL_NO_LIB=1
+      <library>/boost//timer
+      <cxxflags>--std=c++11
+      <toolset>gcc:<cxxflags>-fopenmp
+      <toolset>gcc:<linkflags>-fopenmp
+      <toolset>intel:<cxxflags>-openmp
+      <toolset>intel:<linkflags>-openmp
+    ;
+
+exe osc_chain_1d : osc_chain_1d.cpp ;
diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_1d.cpp b/libs/numeric/odeint/performance/openmp/osc_chain_1d.cpp
@@ -0,0 +1,128 @@
+/* Boost libs/numeric/odeint/performance/openmp/osc_chain_1d.cpp
+
+ Copyright 2009-2012 Karsten Ahnert
+ Copyright 2009-2012 Mario Mulansky
+
+ stronlgy nonlinear hamiltonian lattice in 2d
+
+ Distributed under the Boost Software License, Version 1.0.
+(See accompanying file LICENSE_1_0.txt or
+ copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+
+#include <iostream>
+#include <vector>
+#include <random>
+
+#include <omp.h>
+
+#include <boost/numeric/odeint.hpp>
+#include <boost/numeric/odeint/external/openmp/openmp.hpp>
+
+#include <boost/timer/timer.hpp>
+#include <boost/foreach.hpp>
+#include <boost/accumulators/accumulators.hpp>
+#include <boost/accumulators/statistics/stats.hpp>
+#include <boost/accumulators/statistics/mean.hpp>
+#include <boost/accumulators/statistics/median.hpp>
+#include <boost/accumulators/statistics/min.hpp>
+
+#include "osc_chain_1d_system.hpp"
+
+using namespace std;
+using namespace boost::numeric::odeint;
+using namespace boost::accumulators;
+
+using boost::timer::cpu_timer;
+
+const double p_kappa = 3.3;
+const double p_lambda = 4.7;
+const double p_beta = 1.0;
+
+int main( int argc , char* argv[] )
+{
+    size_t N = 1024;
+    size_t blocks = omp_get_max_threads();
+    size_t steps = 100;
+    size_t repeat = 5;
+    bool split_range = true;
+    if( argc > 1 ) N = boost::lexical_cast<size_t>( argv[1] );
+    if( argc > 2 ) blocks = boost::lexical_cast<size_t>( argv[2] );
+    if( argc > 3 ) steps = boost::lexical_cast<size_t>( argv[3] );
+    if( argc > 4 ) repeat = boost::lexical_cast<size_t>( argv[4] );
+    if( argc > 5 ) split_range = boost::lexical_cast<bool>( argv[5] );
+
+    cout << "Size: " << N << " with " << blocks << " blocks and " << steps << " steps." << endl;
+
+    accumulator_set< double, stats<tag::mean, tag::median> > acc_time;
+
+    for(size_t n_rep = 0 ; n_rep != repeat ; n_rep++)
+    {
+        osc_chain system( p_kappa , p_lambda , p_beta );
+
+        // fully random data
+        vector<double> p_init( N ), q_init( N, 0 );
+        uniform_real_distribution<double> distribution( 0.0 );
+        mt19937 engine( 0 );
+        auto generator = bind( distribution , engine );
+        generate( p_init.begin() , p_init.end() , generator );
+
+        if(split_range) {
+            typedef openmp_state<double> state_type;
+            typedef symplectic_rkn_sb3a_mclachlan<
+                    state_type , state_type , double
+                > stepper_type;
+
+            // split into blocks
+            state_type p( blocks );
+            boost::numeric::odeint::copy(p_init, p);
+
+            state_type q( blocks );
+            boost::numeric::odeint::copy(q_init, q);
+
+            clog << "split " << N << " into";
+            for(size_t i = 0 ; i != p.size() ; i++)
+                clog << ' ' << p[i].size();
+            clog << endl;
+
+            for(size_t n_run = 0 ; n_run != 5 ; n_run++) {
+                cpu_timer timer;
+                integrate_n_steps( stepper_type() , system ,
+                                   make_pair( ref(q) , ref(p) ) ,
+                                   0.0 , 0.01 , steps );
+                double run_time = static_cast<double>(timer.elapsed().wall) * 1.0e-9;
+                acc_time(run_time);
+                clog << "run " << n_rep << "-" << n_run << " wall[s]: " << run_time << endl;
+            }
+
+        } else {
+            typedef vector<double> state_type;
+            typedef symplectic_rkn_sb3a_mclachlan<
+                    state_type , state_type , double ,
+                    state_type , state_type , double ,
+                    openmp_range_algebra
+                > stepper_type;
+
+            omp_set_num_threads(blocks);
+
+            state_type p(p_init), q(q_init);
+
+            for(size_t n_run = 0 ; n_run != 5 ; n_run++) {
+                cpu_timer timer;
+                integrate_n_steps( stepper_type() , system ,
+                                   make_pair( ref(q) , ref(p) ) ,
+                                   0.0 , 0.01 , steps );
+                double run_time = static_cast<double>(timer.elapsed().wall) * 1.0e-9;
+                acc_time(run_time);
+                clog << "run " << n_rep << "-" << n_run << " wall[s]: " << run_time << endl;
+            }
+
+        }
+    }
+
+    cout << " mean[s]: " << mean(acc_time)
+         << " median[s]: " << median(acc_time) << endl;
+
+    return 0;
+}
+
diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_1d_system.hpp b/libs/numeric/odeint/performance/openmp/osc_chain_1d_system.hpp
@@ -0,0 +1,99 @@
+/* Boost libs/numeric/odeint/performance/openmp/osc_chain_1d_system.hpp
+
+ Copyright 2009-2012 Karsten Ahnert
+ Copyright 2009-2012 Mario Mulansky
+
+ stronlgy nonlinear hamiltonian lattice
+
+ Distributed under the Boost Software License, Version 1.0.
+(See accompanying file LICENSE_1_0.txt or
+ copy at http://www.boost.org/LICENSE_1_0.txt)
+ */
+
+#ifndef SYSTEM_HPP
+#define SYSTEM_HPP
+
+#include <vector>
+#include <cmath>
+#include <iostream>
+
+#include <omp.h>
+
+#include <boost/math/special_functions/sign.hpp>
+#include <boost/numeric/odeint/external/openmp/openmp.hpp>
+
+typedef std::vector< double > dvec;
+
+namespace checked_math {
+    inline double pow( double x , double y )
+    {
+        if( x==0.0 )
+            // 0**y = 0, don't care for y = 0 or NaN
+            return 0.0;
+        using std::pow;
+        using std::abs;
+        return pow( abs(x) , y );
+    }
+}
+
+double signed_pow( double x , double k )
+{
+    using boost::math::sign;
+    return checked_math::pow( x , k ) * sign(x);
+}
+
+struct osc_chain {
+
+    const double m_kap, m_lam, m_beta;
+
+    osc_chain( const double kap , const double lam , const double beta )
+        : m_kap( kap ) , m_lam( lam ) , m_beta( beta )
+    { }
+
+    // Simple case with openmp_range_algebra
+    void operator()( const std::vector<double> &q ,
+                           std::vector<double> &dpdt ) const
+    {
+        const size_t N = q.size();
+#       pragma omp parallel for schedule(runtime)
+        for(size_t i = 0 ; i < N ; ++i)
+        {
+            // can't store things between iterations
+            const double q_prev = i == 0 ? 0 : q[i - 1];
+            const double q_next = i + 1 == N ? 0 : q[i + 1];
+            const double coupling_l = signed_pow( q_prev - q[i] , m_lam-1 );
+            const double coupling_r = signed_pow( q[i] - q_next , m_lam-1 );
+            dpdt[i] = coupling_l - signed_pow( q[i] , m_kap-1 ) - coupling_r;
+        }
+    }
+
+    // Split case with openmp_algebra
+    void operator()( const boost::numeric::odeint::openmp_state<double> &q ,
+                           boost::numeric::odeint::openmp_state<double> &dpdt ) const
+    {
+        const size_t N = q.size();
+#       pragma omp parallel for schedule(runtime)
+        for(size_t i = 0 ; i < N ; ++i)
+        {
+            const double q_left =      i == 0 ? 0 : q[i-1].back();
+            const double q_right = i + 1 == N ? 0 : q[i+1].front();
+
+            const std::vector<double> &_q = q[i];
+            std::vector<double> &_dpdt = dpdt[i];
+
+            const size_t M = q[i].size();
+            double coupling_lr = signed_pow( q_left - _q[0] , m_lam-1 );
+            for(size_t i = 0 ; i < M-1 ; ++i)
+            {
+                _dpdt[i] = -signed_pow( _q[i] , m_kap-1 ) + coupling_lr;
+                coupling_lr = signed_pow( _q[i] - _q[i+1] , m_lam-1 );
+                _dpdt[i] -= coupling_lr;
+            }
+            _dpdt[N-1] = -signed_pow( _q[N-1] , m_kap-1 ) + coupling_lr;
+            _dpdt[N-1] -= signed_pow( _q[N-1] - q_right , m_lam-1 );
+        }
+    }
+
+};
+
+#endif
diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_speedup.gnu b/libs/numeric/odeint/performance/openmp/osc_chain_speedup.gnu
@@ -0,0 +1,50 @@
+#!/usr/bin/env gnuplot
+
+set terminal pngcairo size 1000,1000
+set output "osc_chain_speedup.png"
+
+set multiplot layout 2,2
+
+set key left
+
+set xrange [1:16]
+set x2range [1:16]
+set x2tics 8 format ""
+set grid x2tics
+set yrange [0:8]
+
+set title "short: speedup"
+plot \
+    "osc_chain_speedup-short.dat" i 0 u "block":"gcc-s-mul" w lp t "gcc (split)" , \
+    "osc_chain_speedup-short.dat" i 0 u "block":"gcc-t-mul" w lp t "gcc (simple)", \
+    "osc_chain_speedup-short.dat" i 0 u "block":"icc-s-mul" w lp t "icc (split)" , \
+    "osc_chain_speedup-short.dat" i 0 u "block":"icc-t-mul" w lp t "icc (simple)", \
+    4 lc 0 lt 0 t "target"
+
+unset key
+
+set title "long: speedup"
+plot \
+    "osc_chain_speedup-long.dat" i 0 u "block":"gcc-s-mul" w lp, \
+    "osc_chain_speedup-long.dat" i 0 u "block":"gcc-t-mul" w lp, \
+    "osc_chain_speedup-long.dat" i 0 u "block":"icc-s-mul" w lp, \
+    "osc_chain_speedup-long.dat" i 0 u "block":"icc-t-mul" w lp, \
+    4 lc 0 lt 0
+
+set yrange [0:*]
+
+set title "short: time[s]"
+plot \
+    "osc_chain_speedup-short.dat" i 0 u "block":"gcc-s-med" w lp, \
+    "osc_chain_speedup-short.dat" i 0 u "block":"gcc-t-med" w lp, \
+    "osc_chain_speedup-short.dat" i 0 u "block":"icc-s-med" w lp, \
+    "osc_chain_speedup-short.dat" i 0 u "block":"icc-t-med" w lp
+
+set title "long: time[s]"
+plot \
+    "osc_chain_speedup-long.dat" i 0 u "block":"gcc-s-med" w lp, \
+    "osc_chain_speedup-long.dat" i 0 u "block":"gcc-t-med" w lp, \
+    "osc_chain_speedup-long.dat" i 0 u "block":"icc-s-med" w lp, \
+    "osc_chain_speedup-long.dat" i 0 u "block":"icc-t-med" w lp
+
+unset multiplot
diff --git a/libs/numeric/odeint/performance/openmp/osc_chain_speedup.sh b/libs/numeric/odeint/performance/openmp/osc_chain_speedup.sh
@@ -0,0 +1,37 @@
+#!/bin/zsh
+
+export LC_NUMERIC=en_US.UTF-8
+declare -A times
+
+export OMP_SCHEDULE=static
+repeat=2
+
+function run {
+    n=$1
+    steps=$2
+    printf "# n=$n steps=$steps repeat=$repeat\n"
+    printf '"block"'
+    for b in gcc icc ; do
+        for s in s t ; do
+            for t in med mul ; do
+                printf "\t\"$b-$s-$t\""
+            done
+        done
+    done
+    for block in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 ; do
+        printf '\n%d' $block
+        for build in gcc-4.7 intel-linux ; do
+            bench="bin/$build/release/osc_chain_1d"
+            for split in 1 0 ; do
+                med=$($bench $n $block $steps $repeat $split | tail -1 | awk '{print $4}')
+                times[$build-$split-$block]=$med
+                speedup=$((${times[$build-$split-1]}/$med))
+                printf '\t%f\t%f' $med $speedup
+            done
+        done
+    done
+    printf '\n\n\n'
+}
+
+run 4096 1024 | tee osc_chain_speedup-short.dat
+run 4194304 1 | tee osc_chain_speedup-long.dat