From 504eed38b15c4bbe3e50aeac5fcdff57e12da3ce Mon Sep 17 00:00:00 2001
From: Zentrik <Zentrik@users.noreply.github.com>
Date: Sat, 30 Dec 2023 22:55:58 +0000
Subject: [PATCH] Only run perf once instead of per sample

Runing perf every sample would probably slow down all benchmarks significantly for no gain (as we only store the last recorded perf result).
---
 src/execution.jl    | 113 ++++++++++++++++++++++----------------------
 src/parameters.jl   |   2 +-
 src/trials.jl       |  30 ++----------
 test/TrialsTests.jl |  22 ++++-----
 4 files changed, 71 insertions(+), 96 deletions(-)

diff --git a/src/execution.jl b/src/execution.jl
index 7df2aa02..cbaaa750 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -9,6 +9,7 @@ gcscrub() = (GC.gc(); GC.gc(); GC.gc(); GC.gc())
 
 mutable struct Benchmark
     samplefunc
+    linux_perf_func
     quote_vals
     params::Parameters
 end
@@ -106,15 +107,21 @@ function _run(b::Benchmark, p::Parameters; verbose=false, pad="", kwargs...)
     start_time = Base.time()
     trial = Trial(params)
     params.gcsample && gcscrub()
-    trial_contents = b.samplefunc(b.quote_vals, params)
-    push!(trial, trial_contents)
-    return_val = trial_contents.return_val
+    s = b.samplefunc(b.quote_vals, params)
+    push!(trial, s[1:(end - 1)]...)
+    return_val = s[end]
     iters = 2
     while (Base.time() - start_time) < params.seconds && iters ≤ params.samples
         params.gcsample && gcscrub()
-        push!(trial, b.samplefunc(b.quote_vals, params))
+        push!(trial, b.samplefunc(b.quote_vals, params)[1:(end - 1)]...)
         iters += 1
     end
+
+    if p.experimental_enable_linux_perf
+        params.gcsample && gcscrub()
+        trial.linux_perf_stats = b.linux_perf_func(b.quote_vals, params)
+    end
+
     return trial, return_val
 end
 
@@ -178,7 +185,7 @@ function _lineartrial(b::Benchmark, p::Parameters=b.params; maxevals=RESOLUTION,
     for evals in eachindex(estimates)
         params.gcsample && gcscrub()
         params.evals = evals
-        estimates[evals] = b.samplefunc(b.quote_vals, params).time
+        estimates[evals] = first(b.samplefunc(b.quote_vals, params))
         completed += 1
         ((time() - start_time) > params.seconds) && break
     end
@@ -506,6 +513,7 @@ function generate_benchmark_definition(
     @nospecialize
     corefunc = gensym("core")
     samplefunc = gensym("sample")
+    linux_perf_func = gensym("perf")
     type_vars = [gensym() for i in 1:(length(quote_vars) + length(setup_vars))]
     signature = Expr(:call, corefunc, quote_vars..., setup_vars...)
     signature_def = Expr(
@@ -572,64 +580,57 @@ function generate_benchmark_definition(
                         __evals,
                     ),
                 )
-                if $(params.experimental_enable_linux_perf)
-                    # Based on https://github.com/JuliaPerf/LinuxPerf.jl/blob/a7fee0ff261a5b5ce7a903af7b38d1b5c27dd931/src/LinuxPerf.jl#L1043-L1061
-                    __linux_perf_groups = BenchmarkTools.LinuxPerf.set_default_spaces(
-                        $(params.linux_perf_options.events),
-                        $(params.linux_perf_options.spaces),
+                return __time, __gctime, __memory, __allocs, __return_val
+            end
+            @noinline function $(linux_perf_func)(
+                $(Expr(:tuple, quote_vars...)), __params::$BenchmarkTools.Parameters
+            )
+                # Based on https://github.com/JuliaPerf/LinuxPerf.jl/blob/a7fee0ff261a5b5ce7a903af7b38d1b5c27dd931/src/LinuxPerf.jl#L1043-L1061
+                __linux_perf_groups = $LinuxPerf.set_default_spaces(
+                    eval(__params.linux_perf_options.events),
+                    eval(__params.linux_perf_options.spaces),
+                )
+                __linux_perf_bench = nothing
+                try
+                    __linux_perf_bench = $LinuxPerf.make_bench_threaded(
+                        __linux_perf_groups;
+                        threads=eval(__params.linux_perf_options.threads),
                     )
-                    __linux_perf_bench = nothing
-                    try
-                        __linux_perf_bench = BenchmarkTools.LinuxPerf.make_bench_threaded(
-                            __linux_perf_groups;
-                            threads=$(params.linux_perf_options.threads),
-                        )
-                    catch e
-                        if e isa ErrorException &&
-                            startswith(e.msg, "perf_event_open error : ")
-                            @warn "Perf is disabled"
-                        else
-                            rethrow()
-                        end
+                catch e
+                    if e isa ErrorException &&
+                        startswith(e.msg, "perf_event_open error : ")
+                        @warn "Perf is disabled" # Really we only want to do this if we defaulted to running with perf, otherwise we should just throw.
+                    # Given we now more accurately determine if perf is available can we do away with this hack?
+                    else
+                        rethrow()
                     end
+                end
 
-                    if !isnothing(__linux_perf_bench)
-                        try
-                            $(setup)
-                            BenchmarkTools.LinuxPerf.enable!(__linux_perf_bench)
-                            # We'll just run it one time.
-                            __return_val_2 = $(invocation)
-                            BenchmarkTools.LinuxPerf.disable!(__linux_perf_bench)
-                            # trick the compiler not to eliminate the code
-                            if rand() < 0
-                                __linux_perf_stats = __return_val_2
-                            else
-                                __linux_perf_stats = BenchmarkTools.LinuxPerf.Stats(
-                                    __linux_perf_bench
-                                )
-                            end
-                        catch
-                            rethrow()
-                        finally
-                            close(__linux_perf_bench)
-                            $(teardown)
+                if !isnothing(__linux_perf_bench)
+                    $(setup)
+                    try
+                        $LinuxPerf.enable!(__linux_perf_bench)
+                        # We'll just run it one time.
+                        __return_val_2 = $(invocation)
+                        $LinuxPerf.disable!(__linux_perf_bench)
+                        # trick the compiler not to eliminate the code
+                        if rand() < 0
+                            __linux_perf_stats = __return_val_2
+                        else
+                            __linux_perf_stats = $LinuxPerf.Stats(__linux_perf_bench)
                         end
+                        return __linux_perf_stats
+                    catch
+                        rethrow()
+                    finally
+                        close(__linux_perf_bench)
+                        $(teardown)
                     end
-                else
-                    __return_val_2 = nothing
-                    __linux_perf_stats = nothing
                 end
-                return BenchmarkTools.TrialContents(
-                    __time,
-                    __gctime,
-                    __memory,
-                    __allocs,
-                    __return_val,
-                    __return_val_2,
-                    __linux_perf_stats,
-                )
             end
-            $BenchmarkTools.Benchmark($(samplefunc), $(quote_vals), $(params))
+            $BenchmarkTools.Benchmark(
+                $(samplefunc), $(linux_perf_func), $(quote_vals), $(params)
+            )
         end,
     )
 end
diff --git a/src/parameters.jl b/src/parameters.jl
index 0c46c6bd..3b65a5de 100644
--- a/src/parameters.jl
+++ b/src/parameters.jl
@@ -30,7 +30,7 @@ function perf_available()
     try
         opts = DEFAULT_LINUX_PERF_OPTIONS
         groups = LinuxPerf.set_default_spaces(eval(opts.events), eval(opts.spaces))
-        bench = LinuxPerf.make_bench_threaded(groups, threads = eval(opts.threads))
+        bench = LinuxPerf.make_bench_threaded(groups; threads=eval(opts.threads))
         return true
     catch
         return false
diff --git a/src/trials.jl b/src/trials.jl
index 9280bb9b..18a931de 100644
--- a/src/trials.jl
+++ b/src/trials.jl
@@ -11,16 +11,6 @@ mutable struct Trial
     linux_perf_stats::Union{LinuxPerf.Stats,Nothing}
 end
 
-struct TrialContents{A,B}
-    time::Float64
-    gctime::Float64
-    memory::Int
-    allocs::Int
-    return_val::A
-    return_val_2::B
-    linux_perf_stats::Union{LinuxPerf.Stats,Nothing}
-end
-
 function Trial(params::Parameters)
     return Trial(params, Float64[], Float64[], typemax(Int), typemax(Int), nothing)
 end
@@ -44,16 +34,11 @@ function Base.copy(t::Trial)
     )
 end
 
-function Base.push!(t::Trial, trial_contents::TrialContents)
-    time = trial_contents.time
-    gctime = trial_contents.gctime
-    memory = trial_contents.memory
-    allocs = trial_contents.allocs
+function Base.push!(t::Trial, time, gctime, memory, allocs)
     push!(t.times, time)
     push!(t.gctimes, gctime)
     memory < t.memory && (t.memory = memory)
     allocs < t.allocs && (t.allocs = allocs)
-    t.linux_perf_stats = trial_contents.linux_perf_stats
     return t
 end
 
@@ -65,17 +50,8 @@ end
 
 Base.length(t::Trial) = length(t.times)
 function Base.getindex(t::Trial, i::Number)
-    return push!(
-        Trial(t.params),
-        TrialContents(
-            t.times[i],
-            t.gctimes[i],
-            t.memory,
-            t.allocs,
-            nothing,
-            nothing,
-            t.linux_perf_stats,
-        ),
+    return Trial(
+        t.params, [t.times[i]], [t.gctimes[i]], t.memory, t.allocs, t.linux_perf_stats
     )
 end
 function Base.getindex(t::Trial, i)
diff --git a/test/TrialsTests.jl b/test/TrialsTests.jl
index c666bf8e..3cd960d9 100644
--- a/test/TrialsTests.jl
+++ b/test/TrialsTests.jl
@@ -1,21 +1,21 @@
 module TrialsTests
 
 using BenchmarkTools
-using BenchmarkTools: TrialContents
 using Test
 
 #########
 # Trial #
 #########
+
 trial1 = BenchmarkTools.Trial(BenchmarkTools.Parameters(; evals=2))
-push!(trial1, TrialContents(2.0, 1.0, 4, 5, nothing, nothing, nothing))
-push!(trial1, TrialContents(21.0, 0.0, 41, 51, nothing, nothing, nothing))
+push!(trial1, 2, 1, 4, 5)
+push!(trial1, 21, 0, 41, 51)
 
 trial2 = BenchmarkTools.Trial(BenchmarkTools.Parameters(; time_tolerance=0.15))
-push!(trial2, TrialContents(21.0, 0.0, 41, 51, nothing, nothing, nothing))
-push!(trial2, TrialContents(2.0, 1.0, 4, 5, nothing, nothing, nothing))
+push!(trial2, 21, 0, 41, 51)
+push!(trial2, 2, 1, 4, 5)
 
-push!(trial2, TrialContents(21.0, 0.0, 41, 51, nothing, nothing, nothing))
+push!(trial2, 21, 0, 41, 51)
 @test length(trial2) == 3
 deleteat!(trial2, 3)
 @test length(trial1) == length(trial2) == 2
@@ -33,10 +33,8 @@ trial2.params = trial1.params
 
 @test trial1 == trial2
 
-@test trial1[2] == push!(
-    BenchmarkTools.Trial(BenchmarkTools.Parameters(; evals=2)),
-    TrialContents(21.0, 0.0, 4, 5, nothing, nothing, nothing),
-)
+@test trial1[2] ==
+    push!(BenchmarkTools.Trial(BenchmarkTools.Parameters(; evals=2)), 21, 0, 4, 5)
 @test trial1[1:end] == trial1
 
 @test time(trial1) == time(trial2) == 2.0
@@ -63,11 +61,11 @@ rmskew!(trial3)
 randtrial = BenchmarkTools.Trial(BenchmarkTools.Parameters())
 
 for _ in 1:40
-    push!(randtrial, TrialContents(rand(1.0:20.0), 1.0, 1, 1, nothing, nothing, nothing))
+    push!(randtrial, rand(1:20), 1, 1, 1)
 end
 
 while mean(randtrial) <= median(randtrial)
-    push!(randtrial, TrialContents(rand(10.0:20.0), 1.0, 1, 1, nothing, nothing, nothing))
+    push!(randtrial, rand(10:20), 1, 1, 1)
 end
 
 rmskew!(randtrial)