diff --git a/src/util.jl b/src/util.jl index 3d116837d..a9f927035 100644 --- a/src/util.jl +++ b/src/util.jl @@ -69,77 +69,62 @@ macro trixi_timeit(timer_output, label, expr) end """ - @threaded for ... end + @threaded system for ... end +Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `system`. Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange` but without guarantee that the underlying implementation uses `Threads.@threads` or works for more general `for` loops. -In particular, there may be an additional check whether only one thread is used -to reduce the overhead of serial execution or the underlying threading capabilities -might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). +The second argument must either be a particle system or an array from which can be derived +if the loop has to be run threaded on the CPU or launched as a kernel on the GPU. + +In particular, the underlying threading capabilities might be provided by other packages +such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). !!! warn This macro does not necessarily work for general `for` loops. For example, it does not necessarily support general iterables such as `eachline(filename)`. +""" +macro threaded(system, expr) + # Reverse-engineer the for loop. + # `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`. + # So, `expr.args[1].args[2]` is the iterator `eachindex(x)` + # and `expr.args[1].args[1]` is the loop variable `i`. + iterator = expr.args[1].args[2] + i = expr.args[1].args[1] + inner_loop = expr.args[2] -Some discussion can be found at -[https://discourse.julialang.org/t/overhead-of-threads-threads/53964](https://discourse.julialang.org/t/overhead-of-threads-threads/53964) -and -[https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435](https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435). + # Assemble the for loop again as a call to `parallel_foreach`, using `$i` to use the + # same loop variable as used in the for loop. + return esc(quote + TrixiParticles.parallel_foreach($iterator, $system) do $i + $inner_loop + end + end) +end -Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl). -""" -# macro threaded(expr) -# # Use `esc(quote ... end)` for nested macro calls as suggested in -# # https://github.com/JuliaLang/julia/issues/23221 -# # -# # The following code is a simple version using only `Threads.@threads` from the -# # standard library with an additional check whether only a single thread is used -# # to reduce some overhead (and allocations) for serial execution. -# # -# # return esc(quote -# # let -# # if Threads.nthreads() == 1 -# # $(expr) -# # else -# # Threads.@threads $(expr) -# # end -# # end -# # end) -# # -# # However, the code below using `@batch` from Polyester.jl is more efficient, -# # since this packages provides threads with less overhead. Since it is written -# # by Chris Elrod, the author of LoopVectorization.jl, we expect this package -# # to provide the most efficient and useful implementation of threads (as we use -# # them) available in Julia. -# # !!! danger "Heisenbug" -# # Look at the comments for `wrap_array` when considering to change this macro. - -# return esc(quote -# TrixiParticles.@batch $(expr) -# end) -# end - -@inline function parallel_for(f, system, iterator) +# Use `Polyester.@batch` for low-overhead threading +@inline function parallel_foreach(f, iterator, system) Polyester.@batch for i in iterator @inline f(i) end end -@kernel function generic_kernel(f) - i = @index(Global) - @inline f(i) -end - -@inline function parallel_for(f, system::Union{GPUSystem, AbstractGPUArray}, iterator) +# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl +@inline function parallel_foreach(f, iterator, system::Union{GPUSystem, AbstractGPUArray}) + # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` + # and index with `iterator[eachindex(iterator)[i]]`. + # Note that this only works with vector-like iterators that support arbitrary indexing. indices = eachindex(iterator) ndrange = length(indices) - # Skip empty iterations + # Skip empty loops ndrange == 0 && return backend = KernelAbstractions.get_backend(system) + # Call the generic kernel that is defined below, which only calls a function with + # the global GPU index. generic_kernel(backend)(ndrange=ndrange) do i @inline f(iterator[indices[i]]) end @@ -147,16 +132,9 @@ end KernelAbstractions.synchronize(backend) end -macro threaded(system, expr) - iterator = expr.args[1].args[2] - i = expr.args[1].args[1] - inner_loop = expr.args[2] - - return esc(quote - TrixiParticles.parallel_for($system, $iterator) do $i - $inner_loop - end - end) +@kernel function generic_kernel(f) + i = @index(Global) + @inline f(i) end @doc raw"""