Skip to content

Commit

Permalink
Add proper docs for @threaded
Browse files Browse the repository at this point in the history
  • Loading branch information
efaulhaber committed Jun 12, 2024
1 parent e967c5b commit 4a54721
Showing 1 changed file with 37 additions and 59 deletions.
96 changes: 37 additions & 59 deletions src/util.jl
Original file line number Diff line number Diff line change
Expand Up @@ -69,94 +69,72 @@ macro trixi_timeit(timer_output, label, expr)
end

"""
@threaded for ... end
@threaded system for ... end
Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `system`.
Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange`
but without guarantee that the underlying implementation uses `Threads.@threads`
or works for more general `for` loops.
In particular, there may be an additional check whether only one thread is used
to reduce the overhead of serial execution or the underlying threading capabilities
might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
The second argument must either be a particle system or an array from which can be derived
if the loop has to be run threaded on the CPU or launched as a kernel on the GPU.
In particular, the underlying threading capabilities might be provided by other packages
such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
!!! warn
This macro does not necessarily work for general `for` loops. For example,
it does not necessarily support general iterables such as `eachline(filename)`.
"""
macro threaded(system, expr)
# Reverse-engineer the for loop.
# `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`.
# So, `expr.args[1].args[2]` is the iterator `eachindex(x)`
# and `expr.args[1].args[1]` is the loop variable `i`.
iterator = expr.args[1].args[2]
i = expr.args[1].args[1]
inner_loop = expr.args[2]

Some discussion can be found at
[https://discourse.julialang.org/t/overhead-of-threads-threads/53964](https://discourse.julialang.org/t/overhead-of-threads-threads/53964)
and
[https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435](https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435).
# Assemble the for loop again as a call to `parallel_foreach`, using `$i` to use the
# same loop variable as used in the for loop.
return esc(quote
TrixiParticles.parallel_foreach($iterator, $system) do $i
$inner_loop
end
end)
end

Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl).
"""
# macro threaded(expr)
# # Use `esc(quote ... end)` for nested macro calls as suggested in
# # https://github.com/JuliaLang/julia/issues/23221
# #
# # The following code is a simple version using only `Threads.@threads` from the
# # standard library with an additional check whether only a single thread is used
# # to reduce some overhead (and allocations) for serial execution.
# #
# # return esc(quote
# # let
# # if Threads.nthreads() == 1
# # $(expr)
# # else
# # Threads.@threads $(expr)
# # end
# # end
# # end)
# #
# # However, the code below using `@batch` from Polyester.jl is more efficient,
# # since this packages provides threads with less overhead. Since it is written
# # by Chris Elrod, the author of LoopVectorization.jl, we expect this package
# # to provide the most efficient and useful implementation of threads (as we use
# # them) available in Julia.
# # !!! danger "Heisenbug"
# # Look at the comments for `wrap_array` when considering to change this macro.

# return esc(quote
# TrixiParticles.@batch $(expr)
# end)
# end

@inline function parallel_for(f, system, iterator)
# Use `Polyester.@batch` for low-overhead threading
@inline function parallel_foreach(f, iterator, system)
Polyester.@batch for i in iterator
@inline f(i)
end
end

@kernel function generic_kernel(f)
i = @index(Global)
@inline f(i)
end

@inline function parallel_for(f, system::Union{GPUSystem, AbstractGPUArray}, iterator)
# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
@inline function parallel_foreach(f, iterator, system::Union{GPUSystem, AbstractGPUArray})
# On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
# and index with `iterator[eachindex(iterator)[i]]`.
# Note that this only works with vector-like iterators that support arbitrary indexing.
indices = eachindex(iterator)
ndrange = length(indices)

# Skip empty iterations
# Skip empty loops
ndrange == 0 && return

backend = KernelAbstractions.get_backend(system)

# Call the generic kernel that is defined below, which only calls a function with
# the global GPU index.
generic_kernel(backend)(ndrange=ndrange) do i
@inline f(iterator[indices[i]])
end

KernelAbstractions.synchronize(backend)
end

macro threaded(system, expr)
iterator = expr.args[1].args[2]
i = expr.args[1].args[1]
inner_loop = expr.args[2]

return esc(quote
TrixiParticles.parallel_for($system, $iterator) do $i
$inner_loop
end
end)
@kernel function generic_kernel(f)
i = @index(Global)
@inline f(i)
end

@doc raw"""
Expand Down

0 comments on commit 4a54721

Please sign in to comment.