diff --git a/src/util.jl b/src/util.jl
index 3d116837d..a9f927035 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -69,77 +69,62 @@ macro trixi_timeit(timer_output, label, expr)
 end
 
 """
-    @threaded for ... end
+    @threaded system for ... end
 
+Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `system`.
 Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange`
 but without guarantee that the underlying implementation uses `Threads.@threads`
 or works for more general `for` loops.
-In particular, there may be an additional check whether only one thread is used
-to reduce the overhead of serial execution or the underlying threading capabilities
-might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
+The second argument must either be a particle system or an array from which can be derived
+if the loop has to be run threaded on the CPU or launched as a kernel on the GPU.
+
+In particular, the underlying threading capabilities might be provided by other packages
+such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
 
 !!! warn
     This macro does not necessarily work for general `for` loops. For example,
     it does not necessarily support general iterables such as `eachline(filename)`.
+"""
+macro threaded(system, expr)
+    # Reverse-engineer the for loop.
+    # `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`.
+    # So, `expr.args[1].args[2]` is the iterator `eachindex(x)`
+    # and `expr.args[1].args[1]` is the loop variable `i`.
+    iterator = expr.args[1].args[2]
+    i = expr.args[1].args[1]
+    inner_loop = expr.args[2]
 
-Some discussion can be found at
-[https://discourse.julialang.org/t/overhead-of-threads-threads/53964](https://discourse.julialang.org/t/overhead-of-threads-threads/53964)
-and
-[https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435](https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435).
+    # Assemble the for loop again as a call to `parallel_foreach`, using `$i` to use the
+    # same loop variable as used in the for loop.
+    return esc(quote
+        TrixiParticles.parallel_foreach($iterator, $system) do $i
+            $inner_loop
+        end
+    end)
+end
 
-Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl).
-"""
-# macro threaded(expr)
-#     # Use `esc(quote ... end)` for nested macro calls as suggested in
-#     # https://github.com/JuliaLang/julia/issues/23221
-#     #
-#     # The following code is a simple version using only `Threads.@threads` from the
-#     # standard library with an additional check whether only a single thread is used
-#     # to reduce some overhead (and allocations) for serial execution.
-#     #
-#     # return esc(quote
-#     #   let
-#     #     if Threads.nthreads() == 1
-#     #       $(expr)
-#     #     else
-#     #       Threads.@threads $(expr)
-#     #     end
-#     #   end
-#     # end)
-#     #
-#     # However, the code below using `@batch` from Polyester.jl is more efficient,
-#     # since this packages provides threads with less overhead. Since it is written
-#     # by Chris Elrod, the author of LoopVectorization.jl, we expect this package
-#     # to provide the most efficient and useful implementation of threads (as we use
-#     # them) available in Julia.
-#     # !!! danger "Heisenbug"
-#     #     Look at the comments for `wrap_array` when considering to change this macro.
-
-#     return esc(quote
-#                    TrixiParticles.@batch $(expr)
-#                end)
-# end
-
-@inline function parallel_for(f, system, iterator)
+# Use `Polyester.@batch` for low-overhead threading
+@inline function parallel_foreach(f, iterator, system)
     Polyester.@batch for i in iterator
         @inline f(i)
     end
 end
 
-@kernel function generic_kernel(f)
-    i = @index(Global)
-    @inline f(i)
-end
-
-@inline function parallel_for(f, system::Union{GPUSystem, AbstractGPUArray}, iterator)
+# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
+@inline function parallel_foreach(f, iterator, system::Union{GPUSystem, AbstractGPUArray})
+    # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
+    # and index with `iterator[eachindex(iterator)[i]]`.
+    # Note that this only works with vector-like iterators that support arbitrary indexing.
     indices = eachindex(iterator)
     ndrange = length(indices)
 
-    # Skip empty iterations
+    # Skip empty loops
     ndrange == 0 && return
 
     backend = KernelAbstractions.get_backend(system)
 
+    # Call the generic kernel that is defined below, which only calls a function with
+    # the global GPU index.
     generic_kernel(backend)(ndrange=ndrange) do i
         @inline f(iterator[indices[i]])
     end
@@ -147,16 +132,9 @@ end
     KernelAbstractions.synchronize(backend)
 end
 
-macro threaded(system, expr)
-    iterator = expr.args[1].args[2]
-    i = expr.args[1].args[1]
-    inner_loop = expr.args[2]
-
-    return esc(quote
-        TrixiParticles.parallel_for($system, $iterator) do $i
-            $inner_loop
-        end
-    end)
+@kernel function generic_kernel(f)
+    i = @index(Global)
+    @inline f(i)
 end
 
 @doc raw"""