From 301059ecc464ab13b629097eaaa677453f6d9618 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 22 May 2024 16:13:32 +0200 Subject: [PATCH 01/22] Fix wrapping for GPU arrays --- src/general/semidiscretization.jl | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 6a1d703b2..28dffc9ce 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -350,7 +350,7 @@ end # We have to pass `system` here for type stability, # since the type of `system` determines the return type. -@inline function wrap_u(u_ode, system, semi) +@inline function wrap_u(u_ode::Array, system, semi) (; ranges_u) = semi range = ranges_u[system_indices(system, semi)] @@ -364,7 +364,7 @@ end (StaticInt(u_nvariables(system)), n_moving_particles(system))) end -@inline function wrap_v(v_ode, system, semi) +@inline function wrap_v(v_ode::Array, system, semi) (; ranges_v) = semi range = ranges_v[system_indices(system, semi)] @@ -375,6 +375,29 @@ end (StaticInt(v_nvariables(system)), n_moving_particles(system))) end +# For non-`Array`s (typically GPU arrays), just reshape. Calling the `PtrArray` code above +# for a `CuArray` yields another `CuArray` (instead of a `PtrArray`) and is 8 times slower +# with double the allocations. +@inline function wrap_u(u_ode, system, semi) + (; ranges_u) = semi + + range = ranges_u[system_indices(system, semi)] + + @boundscheck @assert length(range) == u_nvariables(system) * n_moving_particles(system) + + return reshape(view(u_ode, range), (u_nvariables(system), n_moving_particles(system))) +end + +@inline function wrap_v(v_ode, system, semi) + (; ranges_v) = semi + + range = ranges_v[system_indices(system, semi)] + + @boundscheck @assert length(range) == v_nvariables(system) * n_moving_particles(system) + + return reshape(view(v_ode, range), (v_nvariables(system), n_moving_particles(system))) +end + function calculate_dt(v_ode, u_ode, cfl_number, semi::Semidiscretization) (; systems) = semi From 917e0e59ff4ecdded83e2df49e3f197555e3809e Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 22 May 2024 16:36:55 +0200 Subject: [PATCH 02/22] Adapt `@threaded` macro to work with GPUs --- Project.toml | 4 +- src/TrixiParticles.jl | 6 ++- src/general/general.jl | 27 +----------- src/general/gpu.jl | 4 ++ src/general/system.jl | 22 ++++++++++ src/util.jl | 93 ++++++++++++++++++++++++++++++------------ 6 files changed, 102 insertions(+), 54 deletions(-) diff --git a/Project.toml b/Project.toml index ea1bc9aa5..6636a7a23 100644 --- a/Project.toml +++ b/Project.toml @@ -11,7 +11,9 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" DiffEqCallbacks = "459566f4-90b8-5000-8ac3-15dfb0a30def" FastPow = "c0e83750-1142-43a8-81cf-6c956b72b4d1" ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" +GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" +KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221" PointNeighbors = "1c4d5385-0a27-49de-8e2c-43b175c8985c" @@ -35,6 +37,7 @@ FastPow = "0.1" ForwardDiff = "0.10" JSON = "0.21" MuladdMacro = "0.2" +PointNeighbors = "0.2" Polyester = "0.7.5" RecipesBase = "1" Reexport = "1" @@ -43,6 +46,5 @@ StaticArrays = "1" StrideArrays = "0.1" TimerOutputs = "0.5" TrixiBase = "0.1" -PointNeighbors = "0.2" WriteVTK = "1" julia = "1.9" diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index f170e08d8..b3f01e103 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -9,7 +9,9 @@ using DataFrames: DataFrame using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect, PresetTimeCallback using FastPow: @fastpow using ForwardDiff: ForwardDiff +using GPUArrays: AbstractGPUArray using JSON: JSON +using KernelAbstractions: KernelAbstractions, @kernel using LinearAlgebra: norm, dot, I, tr, inv, pinv, det using MuladdMacro: @muladd using Polyester: Polyester, @batch @@ -26,7 +28,9 @@ using TrixiBase: trixi_include using PointNeighbors: PointNeighbors, for_particle_neighbor using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save -# util needs to be first because of macro @trixi_timeit +# `util.jl` depends on the `GPUSystem` type defined in `system.jl` +include("general/system.jl") +# `util.jl` needs to be next because of the macros `@trixi_timeit` and `@threaded` include("util.jl") include("callbacks/callbacks.jl") include("general/general.jl") diff --git a/src/general/general.jl b/src/general/general.jl index dfe52e06a..cdc7de7da 100644 --- a/src/general/general.jl +++ b/src/general/general.jl @@ -1,33 +1,10 @@ -# Abstract supertype for all system types. We additionally store the type of the system's -# initial condition, which is `Nothing` when using KernelAbstractions.jl. -abstract type System{NDIMS, IC} end - -# When using KernelAbstractions.jl, the initial condition has been replaced by `nothing` -GPUSystem = System{NDIMS, Nothing} where {NDIMS} - -abstract type FluidSystem{NDIMS, IC} <: System{NDIMS, IC} end -timer_name(::FluidSystem) = "fluid" - -abstract type SolidSystem{NDIMS, IC} <: System{NDIMS, IC} end -timer_name(::SolidSystem) = "solid" - -abstract type BoundarySystem{NDIMS, IC} <: System{NDIMS, IC} end -timer_name(::BoundarySystem) = "boundary" - -@inline function set_zero!(du) - du .= zero(eltype(du)) - - return du -end - -# Note that `semidiscretization.jl` depends on the system types and has to be -# included later. +# Note that `system.jl` has already been included. +# `semidiscretization.jl` depends on the system types and has to be included later. # `density_calculators.jl` needs to be included before `corrections.jl`. include("density_calculators.jl") include("corrections.jl") include("smoothing_kernels.jl") include("initial_condition.jl") -include("system.jl") include("interpolation.jl") include("file_system.jl") include("custom_quantities.jl") diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 377ea8b72..91b5fe32e 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -32,3 +32,7 @@ end function Adapt.adapt_structure(to::typeof(Array), range::UnitRange) return range end + +KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU() +KernelAbstractions.get_backend(system::System) = KernelAbstractions.get_backend(system.mass) +KernelAbstractions.get_backend(system::BoundarySPHSystem) = KernelAbstractions.get_backend(system.coordinates) diff --git a/src/general/system.jl b/src/general/system.jl index a4c7754c4..4cd59c6ff 100644 --- a/src/general/system.jl +++ b/src/general/system.jl @@ -1,3 +1,25 @@ +# Abstract supertype for all system types. We additionally store the type of the system's +# initial condition, which is `Nothing` when using KernelAbstractions.jl. +abstract type System{NDIMS, IC} end + +# When using KernelAbstractions.jl, the initial condition has been replaced by `nothing` +GPUSystem = System{NDIMS, Nothing} where {NDIMS} + +abstract type FluidSystem{NDIMS, IC} <: System{NDIMS, IC} end +timer_name(::FluidSystem) = "fluid" + +abstract type SolidSystem{NDIMS, IC} <: System{NDIMS, IC} end +timer_name(::SolidSystem) = "solid" + +abstract type BoundarySystem{NDIMS, IC} <: System{NDIMS, IC} end +timer_name(::BoundarySystem) = "boundary" + +@inline function set_zero!(du) + du .= zero(eltype(du)) + + return du +end + initialize!(system, neighborhood_search) = system @inline Base.ndims(::System{NDIMS}) where {NDIMS} = NDIMS diff --git a/src/util.jl b/src/util.jl index 2dc86a267..81b9354a6 100644 --- a/src/util.jl +++ b/src/util.jl @@ -89,35 +89,74 @@ and Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl). """ -macro threaded(expr) - # Use `esc(quote ... end)` for nested macro calls as suggested in - # https://github.com/JuliaLang/julia/issues/23221 - # - # The following code is a simple version using only `Threads.@threads` from the - # standard library with an additional check whether only a single thread is used - # to reduce some overhead (and allocations) for serial execution. - # - # return esc(quote - # let - # if Threads.nthreads() == 1 - # $(expr) - # else - # Threads.@threads $(expr) - # end - # end - # end) - # - # However, the code below using `@batch` from Polyester.jl is more efficient, - # since this packages provides threads with less overhead. Since it is written - # by Chris Elrod, the author of LoopVectorization.jl, we expect this package - # to provide the most efficient and useful implementation of threads (as we use - # them) available in Julia. - # !!! danger "Heisenbug" - # Look at the comments for `wrap_array` when considering to change this macro. +# macro threaded(expr) +# # Use `esc(quote ... end)` for nested macro calls as suggested in +# # https://github.com/JuliaLang/julia/issues/23221 +# # +# # The following code is a simple version using only `Threads.@threads` from the +# # standard library with an additional check whether only a single thread is used +# # to reduce some overhead (and allocations) for serial execution. +# # +# # return esc(quote +# # let +# # if Threads.nthreads() == 1 +# # $(expr) +# # else +# # Threads.@threads $(expr) +# # end +# # end +# # end) +# # +# # However, the code below using `@batch` from Polyester.jl is more efficient, +# # since this packages provides threads with less overhead. Since it is written +# # by Chris Elrod, the author of LoopVectorization.jl, we expect this package +# # to provide the most efficient and useful implementation of threads (as we use +# # them) available in Julia. +# # !!! danger "Heisenbug" +# # Look at the comments for `wrap_array` when considering to change this macro. + +# return esc(quote +# TrixiParticles.@batch $(expr) +# end) +# end + +@inline function parallel_for(f, system, iterator) + Polyester.@batch for i in iterator + @inline f(i) + end +end + +@kernel function generic_kernel(f) + i = KernelAbstractions.@index(Global) + @inline f(i) +end + +@inline function parallel_for(f, system::Union{GPUSystem, AbstractGPUArray}, iterator) + indices = eachindex(iterator) + ndrange = length(indices) + + # Skip empty iterations + ndrange == 0 && return + + backend = KernelAbstractions.get_backend(system) + + generic_kernel(backend)(ndrange=ndrange) do i + @inline f(iterator[indices[i]]) + end + + KernelAbstractions.synchronize(backend) +end + +macro threaded(system, expr) + iterator = expr.args[1].args[2] + i = expr.args[1].args[1] + inner_loop = expr.args[2] return esc(quote - TrixiParticles.@batch $(expr) - end) + TrixiParticles.parallel_for($system, $iterator) do $i + $inner_loop + end + end) end @doc raw""" From 19b7806bc4d0ef7122d2616896c2a55a33bacee7 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 22 May 2024 16:39:52 +0200 Subject: [PATCH 03/22] Use new `@threaded` macro --- src/general/corrections.jl | 2 +- src/general/neighborhood_search.jl | 20 +++++++++++++++++++ src/general/semidiscretization.jl | 4 ++-- .../dummy_particles/dummy_particles.jl | 7 ++++--- src/schemes/boundary/system.jl | 2 +- .../fluid/weakly_compressible_sph/system.jl | 2 +- .../solid/total_lagrangian_sph/system.jl | 6 +++--- 7 files changed, 32 insertions(+), 11 deletions(-) diff --git a/src/general/corrections.jl b/src/general/corrections.jl index cf64f6d16..ee894fec4 100644 --- a/src/general/corrections.jl +++ b/src/general/corrections.jl @@ -440,7 +440,7 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system, end function correction_matrix_inversion_step!(corr_matrix, system) - @threaded for particle in eachparticle(system) + @threaded system for particle in eachparticle(system) L = extract_smatrix(corr_matrix, system, particle) # The matrix `L` only becomes singular when the particle and all neighbors diff --git a/src/general/neighborhood_search.jl b/src/general/neighborhood_search.jl index 0713f153d..1fcb4ca93 100644 --- a/src/general/neighborhood_search.jl +++ b/src/general/neighborhood_search.jl @@ -9,3 +9,23 @@ function PointNeighbors.for_particle_neighbor(f, system, neighbor_system, for_particle_neighbor(f, system_coords, neighbor_coords, neighborhood_search, particles=particles, parallel=parallel) end + +@inline function PointNeighbors.for_particle_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, particles, parallel::Val{true}) + @threaded system_coords for particle in particles + PointNeighbors.for_particle_neighbor_inner(f, system_coords, neighbor_coords, neighborhood_search, + particle) + end + + return nothing +end + +@inline function PointNeighbors.for_particle_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, particles, parallel::Val{false}) + for particle in particles + PointNeighbors.for_particle_neighbor_inner(f, system_coords, neighbor_coords, neighborhood_search, + particle) + end + + return nothing +end diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 28dffc9ce..daff2080e 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -414,7 +414,7 @@ function drift!(du_ode, v_ode, u_ode, semi, t) du = wrap_u(du_ode, system, semi) v = wrap_v(v_ode, system, semi) - @threaded for particle in each_moving_particle(system) + @threaded system for particle in each_moving_particle(system) # This can be dispatched per system add_velocity!(du, v, particle, system) end @@ -513,7 +513,7 @@ function add_source_terms!(dv_ode, v_ode, u_ode, semi) v = wrap_v(v_ode, system, semi) u = wrap_u(u_ode, system, semi) - @threaded for particle in each_moving_particle(system) + @threaded system for particle in each_moving_particle(system) # Dispatch by system type to exclude boundary systems add_acceleration!(dv, particle, system) add_source_terms_inner!(dv, v, u, particle, system, source_terms(system)) diff --git a/src/schemes/boundary/dummy_particles/dummy_particles.jl b/src/schemes/boundary/dummy_particles/dummy_particles.jl index 0f7b42696..3e3abcbd9 100644 --- a/src/schemes/boundary/dummy_particles/dummy_particles.jl +++ b/src/schemes/boundary/dummy_particles/dummy_particles.jl @@ -291,7 +291,7 @@ function compute_pressure!(boundary_model, ::Union{SummationDensity, ContinuityD # Limit pressure to be non-negative to avoid attractive forces between fluid and # boundary particles at free surfaces (sticking artifacts). - @threaded for particle in eachparticle(system) + @threaded system for particle in eachparticle(system) apply_state_equation!(boundary_model, particle_density(v, boundary_model, particle), particle) end @@ -346,14 +346,15 @@ function compute_pressure!(boundary_model, ::AdamiPressureExtrapolation, system_coords, neighbor_coords, v_neighbor_system, nhs) end - for particle in eachparticle(system) + + @threaded system for particle in eachparticle(system) # Limit pressure to be non-negative to avoid attractive forces between fluid and # boundary particles at free surfaces (sticking artifacts). pressure[particle] = max(pressure[particle], 0.0) end end - @trixi_timeit timer() "inverse state equation" @threaded for particle in eachparticle(system) + @trixi_timeit timer() "inverse state equation" @threaded system for particle in eachparticle(system) compute_adami_density!(boundary_model, system, system_coords, particle) end end diff --git a/src/schemes/boundary/system.jl b/src/schemes/boundary/system.jl index cade5fbe9..36699b0b5 100644 --- a/src/schemes/boundary/system.jl +++ b/src/schemes/boundary/system.jl @@ -208,7 +208,7 @@ function (movement::BoundaryMovement)(system, t) is_moving(t) || return system - @threaded for particle in moving_particles + @threaded system for particle in moving_particles pos_new = initial_coords(system, particle) + movement_function(t) vel = ForwardDiff.derivative(movement_function, t) acc = ForwardDiff.derivative(t_ -> ForwardDiff.derivative(movement_function, t_), t) diff --git a/src/schemes/fluid/weakly_compressible_sph/system.jl b/src/schemes/fluid/weakly_compressible_sph/system.jl index cb9a1da87..c838b174f 100644 --- a/src/schemes/fluid/weakly_compressible_sph/system.jl +++ b/src/schemes/fluid/weakly_compressible_sph/system.jl @@ -295,7 +295,7 @@ function reinit_density!(system, v, u, v_ode, u_ode, semi) end function compute_pressure!(system, v) - @threaded for particle in eachparticle(system) + @threaded system for particle in eachparticle(system) apply_state_equation!(system, particle_density(v, system, particle), particle) end end diff --git a/src/schemes/solid/total_lagrangian_sph/system.jl b/src/schemes/solid/total_lagrangian_sph/system.jl index 5b3021b1c..63d70c262 100644 --- a/src/schemes/solid/total_lagrangian_sph/system.jl +++ b/src/schemes/solid/total_lagrangian_sph/system.jl @@ -258,7 +258,7 @@ end calc_deformation_grad!(deformation_grad, neighborhood_search, system) - @threaded for particle in eachparticle(system) + @threaded system for particle in eachparticle(system) F_particle = deformation_gradient(system, particle) pk1_particle = pk1_stress_tensor(F_particle, system) pk1_particle_corrected = pk1_particle * correction_matrix(system, particle) @@ -395,7 +395,7 @@ end function von_mises_stress(system::TotalLagrangianSPHSystem) von_mises_stress_vector = zeros(eltype(system.pk1_corrected), nparticles(system)) - @threaded for particle in each_moving_particle(system) + @threaded system for particle in each_moving_particle(system) von_mises_stress_vector[particle] = von_mises_stress(system, particle) end @@ -428,7 +428,7 @@ function cauchy_stress(system::TotalLagrangianSPHSystem) cauchy_stress_tensors = zeros(eltype(system.pk1_corrected), NDIMS, NDIMS, nparticles(system)) - @threaded for particle in each_moving_particle(system) + @threaded system for particle in each_moving_particle(system) F = deformation_gradient(system, particle) J = det(F) P = pk1_corrected(system, particle) From 96440217b13360653ad61411988ae8500495e038 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 22 May 2024 16:49:09 +0200 Subject: [PATCH 04/22] Fix NHS to run GPU code with CPU arrays --- src/general/neighborhood_search.jl | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/src/general/neighborhood_search.jl b/src/general/neighborhood_search.jl index 1fcb4ca93..03f2523e8 100644 --- a/src/general/neighborhood_search.jl +++ b/src/general/neighborhood_search.jl @@ -10,22 +10,13 @@ function PointNeighbors.for_particle_neighbor(f, system, neighbor_system, particles=particles, parallel=parallel) end -@inline function PointNeighbors.for_particle_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, particles, parallel::Val{true}) - @threaded system_coords for particle in particles - PointNeighbors.for_particle_neighbor_inner(f, system_coords, neighbor_coords, neighborhood_search, - particle) - end - - return nothing -end - -@inline function PointNeighbors.for_particle_neighbor(f, system_coords, neighbor_coords, - neighborhood_search, particles, parallel::Val{false}) - for particle in particles - PointNeighbors.for_particle_neighbor_inner(f, system_coords, neighbor_coords, neighborhood_search, - particle) +function PointNeighbors.for_particle_neighbor(f, system::GPUSystem, neighbor_system, + system_coords, neighbor_coords, + neighborhood_search; + particles=each_moving_particle(system), + parallel=true) + @threaded system for particle in particles + PointNeighbors.for_particle_neighbor_inner(f, system_coords, neighbor_coords, + neighborhood_search, particle) end - - return nothing end From 1ff91d3de8e64adb1255f4444cbfee807a36b05c Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 23 May 2024 17:23:24 +0200 Subject: [PATCH 05/22] Import `@index` from KernelAbstractions.jl For some reason it doesn't run on the CPU otherwise. --- src/TrixiParticles.jl | 2 +- src/util.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index b3f01e103..ee5c15b00 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -11,7 +11,7 @@ using FastPow: @fastpow using ForwardDiff: ForwardDiff using GPUArrays: AbstractGPUArray using JSON: JSON -using KernelAbstractions: KernelAbstractions, @kernel +using KernelAbstractions: KernelAbstractions, @kernel, @index using LinearAlgebra: norm, dot, I, tr, inv, pinv, det using MuladdMacro: @muladd using Polyester: Polyester, @batch diff --git a/src/util.jl b/src/util.jl index 81b9354a6..3d116837d 100644 --- a/src/util.jl +++ b/src/util.jl @@ -127,7 +127,7 @@ Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl). end @kernel function generic_kernel(f) - i = KernelAbstractions.@index(Global) + i = @index(Global) @inline f(i) end From 7514ac5d47193c6c94c2d5114f96ee171ffda354 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:07:57 +0200 Subject: [PATCH 06/22] Add proper docs for `@threaded` --- src/util.jl | 96 +++++++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 59 deletions(-) diff --git a/src/util.jl b/src/util.jl index 3d116837d..a9f927035 100644 --- a/src/util.jl +++ b/src/util.jl @@ -69,77 +69,62 @@ macro trixi_timeit(timer_output, label, expr) end """ - @threaded for ... end + @threaded system for ... end +Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `system`. Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange` but without guarantee that the underlying implementation uses `Threads.@threads` or works for more general `for` loops. -In particular, there may be an additional check whether only one thread is used -to reduce the overhead of serial execution or the underlying threading capabilities -might be provided by other packages such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). +The second argument must either be a particle system or an array from which can be derived +if the loop has to be run threaded on the CPU or launched as a kernel on the GPU. + +In particular, the underlying threading capabilities might be provided by other packages +such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl). !!! warn This macro does not necessarily work for general `for` loops. For example, it does not necessarily support general iterables such as `eachline(filename)`. +""" +macro threaded(system, expr) + # Reverse-engineer the for loop. + # `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`. + # So, `expr.args[1].args[2]` is the iterator `eachindex(x)` + # and `expr.args[1].args[1]` is the loop variable `i`. + iterator = expr.args[1].args[2] + i = expr.args[1].args[1] + inner_loop = expr.args[2] -Some discussion can be found at -[https://discourse.julialang.org/t/overhead-of-threads-threads/53964](https://discourse.julialang.org/t/overhead-of-threads-threads/53964) -and -[https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435](https://discourse.julialang.org/t/threads-threads-with-one-thread-how-to-remove-the-overhead/58435). + # Assemble the for loop again as a call to `parallel_foreach`, using `$i` to use the + # same loop variable as used in the for loop. + return esc(quote + TrixiParticles.parallel_foreach($iterator, $system) do $i + $inner_loop + end + end) +end -Copied from [Trixi.jl](https://github.com/trixi-framework/Trixi.jl). -""" -# macro threaded(expr) -# # Use `esc(quote ... end)` for nested macro calls as suggested in -# # https://github.com/JuliaLang/julia/issues/23221 -# # -# # The following code is a simple version using only `Threads.@threads` from the -# # standard library with an additional check whether only a single thread is used -# # to reduce some overhead (and allocations) for serial execution. -# # -# # return esc(quote -# # let -# # if Threads.nthreads() == 1 -# # $(expr) -# # else -# # Threads.@threads $(expr) -# # end -# # end -# # end) -# # -# # However, the code below using `@batch` from Polyester.jl is more efficient, -# # since this packages provides threads with less overhead. Since it is written -# # by Chris Elrod, the author of LoopVectorization.jl, we expect this package -# # to provide the most efficient and useful implementation of threads (as we use -# # them) available in Julia. -# # !!! danger "Heisenbug" -# # Look at the comments for `wrap_array` when considering to change this macro. - -# return esc(quote -# TrixiParticles.@batch $(expr) -# end) -# end - -@inline function parallel_for(f, system, iterator) +# Use `Polyester.@batch` for low-overhead threading +@inline function parallel_foreach(f, iterator, system) Polyester.@batch for i in iterator @inline f(i) end end -@kernel function generic_kernel(f) - i = @index(Global) - @inline f(i) -end - -@inline function parallel_for(f, system::Union{GPUSystem, AbstractGPUArray}, iterator) +# On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl +@inline function parallel_foreach(f, iterator, system::Union{GPUSystem, AbstractGPUArray}) + # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)` + # and index with `iterator[eachindex(iterator)[i]]`. + # Note that this only works with vector-like iterators that support arbitrary indexing. indices = eachindex(iterator) ndrange = length(indices) - # Skip empty iterations + # Skip empty loops ndrange == 0 && return backend = KernelAbstractions.get_backend(system) + # Call the generic kernel that is defined below, which only calls a function with + # the global GPU index. generic_kernel(backend)(ndrange=ndrange) do i @inline f(iterator[indices[i]]) end @@ -147,16 +132,9 @@ end KernelAbstractions.synchronize(backend) end -macro threaded(system, expr) - iterator = expr.args[1].args[2] - i = expr.args[1].args[1] - inner_loop = expr.args[2] - - return esc(quote - TrixiParticles.parallel_for($system, $iterator) do $i - $inner_loop - end - end) +@kernel function generic_kernel(f) + i = @index(Global) + @inline f(i) end @doc raw""" From eb4903b6d8b90515897d1aeb95638b3fa93471e9 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:18:04 +0200 Subject: [PATCH 07/22] Fix `foreach_neighbor` --- Project.toml | 2 +- src/general/neighborhood_search.jl | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Project.toml b/Project.toml index 6636a7a23..6db9fefe3 100644 --- a/Project.toml +++ b/Project.toml @@ -37,7 +37,7 @@ FastPow = "0.1" ForwardDiff = "0.10" JSON = "0.21" MuladdMacro = "0.2" -PointNeighbors = "0.2" +PointNeighbors = "0.2.3" Polyester = "0.7.5" RecipesBase = "1" Reexport = "1" diff --git a/src/general/neighborhood_search.jl b/src/general/neighborhood_search.jl index 03f2523e8..afcd2c9b3 100644 --- a/src/general/neighborhood_search.jl +++ b/src/general/neighborhood_search.jl @@ -16,7 +16,7 @@ function PointNeighbors.for_particle_neighbor(f, system::GPUSystem, neighbor_sys particles=each_moving_particle(system), parallel=true) @threaded system for particle in particles - PointNeighbors.for_particle_neighbor_inner(f, system_coords, neighbor_coords, - neighborhood_search, particle) + PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search, particle) end end From bafccabdbaa241945c0069c1e4b96ec1288900df Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:18:54 +0200 Subject: [PATCH 08/22] Reformat --- src/general/gpu.jl | 5 ++++- src/util.jl | 8 ++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 91b5fe32e..6e7a8186f 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -35,4 +35,7 @@ end KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU() KernelAbstractions.get_backend(system::System) = KernelAbstractions.get_backend(system.mass) -KernelAbstractions.get_backend(system::BoundarySPHSystem) = KernelAbstractions.get_backend(system.coordinates) + +function KernelAbstractions.get_backend(system::BoundarySPHSystem) + KernelAbstractions.get_backend(system.coordinates) +end diff --git a/src/util.jl b/src/util.jl index a9f927035..6ea055c89 100644 --- a/src/util.jl +++ b/src/util.jl @@ -97,10 +97,10 @@ macro threaded(system, expr) # Assemble the for loop again as a call to `parallel_foreach`, using `$i` to use the # same loop variable as used in the for loop. return esc(quote - TrixiParticles.parallel_foreach($iterator, $system) do $i - $inner_loop - end - end) + TrixiParticles.parallel_foreach($iterator, $system) do $i + $inner_loop + end + end) end # Use `Polyester.@batch` for low-overhead threading From e7b6b6ca065eff1efbe2a8f5f46e699a8639f440 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:33:35 +0200 Subject: [PATCH 09/22] Add test for GPU code on the CPU --- examples/fluid/dam_break_2d.jl | 2 +- test/examples/examples.jl | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/examples/fluid/dam_break_2d.jl b/examples/fluid/dam_break_2d.jl index e58a7cc06..22911cced 100644 --- a/examples/fluid/dam_break_2d.jl +++ b/examples/fluid/dam_break_2d.jl @@ -74,7 +74,7 @@ boundary_system = BoundarySPHSystem(tank.boundary, boundary_model, adhesion_coef # ========================================================================================== # ==== Simulation semi = Semidiscretization(fluid_system, boundary_system, threaded_nhs_update=true) -ode = semidiscretize(semi, tspan) +ode = semidiscretize(semi, tspan, data_type=nothing) info_callback = InfoCallback(interval=100) diff --git a/test/examples/examples.jl b/test/examples/examples.jl index 0f0032607..f9506a248 100644 --- a/test/examples/examples.jl +++ b/test/examples/examples.jl @@ -87,6 +87,19 @@ @test count_rhs_allocations(sol, semi) == 0 end + @trixi_testset "fluid/dam_break_2d.jl with KernelAbstractions.jl" begin + # Emulate the GPU code on the CPU by passing `data_type = Array` + @test_nowarn_mod trixi_include(@__MODULE__, + joinpath(examples_dir(), "fluid", + "dam_break_2d.jl"), tspan=(0.0, 0.1), + data_type=Array) [ + r"┌ Info: The desired tank length in y-direction .*\n", + r"└ New tank length in y-direction.*\n", + ] + @test sol.retcode == ReturnCode.Success + @test count_rhs_allocations(sol, semi) == 0 + end + @trixi_testset "fluid/dam_break_3d.jl" begin @test_nowarn_mod trixi_include(@__MODULE__, joinpath(examples_dir(), "fluid", From f10246f36ddae4d41c3da786c64a6a126a831762 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 11:40:49 +0200 Subject: [PATCH 10/22] Fix unit tests --- test/general/semidiscretization.jl | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/general/semidiscretization.jl b/test/general/semidiscretization.jl index 633a0e240..953efb590 100644 --- a/test/general/semidiscretization.jl +++ b/test/general/semidiscretization.jl @@ -1,9 +1,15 @@ # Use `@trixi_testset` to isolate the mock functions in a separate namespace @trixi_testset "Semidiscretization" begin - # Mock systems - struct System1 <: TrixiParticles.System{3, Nothing} end + # Mock systems. `System1` will use the CPU backend, `System2` is a `GPUSystem`, using + # the GPU backend (emulated on the CPU). + struct System1 <: TrixiParticles.System{3, String} end struct System2 <: TrixiParticles.System{3, Nothing} end + # `System2` has no field `mass`, so we have to manually define the backend + function TrixiParticles.KernelAbstractions.get_backend(::System2) + return TrixiParticles.KernelAbstractions.CPU() + end + system1 = System1() system2 = System2() From c6103d9f07301a37f8a1988c5d361b6cc90db46a Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 12:10:45 +0200 Subject: [PATCH 11/22] Add unit tests for GPU interaction --- .../schemes/solid/total_lagrangian_sph/rhs.jl | 57 +++++++++++-------- 1 file changed, 34 insertions(+), 23 deletions(-) diff --git a/test/schemes/solid/total_lagrangian_sph/rhs.jl b/test/schemes/solid/total_lagrangian_sph/rhs.jl index dfc9fc234..ac4c71cec 100644 --- a/test/schemes/solid/total_lagrangian_sph/rhs.jl +++ b/test/schemes/solid/total_lagrangian_sph/rhs.jl @@ -32,7 +32,7 @@ [0.0, 0.0], ] - @testset "Test $i" for i in 1:4 + @testset verbose=true "Test $i" for i in 1:4 #### Setup each_moving_particle = [particle[i]] # Only calculate dv for this one particle eachparticle = [particle[i], neighbor[i]] @@ -53,18 +53,27 @@ kernel_deriv = 1.0 #### Mocking - # Mock the system - system = Val{:mock_system_interact}() - TrixiParticles.ndims(::Val{:mock_system_interact}) = 2 - Base.ntuple(f, ::Symbol) = ntuple(f, 2) # Make `extract_svector` work + # Mock a CPU system to test CPU code + struct MockSystemInteractCPU <: TrixiParticles.System{2, String} end + system = MockSystemInteractCPU() - function TrixiParticles.initial_coordinates(::Val{:mock_system_interact}) + # Mock a GPU system to emulate GPU code on the CPU + struct MockSystemInteractGPU <: TrixiParticles.System{2, Nothing} end + system_gpu = MockSystemInteractGPU() + + function TrixiParticles.KernelAbstractions.get_backend(::MockSystemInteractGPU) + return TrixiParticles.KernelAbstractions.CPU() + end + + MockSystemType = Union{MockSystemInteractCPU, MockSystemInteractGPU} + + function TrixiParticles.initial_coordinates(::MockSystemType) return initial_coordinates end # Unpack calls should return predefined values or # another mock object of the type Val{:mock_property_name}. - function Base.getproperty(::Val{:mock_system_interact}, f::Symbol) + function Base.getproperty(::MockSystemType, f::Symbol) if f === :current_coordinates return current_coordinates elseif f === :material_density @@ -81,10 +90,13 @@ return Val(Symbol("mock_" * string(f))) end - function TrixiParticles.each_moving_particle(::Val{:mock_system_interact}) + function TrixiParticles.each_moving_particle(::MockSystemType) each_moving_particle end - TrixiParticles.eachparticle(::Val{:mock_system_interact}) = eachparticle + TrixiParticles.eachparticle(::MockSystemType) = eachparticle + + # Mock the neighborhood search + nhs = Val{:nhs}() TrixiParticles.PointNeighbors.eachneighbor(_, ::Val{:nhs}) = eachneighbor function Base.getproperty(::Val{:nhs}, f::Symbol) @@ -99,26 +111,25 @@ end TrixiParticles.ndims(::Val{:nhs}) = 2 - function TrixiParticles.pk1_corrected(::Val{:mock_system_dv}, particle_) - if particle_ == particle[i] - return pk1_particle_corrected[i] - end - return pk1_neighbor_corrected[i] - end - - function TrixiParticles.add_acceleration!(_, _, ::Val{:mock_system_interact}) + function TrixiParticles.add_acceleration!(_, _, ::MockSystemType) nothing end TrixiParticles.kernel_deriv(::Val{:mock_smoothing_kernel}, _, _) = kernel_deriv #### Verification - dv = zeros(ndims(system), 10) - dv_expected = copy(dv) - dv_expected[:, particle[i]] = dv_particle_expected[i] + systems = [system, system_gpu] + names = ["CPU code", "Emulate GPU"] + @testset "$(names[j])" for j in eachindex(names) + system_ = systems[j] - TrixiParticles.interact_solid_solid!(dv, Val(:nhs), system, system) + dv = zeros(ndims(system_), 10) + dv_expected = copy(dv) + dv_expected[:, particle[i]] = dv_particle_expected[i] - @test dv ≈ dv_expected + TrixiParticles.interact_solid_solid!(dv, nhs, system_, system_) + + @test dv ≈ dv_expected + end end end @@ -194,4 +205,4 @@ rtol=sqrt(eps()), atol=sqrt(eps())) end end -end +end; From b1167311d05b9cb083729a24c5f8d0eed4a3af67 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 13:11:37 +0200 Subject: [PATCH 12/22] Also add tests for GPU versions of `wrap_v` and `wrap_u` --- src/general/gpu.jl | 1 + .../solid/total_lagrangian_sph/system.jl | 85 +++++++++---------- .../schemes/solid/total_lagrangian_sph/rhs.jl | 52 +++++++++--- 3 files changed, 81 insertions(+), 57 deletions(-) diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 6e7a8186f..8aa481cff 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -13,6 +13,7 @@ Adapt.@adapt_structure DensityDiffusionAntuono Adapt.@adapt_structure BoundarySPHSystem Adapt.@adapt_structure BoundaryModelDummyParticles Adapt.@adapt_structure BoundaryModelMonaghanKajtar +Adapt.@adapt_structure TotalLagrangianSPHSystem # The initial conditions are only used for initialization, which happens before `adapt`ing # the semidiscretization, so we don't need to store `InitialCondition`s on the GPU. diff --git a/src/schemes/solid/total_lagrangian_sph/system.jl b/src/schemes/solid/total_lagrangian_sph/system.jl index 63d70c262..5a1c04724 100644 --- a/src/schemes/solid/total_lagrangian_sph/system.jl +++ b/src/schemes/solid/total_lagrangian_sph/system.jl @@ -69,55 +69,50 @@ struct TotalLagrangianSPHSystem{BM, NDIMS, ELTYPE <: Real, IC, ARRAY1D, ARRAY2D, boundary_model :: BM penalty_force :: PF source_terms :: ST +end - function TotalLagrangianSPHSystem(initial_condition, - smoothing_kernel, smoothing_length, - young_modulus, poisson_ratio; - n_fixed_particles=0, boundary_model=nothing, - acceleration=ntuple(_ -> 0.0, - ndims(smoothing_kernel)), - penalty_force=nothing, source_terms=nothing) - NDIMS = ndims(initial_condition) - ELTYPE = eltype(initial_condition) - n_particles = nparticles(initial_condition) - - if ndims(smoothing_kernel) != NDIMS - throw(ArgumentError("smoothing kernel dimensionality must be $NDIMS for a $(NDIMS)D problem")) - end +function TotalLagrangianSPHSystem(initial_condition, + smoothing_kernel, smoothing_length, + young_modulus, poisson_ratio; + n_fixed_particles=0, boundary_model=nothing, + acceleration=ntuple(_ -> 0.0, + ndims(smoothing_kernel)), + penalty_force=nothing, source_terms=nothing) + NDIMS = ndims(initial_condition) + ELTYPE = eltype(initial_condition) + n_particles = nparticles(initial_condition) - # Make acceleration an SVector - acceleration_ = SVector(acceleration...) - if length(acceleration_) != NDIMS - throw(ArgumentError("`acceleration` must be of length $NDIMS for a $(NDIMS)D problem")) - end + if ndims(smoothing_kernel) != NDIMS + throw(ArgumentError("smoothing kernel dimensionality must be $NDIMS for a $(NDIMS)D problem")) + end - initial_coordinates = copy(initial_condition.coordinates) - current_coordinates = copy(initial_condition.coordinates) - mass = copy(initial_condition.mass) - material_density = copy(initial_condition.density) - correction_matrix = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles) - pk1_corrected = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles) - deformation_grad = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles) - - n_moving_particles = n_particles - n_fixed_particles - - lame_lambda = young_modulus * poisson_ratio / - ((1 + poisson_ratio) * (1 - 2 * poisson_ratio)) - lame_mu = 0.5 * young_modulus / (1 + poisson_ratio) - - return new{typeof(boundary_model), NDIMS, ELTYPE, - typeof(initial_condition), - typeof(mass), typeof(initial_coordinates), - typeof(deformation_grad), typeof(smoothing_kernel), - typeof(penalty_force), - typeof(source_terms)}(initial_condition, initial_coordinates, - current_coordinates, mass, correction_matrix, - pk1_corrected, deformation_grad, material_density, - n_moving_particles, young_modulus, poisson_ratio, - lame_lambda, lame_mu, smoothing_kernel, - smoothing_length, acceleration_, boundary_model, - penalty_force, source_terms) + # Make acceleration an SVector + acceleration_ = SVector(acceleration...) + if length(acceleration_) != NDIMS + throw(ArgumentError("`acceleration` must be of length $NDIMS for a $(NDIMS)D problem")) end + + initial_coordinates = copy(initial_condition.coordinates) + current_coordinates = copy(initial_condition.coordinates) + mass = copy(initial_condition.mass) + material_density = copy(initial_condition.density) + correction_matrix = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles) + pk1_corrected = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles) + deformation_grad = Array{ELTYPE, 3}(undef, NDIMS, NDIMS, n_particles) + + n_moving_particles = n_particles - n_fixed_particles + + lame_lambda = young_modulus * poisson_ratio / + ((1 + poisson_ratio) * (1 - 2 * poisson_ratio)) + lame_mu = 0.5 * young_modulus / (1 + poisson_ratio) + + return TotalLagrangianSPHSystem(initial_condition, initial_coordinates, + current_coordinates, mass, correction_matrix, + pk1_corrected, deformation_grad, material_density, + n_moving_particles, young_modulus, poisson_ratio, + lame_lambda, lame_mu, smoothing_kernel, + smoothing_length, acceleration_, boundary_model, + penalty_force, source_terms) end function Base.show(io::IO, system::TotalLagrangianSPHSystem) diff --git a/test/schemes/solid/total_lagrangian_sph/rhs.jl b/test/schemes/solid/total_lagrangian_sph/rhs.jl index ac4c71cec..b64fb6c97 100644 --- a/test/schemes/solid/total_lagrangian_sph/rhs.jl +++ b/test/schemes/solid/total_lagrangian_sph/rhs.jl @@ -151,7 +151,7 @@ 10 / 1000^2 * 1.5400218087591082 * 324.67072684047224 * 1.224, 0.0, ]) - @testset "Deformation Function: $deformation" for deformation in keys(deformations) + @testset verbose=true "Deformation Function: $deformation" for deformation in keys(deformations) J = deformations[deformation] u = zeros(2, 81) v = zeros(2, 81) @@ -187,22 +187,50 @@ semi = Semidiscretization(system) tspan = (0.0, 1.0) - semidiscretize(semi, tspan) - # Apply the deformation matrix - for particle in axes(u, 2) - # Apply deformation - u[1:2, particle] = deformations[deformation](coordinates[:, particle]) + # To make the code below work + function TrixiParticles.PtrArray{Float64}(::UndefInitializer, length) + TrixiParticles.PtrArray(zeros(length)) end - #### Verification for the particle in the middle - particle = 41 + # We can pass the data type `Array` to convert all systems to `GPUSystem`s + # and emulate the GPU kernels on the GPU. + # But this doesn't test `wrap_v` and `wrap_u` for non-`Array` types. + # In order to test this as well, we need a different data type, so we also + # pass `PtrArray`. + names = ["CPU code", "GPU code with CPU wrapping", "GPU code with GPU wrapping"] + data_types = [nothing, Array, TrixiParticles.PtrArray] + @testset "$(names[i])" for i in eachindex(names) + data_type = data_types[i] + ode = semidiscretize(semi, tspan, data_type=data_type) + + # Apply the deformation matrix + for particle in axes(u, 2) + # Apply deformation + u[1:2, particle] = deformations[deformation](coordinates[:, particle]) + end + + v_ode = ode.u0.x[1] + if isnothing(data_type) + u_ode = vec(u) + else + u_ode = data_type(vec(u)) + end + + @test typeof(v_ode) == typeof(u_ode) + @test length(v_ode) == length(u_ode) - dv = zeros(ndims(system), 81) - TrixiParticles.kick!(dv, v, u, semi, 0.0) + #### Verification for the particle in the middle + particle = 41 - @test isapprox(dv[:, particle], dv_expected_41[deformation], - rtol=sqrt(eps()), atol=sqrt(eps())) + dv_ode = zero(v_ode) + TrixiParticles.kick!(dv_ode, v_ode, u_ode, ode.p, 0.0) + + dv = TrixiParticles.wrap_v(dv_ode, system, semi) + + @test isapprox(dv[:, particle], dv_expected_41[deformation], + rtol=sqrt(eps()), atol=sqrt(eps())) + end end end end; From 6c1417712eb170ebbe82748db27c0a399b7bc195 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 13:35:05 +0200 Subject: [PATCH 13/22] Add kwarg `neighborhood_search` to dam break example --- examples/fluid/dam_break_2d.jl | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/fluid/dam_break_2d.jl b/examples/fluid/dam_break_2d.jl index 22911cced..a1eeb2f38 100644 --- a/examples/fluid/dam_break_2d.jl +++ b/examples/fluid/dam_break_2d.jl @@ -73,7 +73,9 @@ boundary_system = BoundarySPHSystem(tank.boundary, boundary_model, adhesion_coef # ========================================================================================== # ==== Simulation -semi = Semidiscretization(fluid_system, boundary_system, threaded_nhs_update=true) +semi = Semidiscretization(fluid_system, boundary_system, + neighborhood_search=GridNeighborhoodSearch, + threaded_nhs_update=true) ode = semidiscretize(semi, tspan, data_type=nothing) info_callback = InfoCallback(interval=100) From 4dab0bdb3f82c56c65469a1c318fc7fa86fdd900 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 14:47:10 +0200 Subject: [PATCH 14/22] Fix tests --- examples/n_body/n_body_system.jl | 4 +++- test/count_allocations.jl | 16 ++++++++-------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/examples/n_body/n_body_system.jl b/examples/n_body/n_body_system.jl index caf40a62e..9dc84e00e 100644 --- a/examples/n_body/n_body_system.jl +++ b/examples/n_body/n_body_system.jl @@ -1,7 +1,9 @@ using TrixiParticles using LinearAlgebra -struct NBodySystem{NDIMS, ELTYPE <: Real} <: TrixiParticles.System{NDIMS, Nothing} +# The second type parameter of `System` can't be `Nothing`, or TrixiParticles will launch +# GPU kernel for `for_particle_neighbor` loops. +struct NBodySystem{NDIMS, ELTYPE <: Real} <: TrixiParticles.System{NDIMS, 0} initial_condition :: InitialCondition{ELTYPE} mass :: Array{ELTYPE, 1} # [particle] G :: ELTYPE diff --git a/test/count_allocations.jl b/test/count_allocations.jl index 3b5110ba1..ce923ed1b 100644 --- a/test/count_allocations.jl +++ b/test/count_allocations.jl @@ -16,14 +16,14 @@ function copy_semi_with_no_update_nhs(semi) neighborhood_searches) end -# Forward `for_particle_neighbor` to wrapped neighborhood search -@inline function TrixiParticles.for_particle_neighbor(f, system_coords, neighbor_coords, - neighborhood_search::NoUpdateNeighborhoodSearch; - particles=axes(system_coords, 2), - parallel=true) - TrixiParticles.for_particle_neighbor(f, system_coords, neighbor_coords, - neighborhood_search.nhs, - particles=particles, parallel=parallel) +# Forward `foreach_neighbor` to wrapped neighborhood search +@inline function TrixiParticles.PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search::NoUpdateNeighborhoodSearch, + particle; + search_radius=neighborhood_search.nhs.search_radius) + TrixiParticles.PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords, + neighborhood_search.nhs, particle, + search_radius=search_radius) end # No update From a4d8aef1afa47eb53cb91f2bca52c1417ec1f780 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 12 Jun 2024 14:48:56 +0200 Subject: [PATCH 15/22] Reformat --- test/count_allocations.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/count_allocations.jl b/test/count_allocations.jl index ce923ed1b..9d976ef65 100644 --- a/test/count_allocations.jl +++ b/test/count_allocations.jl @@ -17,7 +17,8 @@ function copy_semi_with_no_update_nhs(semi) end # Forward `foreach_neighbor` to wrapped neighborhood search -@inline function TrixiParticles.PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords, +@inline function TrixiParticles.PointNeighbors.foreach_neighbor(f, system_coords, + neighbor_coords, neighborhood_search::NoUpdateNeighborhoodSearch, particle; search_radius=neighborhood_search.nhs.search_radius) From 8e8efa87f2d4698dfe6bf837424947dd0fc46cbb Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 19 Jun 2024 10:55:45 +0200 Subject: [PATCH 16/22] Add function `wrap_array` --- src/general/semidiscretization.jl | 45 ++++++++++++------------------- 1 file changed, 17 insertions(+), 28 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index daff2080e..9c7d91472 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -350,20 +350,6 @@ end # We have to pass `system` here for type stability, # since the type of `system` determines the return type. -@inline function wrap_u(u_ode::Array, system, semi) - (; ranges_u) = semi - - range = ranges_u[system_indices(system, semi)] - - @boundscheck @assert length(range) == u_nvariables(system) * n_moving_particles(system) - - # This is a non-allocating version of: - # return unsafe_wrap(Array{eltype(u_ode), 2}, pointer(view(u_ode, range)), - # (u_nvariables(system), n_moving_particles(system))) - return PtrArray(pointer(view(u_ode, range)), - (StaticInt(u_nvariables(system)), n_moving_particles(system))) -end - @inline function wrap_v(v_ode::Array, system, semi) (; ranges_v) = semi @@ -371,31 +357,34 @@ end @boundscheck @assert length(range) == v_nvariables(system) * n_moving_particles(system) - return PtrArray(pointer(view(v_ode, range)), - (StaticInt(v_nvariables(system)), n_moving_particles(system))) + return wrap_array(v_ode, range, + (StaticInt(v_nvariables(system)), n_moving_particles(system))) end -# For non-`Array`s (typically GPU arrays), just reshape. Calling the `PtrArray` code above -# for a `CuArray` yields another `CuArray` (instead of a `PtrArray`) and is 8 times slower -# with double the allocations. -@inline function wrap_u(u_ode, system, semi) +@inline function wrap_u(u_ode::Array, system, semi) (; ranges_u) = semi range = ranges_u[system_indices(system, semi)] @boundscheck @assert length(range) == u_nvariables(system) * n_moving_particles(system) - return reshape(view(u_ode, range), (u_nvariables(system), n_moving_particles(system))) + return wrap_array(u_ode, range, + (StaticInt(u_nvariables(system)), n_moving_particles(system))) end -@inline function wrap_v(v_ode, system, semi) - (; ranges_v) = semi - - range = ranges_v[system_indices(system, semi)] - - @boundscheck @assert length(range) == v_nvariables(system) * n_moving_particles(system) +@inline function wrap_array(array::Array, range, size) + # This is a non-allocating version of: + # return unsafe_wrap(Array{eltype(array), 2}, pointer(view(array, range)), size) + return PtrArray(pointer(view(array, range)), size) +end - return reshape(view(v_ode, range), (v_nvariables(system), n_moving_particles(system))) +@inline function wrap_array(array, range, size) + # For non-`Array`s (typically GPU arrays), just reshape. Calling the `PtrArray` code + # above for a `CuArray` yields another `CuArray` (instead of a `PtrArray`) + # and is 8 times slower with double the allocations. + # + # Note that `size` might contain `StaticInt`s, so convert to `Int` first. + return reshape(view(array, range), Int.(size)) end function calculate_dt(v_ode, u_ode, cfl_number, semi::Semidiscretization) From 969cad2ed81b6a52ee706ebd3ba39b1c0c851168 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:38:05 +0200 Subject: [PATCH 17/22] Fix merge --- src/general/system.jl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/general/system.jl b/src/general/system.jl index 4cd59c6ff..f43777720 100644 --- a/src/general/system.jl +++ b/src/general/system.jl @@ -7,12 +7,15 @@ GPUSystem = System{NDIMS, Nothing} where {NDIMS} abstract type FluidSystem{NDIMS, IC} <: System{NDIMS, IC} end timer_name(::FluidSystem) = "fluid" +vtkname(system::FluidSystem) = "fluid" abstract type SolidSystem{NDIMS, IC} <: System{NDIMS, IC} end timer_name(::SolidSystem) = "solid" +vtkname(system::SolidSystem) = "solid" abstract type BoundarySystem{NDIMS, IC} <: System{NDIMS, IC} end timer_name(::BoundarySystem) = "boundary" +vtkname(system::BoundarySystem) = "boundary" @inline function set_zero!(du) du .= zero(eltype(du)) From b05f045d47cb3056fc23014e4c368dfda3813ba4 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:40:33 +0200 Subject: [PATCH 18/22] Fix open boundary code --- src/schemes/boundary/open_boundary/system.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/schemes/boundary/open_boundary/system.jl b/src/schemes/boundary/open_boundary/system.jl index 43c0a6ff1..b5950856e 100644 --- a/src/schemes/boundary/open_boundary/system.jl +++ b/src/schemes/boundary/open_boundary/system.jl @@ -171,7 +171,7 @@ end reference_velocity, reference_pressure, reference_density) = system # Update quantities based on the characteristic variables - @threaded for particle in each_moving_particle(system) + @threaded system for particle in each_moving_particle(system) particle_position = current_coords(u, system, particle) J1 = characteristics[1, particle] @@ -230,7 +230,7 @@ function evaluate_characteristics!(system, v, u, v_ode, u_ode, semi, t) # Thus, we compute the characteristics for the particles that are outside the influence # of fluid particles by using the average of the values of the previous time step. # See eq. 27 in Negi (2020) https://doi.org/10.1016/j.cma.2020.113119 - @threaded for particle in each_moving_particle(system) + @threaded system for particle in each_moving_particle(system) # Particle is outside of the influence of fluid particles if isapprox(volume[particle], 0.0) From 613ec7e924fb11cf31750ae88052877e0baffb77 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:40:58 +0200 Subject: [PATCH 19/22] Implement suggestions --- src/util.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util.jl b/src/util.jl index d4bdc78dc..172f0a800 100644 --- a/src/util.jl +++ b/src/util.jl @@ -33,7 +33,7 @@ Run either a threaded CPU loop or launch a kernel on the GPU, depending on the t Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange` but without guarantee that the underlying implementation uses `Threads.@threads` or works for more general `for` loops. -The second argument must either be a particle system or an array from which can be derived +The first argument must either be a particle system or an array from which can be derived if the loop has to be run threaded on the CPU or launched as a kernel on the GPU. In particular, the underlying threading capabilities might be provided by other packages From 97955364a5f5a7289d36b38a0b2aab0e178795e5 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:42:49 +0200 Subject: [PATCH 20/22] Fix syntax error from merge --- src/schemes/solid/total_lagrangian_sph/system.jl | 1 + 1 file changed, 1 insertion(+) diff --git a/src/schemes/solid/total_lagrangian_sph/system.jl b/src/schemes/solid/total_lagrangian_sph/system.jl index c0a912498..125bb5507 100644 --- a/src/schemes/solid/total_lagrangian_sph/system.jl +++ b/src/schemes/solid/total_lagrangian_sph/system.jl @@ -70,6 +70,7 @@ struct TotalLagrangianSPHSystem{BM, NDIMS, ELTYPE <: Real, IC, ARRAY1D, ARRAY2D, penalty_force :: PF source_terms :: ST buffer :: Nothing +end function TotalLagrangianSPHSystem(initial_condition, smoothing_kernel, smoothing_length, From 9313efce4312bc9bf94c26d3a4c8b8e47206a073 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 20 Jun 2024 09:51:18 +0200 Subject: [PATCH 21/22] Fix `wrap_u` and `wrap_v` for GPU array types --- src/general/semidiscretization.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index b9bb92d82..7f6c791db 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -368,7 +368,7 @@ end # We have to pass `system` here for type stability, # since the type of `system` determines the return type. -@inline function wrap_v(v_ode::Array, system, semi) +@inline function wrap_v(v_ode, system, semi) (; ranges_v) = semi range = ranges_v[system_indices(system, semi)] @@ -379,7 +379,7 @@ end (StaticInt(v_nvariables(system)), n_moving_particles(system))) end -@inline function wrap_u(u_ode::Array, system, semi) +@inline function wrap_u(u_ode, system, semi) (; ranges_u) = semi range = ranges_u[system_indices(system, semi)] From 2140a673b32d944f7861fd3b98be21d27946d8c2 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 20 Jun 2024 16:01:03 +0200 Subject: [PATCH 22/22] Fix constructor of `BoundarySPHSystem` --- src/schemes/boundary/system.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/schemes/boundary/system.jl b/src/schemes/boundary/system.jl index 75a544da5..f36d515ac 100644 --- a/src/schemes/boundary/system.jl +++ b/src/schemes/boundary/system.jl @@ -27,13 +27,13 @@ struct BoundarySPHSystem{BM, NDIMS, ELTYPE <: Real, IC, CO, M, IM, # This constructor is necessary for Adapt.jl to work with this struct. # See the comments in general/gpu.jl for more details. function BoundarySPHSystem(initial_condition, coordinates, boundary_model, movement, - ismoving, adhesion_coefficient, cache) + ismoving, adhesion_coefficient, cache, buffer) ELTYPE = eltype(coordinates) new{typeof(boundary_model), size(coordinates, 1), ELTYPE, typeof(initial_condition), typeof(coordinates), typeof(movement), typeof(ismoving), typeof(cache)}(initial_condition, coordinates, boundary_model, movement, - ismoving, adhesion_coefficient, cache, nothing) + ismoving, adhesion_coefficient, cache, buffer) end end @@ -54,7 +54,7 @@ function BoundarySPHSystem(initial_condition, model; movement=nothing, # Because of dispatches boundary model needs to be first! return BoundarySPHSystem(initial_condition, coordinates, model, movement, - ismoving, adhesion_coefficient, cache) + ismoving, adhesion_coefficient, cache, nothing) end """