trixi-framework · svchb · Jun 21, 2024 · May 22, 2024 · May 22, 2024 · May 22, 2024
diff --git a/Project.toml b/Project.toml
@@ -11,7 +11,9 @@ Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 DiffEqCallbacks = "459566f4-90b8-5000-8ac3-15dfb0a30def"
 FastPow = "c0e83750-1142-43a8-81cf-6c956b72b4d1"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
+GPUArrays = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221"
 PointNeighbors = "1c4d5385-0a27-49de-8e2c-43b175c8985c"
@@ -35,6 +37,7 @@ FastPow = "0.1"
 ForwardDiff = "0.10"
 JSON = "0.21"
 MuladdMacro = "0.2"
+PointNeighbors = "0.2.3"
 Polyester = "0.7.5"
 RecipesBase = "1"
 Reexport = "1"
@@ -43,6 +46,5 @@ StaticArrays = "1"
 StrideArrays = "0.1"
 TimerOutputs = "0.5"
 TrixiBase = "0.1"
-PointNeighbors = "0.2"
 WriteVTK = "1"
 julia = "1.9"
diff --git a/examples/fluid/dam_break_2d.jl b/examples/fluid/dam_break_2d.jl
@@ -73,8 +73,10 @@ boundary_system = BoundarySPHSystem(tank.boundary, boundary_model, adhesion_coef
 
 # ==========================================================================================
 # ==== Simulation
-semi = Semidiscretization(fluid_system, boundary_system, threaded_nhs_update=true)
-ode = semidiscretize(semi, tspan)
+semi = Semidiscretization(fluid_system, boundary_system,
+                          neighborhood_search=GridNeighborhoodSearch,
+                          threaded_nhs_update=true)
+ode = semidiscretize(semi, tspan, data_type=nothing)
 
 info_callback = InfoCallback(interval=100)
 

diff --git a/examples/n_body/n_body_system.jl b/examples/n_body/n_body_system.jl
@@ -1,7 +1,9 @@
 using TrixiParticles
 using LinearAlgebra
 
-struct NBodySystem{NDIMS, ELTYPE <: Real} <: TrixiParticles.System{NDIMS, Nothing}
+# The second type parameter of `System` can't be `Nothing`, or TrixiParticles will launch
+# GPU kernel for `for_particle_neighbor` loops.
+struct NBodySystem{NDIMS, ELTYPE <: Real} <: TrixiParticles.System{NDIMS, 0}
     initial_condition :: InitialCondition{ELTYPE}
     mass              :: Array{ELTYPE, 1} # [particle]
     G                 :: ELTYPE

diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -9,7 +9,9 @@ using DataFrames: DataFrame
 using DiffEqCallbacks: PeriodicCallback, PeriodicCallbackAffect, PresetTimeCallback
 using FastPow: @fastpow
 using ForwardDiff: ForwardDiff
+using GPUArrays: AbstractGPUArray
 using JSON: JSON
+using KernelAbstractions: KernelAbstractions, @kernel, @index
 using LinearAlgebra: norm, dot, I, tr, inv, pinv, det
 using MuladdMacro: @muladd
 using Polyester: Polyester, @batch
@@ -26,7 +28,9 @@ using TrixiBase: trixi_include
 using PointNeighbors: PointNeighbors, for_particle_neighbor
 using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save
 
-# util needs to be first because of macro @trixi_timeit
+# `util.jl` depends on the `GPUSystem` type defined in `system.jl`
+include("general/system.jl")
+# `util.jl` needs to be next because of the macros `@trixi_timeit` and `@threaded`
 include("util.jl")
 include("callbacks/callbacks.jl")
 include("general/general.jl")

diff --git a/src/general/corrections.jl b/src/general/corrections.jl
@@ -440,7 +440,7 @@ function compute_gradient_correction_matrix!(corr_matrix::AbstractArray, system,
 end
 
 function correction_matrix_inversion_step!(corr_matrix, system)
-    @threaded for particle in eachparticle(system)
+    @threaded system for particle in eachparticle(system)
         L = extract_smatrix(corr_matrix, system, particle)
 
         # The matrix `L` only becomes singular when the particle and all neighbors

diff --git a/src/general/general.jl b/src/general/general.jl
@@ -1,33 +1,10 @@
-# Abstract supertype for all system types. We additionally store the type of the system's
-# initial condition, which is `Nothing` when using KernelAbstractions.jl.
-abstract type System{NDIMS, IC} end
-
-# When using KernelAbstractions.jl, the initial condition has been replaced by `nothing`
-GPUSystem = System{NDIMS, Nothing} where {NDIMS}
-
-abstract type FluidSystem{NDIMS, IC} <: System{NDIMS, IC} end
-timer_name(::FluidSystem) = "fluid"
-
-abstract type SolidSystem{NDIMS, IC} <: System{NDIMS, IC} end
-timer_name(::SolidSystem) = "solid"
-
-abstract type BoundarySystem{NDIMS, IC} <: System{NDIMS, IC} end
-timer_name(::BoundarySystem) = "boundary"
-
-@inline function set_zero!(du)
-    du .= zero(eltype(du))
-
-    return du
-end
-
-# Note that `semidiscretization.jl` depends on the system types and has to be
-# included later.
+# Note that `system.jl` has already been included.
+# `semidiscretization.jl` depends on the system types and has to be included later.
 # `density_calculators.jl` needs to be included before `corrections.jl`.
 include("density_calculators.jl")
 include("corrections.jl")
 include("smoothing_kernels.jl")
 include("initial_condition.jl")
-include("system.jl")
 include("interpolation.jl")
 include("file_system.jl")
 include("custom_quantities.jl")

diff --git a/src/general/gpu.jl b/src/general/gpu.jl
@@ -13,6 +13,7 @@ Adapt.@adapt_structure DensityDiffusionAntuono
 Adapt.@adapt_structure BoundarySPHSystem
 Adapt.@adapt_structure BoundaryModelDummyParticles
 Adapt.@adapt_structure BoundaryModelMonaghanKajtar
+Adapt.@adapt_structure TotalLagrangianSPHSystem
 
 # The initial conditions are only used for initialization, which happens before `adapt`ing
 # the semidiscretization, so we don't need to store `InitialCondition`s on the GPU.
@@ -32,3 +33,10 @@ end
 function Adapt.adapt_structure(to::typeof(Array), range::UnitRange)
     return range
 end
+
+KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU()
+KernelAbstractions.get_backend(system::System) = KernelAbstractions.get_backend(system.mass)
+
+function KernelAbstractions.get_backend(system::BoundarySPHSystem)
+    KernelAbstractions.get_backend(system.coordinates)
+end
diff --git a/src/general/neighborhood_search.jl b/src/general/neighborhood_search.jl
@@ -9,3 +9,14 @@ function PointNeighbors.for_particle_neighbor(f, system, neighbor_system,
     for_particle_neighbor(f, system_coords, neighbor_coords, neighborhood_search,
                           particles=particles, parallel=parallel)
 end
+
+function PointNeighbors.for_particle_neighbor(f, system::GPUSystem, neighbor_system,
+                                              system_coords, neighbor_coords,
+                                              neighborhood_search;
+                                              particles=each_moving_particle(system),
+                                              parallel=true)
+    @threaded system for particle in particles
+        PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords,
+                                        neighborhood_search, particle)
+    end
+end
diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl
@@ -350,7 +350,7 @@ end
 
 # We have to pass `system` here for type stability,
 # since the type of `system` determines the return type.
-@inline function wrap_u(u_ode, system, semi)
+@inline function wrap_u(u_ode::Array, system, semi)
     (; ranges_u) = semi
 
     range = ranges_u[system_indices(system, semi)]
@@ -364,7 +364,7 @@ end
                     (StaticInt(u_nvariables(system)), n_moving_particles(system)))
 end
 
-@inline function wrap_v(v_ode, system, semi)
+@inline function wrap_v(v_ode::Array, system, semi)
     (; ranges_v) = semi
 
     range = ranges_v[system_indices(system, semi)]
@@ -375,6 +375,29 @@ end
                     (StaticInt(v_nvariables(system)), n_moving_particles(system)))
 end
 
+# For non-`Array`s (typically GPU arrays), just reshape. Calling the `PtrArray` code above
+# for a `CuArray` yields another `CuArray` (instead of a `PtrArray`) and is 8 times slower
+# with double the allocations.
+@inline function wrap_u(u_ode, system, semi)
+    (; ranges_u) = semi
+
+    range = ranges_u[system_indices(system, semi)]
+
+    @boundscheck @assert length(range) == u_nvariables(system) * n_moving_particles(system)
+
+    return reshape(view(u_ode, range), (u_nvariables(system), n_moving_particles(system)))
+end
+
+@inline function wrap_v(v_ode, system, semi)
+    (; ranges_v) = semi
+
+    range = ranges_v[system_indices(system, semi)]
+
+    @boundscheck @assert length(range) == v_nvariables(system) * n_moving_particles(system)
+
+    return reshape(view(v_ode, range), (v_nvariables(system), n_moving_particles(system)))
+end
+
 function calculate_dt(v_ode, u_ode, cfl_number, semi::Semidiscretization)
     (; systems) = semi
 
@@ -391,7 +414,7 @@ function drift!(du_ode, v_ode, u_ode, semi, t)
                 du = wrap_u(du_ode, system, semi)
                 v = wrap_v(v_ode, system, semi)
 
-                @threaded for particle in each_moving_particle(system)
+                @threaded system for particle in each_moving_particle(system)
                     # This can be dispatched per system
                     add_velocity!(du, v, particle, system)
                 end
@@ -490,7 +513,7 @@ function add_source_terms!(dv_ode, v_ode, u_ode, semi)
         v = wrap_v(v_ode, system, semi)
         u = wrap_u(u_ode, system, semi)
 
-        @threaded for particle in each_moving_particle(system)
+        @threaded system for particle in each_moving_particle(system)
             # Dispatch by system type to exclude boundary systems
             add_acceleration!(dv, particle, system)
             add_source_terms_inner!(dv, v, u, particle, system, source_terms(system))

diff --git a/src/general/system.jl b/src/general/system.jl
@@ -1,3 +1,25 @@
+# Abstract supertype for all system types. We additionally store the type of the system's
+# initial condition, which is `Nothing` when using KernelAbstractions.jl.
+abstract type System{NDIMS, IC} end
+
+# When using KernelAbstractions.jl, the initial condition has been replaced by `nothing`
+GPUSystem = System{NDIMS, Nothing} where {NDIMS}
+
+abstract type FluidSystem{NDIMS, IC} <: System{NDIMS, IC} end
+timer_name(::FluidSystem) = "fluid"
+
+abstract type SolidSystem{NDIMS, IC} <: System{NDIMS, IC} end
+timer_name(::SolidSystem) = "solid"
+
+abstract type BoundarySystem{NDIMS, IC} <: System{NDIMS, IC} end
+timer_name(::BoundarySystem) = "boundary"
+
+@inline function set_zero!(du)
+    du .= zero(eltype(du))
+
+    return du
+end
+
 initialize!(system, neighborhood_search) = system
 
 @inline Base.ndims(::System{NDIMS}) where {NDIMS} = NDIMS

diff --git a/src/schemes/boundary/dummy_particles/dummy_particles.jl b/src/schemes/boundary/dummy_particles/dummy_particles.jl
@@ -291,7 +291,7 @@ function compute_pressure!(boundary_model, ::Union{SummationDensity, ContinuityD
 
     # Limit pressure to be non-negative to avoid attractive forces between fluid and
     # boundary particles at free surfaces (sticking artifacts).
-    @threaded for particle in eachparticle(system)
+    @threaded system for particle in eachparticle(system)
         apply_state_equation!(boundary_model, particle_density(v, boundary_model,
                                                                particle), particle)
     end
@@ -346,14 +346,15 @@ function compute_pressure!(boundary_model, ::AdamiPressureExtrapolation,
                                           system_coords, neighbor_coords,
                                           v_neighbor_system, nhs)
         end
-        for particle in eachparticle(system)
+
+        @threaded system for particle in eachparticle(system)
             # Limit pressure to be non-negative to avoid attractive forces between fluid and
             # boundary particles at free surfaces (sticking artifacts).
             pressure[particle] = max(pressure[particle], 0.0)
         end
     end
 
-    @trixi_timeit timer() "inverse state equation" @threaded for particle in eachparticle(system)
+    @trixi_timeit timer() "inverse state equation" @threaded system for particle in eachparticle(system)
         compute_adami_density!(boundary_model, system, system_coords, particle)
     end
 end

diff --git a/src/schemes/boundary/system.jl b/src/schemes/boundary/system.jl
@@ -208,7 +208,7 @@ function (movement::BoundaryMovement)(system, t)
 
     is_moving(t) || return system
 
-    @threaded for particle in moving_particles
+    @threaded system for particle in moving_particles
         pos_new = initial_coords(system, particle) + movement_function(t)
         vel = ForwardDiff.derivative(movement_function, t)
         acc = ForwardDiff.derivative(t_ -> ForwardDiff.derivative(movement_function, t_), t)

diff --git a/src/schemes/fluid/weakly_compressible_sph/system.jl b/src/schemes/fluid/weakly_compressible_sph/system.jl
@@ -295,7 +295,7 @@ function reinit_density!(system, v, u, v_ode, u_ode, semi)
 end
 
 function compute_pressure!(system, v)
-    @threaded for particle in eachparticle(system)
+    @threaded system for particle in eachparticle(system)
         apply_state_equation!(system, particle_density(v, system, particle), particle)
     end
 end