Skip to content

Commit

Permalink
factorization
Browse files Browse the repository at this point in the history
  • Loading branch information
yhmtsai committed Oct 29, 2024
1 parent c1fcc4a commit b4efba5
Show file tree
Hide file tree
Showing 69 changed files with 359 additions and 238 deletions.
12 changes: 7 additions & 5 deletions common/cuda_hip/factorization/cholesky_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ void symbolic_factorize(
postorder, postorder_parent, out_row_ptrs, out_cols);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_SYMBOLIC_FACTORIZE);


Expand Down Expand Up @@ -312,7 +312,7 @@ void forest_from_factor(
build_children_from_parents(exec, forest);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_FOREST_FROM_FACTOR);


Expand Down Expand Up @@ -346,7 +346,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
transpose_idxs);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_INITIALIZE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_INITIALIZE);


template <typename ValueType, typename IndexType>
Expand All @@ -372,7 +373,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_CHOLESKY_FACTORIZE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_FACTORIZE);


template <typename ValueType, typename IndexType>
Expand Down Expand Up @@ -428,7 +430,7 @@ void symbolic_count(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_CHOLESKY_SYMBOLIC_COUNT);


Expand Down
10 changes: 5 additions & 5 deletions common/cuda_hip/factorization/factorization_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -441,7 +441,7 @@ void add_diagonal_elements(std::shared_ptr<const DefaultExecutor> exec,
mtx_builder.get_col_idx_array() = std::move(new_col_idx_array);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_ADD_DIAGONAL_ELEMENTS_KERNEL);


Expand Down Expand Up @@ -471,7 +471,7 @@ void initialize_row_ptrs_l_u(
components::prefix_sum_nonnegative(exec, u_row_ptrs, num_rows + 1);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_U_KERNEL);


Expand All @@ -497,7 +497,7 @@ void initialize_l_u(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_L_U_KERNEL);


Expand Down Expand Up @@ -525,7 +525,7 @@ void initialize_row_ptrs_l(
components::prefix_sum_nonnegative(exec, l_row_ptrs, num_rows + 1);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_ROW_PTRS_L_KERNEL);


Expand All @@ -549,7 +549,7 @@ void initialize_l(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_FACTORIZATION_INITIALIZE_L_KERNEL);


Expand Down
3 changes: 2 additions & 1 deletion common/cuda_hip/factorization/ic_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,8 @@ void compute(std::shared_ptr<const DefaultExecutor> exec,
sparselib::destroy(desc);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_IC_COMPUTE_KERNEL);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_IC_COMPUTE_KERNEL);


} // namespace ic_factorization
Expand Down
2 changes: 1 addition & 1 deletion common/cuda_hip/factorization/ilu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ void compute_lu(std::shared_ptr<const DefaultExecutor> exec,
sparselib::destroy(desc);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_ILU_COMPUTE_LU_KERNEL);


Expand Down
6 changes: 4 additions & 2 deletions common/cuda_hip/factorization/lu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -245,7 +245,8 @@ void initialize(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_INITIALIZE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_LU_INITIALIZE);


template <typename ValueType, typename IndexType>
Expand All @@ -268,7 +269,8 @@ void factorize(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(GKO_DECLARE_LU_FACTORIZE);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_LU_FACTORIZE);


template <typename IndexType>
Expand Down
29 changes: 19 additions & 10 deletions common/cuda_hip/factorization/par_ic_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,7 @@ void init_factor(std::shared_ptr<const DefaultExecutor> exec,
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_IC_INIT_FACTOR_KERNEL);


Expand All @@ -123,19 +123,28 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
auto nnz = l->get_num_stored_elements();
auto num_blocks = ceildiv(nnz, default_block_size);
if (num_blocks > 0) {
for (size_type i = 0; i < iterations; ++i) {
kernel::ic_sweep<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
a_lower->get_const_row_idxs(), a_lower->get_const_col_idxs(),
as_device_type(a_lower->get_const_values()),
l->get_const_row_ptrs(), l->get_const_col_idxs(),
as_device_type(l->get_values()),
static_cast<IndexType>(l->get_num_stored_elements()));
#ifdef GKO_COMPILING_HIP
if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
// HIP does not support 16bit atomic operation
GKO_NOT_SUPPORTED(a_lower);
} else
#endif
{
for (size_type i = 0; i < iterations; ++i) {
kernel::ic_sweep<<<num_blocks, default_block_size, 0,
exec->get_stream()>>>(
a_lower->get_const_row_idxs(),
a_lower->get_const_col_idxs(),
as_device_type(a_lower->get_const_values()),
l->get_const_row_ptrs(), l->get_const_col_idxs(),
as_device_type(l->get_values()),
static_cast<IndexType>(l->get_num_stored_elements()));
}
}
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_IC_COMPUTE_FACTOR_KERNEL);


Expand Down
26 changes: 17 additions & 9 deletions common/cuda_hip/factorization/par_ict_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -390,13 +390,21 @@ void compute_factor(syn::value_list<int, subwarp_size>,
auto block_size = default_block_size / subwarp_size;
auto num_blocks = ceildiv(total_nnz, block_size);
if (num_blocks > 0) {
kernel::ict_sweep<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
a->get_const_row_ptrs(), a->get_const_col_idxs(),
as_device_type(a->get_const_values()), l->get_const_row_ptrs(),
l_coo->get_const_row_idxs(), l->get_const_col_idxs(),
as_device_type(l->get_values()),
static_cast<IndexType>(l->get_num_stored_elements()));
#ifdef GKO_COMPILING_HIP
if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
// HIP does not support 16bit atomic operation
GKO_NOT_SUPPORTED(l);
} else
#endif
{
kernel::ict_sweep<subwarp_size>
<<<num_blocks, default_block_size, 0, exec->get_stream()>>>(
a->get_const_row_ptrs(), a->get_const_col_idxs(),
as_device_type(a->get_const_values()),
l->get_const_row_ptrs(), l_coo->get_const_row_idxs(),
l->get_const_col_idxs(), as_device_type(l->get_values()),
static_cast<IndexType>(l->get_num_stored_elements()));
}
}
}

Expand Down Expand Up @@ -427,7 +435,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
syn::value_list<int>(), syn::type_list<>(), exec, llh, a, l, l_new);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_ICT_ADD_CANDIDATES_KERNEL);


Expand All @@ -449,7 +457,7 @@ void compute_factor(std::shared_ptr<const DefaultExecutor> exec,
syn::value_list<int>(), syn::type_list<>(), exec, a, l, l_coo);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_ICT_COMPUTE_FACTOR_KERNEL);


Expand Down
32 changes: 21 additions & 11 deletions common/cuda_hip/factorization/par_ilu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,21 +94,31 @@ void compute_l_u_factors(std::shared_ptr<const DefaultExecutor> exec,
const auto grid_dim = static_cast<uint32>(
ceildiv(num_elements, static_cast<size_type>(block_size)));
if (grid_dim > 0) {
for (size_type i = 0; i < iterations; ++i) {
kernel::compute_l_u_factors<<<grid_dim, block_size, 0,
exec->get_stream()>>>(
num_elements, system_matrix->get_const_row_idxs(),
system_matrix->get_const_col_idxs(),
as_device_type(system_matrix->get_const_values()),
l_factor->get_const_row_ptrs(), l_factor->get_const_col_idxs(),
as_device_type(l_factor->get_values()),
u_factor->get_const_row_ptrs(), u_factor->get_const_col_idxs(),
as_device_type(u_factor->get_values()));
#ifdef GKO_COMPILING_HIP
if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
// HIP does not support 16bit atomic operation
GKO_NOT_SUPPORTED(system_matrix);
} else
#endif
{
for (size_type i = 0; i < iterations; ++i) {
kernel::compute_l_u_factors<<<grid_dim, block_size, 0,
exec->get_stream()>>>(
num_elements, system_matrix->get_const_row_idxs(),
system_matrix->get_const_col_idxs(),
as_device_type(system_matrix->get_const_values()),
l_factor->get_const_row_ptrs(),
l_factor->get_const_col_idxs(),
as_device_type(l_factor->get_values()),
u_factor->get_const_row_ptrs(),
u_factor->get_const_col_idxs(),
as_device_type(u_factor->get_values()));
}
}
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_ILU_COMPUTE_L_U_FACTORS_KERNEL);


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ void threshold_filter_approx(std::shared_ptr<const DefaultExecutor> exec,
&threshold, m_out, m_out_coo);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_APPROX_KERNEL);


Expand Down
2 changes: 1 addition & 1 deletion common/cuda_hip/factorization/par_ilut_filter_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ void threshold_filter(std::shared_ptr<const DefaultExecutor> exec,
m_out_coo, lower);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_ILUT_THRESHOLD_FILTER_KERNEL);


Expand Down
16 changes: 12 additions & 4 deletions common/cuda_hip/factorization/par_ilut_select_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,17 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
auto num_threads_total = ceildiv(size, items_per_thread);
auto num_blocks =
static_cast<IndexType>(ceildiv(num_threads_total, default_block_size));
// pick sample, build searchtree
kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>(
as_device_type(values), size, as_device_type(tree));
#ifdef GKO_COMPILING_HIP
if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
// HIP does not support 16bit atomic operation
GKO_NOT_SUPPORTED(values);
} else
#endif
{
// pick sample, build searchtree
kernel::build_searchtree<<<1, bucket_count, 0, exec->get_stream()>>>(
as_device_type(values), size, as_device_type(tree));
}
// determine bucket sizes
if (num_blocks > 0) {
kernel::count_buckets<<<num_blocks, default_block_size, 0,
Expand All @@ -69,7 +77,7 @@ void sampleselect_count(std::shared_ptr<const DefaultExecutor> exec,
unsigned char* oracles, IndexType* partial_counts, \
IndexType* total_counts)

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(DECLARE_SSSS_COUNT);
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(DECLARE_SSSS_COUNT);


template <typename IndexType>
Expand Down
19 changes: 14 additions & 5 deletions common/cuda_hip/factorization/par_ilut_select_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,13 +141,22 @@ void threshold_select(std::shared_ptr<const DefaultExecutor> exec,

// base case
auto out_ptr = reinterpret_cast<AbsType*>(tmp1.get_data());
kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
exec->get_stream()>>>(
as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
threshold = exec->copy_val_to_host(out_ptr);

#ifdef GKO_COMPILING_HIP
if constexpr (std::is_same<remove_complex<ValueType>, half>::value) {
// HIP does not support 16bit atomic operation
GKO_NOT_SUPPORTED(m);
} else
#endif
{
kernel::basecase_select<<<1, kernel::basecase_block_size, 0,
exec->get_stream()>>>(
as_device_type(tmp22), bucket.size, rank, as_device_type(out_ptr));
threshold = exec->copy_val_to_host(out_ptr);
}
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_ILUT_THRESHOLD_SELECT_KERNEL);


Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/factorization/par_ilut_select_kernels.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -254,12 +254,12 @@ __global__ __launch_bounds__(basecase_block_size) void basecase_select(
const ValueType* __restrict__ input, IndexType size, IndexType rank,
ValueType* __restrict__ out)
{
constexpr auto sentinel = device_numeric_limits<ValueType>::inf();
const auto sentinel = device_numeric_limits<ValueType>::inf();
ValueType local[basecase_local_size];
__shared__ ValueType sh_local[basecase_size];
for (int i = 0; i < basecase_local_size; ++i) {
auto idx = threadIdx.x + i * basecase_block_size;
local[i] = idx < size ? input[idx] : sentinel;
local[i] = idx < size ? input[idx] : static_cast<ValueType>(sentinel);
}
bitonic_sort<basecase_size, basecase_local_size>(local, sh_local);
if (threadIdx.x == rank / basecase_local_size) {
Expand Down
2 changes: 1 addition & 1 deletion common/cuda_hip/factorization/par_ilut_spgeam_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -389,7 +389,7 @@ void add_candidates(std::shared_ptr<const DefaultExecutor> exec,
u_new);
}

GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE(
GKO_INSTANTIATE_FOR_EACH_VALUE_AND_INDEX_TYPE_WITH_HALF(
GKO_DECLARE_PAR_ILUT_ADD_CANDIDATES_KERNEL);


Expand Down
Loading

0 comments on commit b4efba5

Please sign in to comment.