diff --git a/composable_kernel/include/utility/tensor_space_filling_curve.hpp b/composable_kernel/include/utility/tensor_space_filling_curve.hpp
new file mode 100644
index 0000000000..a8f12cd8e1
--- /dev/null
+++ b/composable_kernel/include/utility/tensor_space_filling_curve.hpp
@@ -0,0 +1,131 @@
+#include "math.hpp"
+#include "sequence.hpp"
+#include "tensor_adaptor.hpp"
+#include "statically_indexed_array_multi_index.hpp"
+#include "tuple_helper.hpp"
+
+namespace ck {
+
+template <typename TensorLengths,
+          typename DimAccessOrder,
+          typename ScalarsPerAccess> // # of scalars per access in each dimension
+struct SpaceFillingCurve
+{
+    static constexpr index_t nDim = TensorLengths::Size();
+
+    using Index = MultiIndex<nDim>;
+
+    static constexpr index_t ScalarPerVector =
+        reduce_on_sequence(ScalarsPerAccess{}, math::multiplies{}, Number<1>{});
+
+    static constexpr auto access_lengths   = TensorLengths{} / ScalarsPerAccess{};
+    static constexpr auto dim_access_order = DimAccessOrder{};
+    static constexpr auto ordered_access_lengths =
+        container_reorder_given_new2old(access_lengths, dim_access_order);
+
+    static constexpr auto to_index_adaptor = make_single_stage_tensor_adaptor(
+        make_tuple(make_merge_transform(ordered_access_lengths)),
+        make_tuple(typename arithmetic_sequence_gen<0, nDim, 1>::type{}),
+        make_tuple(Sequence<0>{}));
+
+    static constexpr auto I0 = Number<0>{};
+    static constexpr auto I1 = Number<1>{};
+
+    __host__ __device__ static constexpr index_t GetNumOfAccess()
+    {
+        return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
+               ScalarPerVector;
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetForwardStep(Number<AccessIdx1d>)
+    {
+
+        constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
+        constexpr auto idx_next = GetIndex(Number<AccessIdx1d + 1>{});
+        return idx_next - idx_curr;
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetBackwardStep(Number<AccessIdx1d>)
+    {
+        static_assert(AccessIdx1d > 0, "1D index should be larger than 0");
+
+        constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
+        constexpr auto idx_prev = GetIndex(Number<AccessIdx1d - 1>{});
+        return idx_prev - idx_curr;
+    }
+
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr Index GetIndex(Number<AccessIdx1d>)
+    {
+#if 0
+        /*
+         * \todo: TensorAdaptor::CalculateBottomIndex does NOT return constexpr as expected.
+         */
+        constexpr auto ordered_access_idx = to_index_adaptor.CalculateBottomIndex(make_multi_index(Number<AccessIdx1d>{}));
+#else
+
+        constexpr auto access_strides = container_reverse_exclusive_scan(
+            ordered_access_lengths, math::multiplies{}, Number<1>{});
+
+        constexpr auto idx_1d = Number<AccessIdx1d>{};
+        // Given tensor strides \p access_lengths, and 1D index of space-filling-curve, compute the
+        // idim-th element of multidimensional index.
+        // All constexpr variables have to be captured by VALUE.
+        constexpr auto compute_index = [ idx_1d, access_strides ](auto idim) constexpr
+        {
+            constexpr auto compute_index_impl = [ idx_1d, access_strides ](auto jdim) constexpr
+            {
+                auto res = idx_1d.value;
+                auto id  = 0;
+
+                static_for<0, jdim.value + 1, 1>{}([&](auto kdim) {
+                    id = res / access_strides[kdim].value;
+                    res -= id * access_strides[kdim].value;
+                });
+
+                return id;
+            };
+
+            constexpr auto id = compute_index_impl(idim);
+            return Number<id>{};
+        };
+
+        constexpr auto ordered_access_idx = generate_tuple(compute_index, Number<nDim>{});
+#endif
+        constexpr auto forward_sweep = [&]() {
+            StaticallyIndexedArray<bool, nDim> forward_sweep_;
+
+            forward_sweep_(I0) = true;
+
+            static_for<1, nDim, 1>{}([&](auto idim) {
+                index_t tmp = ordered_access_idx[I0];
+
+                static_for<1, idim, 1>{}(
+                    [&](auto j) { tmp = tmp * ordered_access_lengths[j] + ordered_access_idx[j]; });
+
+                forward_sweep_(idim) = tmp % 2 == 0;
+            });
+
+            return forward_sweep_;
+        }();
+
+        // calculate multi-dim tensor index
+        auto idx_md = [&]() {
+            Index ordered_idx;
+
+            static_for<0, nDim, 1>{}([&](auto idim) {
+                ordered_idx(idim) = forward_sweep[idim] ? ordered_access_idx[idim]
+                                                        : ordered_access_lengths[idim] - 1 -
+                                                              ordered_access_idx[idim];
+            });
+
+            return container_reorder_given_old2new(ordered_idx, dim_access_order) *
+                   ScalarsPerAccess{};
+        }();
+        return idx_md;
+    }
+};
+
+} // namespace ck
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ff483b8117..45748640dc 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -45,3 +45,8 @@ target_link_libraries(test_reference_conv_fwd PRIVATE host_tensor)
 set(CONVND_FWD_XDL_SOURCE convnd_fwd_xdl/main.cpp)
 add_executable(test_convnd_fwd_xdl ${CONVND_FWD_XDL_SOURCE})
 target_link_libraries(test_convnd_fwd_xdl PRIVATE host_tensor)
+
+# test space_filling_curve_
+set(SPACE_FILLING_CURVE_SOURCE space_filling_curve/space_filling_curve.cpp)
+add_executable(space_filling_curve ${SPACE_FILLING_CURVE_SOURCE})
+target_link_libraries(space_filling_curve PRIVATE host_tensor)
diff --git a/test/space_filling_curve/space_filling_curve.cpp b/test/space_filling_curve/space_filling_curve.cpp
new file mode 100644
index 0000000000..64e8044608
--- /dev/null
+++ b/test/space_filling_curve/space_filling_curve.cpp
@@ -0,0 +1,131 @@
+#include <vector>
+#include <iostream>
+#include <numeric>
+#include <cassert>
+
+#include "tensor_space_filling_curve.hpp"
+
+using namespace ck;
+
+void traverse_using_space_filling_curve();
+
+int main(int argc, char** argv)
+{
+    (void)argc;
+    (void)argv;
+
+    {
+        traverse_using_space_filling_curve();
+        auto err = hipDeviceSynchronize();
+        (void)err;
+        assert(err == hipSuccess);
+    }
+    return 0;
+}
+
+void traverse_using_space_filling_curve()
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    constexpr auto I2 = Number<2>{};
+
+    using TensorLengths     = Sequence<4, 10, 9>;
+    using DimAccessOrder    = Sequence<2, 0, 1>;
+    using ScalarsPerAccess  = Sequence<1, 2, 3>;
+    using SpaceFillingCurve = SpaceFillingCurve<TensorLengths, DimAccessOrder, ScalarsPerAccess>;
+
+    constexpr auto expected = make_tuple(make_tuple(0, 0, 0),
+                                         make_tuple(0, 2, 0),
+                                         make_tuple(0, 4, 0),
+                                         make_tuple(0, 6, 0),
+                                         make_tuple(0, 8, 0),
+                                         make_tuple(1, 8, 0),
+                                         make_tuple(1, 6, 0),
+                                         make_tuple(1, 4, 0),
+                                         make_tuple(1, 2, 0),
+                                         make_tuple(1, 0, 0),
+                                         make_tuple(2, 0, 0),
+                                         make_tuple(2, 2, 0),
+                                         make_tuple(2, 4, 0),
+                                         make_tuple(2, 6, 0),
+                                         make_tuple(2, 8, 0),
+                                         make_tuple(3, 8, 0),
+                                         make_tuple(3, 6, 0),
+                                         make_tuple(3, 4, 0),
+                                         make_tuple(3, 2, 0),
+                                         make_tuple(3, 0, 0),
+                                         make_tuple(3, 0, 3),
+                                         make_tuple(3, 2, 3),
+                                         make_tuple(3, 4, 3),
+                                         make_tuple(3, 6, 3),
+                                         make_tuple(3, 8, 3),
+                                         make_tuple(2, 8, 3),
+                                         make_tuple(2, 6, 3),
+                                         make_tuple(2, 4, 3),
+                                         make_tuple(2, 2, 3),
+                                         make_tuple(2, 0, 3),
+                                         make_tuple(1, 0, 3),
+                                         make_tuple(1, 2, 3),
+                                         make_tuple(1, 4, 3),
+                                         make_tuple(1, 6, 3),
+                                         make_tuple(1, 8, 3),
+                                         make_tuple(0, 8, 3),
+                                         make_tuple(0, 6, 3),
+                                         make_tuple(0, 4, 3),
+                                         make_tuple(0, 2, 3),
+                                         make_tuple(0, 0, 3),
+                                         make_tuple(0, 0, 6),
+                                         make_tuple(0, 2, 6),
+                                         make_tuple(0, 4, 6),
+                                         make_tuple(0, 6, 6),
+                                         make_tuple(0, 8, 6),
+                                         make_tuple(1, 8, 6),
+                                         make_tuple(1, 6, 6),
+                                         make_tuple(1, 4, 6),
+                                         make_tuple(1, 2, 6),
+                                         make_tuple(1, 0, 6),
+                                         make_tuple(2, 0, 6),
+                                         make_tuple(2, 2, 6),
+                                         make_tuple(2, 4, 6),
+                                         make_tuple(2, 6, 6),
+                                         make_tuple(2, 8, 6),
+                                         make_tuple(3, 8, 6),
+                                         make_tuple(3, 6, 6),
+                                         make_tuple(3, 4, 6),
+                                         make_tuple(3, 2, 6),
+                                         make_tuple(3, 0, 6));
+
+    constexpr index_t num_accesses = SpaceFillingCurve::GetNumOfAccess();
+
+    static_assert(num_accesses == reduce_on_sequence(TensorLengths{} / ScalarsPerAccess{},
+                                                     math::multiplies{},
+                                                     Number<1>{}));
+
+    static_for<1, num_accesses, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+
+        constexpr auto backward_step = SpaceFillingCurve::GetBackwardStep(i);
+        constexpr auto expected_step = expected[i - I1] - expected[i];
+        static_assert(backward_step[I0] == expected_step[I0]);
+        static_assert(backward_step[I1] == expected_step[I1]);
+        static_assert(backward_step[I2] == expected_step[I2]);
+    });
+
+    static_for<0, num_accesses - 1, 1>{}([&](auto i) {
+        constexpr auto idx_curr = SpaceFillingCurve::GetIndex(i);
+
+        static_assert(idx_curr[I0] == expected[i][I0]);
+        static_assert(idx_curr[I1] == expected[i][I1]);
+        static_assert(idx_curr[I2] == expected[i][I2]);
+
+        constexpr auto forward_step  = SpaceFillingCurve::GetForwardStep(i);
+        constexpr auto expected_step = expected[i + I1] - expected[i];
+        static_assert(forward_step[I0] == expected_step[I0]);
+        static_assert(forward_step[I1] == expected_step[I1]);
+        static_assert(forward_step[I2] == expected_step[I2]);
+    });
+}