45 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
46 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
48 #include <initializer_list>
52 #include <impl/KokkosExp_Host_IterateTile.hpp>
53 #include <Kokkos_ExecPolicy.hpp>
56 #if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
57 #include <Cuda/KokkosExp_Cuda_IterateTile.hpp>
58 #include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
61 #if defined(__HCC__) && defined(KOKKOS_ENABLE_ROCM)
63 #include <ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp>
66 #if defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP)
67 #include <HIP/KokkosExp_HIP_IterateTile.hpp>
83 template <
typename ExecSpace>
84 struct default_outer_direction {
86 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \
87 defined(KOKKOS_ENABLE_HIP)
88 static constexpr Iterate value = Iterate::Left;
90 static constexpr Iterate value = Iterate::Right;
94 template <
typename ExecSpace>
95 struct default_inner_direction {
97 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_ROCM) || \
98 defined(KOKKOS_ENABLE_HIP)
99 static constexpr Iterate value = Iterate::Left;
101 static constexpr Iterate value = Iterate::Right;
106 template <
unsigned N, Iterate OuterDir = Iterate::Default,
107 Iterate InnerDir = Iterate::Default>
109 static_assert(N != 0u,
"Kokkos Error: rank 0 undefined");
110 static_assert(N != 1u,
111 "Kokkos Error: rank 1 is not a multi-dimensional range");
112 static_assert(N < 7u,
"Kokkos Error: Unsupported rank...");
114 using iteration_pattern = Rank<N, OuterDir, InnerDir>;
116 static constexpr
int rank = N;
117 static constexpr Iterate outer_direction = OuterDir;
118 static constexpr Iterate inner_direction = InnerDir;
122 template <
typename... Properties>
123 struct MDRangePolicy :
public Kokkos::Impl::PolicyTraits<Properties...> {
124 using traits = Kokkos::Impl::PolicyTraits<Properties...>;
125 using range_policy = RangePolicy<Properties...>;
127 typename traits::execution_space m_space;
129 using impl_range_policy =
130 RangePolicy<
typename traits::execution_space,
131 typename traits::schedule_type,
typename traits::index_type>;
133 typedef MDRangePolicy
136 template <
class... OtherProperties>
137 friend struct MDRangePolicy;
139 static_assert(!std::is_same<typename traits::iteration_pattern, void>::value,
140 "Kokkos Error: MD iteration pattern not defined");
142 using iteration_pattern =
typename traits::iteration_pattern;
143 using work_tag =
typename traits::work_tag;
144 using launch_bounds =
typename traits::launch_bounds;
145 using member_type =
typename range_policy::member_type;
147 enum {
rank =
static_cast<int>(iteration_pattern::rank) };
149 using index_type =
typename traits::index_type;
150 using array_index_type = long;
167 point_type m_tile_end;
168 index_type m_num_tiles;
169 index_type m_prod_tile_dims;
188 static constexpr
int outer_direction =
static_cast<int>(
189 (iteration_pattern::outer_direction != Iterate::Default)
190 ? iteration_pattern::outer_direction
191 : default_outer_direction<typename traits::execution_space>::value);
193 static constexpr
int inner_direction =
static_cast<int>(
194 iteration_pattern::inner_direction != Iterate::Default
195 ? iteration_pattern::inner_direction
196 : default_inner_direction<typename traits::execution_space>::value);
199 static constexpr
int Right =
static_cast<int>(Iterate::Right);
200 static constexpr
int Left =
static_cast<int>(Iterate::Left);
202 KOKKOS_INLINE_FUNCTION
const typename traits::execution_space& space()
const {
205 template <
typename LT,
typename UT,
typename TT = array_index_type>
206 MDRangePolicy(std::initializer_list<LT>
const& lower,
207 std::initializer_list<UT>
const& upper,
208 std::initializer_list<TT>
const& tile = {})
210 init(lower, upper, tile);
213 template <
typename LT,
typename UT,
typename TT = array_index_type>
214 MDRangePolicy(
const typename traits::execution_space& work_space,
215 std::initializer_list<LT>
const& lower,
216 std::initializer_list<UT>
const& upper,
217 std::initializer_list<TT>
const& tile = {})
218 : m_space(work_space) {
219 init(lower, upper, tile);
222 MDRangePolicy(point_type
const& lower, point_type
const& upper,
223 tile_type
const& tile = tile_type{})
229 m_prod_tile_dims(1) {
233 MDRangePolicy(
const typename traits::execution_space& work_space,
234 point_type
const& lower, point_type
const& upper,
235 tile_type
const& tile = tile_type{})
236 : m_space(work_space),
241 m_prod_tile_dims(1) {
245 template <
class... OtherProperties>
246 MDRangePolicy(
const MDRangePolicy<OtherProperties...> p)
247 : m_space(p.m_space),
251 m_tile_end(p.m_tile_end),
252 m_num_tiles(p.m_num_tiles),
253 m_prod_tile_dims(p.m_prod_tile_dims) {}
259 #
if defined(KOKKOS_ENABLE_CUDA)
260 && !std::is_same<typename traits::execution_space, Kokkos::Cuda>::value
262 #
if defined(KOKKOS_ENABLE_ROCM)
263 && !std::is_same<
typename traits::execution_space,
264 Kokkos::Experimental::ROCm>::value
266 #
if defined(KOKKOS_ENABLE_HIP)
267 && !std::is_same<
typename traits::execution_space,
268 Kokkos::Experimental::HIP>::value
272 for (
int i = 0; i <
rank; ++i) {
273 span = m_upper[i] - m_lower[i];
274 if (m_tile[i] <= 0) {
275 if (((
int)inner_direction == (
int)Right && (i < rank - 1)) ||
276 ((
int)inner_direction == (
int)Left && (i > 0))) {
279 m_tile[i] = (span == 0 ? 1 : span);
283 static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
284 m_num_tiles *= m_tile_end[i];
285 m_prod_tile_dims *= m_tile[i];
288 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
295 if ((
int)inner_direction == (int)Right) {
297 rank_start = rank - 1;
300 bool is_cuda_exec_space =
301 #if defined(KOKKOS_ENABLE_CUDA)
302 std::is_same<typename traits::execution_space, Kokkos::Cuda>::value;
306 for (
int i = rank_start; i != rank_end; i += increment) {
307 span = m_upper[i] - m_lower[i];
308 if (m_tile[i] <= 0) {
311 if (((
int)inner_direction == (int)Right && (i < rank - 1)) ||
312 ((
int)inner_direction == (int)Left && (i > 0))) {
313 if (m_prod_tile_dims < 256) {
314 m_tile[i] = (is_cuda_exec_space) ? 2 : 4;
323 static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
324 m_num_tiles *= m_tile_end[i];
325 m_prod_tile_dims *= m_tile[i];
327 if (m_prod_tile_dims >
330 if (is_cuda_exec_space) {
331 printf(
" Tile dimensions exceed Cuda limits\n");
333 " Cuda ExecSpace Error: MDRange tile dims exceed maximum number "
335 "threads per block - choose smaller tile dims");
337 printf(
" Tile dimensions exceed HIP limits\n");
339 "HIP ExecSpace Error: MDRange tile dims exceed maximum number of "
340 "threads per block - choose smaller tile dims");
345 #if defined(KOKKOS_ENABLE_ROCM)
352 if ((
int)inner_direction == (
int)Right) {
354 rank_start = rank - 1;
357 for (
int i = rank_start; i != rank_end; i += increment) {
358 span = m_upper[i] - m_lower[i];
359 if (m_tile[i] <= 0) {
362 if (((
int)inner_direction == (
int)Right && (i < rank - 1)) ||
363 ((
int)inner_direction == (
int)Left && (i > 0))) {
364 if (m_prod_tile_dims < 256) {
374 static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
375 m_num_tiles *= m_tile_end[i];
376 m_prod_tile_dims *= m_tile[i];
378 if (m_prod_tile_dims > 1024) {
379 printf(
" Tile dimensions exceed ROCm limits\n");
381 " ROCm ExecSpace Error: MDRange tile dims exceed maximum number of "
382 "threads per block - choose smaller tile dims");
391 template <
typename LT,
typename UT,
typename TT = array_index_type>
392 void init(std::initializer_list<LT>
const& lower,
393 std::initializer_list<UT>
const& upper,
394 std::initializer_list<TT>
const& tile = {}) {
395 if (static_cast<int>(m_lower.size()) != rank ||
396 static_cast<int>(m_upper.size()) != rank)
398 "MDRangePolicy: Constructor initializer lists have wrong size");
400 for (
auto i = 0; i <
rank; ++i) {
401 m_lower[i] =
static_cast<array_index_type
>(lower.begin()[i]);
402 m_upper[i] =
static_cast<array_index_type
>(upper.begin()[i]);
403 if (static_cast<int>(tile.size()) == rank)
404 m_tile[i] =
static_cast<array_index_type
>(tile.begin()[i]);
410 m_prod_tile_dims = 1;
414 #
if defined(KOKKOS_ENABLE_CUDA)
415 && !std::is_same<typename traits::execution_space, Kokkos::Cuda>::value
417 #if defined(KOKKOS_ENABLE_ROCM)
418 && !std::is_same<
typename traits::execution_space,
419 Kokkos::Experimental::ROCm>::value
421 #if defined(KOKKOS_ENABLE_HIP)
422 && !std::is_same<
typename traits::execution_space,
423 Kokkos::Experimental::HIP>::value
427 for (
int i = 0; i <
rank; ++i) {
428 span = m_upper[i] - m_lower[i];
429 if (m_tile[i] <= 0) {
430 if (((
int)inner_direction == (
int)Right && (i < rank - 1)) ||
431 ((int)inner_direction == (
int)Left && (i > 0))) {
434 m_tile[i] = (span == 0 ? 1 : span);
438 static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
439 m_num_tiles *= m_tile_end[i];
440 m_prod_tile_dims *= m_tile[i];
443 #if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP)
450 if ((
int)inner_direction == (int)Right) {
452 rank_start = rank - 1;
455 for (
int i = rank_start; i != rank_end; i += increment) {
456 span = m_upper[i] - m_lower[i];
457 if (m_tile[i] <= 0) {
460 if (((
int)inner_direction == (int)Right && (i < rank - 1)) ||
461 ((
int)inner_direction == (int)Left && (i > 0))) {
462 if (m_prod_tile_dims < 256) {
472 static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
473 m_num_tiles *= m_tile_end[i];
474 m_prod_tile_dims *= m_tile[i];
476 if (m_prod_tile_dims >
479 #if defined(KOKKOS_ENABLE_CUDA)
480 printf(
" Tile dimensions exceed Cuda limits\n");
482 " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of "
483 "threads per block - choose smaller tile dims");
485 printf(
" Tile dimensions exceed HIP limits\n");
487 " HIP ExecSpace Error: MDRange tile dims exceed maximum number of "
488 "threads per block - choose smaller tile dims");
493 #if defined(KOKKOS_ENABLE_ROCM)
500 if ((
int)inner_direction == (
int)Right) {
502 rank_start = rank - 1;
505 for (
int i = rank_start; i != rank_end; i += increment) {
506 span = m_upper[i] - m_lower[i];
507 if (m_tile[i] <= 0) {
510 if (((
int)inner_direction == (
int)Right && (i < rank - 1)) ||
511 ((
int)inner_direction == (
int)Left && (i > 0))) {
512 if (m_prod_tile_dims < 256) {
522 static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
523 m_num_tiles *= m_tile_end[i];
524 m_prod_tile_dims *= m_tile[i];
526 if (m_prod_tile_dims >
529 printf(
" Tile dimensions exceed ROCm limits\n");
531 " ROCm ExecSpace Error: MDRange tile dims exceed maximum number of "
532 "threads per block - choose smaller tile dims");
546 namespace Experimental {
547 using Kokkos::Iterate;
548 using Kokkos::MDRangePolicy;
554 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
560 namespace Experimental {
562 template <
typename MDRange,
typename Functor,
typename Enable =
void>
563 void md_parallel_for(
564 MDRange
const& range, Functor
const& f,
const std::string& str =
"",
565 typename std::enable_if<
567 #
if defined(KOKKOS_ENABLE_CUDA)
568 && !std::is_same<
typename MDRange::range_policy::execution_space,
571 #
if defined(KOKKOS_ENABLE_ROCM)
572 && !std::is_same<
typename MDRange::range_policy::execution_space,
573 Kokkos::Experimental::ROCm>::value
576 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
578 using range_policy =
typename MDRange::impl_range_policy;
584 template <
typename MDRange,
typename Functor>
585 void md_parallel_for(
586 const std::string& str, MDRange
const& range, Functor
const& f,
587 typename std::enable_if<
589 #
if defined(KOKKOS_ENABLE_CUDA)
590 && !std::is_same<
typename MDRange::range_policy::execution_space,
593 #
if defined(KOKKOS_ENABLE_ROCM)
594 && !std::is_same<
typename MDRange::range_policy::execution_space,
595 Kokkos::Experimental::ROCm>::value
598 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
600 using range_policy =
typename MDRange::impl_range_policy;
607 #if defined(__CUDACC__) && defined(KOKKOS_ENABLE_CUDA)
608 template <
typename MDRange,
typename Functor>
609 void md_parallel_for(
610 const std::string& str, MDRange
const& range, Functor
const& f,
611 typename std::enable_if<
613 #
if defined(KOKKOS_ENABLE_CUDA)
614 && std::is_same<
typename MDRange::range_policy::execution_space,
618 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag>
623 template <
typename MDRange,
typename Functor>
624 void md_parallel_for(
625 MDRange
const& range, Functor
const& f,
const std::string& str =
"",
626 typename std::enable_if<
628 #
if defined(KOKKOS_ENABLE_CUDA)
629 && std::is_same<
typename MDRange::range_policy::execution_space,
633 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag>
643 template <
typename MDRange,
typename Functor,
typename ValueType>
644 void md_parallel_reduce(
645 MDRange
const& range, Functor
const& f, ValueType& v,
646 const std::string& str =
"",
647 typename std::enable_if<
649 #
if defined(KOKKOS_ENABLE_CUDA)
650 && !std::is_same<
typename MDRange::range_policy::execution_space,
653 #
if defined(KOKKOS_ENABLE_ROCM)
654 && !std::is_same<
typename MDRange::range_policy::execution_space,
655 Kokkos::Experimental::ROCm>::value
658 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range,
661 using range_policy =
typename MDRange::impl_range_policy;
663 str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v);
666 template <
typename MDRange,
typename Functor,
typename ValueType>
667 void md_parallel_reduce(
668 const std::string& str, MDRange
const& range, Functor
const& f,
670 typename std::enable_if<
672 #
if defined(KOKKOS_ENABLE_CUDA)
673 && !std::is_same<
typename MDRange::range_policy::execution_space,
676 #
if defined(KOKKOS_ENABLE_ROCM)
677 && !std::is_same<
typename MDRange::range_policy::execution_space,
678 Kokkos::Experimental::ROCm>::value
681 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range,
684 using range_policy =
typename MDRange::impl_range_policy;
687 str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v);
697 namespace Experimental {
700 template <
unsigned long P,
class... Properties>
701 struct PolicyPropertyAdaptor<WorkItemProperty::ImplWorkItemProperty<P>,
702 MDRangePolicy<Properties...>> {
703 typedef MDRangePolicy<Properties...> policy_in_t;
704 typedef MDRangePolicy<
typename policy_in_t::traits::execution_space,
705 typename policy_in_t::traits::schedule_type,
706 typename policy_in_t::traits::work_tag,
707 typename policy_in_t::traits::index_type,
708 typename policy_in_t::traits::iteration_pattern,
709 typename policy_in_t::traits::launch_bounds,
710 WorkItemProperty::ImplWorkItemProperty<P>>
718 #endif // KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
Declaration of various MemoryLayout options.
Declaration of parallel operators.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename std::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=nullptr)
Parallel reduction.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename std::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=nullptr)
Execute functor in parallel according to the execution policy.
KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View< D, P...> &V)
Temporary free function rank() until rank() is implemented in the View.