44 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
45 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
47 #include <initializer_list>
51 #include<impl/KokkosExp_Host_IterateTile.hpp>
52 #include <Kokkos_ExecPolicy.hpp>
55 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
56 #include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
57 #include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
60 #if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
62 #include <ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp>
78 template <
typename ExecSpace>
79 struct default_outer_direction
82 #if defined( KOKKOS_ENABLE_CUDA)||defined( KOKKOS_ENABLE_ROCM)
83 static constexpr Iterate value = Iterate::Left;
85 static constexpr Iterate value = Iterate::Right;
89 template <
typename ExecSpace>
90 struct default_inner_direction
93 #if defined( KOKKOS_ENABLE_CUDA)||defined( KOKKOS_ENABLE_ROCM)
94 static constexpr Iterate value = Iterate::Left;
96 static constexpr Iterate value = Iterate::Right;
102 template <
unsigned N
103 , Iterate OuterDir = Iterate::Default
104 , Iterate InnerDir = Iterate::Default
108 static_assert( N != 0u,
"Kokkos Error: rank 0 undefined");
109 static_assert( N != 1u,
"Kokkos Error: rank 1 is not a multi-dimensional range");
110 static_assert( N < 7u,
"Kokkos Error: Unsupported rank...");
112 using iteration_pattern = Rank<N, OuterDir, InnerDir>;
114 static constexpr
int rank = N;
115 static constexpr Iterate outer_direction = OuterDir;
116 static constexpr Iterate inner_direction = InnerDir;
121 template <
typename... Properties>
123 :
public Kokkos::Impl::PolicyTraits<Properties ...>
125 using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
126 using range_policy = RangePolicy<Properties...>;
128 typename traits::execution_space m_space;
130 using impl_range_policy = RangePolicy<
typename traits::execution_space
131 ,
typename traits::schedule_type
132 ,
typename traits::index_type
135 typedef MDRangePolicy execution_policy;
137 template<
class ... OtherProperties>
138 friend struct MDRangePolicy;
140 static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
141 ,
"Kokkos Error: MD iteration pattern not defined" );
143 using iteration_pattern =
typename traits::iteration_pattern;
144 using work_tag =
typename traits::work_tag;
145 using launch_bounds =
typename traits::launch_bounds;
146 using member_type =
typename range_policy::member_type;
148 enum {
rank =
static_cast<int>(iteration_pattern::rank) };
150 using index_type =
typename traits::index_type;
151 using array_index_type = long;
165 point_type m_tile_end;
166 index_type m_num_tiles;
167 index_type m_prod_tile_dims;
186 static constexpr
int outer_direction =
static_cast<int> (
187 (iteration_pattern::outer_direction != Iterate::Default)
188 ? iteration_pattern::outer_direction
189 : default_outer_direction< typename traits::execution_space>::value );
191 static constexpr
int inner_direction =
static_cast<int> (
192 iteration_pattern::inner_direction != Iterate::Default
193 ? iteration_pattern::inner_direction
194 : default_inner_direction< typename traits::execution_space>::value ) ;
197 static constexpr
int Right =
static_cast<int>( Iterate::Right );
198 static constexpr
int Left =
static_cast<int>( Iterate::Left );
200 KOKKOS_INLINE_FUNCTION
const typename traits::execution_space & space()
const {
return m_space ; }
201 template <
typename LT ,
typename UT ,
typename TT = array_index_type >
202 MDRangePolicy(std::initializer_list<LT>
const& lower, std::initializer_list<UT>
const& upper, std::initializer_list<TT>
const& tile = {} )
204 init(lower, upper, tile);
207 template <
typename LT ,
typename UT ,
typename TT = array_index_type >
208 MDRangePolicy(
const typename traits::execution_space & work_space,
209 std::initializer_list<LT>
const& lower, std::initializer_list<UT>
const& upper, std::initializer_list<TT>
const& tile = {} )
210 : m_space( work_space ) {
211 init(lower, upper, tile);
214 MDRangePolicy( point_type
const& lower, point_type
const& upper, tile_type
const& tile = tile_type{} )
220 , m_prod_tile_dims(1) {
224 MDRangePolicy(
const typename traits::execution_space & work_space,
225 point_type
const& lower, point_type
const& upper, tile_type
const& tile = tile_type{} )
226 : m_space( work_space )
231 , m_prod_tile_dims(1) {
235 template<
class ... OtherProperties>
236 MDRangePolicy(
const MDRangePolicy<OtherProperties...> p ):
241 m_tile_end(p.m_tile_end),
242 m_num_tiles(p.m_num_tiles),
243 m_prod_tile_dims(p.m_prod_tile_dims) {}
250 #
if defined(KOKKOS_ENABLE_CUDA)
251 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
253 #
if defined(KOKKOS_ENABLE_ROCM)
254 && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
259 for (
int i=0; i<
rank; ++i) {
260 span = m_upper[i] - m_lower[i];
261 if ( m_tile[i] <= 0 ) {
262 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
263 || ((
int)inner_direction == (
int)Left && (i > 0)) )
268 m_tile[i] = (span == 0 ? 1 : span);
271 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
272 m_num_tiles *= m_tile_end[i];
273 m_prod_tile_dims *= m_tile[i];
276 #if defined(KOKKOS_ENABLE_CUDA)
283 if((
int)inner_direction == (int)Right) {
288 for (
int i=rank_start; i!=rank_end; i+=increment) {
289 span = m_upper[i] - m_lower[i];
290 if ( m_tile[i] <= 0 ) {
293 if ( ((
int)inner_direction == (int)Right && (i < rank-1))
294 || ((
int)inner_direction == (int)Left && (i > 0)) )
296 if ( m_prod_tile_dims < 256 ) {
306 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
307 m_num_tiles *= m_tile_end[i];
308 m_prod_tile_dims *= m_tile[i];
310 if ( m_prod_tile_dims > 1024 ) {
311 printf(
" Tile dimensions exceed Cuda limits\n");
312 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
317 #if defined(KOKKOS_ENABLE_ROCM)
324 if((
int)inner_direction == (
int)Right) {
329 for (
int i=rank_start; i!=rank_end; i+=increment) {
330 span = m_upper[i] - m_lower[i];
331 if ( m_tile[i] <= 0 ) {
334 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
335 || ((
int)inner_direction == (
int)Left && (i > 0)) )
337 if ( m_prod_tile_dims < 256 ) {
347 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
348 m_num_tiles *= m_tile_end[i];
349 m_prod_tile_dims *= m_tile[i];
351 if ( m_prod_tile_dims > 1024 ) {
352 printf(
" Tile dimensions exceed ROCm limits\n");
353 Kokkos::abort(
" ROCm ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
360 template <
typename LT ,
typename UT ,
typename TT = array_index_type >
361 void init( std::initializer_list<LT>
const& lower, std::initializer_list<UT>
const& upper, std::initializer_list<TT>
const& tile = {} )
363 if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
364 Kokkos::abort(
"MDRangePolicy: Constructor initializer lists have wrong size");
366 for (
auto i = 0; i <
rank; ++i ) {
367 m_lower[i] =
static_cast<array_index_type
>(lower.begin()[i]);
368 m_upper[i] =
static_cast<array_index_type
>(upper.begin()[i]);
369 if(static_cast<int>(tile.size())==rank)
370 m_tile[i] =
static_cast<array_index_type
>(tile.begin()[i]);
376 m_prod_tile_dims = 1;
380 #
if defined(KOKKOS_ENABLE_CUDA)
381 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
383 #if defined(KOKKOS_ENABLE_ROCM)
384 && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
389 for (
int i=0; i<
rank; ++i) {
390 span = m_upper[i] - m_lower[i];
391 if ( m_tile[i] <= 0 ) {
392 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
393 || ((int)inner_direction == (
int)Left && (i > 0)) )
398 m_tile[i] = (span == 0 ? 1 : span);
401 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
402 m_num_tiles *= m_tile_end[i];
403 m_prod_tile_dims *= m_tile[i];
406 #if defined(KOKKOS_ENABLE_CUDA)
413 if((
int)inner_direction == (int)Right) {
418 for (
int i=rank_start; i!=rank_end; i+=increment) {
419 span = m_upper[i] - m_lower[i];
420 if ( m_tile[i] <= 0 ) {
423 if ( ((
int)inner_direction == (int)Right && (i < rank-1))
424 || ((
int)inner_direction == (int)Left && (i > 0)) )
426 if ( m_prod_tile_dims < 256 ) {
436 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
437 m_num_tiles *= m_tile_end[i];
438 m_prod_tile_dims *= m_tile[i];
440 if ( m_prod_tile_dims > 1024 ) {
441 printf(
" Tile dimensions exceed Cuda limits\n");
442 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
447 #if defined(KOKKOS_ENABLE_ROCM)
454 if((
int)inner_direction == (
int)Right) {
459 for (
int i=rank_start; i!=rank_end; i+=increment) {
460 span = m_upper[i] - m_lower[i];
461 if ( m_tile[i] <= 0 ) {
464 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
465 || ((
int)inner_direction == (
int)Left && (i > 0)) )
467 if ( m_prod_tile_dims < 256 ) {
477 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
478 m_num_tiles *= m_tile_end[i];
479 m_prod_tile_dims *= m_tile[i];
481 if ( m_prod_tile_dims > 1024 ) {
482 printf(
" Tile dimensions exceed ROCm limits\n");
483 Kokkos::abort(
" ROCm ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
495 namespace Kokkos {
namespace Experimental {
496 using Kokkos::MDRangePolicy;
498 using Kokkos::Iterate;
502 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
507 namespace Kokkos {
namespace Experimental {
509 template <
typename MDRange,
typename Functor,
typename Enable =
void>
510 void md_parallel_for( MDRange
const& range
512 ,
const std::string& str =
""
513 ,
typename std::enable_if<(
true
514 #
if defined( KOKKOS_ENABLE_CUDA)
515 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
517 #
if defined( KOKKOS_ENABLE_ROCM)
518 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
523 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
525 using range_policy =
typename MDRange::impl_range_policy;
530 template <
typename MDRange,
typename Functor>
531 void md_parallel_for(
const std::string& str
532 , MDRange
const& range
534 ,
typename std::enable_if<(
true
535 #
if defined( KOKKOS_ENABLE_CUDA)
536 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
538 #
if defined( KOKKOS_ENABLE_ROCM)
539 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
544 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
546 using range_policy =
typename MDRange::impl_range_policy;
552 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
553 template <
typename MDRange,
typename Functor>
554 void md_parallel_for(
const std::string& str
555 , MDRange
const& range
557 ,
typename std::enable_if<(
true
558 #
if defined( KOKKOS_ENABLE_CUDA)
559 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
564 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
568 template <
typename MDRange,
typename Functor>
569 void md_parallel_for( MDRange
const& range
571 ,
const std::string& str =
""
572 ,
typename std::enable_if<(
true
573 #
if defined( KOKKOS_ENABLE_CUDA)
574 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
579 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
588 template <
typename MDRange,
typename Functor,
typename ValueType>
589 void md_parallel_reduce( MDRange
const& range
592 ,
const std::string& str =
""
593 ,
typename std::enable_if<(
true
594 #
if defined( KOKKOS_ENABLE_CUDA)
595 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
597 #
if defined( KOKKOS_ENABLE_ROCM)
598 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
603 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
605 using range_policy =
typename MDRange::impl_range_policy;
609 template <
typename MDRange,
typename Functor,
typename ValueType>
610 void md_parallel_reduce(
const std::string& str
611 , MDRange
const& range
614 ,
typename std::enable_if<(
true
615 #
if defined( KOKKOS_ENABLE_CUDA)
616 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
618 #
if defined( KOKKOS_ENABLE_ROCM)
619 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
624 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
626 using range_policy =
typename MDRange::impl_range_policy;
637 namespace Experimental {
640 template<
unsigned long P,
class ... Properties>
641 struct PolicyPropertyAdaptor<WorkItemProperty::ImplWorkItemProperty<P>,MDRangePolicy<Properties...>> {
642 typedef MDRangePolicy<Properties...> policy_in_t;
643 typedef MDRangePolicy<
typename policy_in_t::traits::execution_space,
644 typename policy_in_t::traits::schedule_type,
645 typename policy_in_t::traits::work_tag,
646 typename policy_in_t::traits::index_type,
647 typename policy_in_t::traits::iteration_pattern,
648 typename policy_in_t::traits::launch_bounds,
649 WorkItemProperty::ImplWorkItemProperty<P>> policy_out_t;
657 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
Declaration of various MemoryLayout options.
Declaration of parallel operators.
KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View< D, P...> &V)
Temporary free function rank() until rank() is implemented in the View.