44 #ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
45 #define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
47 #include <initializer_list>
51 #include<impl/KokkosExp_Host_IterateTile.hpp>
52 #include <Kokkos_ExecPolicy.hpp>
55 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
56 #include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
57 #include <Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp>
60 #if defined( __HCC__ ) && defined( KOKKOS_ENABLE_ROCM )
62 #include <ROCm/KokkosExp_ROCm_IterateTile_Refactor.hpp>
78 template <
typename ExecSpace>
79 struct default_outer_direction
82 #if defined( KOKKOS_ENABLE_CUDA)||defined( KOKKOS_ENABLE_ROCM)
83 static constexpr Iterate value = Iterate::Left;
85 static constexpr Iterate value = Iterate::Right;
89 template <
typename ExecSpace>
90 struct default_inner_direction
93 #if defined( KOKKOS_ENABLE_CUDA)||defined( KOKKOS_ENABLE_ROCM)
94 static constexpr Iterate value = Iterate::Left;
96 static constexpr Iterate value = Iterate::Right;
102 template <
unsigned N
103 , Iterate OuterDir = Iterate::Default
104 , Iterate InnerDir = Iterate::Default
108 static_assert( N != 0u,
"Kokkos Error: rank 0 undefined");
109 static_assert( N != 1u,
"Kokkos Error: rank 1 is not a multi-dimensional range");
110 static_assert( N < 7u,
"Kokkos Error: Unsupported rank...");
112 using iteration_pattern = Rank<N, OuterDir, InnerDir>;
114 static constexpr
int rank = N;
115 static constexpr Iterate outer_direction = OuterDir;
116 static constexpr Iterate inner_direction = InnerDir;
121 template <
typename... Properties>
123 :
public Kokkos::Impl::PolicyTraits<Properties ...>
125 using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
126 using range_policy = RangePolicy<Properties...>;
128 using impl_range_policy = RangePolicy<
typename traits::execution_space
129 ,
typename traits::schedule_type
130 ,
typename traits::index_type
133 typedef MDRangePolicy execution_policy;
135 static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
136 ,
"Kokkos Error: MD iteration pattern not defined" );
138 using iteration_pattern =
typename traits::iteration_pattern;
139 using work_tag =
typename traits::work_tag;
140 using launch_bounds =
typename traits::launch_bounds;
141 using member_type =
typename range_policy::member_type;
143 enum {
rank =
static_cast<int>(iteration_pattern::rank) };
145 using index_type =
typename traits::index_type;
146 using array_index_type = long;
160 point_type m_tile_end;
161 index_type m_num_tiles;
162 index_type m_prod_tile_dims;
181 static constexpr
int outer_direction =
static_cast<int> (
182 (iteration_pattern::outer_direction != Iterate::Default)
183 ? iteration_pattern::outer_direction
184 : default_outer_direction< typename traits::execution_space>::value );
186 static constexpr
int inner_direction =
static_cast<int> (
187 iteration_pattern::inner_direction != Iterate::Default
188 ? iteration_pattern::inner_direction
189 : default_inner_direction< typename traits::execution_space>::value ) ;
192 static constexpr
int Right =
static_cast<int>( Iterate::Right );
193 static constexpr
int Left =
static_cast<int>( Iterate::Left );
195 MDRangePolicy( point_type
const& lower, point_type
const& upper, tile_type
const& tile = tile_type{} )
200 , m_prod_tile_dims(1)
204 #
if defined(KOKKOS_ENABLE_CUDA)
205 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
207 #
if defined(KOKKOS_ENABLE_ROCM)
208 && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
213 for (
int i=0; i<
rank; ++i) {
214 span = upper[i] - lower[i];
215 if ( m_tile[i] <= 0 ) {
216 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
217 || ((
int)inner_direction == (
int)Left && (i > 0)) )
222 m_tile[i] = (span == 0 ? 1 : span);
225 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
226 m_num_tiles *= m_tile_end[i];
227 m_prod_tile_dims *= m_tile[i];
230 #if defined(KOKKOS_ENABLE_CUDA)
237 if((
int)inner_direction == (int)Right) {
242 for (
int i=rank_start; i!=rank_end; i+=increment) {
243 span = m_upper[i] - m_lower[i];
244 if ( m_tile[i] <= 0 ) {
247 if ( ((
int)inner_direction == (int)Right && (i < rank-1))
248 || ((
int)inner_direction == (int)Left && (i > 0)) )
250 if ( m_prod_tile_dims < 256 ) {
260 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
261 m_num_tiles *= m_tile_end[i];
262 m_prod_tile_dims *= m_tile[i];
264 if ( m_prod_tile_dims > 1024 ) {
265 printf(
" Tile dimensions exceed Cuda limits\n");
266 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
271 #if defined(KOKKOS_ENABLE_ROCM)
278 if((
int)inner_direction == (
int)Right) {
283 for (
int i=rank_start; i!=rank_end; i+=increment) {
284 span = m_upper[i] - m_lower[i];
285 if ( m_tile[i] <= 0 ) {
288 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
289 || ((
int)inner_direction == (
int)Left && (i > 0)) )
291 if ( m_prod_tile_dims < 256 ) {
301 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
302 m_num_tiles *= m_tile_end[i];
303 m_prod_tile_dims *= m_tile[i];
305 if ( m_prod_tile_dims > 1024 ) {
306 printf(
" Tile dimensions exceed ROCm limits\n");
307 Kokkos::abort(
" ROCm ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
315 template <
typename LT ,
typename UT ,
typename TT = array_index_type >
316 MDRangePolicy( std::initializer_list<LT>
const& lower, std::initializer_list<UT>
const& upper, std::initializer_list<TT>
const& tile = {} )
319 if(static_cast<int>(m_lower.size()) != rank || static_cast<int>(m_upper.size()) != rank)
320 Kokkos::abort(
"MDRangePolicy: Constructor initializer lists have wrong size");
322 for (
auto i = 0; i <
rank; ++i ) {
323 m_lower[i] =
static_cast<array_index_type
>(lower.begin()[i]);
324 m_upper[i] =
static_cast<array_index_type
>(upper.begin()[i]);
325 if(static_cast<int>(tile.size())==rank)
326 m_tile[i] =
static_cast<array_index_type
>(tile.begin()[i]);
332 m_prod_tile_dims = 1;
336 #
if defined(KOKKOS_ENABLE_CUDA)
337 && !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
339 #if defined(KOKKOS_ENABLE_ROCM)
340 && !std::is_same< typename traits::execution_space, Kokkos::Experimental::ROCm >::value
345 for (
int i=0; i<
rank; ++i) {
346 span = m_upper[i] - m_lower[i];
347 if ( m_tile[i] <= 0 ) {
348 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
349 || ((int)inner_direction == (
int)Left && (i > 0)) )
354 m_tile[i] = (span == 0 ? 1 : span);
357 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
358 m_num_tiles *= m_tile_end[i];
359 m_prod_tile_dims *= m_tile[i];
362 #if defined(KOKKOS_ENABLE_CUDA)
369 if((
int)inner_direction == (int)Right) {
374 for (
int i=rank_start; i!=rank_end; i+=increment) {
375 span = m_upper[i] - m_lower[i];
376 if ( m_tile[i] <= 0 ) {
379 if ( ((
int)inner_direction == (int)Right && (i < rank-1))
380 || ((
int)inner_direction == (int)Left && (i > 0)) )
382 if ( m_prod_tile_dims < 256 ) {
392 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
393 m_num_tiles *= m_tile_end[i];
394 m_prod_tile_dims *= m_tile[i];
396 if ( m_prod_tile_dims > 1024 ) {
397 printf(
" Tile dimensions exceed Cuda limits\n");
398 Kokkos::abort(
" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
403 #if defined(KOKKOS_ENABLE_ROCM)
410 if((
int)inner_direction == (
int)Right) {
415 for (
int i=rank_start; i!=rank_end; i+=increment) {
416 span = m_upper[i] - m_lower[i];
417 if ( m_tile[i] <= 0 ) {
420 if ( ((
int)inner_direction == (
int)Right && (i < rank-1))
421 || ((
int)inner_direction == (
int)Left && (i > 0)) )
423 if ( m_prod_tile_dims < 256 ) {
433 m_tile_end[i] =
static_cast<index_type
>((span + m_tile[i] - 1) / m_tile[i]);
434 m_num_tiles *= m_tile_end[i];
435 m_prod_tile_dims *= m_tile[i];
437 if ( m_prod_tile_dims > 1024 ) {
438 printf(
" Tile dimensions exceed ROCm limits\n");
439 Kokkos::abort(
" ROCm ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
451 namespace Kokkos {
namespace Experimental {
452 using Kokkos::MDRangePolicy;
454 using Kokkos::Iterate;
458 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
463 namespace Kokkos {
namespace Experimental {
465 template <
typename MDRange,
typename Functor,
typename Enable =
void>
466 void md_parallel_for( MDRange
const& range
468 ,
const std::string& str =
""
469 ,
typename std::enable_if<(
true
470 #
if defined( KOKKOS_ENABLE_CUDA)
471 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
473 #
if defined( KOKKOS_ENABLE_ROCM)
474 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
479 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
481 using range_policy =
typename MDRange::impl_range_policy;
486 template <
typename MDRange,
typename Functor>
487 void md_parallel_for(
const std::string& str
488 , MDRange
const& range
490 ,
typename std::enable_if<(
true
491 #
if defined( KOKKOS_ENABLE_CUDA)
492 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
494 #
if defined( KOKKOS_ENABLE_ROCM)
495 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
500 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, void> g(range, f);
502 using range_policy =
typename MDRange::impl_range_policy;
508 #if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
509 template <
typename MDRange,
typename Functor>
510 void md_parallel_for(
const std::string& str
511 , MDRange
const& range
513 ,
typename std::enable_if<(
true
514 #
if defined( KOKKOS_ENABLE_CUDA)
515 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
520 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
524 template <
typename MDRange,
typename Functor>
525 void md_parallel_for( MDRange
const& range
527 ,
const std::string& str =
""
528 ,
typename std::enable_if<(
true
529 #
if defined( KOKKOS_ENABLE_CUDA)
530 && std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
535 Kokkos::Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
544 template <
typename MDRange,
typename Functor,
typename ValueType>
545 void md_parallel_reduce( MDRange
const& range
548 ,
const std::string& str =
""
549 ,
typename std::enable_if<(
true
550 #
if defined( KOKKOS_ENABLE_CUDA)
551 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
553 #
if defined( KOKKOS_ENABLE_ROCM)
554 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
559 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
561 using range_policy =
typename MDRange::impl_range_policy;
565 template <
typename MDRange,
typename Functor,
typename ValueType>
566 void md_parallel_reduce(
const std::string& str
567 , MDRange
const& range
570 ,
typename std::enable_if<(
true
571 #
if defined( KOKKOS_ENABLE_CUDA)
572 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
574 #
if defined( KOKKOS_ENABLE_ROCM)
575 && !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Experimental::ROCm>::value
580 Kokkos::Impl::Experimental::MDFunctor<MDRange, Functor, ValueType> g(range, f);
582 using range_policy =
typename MDRange::impl_range_policy;
592 #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
Declaration of various MemoryLayout options.
Declaration of parallel operators.
KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View< D, P...> &V)
Temporary free function rank() until rank() is implemented in the View.