Kokkos Core Kernels Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Kokkos_ExecPolicy.hpp
1 //@HEADER
2 // ************************************************************************
3 //
4 // Kokkos v. 4.0
5 // Copyright (2022) National Technology & Engineering
6 // Solutions of Sandia, LLC (NTESS).
7 //
8 // Under the terms of Contract DE-NA0003525 with NTESS,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
12 // See https://kokkos.org/LICENSE for license information.
13 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14 //
15 //@HEADER
16 
17 #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
18 #include <Kokkos_Macros.hpp>
19 static_assert(false,
20  "Including non-public Kokkos header files is not allowed.");
21 #endif
22 #ifndef KOKKOS_EXECPOLICY_HPP
23 #define KOKKOS_EXECPOLICY_HPP
24 
25 #include <Kokkos_Core_fwd.hpp>
26 #include <impl/Kokkos_Traits.hpp>
27 #include <impl/Kokkos_Error.hpp>
28 #include <impl/Kokkos_AnalyzePolicy.hpp>
29 #include <Kokkos_Concepts.hpp>
30 #include <typeinfo>
31 #include <limits>
32 
33 //----------------------------------------------------------------------------
34 
35 namespace Kokkos {
36 
37 struct ParallelForTag {};
38 struct ParallelScanTag {};
39 struct ParallelReduceTag {};
40 
41 struct ChunkSize {
42  int value;
43  ChunkSize(int value_) : value(value_) {}
44 };
45 
67 template <class... Properties>
68 class RangePolicy : public Impl::PolicyTraits<Properties...> {
69  public:
70  using traits = Impl::PolicyTraits<Properties...>;
71 
72  private:
73  typename traits::execution_space m_space;
74  typename traits::index_type m_begin;
75  typename traits::index_type m_end;
76  typename traits::index_type m_granularity;
77  typename traits::index_type m_granularity_mask;
78 
79  template <class... OtherProperties>
80  friend class RangePolicy;
81 
82  public:
84  using execution_policy = RangePolicy<Properties...>;
85  using member_type = typename traits::index_type;
86  using index_type = typename traits::index_type;
87 
88  KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const {
89  return m_space;
90  }
91  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; }
92  KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; }
93 
94  // TODO: find a better workaround for Clangs weird instantiation order
95  // This thing is here because of an instantiation error, where the RangePolicy
96  // is inserted into FunctorValue Traits, which tries decltype on the operator.
97  // It tries to do this even though the first argument of parallel for clearly
98  // doesn't match.
99  void operator()(const int&) const {}
100 
101  template <class... OtherProperties>
102  RangePolicy(const RangePolicy<OtherProperties...>& p)
103  : traits(p), // base class may contain data such as desired occupancy
104  m_space(p.m_space),
105  m_begin(p.m_begin),
106  m_end(p.m_end),
107  m_granularity(p.m_granularity),
108  m_granularity_mask(p.m_granularity_mask) {}
109 
110  inline RangePolicy()
111  : m_space(),
112  m_begin(0),
113  m_end(0),
114  m_granularity(0),
115  m_granularity_mask(0) {}
116 
118  template <typename IndexType1, typename IndexType2,
119  std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
120  std::is_convertible_v<IndexType2, member_type>),
121  bool> = false>
122  inline RangePolicy(const IndexType1 work_begin, const IndexType2 work_end)
123  : RangePolicy(typename traits::execution_space(), work_begin, work_end) {}
124 
126  template <typename IndexType1, typename IndexType2,
127  std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
128  std::is_convertible_v<IndexType2, member_type>),
129  bool> = false>
130  inline RangePolicy(const typename traits::execution_space& work_space,
131  const IndexType1 work_begin, const IndexType2 work_end)
132  : m_space(work_space),
133  m_begin(work_begin),
134  m_end(work_end),
135  m_granularity(0),
136  m_granularity_mask(0) {
137  check_conversion_safety(work_begin);
138  check_conversion_safety(work_end);
139  check_bounds_validity();
140  set_auto_chunk_size();
141  }
142 
143  template <typename IndexType1, typename IndexType2,
144  std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
145  std::is_convertible_v<IndexType2, member_type>),
146  bool> = false>
147  RangePolicy(const typename traits::execution_space& work_space,
148  const IndexType1 work_begin, const IndexType2 work_end,
149  const ChunkSize chunk_size)
150  : m_space(work_space),
151  m_begin(work_begin),
152  m_end(work_end),
153  m_granularity(0),
154  m_granularity_mask(0) {
155  check_conversion_safety(work_begin);
156  check_conversion_safety(work_end);
157  check_bounds_validity();
158  set_chunk_size(chunk_size.value);
159  }
160 
162  template <typename IndexType1, typename IndexType2, typename... Args,
163  std::enable_if_t<(std::is_convertible_v<IndexType1, member_type> &&
164  std::is_convertible_v<IndexType2, member_type>),
165  bool> = false>
166  RangePolicy(const IndexType1 work_begin, const IndexType2 work_end,
167  const ChunkSize chunk_size)
168  : RangePolicy(typename traits::execution_space(), work_begin, work_end,
169  chunk_size) {}
170 
171  public:
172 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
173  KOKKOS_DEPRECATED_WITH_COMMENT("Use set_chunk_size instead")
174  inline void set(ChunkSize chunksize) {
175  m_granularity = chunksize.value;
176  m_granularity_mask = m_granularity - 1;
177  }
178 #endif
179 
180  public:
182  inline member_type chunk_size() const { return m_granularity; }
183 
185  inline RangePolicy& set_chunk_size(int chunk_size) {
186  m_granularity = chunk_size;
187  m_granularity_mask = m_granularity - 1;
188  return *this;
189  }
190 
191  private:
193  inline void set_auto_chunk_size() {
194 #ifdef KOKKOS_ENABLE_SYCL
195  if (std::is_same_v<typename traits::execution_space,
196  Kokkos::Experimental::SYCL>) {
197  // chunk_size <=1 lets the compiler choose the workgroup size when
198  // launching kernels
199  m_granularity = 1;
200  m_granularity_mask = 0;
201  return;
202  }
203 #endif
204  auto concurrency = static_cast<int64_t>(m_space.concurrency());
205  if (concurrency == 0) concurrency = 1;
206 
207  if (m_granularity > 0) {
208  if (!Impl::is_integral_power_of_two(m_granularity))
209  Kokkos::abort("RangePolicy blocking granularity must be power of two");
210  }
211 
212  int64_t new_chunk_size = 1;
213  while (new_chunk_size * 100 * concurrency <
214  static_cast<int64_t>(m_end - m_begin))
215  new_chunk_size *= 2;
216  if (new_chunk_size < 128) {
217  new_chunk_size = 1;
218  while ((new_chunk_size * 40 * concurrency <
219  static_cast<int64_t>(m_end - m_begin)) &&
220  (new_chunk_size < 128))
221  new_chunk_size *= 2;
222  }
223  m_granularity = new_chunk_size;
224  m_granularity_mask = m_granularity - 1;
225  }
226 
227  void check_bounds_validity() {
228  if (m_end < m_begin) {
229  std::string msg = "Kokkos::RangePolicy bounds error: The lower bound (" +
230  std::to_string(m_begin) +
231  ") is greater than the upper bound (" +
232  std::to_string(m_end) + ").\n";
233 #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
234  Kokkos::abort(msg.c_str());
235 #endif
236  m_begin = 0;
237  m_end = 0;
238 #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
239  Kokkos::Impl::log_warning(msg);
240 #endif
241  }
242  }
243 
244  // To be replaced with std::in_range (c++20)
245  template <typename IndexType>
246  static void check_conversion_safety(const IndexType bound) {
247 #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \
248  defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS)
249 
250  std::string msg =
251  "Kokkos::RangePolicy bound type error: an unsafe implicit conversion "
252  "is performed on a bound (" +
253  std::to_string(bound) +
254  "), which may "
255  "not preserve its original value.\n";
256  bool warn = false;
257 
258  if constexpr (std::is_signed_v<IndexType> !=
259  std::is_signed_v<member_type>) {
260  // check signed to unsigned
261  if constexpr (std::is_signed_v<IndexType>)
262  warn |= (bound < static_cast<IndexType>(
263  std::numeric_limits<member_type>::min()));
264 
265  // check unsigned to signed
266  if constexpr (std::is_signed_v<member_type>)
267  warn |= (bound > static_cast<IndexType>(
268  std::numeric_limits<member_type>::max()));
269  }
270 
271  // check narrowing
272  warn |= (static_cast<IndexType>(static_cast<member_type>(bound)) != bound);
273 
274  if (warn) {
275 #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4
276  Kokkos::abort(msg.c_str());
277 #endif
278 
279 #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS
280  Kokkos::Impl::log_warning(msg);
281 #endif
282  }
283 #else
284  (void)bound;
285 #endif
286  }
287 
288  public:
293  struct WorkRange {
294  using work_tag = typename RangePolicy<Properties...>::work_tag;
295  using member_type = typename RangePolicy<Properties...>::member_type;
296 
297  KOKKOS_INLINE_FUNCTION member_type begin() const { return m_begin; }
298  KOKKOS_INLINE_FUNCTION member_type end() const { return m_end; }
299 
304  KOKKOS_INLINE_FUNCTION
305  WorkRange(const RangePolicy& range, const int part_rank,
306  const int part_size)
307  : m_begin(0), m_end(0) {
308  if (part_size) {
309  // Split evenly among partitions, then round up to the granularity.
310  const member_type work_part =
311  ((((range.end() - range.begin()) + (part_size - 1)) / part_size) +
312  range.m_granularity_mask) &
313  ~member_type(range.m_granularity_mask);
314 
315  m_begin = range.begin() + work_part * part_rank;
316  m_end = m_begin + work_part;
317 
318  if (range.end() < m_begin) m_begin = range.end();
319  if (range.end() < m_end) m_end = range.end();
320  }
321  }
322 
323  private:
324  member_type m_begin;
325  member_type m_end;
326  WorkRange();
327  WorkRange& operator=(const WorkRange&);
328  };
329 };
330 
331 RangePolicy()->RangePolicy<>;
332 
333 RangePolicy(int64_t, int64_t)->RangePolicy<>;
334 RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>;
335 
336 RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>;
337 RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&)
338  ->RangePolicy<>;
339 
340 template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
341 RangePolicy(ES const&, int64_t, int64_t)->RangePolicy<ES>;
342 
343 template <typename ES, typename = std::enable_if_t<is_execution_space_v<ES>>>
344 RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy<ES>;
345 
346 } // namespace Kokkos
347 
348 //----------------------------------------------------------------------------
349 //----------------------------------------------------------------------------
350 
351 namespace Kokkos {
352 
353 namespace Impl {
354 
355 template <class ExecSpace, class... Properties>
356 class TeamPolicyInternal : public Impl::PolicyTraits<Properties...> {
357  private:
358  using traits = Impl::PolicyTraits<Properties...>;
359 
360  public:
361  using index_type = typename traits::index_type;
362 
363  //----------------------------------------
374  template <class FunctorType>
375  static int team_size_max(const FunctorType&);
376 
387  template <class FunctorType>
388  static int team_size_recommended(const FunctorType&);
389 
390  template <class FunctorType>
391  static int team_size_recommended(const FunctorType&, const int&);
392 
393  template <class FunctorType>
394  int team_size_recommended(const FunctorType& functor,
395  const int vector_length);
396 
397  //----------------------------------------
399  TeamPolicyInternal(const typename traits::execution_space&,
400  int league_size_request, int team_size_request,
401  int vector_length_request = 1);
402 
403  TeamPolicyInternal(const typename traits::execution_space&,
404  int league_size_request, const Kokkos::AUTO_t&,
405  int vector_length_request = 1);
406 
409  TeamPolicyInternal(int league_size_request, int team_size_request,
410  int vector_length_request = 1);
411 
412  TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t&,
413  int vector_length_request = 1);
414 
415  /* TeamPolicyInternal( int league_size_request , int team_size_request );
416 
417  TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
418 
424  KOKKOS_INLINE_FUNCTION int league_size() const;
425 
431  KOKKOS_INLINE_FUNCTION int team_size() const;
432 
435  inline bool impl_auto_team_size() const;
438  inline bool impl_auto_vector_length() const;
439 
440  static int vector_length_max();
441 
442  KOKKOS_INLINE_FUNCTION int impl_vector_length() const;
443 
444  inline typename traits::index_type chunk_size() const;
445 
446  inline TeamPolicyInternal& set_chunk_size(int chunk_size);
447 
451  struct member_type {
453  KOKKOS_INLINE_FUNCTION
454  typename traits::execution_space::scratch_memory_space team_shmem() const;
455 
457  KOKKOS_INLINE_FUNCTION int league_rank() const;
458 
460  KOKKOS_INLINE_FUNCTION int league_size() const;
461 
463  KOKKOS_INLINE_FUNCTION int team_rank() const;
464 
466  KOKKOS_INLINE_FUNCTION int team_size() const;
467 
469  KOKKOS_INLINE_FUNCTION void team_barrier() const;
470 
473  template <class JoinOp>
474  KOKKOS_INLINE_FUNCTION typename JoinOp::value_type team_reduce(
475  const typename JoinOp::value_type, const JoinOp&) const;
476 
482  template <typename Type>
483  KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value) const;
484 
494  template <typename Type>
495  KOKKOS_INLINE_FUNCTION Type team_scan(const Type& value,
496  Type* const global_accum) const;
497  };
498 };
499 
500 struct PerTeamValue {
501  size_t value;
502  PerTeamValue(size_t arg);
503 };
504 
505 struct PerThreadValue {
506  size_t value;
507  PerThreadValue(size_t arg);
508 };
509 
510 template <class iType, class... Args>
511 struct ExtractVectorLength {
512  static inline iType value(
513  std::enable_if_t<std::is_integral<iType>::value, iType> val, Args...) {
514  return val;
515  }
516  static inline std::enable_if_t<!std::is_integral<iType>::value, int> value(
517  std::enable_if_t<!std::is_integral<iType>::value, iType>, Args...) {
518  return 1;
519  }
520 };
521 
522 template <class iType, class... Args>
523 inline std::enable_if_t<std::is_integral<iType>::value, iType>
524 extract_vector_length(iType val, Args...) {
525  return val;
526 }
527 
528 template <class iType, class... Args>
529 inline std::enable_if_t<!std::is_integral<iType>::value, int>
530 extract_vector_length(iType, Args...) {
531  return 1;
532 }
533 
534 } // namespace Impl
535 
536 Impl::PerTeamValue PerTeam(const size_t& arg);
537 Impl::PerThreadValue PerThread(const size_t& arg);
538 
539 struct ScratchRequest {
540  int level;
541 
542  size_t per_team;
543  size_t per_thread;
544 
545  inline ScratchRequest(const int& level_,
546  const Impl::PerTeamValue& team_value) {
547  level = level_;
548  per_team = team_value.value;
549  per_thread = 0;
550  }
551 
552  inline ScratchRequest(const int& level_,
553  const Impl::PerThreadValue& thread_value) {
554  level = level_;
555  per_team = 0;
556  per_thread = thread_value.value;
557  }
558 
559  inline ScratchRequest(const int& level_, const Impl::PerTeamValue& team_value,
560  const Impl::PerThreadValue& thread_value) {
561  level = level_;
562  per_team = team_value.value;
563  per_thread = thread_value.value;
564  }
565 
566  inline ScratchRequest(const int& level_,
567  const Impl::PerThreadValue& thread_value,
568  const Impl::PerTeamValue& team_value) {
569  level = level_;
570  per_team = team_value.value;
571  per_thread = thread_value.value;
572  }
573 };
574 
575 // Throws a runtime exception if level is not `0` or `1`
576 void team_policy_check_valid_storage_level_argument(int level);
577 
604 template <class... Properties>
606  : public Impl::TeamPolicyInternal<
607  typename Impl::PolicyTraits<Properties...>::execution_space,
608  Properties...> {
609  using internal_policy = Impl::TeamPolicyInternal<
610  typename Impl::PolicyTraits<Properties...>::execution_space,
611  Properties...>;
612 
613  template <class... OtherProperties>
614  friend class TeamPolicy;
615 
616  public:
617  using traits = Impl::PolicyTraits<Properties...>;
618 
619  using execution_policy = TeamPolicy<Properties...>;
620 
621  TeamPolicy() : internal_policy(0, AUTO) {}
622 
624  TeamPolicy(const typename traits::execution_space& space_,
625  int league_size_request, int team_size_request,
626  int vector_length_request = 1)
627  : internal_policy(space_, league_size_request, team_size_request,
628  vector_length_request) {}
629 
630  TeamPolicy(const typename traits::execution_space& space_,
631  int league_size_request, const Kokkos::AUTO_t&,
632  int vector_length_request = 1)
633  : internal_policy(space_, league_size_request, Kokkos::AUTO(),
634  vector_length_request) {}
635 
636  TeamPolicy(const typename traits::execution_space& space_,
637  int league_size_request, const Kokkos::AUTO_t&,
638  const Kokkos::AUTO_t&)
639  : internal_policy(space_, league_size_request, Kokkos::AUTO(),
640  Kokkos::AUTO()) {}
641  TeamPolicy(const typename traits::execution_space& space_,
642  int league_size_request, const int team_size_request,
643  const Kokkos::AUTO_t&)
644  : internal_policy(space_, league_size_request, team_size_request,
645  Kokkos::AUTO()) {}
648  TeamPolicy(int league_size_request, int team_size_request,
649  int vector_length_request = 1)
650  : internal_policy(league_size_request, team_size_request,
651  vector_length_request) {}
652 
653  TeamPolicy(int league_size_request, const Kokkos::AUTO_t&,
654  int vector_length_request = 1)
655  : internal_policy(league_size_request, Kokkos::AUTO(),
656  vector_length_request) {}
657 
658  TeamPolicy(int league_size_request, const Kokkos::AUTO_t&,
659  const Kokkos::AUTO_t&)
660  : internal_policy(league_size_request, Kokkos::AUTO(), Kokkos::AUTO()) {}
661  TeamPolicy(int league_size_request, const int team_size_request,
662  const Kokkos::AUTO_t&)
663  : internal_policy(league_size_request, team_size_request,
664  Kokkos::AUTO()) {}
665 
666  template <class... OtherProperties>
667  TeamPolicy(const TeamPolicy<OtherProperties...> p) : internal_policy(p) {
668  // Cannot call converting constructor in the member initializer list because
669  // it is not a direct base.
670  internal_policy::traits::operator=(p);
671  }
672 
673  private:
674  TeamPolicy(const internal_policy& p) : internal_policy(p) {}
675 
676  public:
677  inline TeamPolicy& set_chunk_size(int chunk) {
678  static_assert(std::is_same<decltype(internal_policy::set_chunk_size(chunk)),
679  internal_policy&>::value,
680  "internal set_chunk_size should return a reference");
681  return static_cast<TeamPolicy&>(internal_policy::set_chunk_size(chunk));
682  }
683 
684  inline TeamPolicy& set_scratch_size(const int& level,
685  const Impl::PerTeamValue& per_team) {
686  static_assert(std::is_same<decltype(internal_policy::set_scratch_size(
687  level, per_team)),
688  internal_policy&>::value,
689  "internal set_chunk_size should return a reference");
690 
691  team_policy_check_valid_storage_level_argument(level);
692  return static_cast<TeamPolicy&>(
693  internal_policy::set_scratch_size(level, per_team));
694  }
695  inline TeamPolicy& set_scratch_size(const int& level,
696  const Impl::PerThreadValue& per_thread) {
697  team_policy_check_valid_storage_level_argument(level);
698  return static_cast<TeamPolicy&>(
699  internal_policy::set_scratch_size(level, per_thread));
700  }
701  inline TeamPolicy& set_scratch_size(const int& level,
702  const Impl::PerTeamValue& per_team,
703  const Impl::PerThreadValue& per_thread) {
704  team_policy_check_valid_storage_level_argument(level);
705  return static_cast<TeamPolicy&>(
706  internal_policy::set_scratch_size(level, per_team, per_thread));
707  }
708  inline TeamPolicy& set_scratch_size(const int& level,
709  const Impl::PerThreadValue& per_thread,
710  const Impl::PerTeamValue& per_team) {
711  team_policy_check_valid_storage_level_argument(level);
712  return static_cast<TeamPolicy&>(
713  internal_policy::set_scratch_size(level, per_team, per_thread));
714  }
715 };
716 
717 namespace Impl {
718 
719 template <typename iType, class TeamMemberType>
720 struct TeamThreadRangeBoundariesStruct {
721  private:
722  KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin,
723  const iType& arg_end,
724  const iType& arg_rank,
725  const iType& arg_size) {
726  return arg_begin +
727  ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank;
728  }
729 
730  KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin,
731  const iType& arg_end,
732  const iType& arg_rank,
733  const iType& arg_size) {
734  const iType end_ =
735  arg_begin +
736  ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1);
737  return end_ < arg_end ? end_ : arg_end;
738  }
739 
740  public:
741  using index_type = iType;
742  const iType start;
743  const iType end;
744  enum { increment = 1 };
745  const TeamMemberType& thread;
746 
747  KOKKOS_INLINE_FUNCTION
748  TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread,
749  const iType& arg_end)
750  : start(
751  ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
752  end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
753  thread(arg_thread) {}
754 
755  KOKKOS_INLINE_FUNCTION
756  TeamThreadRangeBoundariesStruct(const TeamMemberType& arg_thread,
757  const iType& arg_begin, const iType& arg_end)
758  : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(),
759  arg_thread.team_size())),
760  end(iend(arg_begin, arg_end, arg_thread.team_rank(),
761  arg_thread.team_size())),
762  thread(arg_thread) {}
763 };
764 
765 template <typename iType, class TeamMemberType>
766 struct TeamVectorRangeBoundariesStruct {
767  private:
768  KOKKOS_INLINE_FUNCTION static iType ibegin(const iType& arg_begin,
769  const iType& arg_end,
770  const iType& arg_rank,
771  const iType& arg_size) {
772  return arg_begin +
773  ((arg_end - arg_begin + arg_size - 1) / arg_size) * arg_rank;
774  }
775 
776  KOKKOS_INLINE_FUNCTION static iType iend(const iType& arg_begin,
777  const iType& arg_end,
778  const iType& arg_rank,
779  const iType& arg_size) {
780  const iType end_ =
781  arg_begin +
782  ((arg_end - arg_begin + arg_size - 1) / arg_size) * (arg_rank + 1);
783  return end_ < arg_end ? end_ : arg_end;
784  }
785 
786  public:
787  using index_type = iType;
788  const iType start;
789  const iType end;
790  enum { increment = 1 };
791  const TeamMemberType& thread;
792 
793  KOKKOS_INLINE_FUNCTION
794  TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread,
795  const iType& arg_end)
796  : start(
797  ibegin(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
798  end(iend(0, arg_end, arg_thread.team_rank(), arg_thread.team_size())),
799  thread(arg_thread) {}
800 
801  KOKKOS_INLINE_FUNCTION
802  TeamVectorRangeBoundariesStruct(const TeamMemberType& arg_thread,
803  const iType& arg_begin, const iType& arg_end)
804  : start(ibegin(arg_begin, arg_end, arg_thread.team_rank(),
805  arg_thread.team_size())),
806  end(iend(arg_begin, arg_end, arg_thread.team_rank(),
807  arg_thread.team_size())),
808  thread(arg_thread) {}
809 };
810 
811 template <typename iType, class TeamMemberType>
812 struct ThreadVectorRangeBoundariesStruct {
813  using index_type = iType;
814  const index_type start;
815  const index_type end;
816  enum { increment = 1 };
817 
818  KOKKOS_INLINE_FUNCTION
819  constexpr ThreadVectorRangeBoundariesStruct(const TeamMemberType,
820  const index_type& count) noexcept
821  : start(static_cast<index_type>(0)), end(count) {}
822 
823  KOKKOS_INLINE_FUNCTION
824  constexpr ThreadVectorRangeBoundariesStruct(
825  const TeamMemberType, const index_type& arg_begin,
826  const index_type& arg_end) noexcept
827  : start(static_cast<index_type>(arg_begin)), end(arg_end) {}
828 };
829 
830 template <class TeamMemberType>
831 struct ThreadSingleStruct {
832  const TeamMemberType& team_member;
833  KOKKOS_INLINE_FUNCTION
834  ThreadSingleStruct(const TeamMemberType& team_member_)
835  : team_member(team_member_) {}
836 };
837 
838 template <class TeamMemberType>
839 struct VectorSingleStruct {
840  const TeamMemberType& team_member;
841  KOKKOS_INLINE_FUNCTION
842  VectorSingleStruct(const TeamMemberType& team_member_)
843  : team_member(team_member_) {}
844 };
845 
846 } // namespace Impl
847 
855 template <typename iType, class TeamMemberType, class _never_use_this_overload>
856 KOKKOS_INLINE_FUNCTION_DELETED
857  Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType>
858  TeamThreadRange(const TeamMemberType&, const iType& count) = delete;
859 
867 template <typename iType1, typename iType2, class TeamMemberType,
868  class _never_use_this_overload>
869 KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
870  std::common_type_t<iType1, iType2>, TeamMemberType>
871 TeamThreadRange(const TeamMemberType&, const iType1& begin,
872  const iType2& end) = delete;
873 
881 template <typename iType, class TeamMemberType, class _never_use_this_overload>
882 KOKKOS_INLINE_FUNCTION_DELETED
883  Impl::TeamThreadRangeBoundariesStruct<iType, TeamMemberType>
884  TeamVectorRange(const TeamMemberType&, const iType& count) = delete;
885 
893 template <typename iType1, typename iType2, class TeamMemberType,
894  class _never_use_this_overload>
895 KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct<
896  std::common_type_t<iType1, iType2>, TeamMemberType>
897 TeamVectorRange(const TeamMemberType&, const iType1& begin,
898  const iType2& end) = delete;
899 
907 template <typename iType, class TeamMemberType, class _never_use_this_overload>
908 KOKKOS_INLINE_FUNCTION_DELETED
909  Impl::ThreadVectorRangeBoundariesStruct<iType, TeamMemberType>
910  ThreadVectorRange(const TeamMemberType&, const iType& count) = delete;
911 
912 template <typename iType1, typename iType2, class TeamMemberType,
913  class _never_use_this_overload>
914 KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct<
915  std::common_type_t<iType1, iType2>, TeamMemberType>
916 ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin,
917  const iType2& arg_end) = delete;
918 
919 namespace Impl {
920 
921 enum class TeamMDRangeLastNestLevel : bool { NotLastNestLevel, LastNestLevel };
922 enum class TeamMDRangeParThread : bool { NotParThread, ParThread };
923 enum class TeamMDRangeParVector : bool { NotParVector, ParVector };
924 enum class TeamMDRangeThreadAndVector : bool { NotBoth, Both };
925 
926 template <typename Rank, TeamMDRangeThreadAndVector ThreadAndVector>
927 struct HostBasedNestLevel;
928 
929 template <typename Rank, TeamMDRangeThreadAndVector ThreadAndVector>
930 struct AcceleratorBasedNestLevel;
931 
932 // ThreadAndVectorNestLevel determines on which nested level parallelization
933 // happens.
934 // - Rank is Kokkos::Rank<TotalNestLevel, Iter>
935 // - TotalNestLevel is the total number of loop nests
936 // - Iter is whether to go forward or backward through ranks (i.e. the
937 // iteration order for MDRangePolicy)
938 // - ThreadAndVector determines whether both vector and thread parallelism is
939 // in use
940 template <typename Rank, typename ExecSpace,
941  TeamMDRangeThreadAndVector ThreadAndVector>
942 struct ThreadAndVectorNestLevel;
943 
944 struct NoReductionTag {};
945 
946 template <typename Rank, typename TeamMDPolicy, typename Lambda,
947  typename ReductionValueType>
948 KOKKOS_INLINE_FUNCTION void md_parallel_impl(TeamMDPolicy const& policy,
949  Lambda const& lambda,
950  ReductionValueType&& val);
951 } // namespace Impl
952 
953 template <typename Rank, typename TeamHandle>
954 struct TeamThreadMDRange;
955 
956 template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
957 struct TeamThreadMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
958  using NestLevelType = int;
959  using BoundaryType = int;
960  using TeamHandleType = TeamHandle;
961  using ExecutionSpace = typename TeamHandleType::execution_space;
962  using ArrayLayout = typename ExecutionSpace::array_layout;
963 
964  static constexpr NestLevelType total_nest_level =
965  Rank<N, OuterDir, InnerDir>::rank;
966  static constexpr Iterate iter = OuterDir;
967  static constexpr auto par_thread = Impl::TeamMDRangeParThread::ParThread;
968  static constexpr auto par_vector = Impl::TeamMDRangeParVector::NotParVector;
969 
970  static constexpr Iterate direction =
971  OuterDir == Iterate::Default
972  ? layout_iterate_type_selector<ArrayLayout>::outer_iteration_pattern
973  : iter;
974 
975  template <class... Args>
976  KOKKOS_FUNCTION TeamThreadMDRange(TeamHandleType const& team_, Args&&... args)
977  : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
978  static_assert(sizeof...(Args) == total_nest_level);
979  }
980 
981  TeamHandleType const& team;
982  BoundaryType boundaries[total_nest_level];
983 };
984 
985 template <typename TeamHandle, typename... Args>
986 TeamThreadMDRange(TeamHandle const&, Args&&...)
987  ->TeamThreadMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
988 
989 template <typename Rank, typename TeamHandle>
990 struct ThreadVectorMDRange;
991 
992 template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
993 struct ThreadVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
994  using NestLevelType = int;
995  using BoundaryType = int;
996  using TeamHandleType = TeamHandle;
997  using ExecutionSpace = typename TeamHandleType::execution_space;
998  using ArrayLayout = typename ExecutionSpace::array_layout;
999 
1000  static constexpr NestLevelType total_nest_level =
1001  Rank<N, OuterDir, InnerDir>::rank;
1002  static constexpr Iterate iter = OuterDir;
1003  static constexpr auto par_thread = Impl::TeamMDRangeParThread::NotParThread;
1004  static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector;
1005 
1006  static constexpr Iterate direction =
1007  OuterDir == Iterate::Default
1008  ? layout_iterate_type_selector<ArrayLayout>::outer_iteration_pattern
1009  : iter;
1010 
1011  template <class... Args>
1012  KOKKOS_INLINE_FUNCTION ThreadVectorMDRange(TeamHandleType const& team_,
1013  Args&&... args)
1014  : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
1015  static_assert(sizeof...(Args) == total_nest_level);
1016  }
1017 
1018  TeamHandleType const& team;
1019  BoundaryType boundaries[total_nest_level];
1020 };
1021 
1022 template <typename TeamHandle, typename... Args>
1023 ThreadVectorMDRange(TeamHandle const&, Args&&...)
1024  ->ThreadVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
1025 
1026 template <typename Rank, typename TeamHandle>
1027 struct TeamVectorMDRange;
1028 
1029 template <unsigned N, Iterate OuterDir, Iterate InnerDir, typename TeamHandle>
1030 struct TeamVectorMDRange<Rank<N, OuterDir, InnerDir>, TeamHandle> {
1031  using NestLevelType = int;
1032  using BoundaryType = int;
1033  using TeamHandleType = TeamHandle;
1034  using ExecutionSpace = typename TeamHandleType::execution_space;
1035  using ArrayLayout = typename ExecutionSpace::array_layout;
1036 
1037  static constexpr NestLevelType total_nest_level =
1038  Rank<N, OuterDir, InnerDir>::rank;
1039  static constexpr Iterate iter = OuterDir;
1040  static constexpr auto par_thread = Impl::TeamMDRangeParThread::ParThread;
1041  static constexpr auto par_vector = Impl::TeamMDRangeParVector::ParVector;
1042 
1043  static constexpr Iterate direction =
1044  iter == Iterate::Default
1045  ? layout_iterate_type_selector<ArrayLayout>::outer_iteration_pattern
1046  : iter;
1047 
1048  template <class... Args>
1049  KOKKOS_INLINE_FUNCTION TeamVectorMDRange(TeamHandleType const& team_,
1050  Args&&... args)
1051  : team(team_), boundaries{static_cast<BoundaryType>(args)...} {
1052  static_assert(sizeof...(Args) == total_nest_level);
1053  }
1054 
1055  TeamHandleType const& team;
1056  BoundaryType boundaries[total_nest_level];
1057 };
1058 
1059 template <typename TeamHandle, typename... Args>
1060 TeamVectorMDRange(TeamHandle const&, Args&&...)
1061  ->TeamVectorMDRange<Rank<sizeof...(Args), Iterate::Default>, TeamHandle>;
1062 
1063 template <typename Rank, typename TeamHandle, typename Lambda,
1064  typename ReducerValueType>
1065 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1066  TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
1067  ReducerValueType& val) {
1068  static_assert(
1069  !std::is_array_v<ReducerValueType> &&
1070  !std::is_pointer_v<ReducerValueType> &&
1071  !Kokkos::is_reducer_v<ReducerValueType>,
1072  "Only scalar return types are allowed!");
1073 
1074  val = ReducerValueType{};
1075  Impl::md_parallel_impl<Rank>(policy, lambda, val);
1076  policy.team.team_reduce(
1077  Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
1078 }
1079 
1080 template <typename Rank, typename TeamHandle, typename Lambda>
1081 KOKKOS_INLINE_FUNCTION void parallel_for(
1082  TeamThreadMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
1083  Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
1084 }
1085 
1086 template <typename Rank, typename TeamHandle, typename Lambda,
1087  typename ReducerValueType>
1088 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1089  ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
1090  ReducerValueType& val) {
1091  static_assert(
1092  !std::is_array_v<ReducerValueType> &&
1093  !std::is_pointer_v<ReducerValueType> &&
1094  !Kokkos::is_reducer_v<ReducerValueType>,
1095  "Only a scalar return types are allowed!");
1096 
1097  val = ReducerValueType{};
1098  Impl::md_parallel_impl<Rank>(policy, lambda, val);
1099  if constexpr (false
1100 #ifdef KOKKOS_ENABLE_CUDA
1101  || std::is_same_v<typename TeamHandle::execution_space,
1102  Kokkos::Cuda>
1103 #elif defined(KOKKOS_ENABLE_HIP)
1104  || std::is_same_v<typename TeamHandle::execution_space,
1105  Kokkos::HIP>
1106 #elif defined(KOKKOS_ENABLE_SYCL)
1107  || std::is_same_v<typename TeamHandle::execution_space,
1108  Kokkos::Experimental::SYCL>
1109 #endif
1110  )
1111  policy.team.vector_reduce(
1112  Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
1113  val});
1114 }
1115 
1116 template <typename Rank, typename TeamHandle, typename Lambda>
1117 KOKKOS_INLINE_FUNCTION void parallel_for(
1118  ThreadVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
1119  Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
1120 }
1121 
1122 template <typename Rank, typename TeamHandle, typename Lambda,
1123  typename ReducerValueType>
1124 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1125  TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda,
1126  ReducerValueType& val) {
1127  static_assert(
1128  !std::is_array_v<ReducerValueType> &&
1129  !std::is_pointer_v<ReducerValueType> &&
1130  !Kokkos::is_reducer_v<ReducerValueType>,
1131  "Only a scalar return types are allowed!");
1132 
1133  val = ReducerValueType{};
1134  Impl::md_parallel_impl<Rank>(policy, lambda, val);
1135  if constexpr (false
1136 #ifdef KOKKOS_ENABLE_CUDA
1137  || std::is_same_v<typename TeamHandle::execution_space,
1138  Kokkos::Cuda>
1139 #elif defined(KOKKOS_ENABLE_HIP)
1140  || std::is_same_v<typename TeamHandle::execution_space,
1141  Kokkos::HIP>
1142 #elif defined(KOKKOS_ENABLE_SYCL)
1143  || std::is_same_v<typename TeamHandle::execution_space,
1144  Kokkos::Experimental::SYCL>
1145 #endif
1146  )
1147  policy.team.vector_reduce(
1148  Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{
1149  val});
1150  policy.team.team_reduce(
1151  Kokkos::Sum<ReducerValueType, typename TeamHandle::execution_space>{val});
1152 }
1153 
1154 template <typename Rank, typename TeamHandle, typename Lambda>
1155 KOKKOS_INLINE_FUNCTION void parallel_for(
1156  TeamVectorMDRange<Rank, TeamHandle> const& policy, Lambda const& lambda) {
1157  Impl::md_parallel_impl<Rank>(policy, lambda, Impl::NoReductionTag());
1158 }
1159 
1160 namespace Impl {
1161 
1162 template <typename FunctorType, typename TagType,
1163  bool HasTag = !std::is_void<TagType>::value>
1164 struct ParallelConstructName;
1165 
1166 template <typename FunctorType, typename TagType>
1167 struct ParallelConstructName<FunctorType, TagType, true> {
1168  ParallelConstructName(std::string const& label) : label_ref(label) {
1169  if (label.empty()) {
1170  default_name = std::string(typeid(FunctorType).name()) + "/" +
1171  typeid(TagType).name();
1172  }
1173  }
1174  std::string const& get() {
1175  return (label_ref.empty()) ? default_name : label_ref;
1176  }
1177  std::string const& label_ref;
1178  std::string default_name;
1179 };
1180 
1181 template <typename FunctorType, typename TagType>
1182 struct ParallelConstructName<FunctorType, TagType, false> {
1183  ParallelConstructName(std::string const& label) : label_ref(label) {
1184  if (label.empty()) {
1185  default_name = std::string(typeid(FunctorType).name());
1186  }
1187  }
1188  std::string const& get() {
1189  return (label_ref.empty()) ? default_name : label_ref;
1190  }
1191  std::string const& label_ref;
1192  std::string default_name;
1193 };
1194 
1195 } // namespace Impl
1196 
1197 } // namespace Kokkos
1198 
1199 namespace Kokkos {
1200 
1201 namespace Impl {
1202 
1203 template <class PatternTag, class... Args>
1204 struct PatternImplSpecializationFromTag;
1205 
1206 template <class... Args>
1207 struct PatternImplSpecializationFromTag<Kokkos::ParallelForTag, Args...>
1208  : type_identity<ParallelFor<Args...>> {};
1209 
1210 template <class... Args>
1211 struct PatternImplSpecializationFromTag<Kokkos::ParallelReduceTag, Args...>
1212  : type_identity<ParallelReduce<Args...>> {};
1213 
1214 template <class... Args>
1215 struct PatternImplSpecializationFromTag<Kokkos::ParallelScanTag, Args...>
1216  : type_identity<ParallelScan<Args...>> {};
1217 
1218 template <class PatternImpl>
1219 struct PatternTagFromImplSpecialization;
1220 
1221 template <class... Args>
1222 struct PatternTagFromImplSpecialization<ParallelFor<Args...>>
1223  : type_identity<ParallelForTag> {};
1224 
1225 template <class... Args>
1226 struct PatternTagFromImplSpecialization<ParallelReduce<Args...>>
1227  : type_identity<ParallelReduceTag> {};
1228 
1229 template <class... Args>
1230 struct PatternTagFromImplSpecialization<ParallelScan<Args...>>
1231  : type_identity<ParallelScanTag> {};
1232 
1233 } // end namespace Impl
1234 
1235 } // namespace Kokkos
1236 #endif /* #define KOKKOS_EXECPOLICY_HPP */
TeamPolicy(int league_size_request, int team_size_request, int vector_length_request=1)
Construct policy with the default instance of the execution space.
RangePolicy(const typename traits::execution_space &work_space, const IndexType1 work_begin, const IndexType2 work_end)
Total range.
member_type chunk_size() const
return chunk_size
TeamPolicy(const typename traits::execution_space &space_, int league_size_request, int team_size_request, int vector_length_request=1)
Construct policy with the given instance of the execution space.
RangePolicy(const IndexType1 work_begin, const IndexType2 work_end, const ChunkSize chunk_size)
Total range.
KOKKOS_INLINE_FUNCTION WorkRange(const RangePolicy &range, const int part_rank, const int part_size)
Subrange for a partition&#39;s rank and size.
RangePolicy & set_chunk_size(int chunk_size)
set chunk_size to a discrete value
Execution policy for work over a range of an integral type.
Subrange for a partition&#39;s rank and size.
Execution policy for parallel work over a league of teams of threads.
RangePolicy(const IndexType1 work_begin, const IndexType2 work_end)
Total range.
Parallel execution of a functor calls the functor once with each member of the execution policy...