Kokkos Core Kernels Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Kokkos_HPX.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #ifndef KOKKOS_HPX_HPP
46 #define KOKKOS_HPX_HPP
47 
48 #include <Kokkos_Macros.hpp>
49 #if defined(KOKKOS_ENABLE_HPX)
50 
51 #include <Kokkos_Core_fwd.hpp>
52 
53 #include <Kokkos_HostSpace.hpp>
54 #include <cstddef>
55 #include <iosfwd>
56 
57 #ifdef KOKKOS_ENABLE_HBWSPACE
58 #include <Kokkos_HBWSpace.hpp>
59 #endif
60 
61 #include <HPX/Kokkos_HPX_ChunkedRoundRobinExecutor.hpp>
62 #include <Kokkos_HostSpace.hpp>
63 #include <Kokkos_Layout.hpp>
64 #include <Kokkos_MemoryTraits.hpp>
65 #include <Kokkos_Parallel.hpp>
66 #include <Kokkos_ScratchSpace.hpp>
67 #include <Kokkos_TaskScheduler.hpp>
68 #include <impl/Kokkos_FunctorAdapter.hpp>
69 #include <impl/Kokkos_FunctorAnalysis.hpp>
70 #include <impl/Kokkos_Profiling_Interface.hpp>
71 #include <impl/Kokkos_Tags.hpp>
72 #include <impl/Kokkos_TaskQueue.hpp>
73 
74 #include <KokkosExp_MDRangePolicy.hpp>
75 
76 #include <hpx/apply.hpp>
77 #include <hpx/hpx_start.hpp>
78 #include <hpx/lcos/local/barrier.hpp>
79 #include <hpx/lcos/local/latch.hpp>
80 #include <hpx/parallel/algorithms/for_loop.hpp>
81 #include <hpx/parallel/algorithms/reduce.hpp>
82 #include <hpx/parallel/executors/static_chunk_size.hpp>
83 #include <hpx/runtime.hpp>
84 #include <hpx/runtime/threads/run_as_hpx_thread.hpp>
85 #include <hpx/runtime/threads/threadmanager.hpp>
86 #include <hpx/runtime/thread_pool_helpers.hpp>
87 
88 #include <iostream>
89 #include <memory>
90 #include <sstream>
91 #include <stdexcept>
92 #include <type_traits>
93 #include <vector>
94 
95 // There are currently two different implementations for the parallel dispatch
96 // functions:
97 //
98 // - 0: The HPX way. Unfortunately, this comes with unnecessary
99 // overheads at the moment, so there is
100 // - 1: The manual way. This way is more verbose and does not take advantage of
101 // e.g. parallel::for_loop in HPX but it is significantly faster in many
102 // benchmarks.
103 // - 2: Like 1, but spawn tasks using for_loop and a custom executor.
104 //
105 // In the long run 0 should be the preferred implementation, but until HPX is
106 // improved 1 will be the default.
107 #ifndef KOKKOS_HPX_IMPLEMENTATION
108 #define KOKKOS_HPX_IMPLEMENTATION 1
109 #endif
110 
111 #if (KOKKOS_HPX_IMPLEMENTATION < 0) || (KOKKOS_HPX_IMPLEMENTATION > 2)
112 #error "You have chosen an invalid value for KOKKOS_HPX_IMPLEMENTATION"
113 #endif
114 
115 namespace Kokkos {
116 namespace Impl {
117 class thread_buffer {
118  static constexpr std::size_t m_cache_line_size = 64;
119 
120  std::size_t m_num_threads;
121  std::size_t m_size_per_thread;
122  std::size_t m_size_total;
123  char *m_data;
124 
125  void pad_to_cache_line(std::size_t &size) {
126  size = ((size + m_cache_line_size - 1) / m_cache_line_size) *
127  m_cache_line_size;
128  }
129 
130  public:
131  thread_buffer()
132  : m_num_threads(0),
133  m_size_per_thread(0),
134  m_size_total(0),
135  m_data(nullptr) {}
136  thread_buffer(const std::size_t num_threads,
137  const std::size_t size_per_thread) {
138  resize(num_threads, size_per_thread);
139  }
140  ~thread_buffer() { delete[] m_data; }
141 
142  thread_buffer(const thread_buffer &) = delete;
143  thread_buffer(thread_buffer &&) = delete;
144  thread_buffer &operator=(const thread_buffer &) = delete;
145  thread_buffer &operator=(thread_buffer) = delete;
146 
147  void resize(const std::size_t num_threads,
148  const std::size_t size_per_thread) {
149  m_num_threads = num_threads;
150  m_size_per_thread = size_per_thread;
151 
152  pad_to_cache_line(m_size_per_thread);
153 
154  std::size_t size_total_new = m_num_threads * m_size_per_thread;
155 
156  if (m_size_total < size_total_new) {
157  delete[] m_data;
158  m_data = new char[size_total_new];
159  m_size_total = size_total_new;
160  }
161  }
162 
163  char *get(std::size_t thread_num) {
164  assert(thread_num < m_num_threads);
165  if (m_data == nullptr) {
166  return nullptr;
167  }
168  return &m_data[thread_num * m_size_per_thread];
169  }
170 
171  std::size_t size_per_thread() const noexcept { return m_size_per_thread; }
172  std::size_t size_total() const noexcept { return m_size_total; }
173 };
174 } // namespace Impl
175 
176 namespace Experimental {
177 class HPX {
178  private:
179  static bool m_hpx_initialized;
180  static Kokkos::Impl::thread_buffer m_buffer;
181 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
182  static hpx::future<void> m_future;
183 #endif
184 
185  public:
186  using execution_space = HPX;
187  using memory_space = HostSpace;
188  using device_type = Kokkos::Device<execution_space, memory_space>;
189  using array_layout = LayoutRight;
190  using size_type = memory_space::size_type;
191  using scratch_memory_space = ScratchMemorySpace<HPX>;
192 
193  HPX() noexcept {}
194  static void print_configuration(std::ostream &,
195  const bool /* verbose */ = false) {
196  std::cout << "HPX backend" << std::endl;
197  }
198  uint32_t impl_instance_id() const noexcept { return 0; }
199 
200  static bool in_parallel(HPX const & = HPX()) noexcept { return false; }
201  static void impl_static_fence(HPX const & = HPX())
202 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
203  {
204  if (hpx::threads::get_self_ptr() == nullptr) {
205  hpx::threads::run_as_hpx_thread([]() { impl_get_future().wait(); });
206  } else {
207  impl_get_future().wait();
208  }
209  }
210 #else
211  noexcept {
212  }
213 #endif
214 
215  void fence() const { impl_static_fence(); }
216 
217  static bool is_asynchronous(HPX const & = HPX()) noexcept {
218 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
219  return true;
220 #else
221  return false;
222 #endif
223  }
224 
225  static std::vector<HPX> partition(...) {
226  Kokkos::abort(
227  "Kokkos::Experimental::HPX::partition_master: can't partition an HPX "
228  "instance\n");
229  return std::vector<HPX>();
230  }
231 
232  template <typename F>
233  static void partition_master(F const &, int requested_num_partitions = 0,
234  int = 0) {
235  if (requested_num_partitions > 1) {
236  Kokkos::abort(
237  "Kokkos::Experimental::HPX::partition_master: can't partition an "
238  "HPX instance\n");
239  }
240  }
241 
242  static int concurrency();
243  static void impl_initialize(int thread_count);
244  static void impl_initialize();
245  static bool impl_is_initialized() noexcept;
246  static void impl_finalize();
247 
248  static int impl_thread_pool_size() noexcept {
249  hpx::runtime *rt = hpx::get_runtime_ptr();
250  if (rt == nullptr) {
251  return 0;
252  } else {
253  if (hpx::threads::get_self_ptr() == nullptr) {
254  return hpx::resource::get_thread_pool(0).get_os_thread_count();
255  } else {
256  return hpx::this_thread::get_pool()->get_os_thread_count();
257  }
258  }
259  }
260 
261  static int impl_thread_pool_rank() noexcept {
262  hpx::runtime *rt = hpx::get_runtime_ptr();
263  if (rt == nullptr) {
264  return 0;
265  } else {
266  if (hpx::threads::get_self_ptr() == nullptr) {
267  return 0;
268  } else {
269  return hpx::this_thread::get_pool()->get_pool_index();
270  }
271  }
272  }
273 
274  static int impl_thread_pool_size(int depth) {
275  if (depth == 0) {
276  return impl_thread_pool_size();
277  } else {
278  return 1;
279  }
280  }
281 
282  static int impl_max_hardware_threads() noexcept {
283  return hpx::threads::hardware_concurrency();
284  }
285 
286  static int impl_hardware_thread_id() noexcept {
287  return hpx::get_worker_thread_num();
288  }
289 
290  static Kokkos::Impl::thread_buffer &impl_get_buffer() noexcept {
291  return m_buffer;
292  }
293 
294 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
295  static hpx::future<void> &impl_get_future() noexcept { return m_future; }
296 #endif
297 
298  static constexpr const char *name() noexcept { return "HPX"; }
299 };
300 } // namespace Experimental
301 
302 namespace Profiling {
303 namespace Experimental {
304 template <>
305 struct DeviceTypeTraits<Kokkos::Experimental::HPX> {
306  constexpr static DeviceType id = DeviceType::HPX;
307 };
308 } // namespace Experimental
309 } // namespace Profiling
310 
311 namespace Impl {
312 template <typename Closure>
313 inline void dispatch_execute_task(Closure *closure) {
314 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
315  if (hpx::threads::get_self_ptr() == nullptr) {
316  hpx::threads::run_as_hpx_thread([closure]() {
317  hpx::future<void> &fut = Kokkos::Experimental::HPX::impl_get_future();
318  Closure closure_copy = *closure;
319  fut = fut.then([closure_copy](hpx::future<void> &&) {
320  closure_copy.execute_task();
321  });
322  });
323  } else {
324  hpx::future<void> &fut = Kokkos::Experimental::HPX::impl_get_future();
325  Closure closure_copy = *closure;
326  fut = fut.then(
327  [closure_copy](hpx::future<void> &&) { closure_copy.execute_task(); });
328  }
329 #else
330  if (hpx::threads::get_self_ptr() == nullptr) {
331  hpx::threads::run_as_hpx_thread([closure]() { closure->execute_task(); });
332  } else {
333  closure->execute_task();
334  }
335 #endif
336 }
337 } // namespace Impl
338 } // namespace Kokkos
339 
340 namespace Kokkos {
341 namespace Impl {
342 template <>
343 struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space,
344  Kokkos::Experimental::HPX::scratch_memory_space> {
345  enum { assignable = false };
346  enum { accessible = true };
347  enum { deepcopy = false };
348 };
349 
350 template <>
351 struct VerifyExecutionCanAccessMemorySpace<
352  Kokkos::Experimental::HPX::memory_space,
353  Kokkos::Experimental::HPX::scratch_memory_space> {
354  enum { value = true };
355  inline static void verify(void) {}
356  inline static void verify(const void *) {}
357 };
358 } // namespace Impl
359 } // namespace Kokkos
360 
361 namespace Kokkos {
362 namespace Experimental {
363 template <>
364 class UniqueToken<HPX, UniqueTokenScope::Instance> {
365  public:
366  using execution_space = HPX;
367  using size_type = int;
368  UniqueToken(execution_space const & = execution_space()) noexcept {}
369 
370  // NOTE: Currently this assumes that there is no oversubscription.
371  // hpx::get_num_worker_threads can't be used directly because it may yield
372  // it's task (problematic if called after hpx::get_worker_thread_num).
373  int size() const noexcept { return HPX::impl_max_hardware_threads(); }
374  int acquire() const noexcept { return HPX::impl_hardware_thread_id(); }
375  void release(int) const noexcept {}
376 };
377 
378 template <>
379 class UniqueToken<HPX, UniqueTokenScope::Global> {
380  public:
381  using execution_space = HPX;
382  using size_type = int;
383  UniqueToken(execution_space const & = execution_space()) noexcept {}
384 
385  // NOTE: Currently this assumes that there is no oversubscription.
386  // hpx::get_num_worker_threads can't be used directly because it may yield
387  // it's task (problematic if called after hpx::get_worker_thread_num).
388  int size() const noexcept { return HPX::impl_max_hardware_threads(); }
389  int acquire() const noexcept { return HPX::impl_hardware_thread_id(); }
390  void release(int) const noexcept {}
391 };
392 } // namespace Experimental
393 } // namespace Kokkos
394 
395 namespace Kokkos {
396 namespace Impl {
397 
398 struct HPXTeamMember {
399  public:
400  using execution_space = Kokkos::Experimental::HPX;
401  using scratch_memory_space =
403 
404  private:
405  scratch_memory_space m_team_shared;
406 
407  int m_league_size;
408  int m_league_rank;
409  int m_team_size;
410  int m_team_rank;
411 
412  public:
413  KOKKOS_INLINE_FUNCTION
414  const scratch_memory_space &team_shmem() const {
415  return m_team_shared.set_team_thread_mode(0, 1, 0);
416  }
417 
418  KOKKOS_INLINE_FUNCTION
419  const execution_space::scratch_memory_space &team_scratch(const int) const {
420  return m_team_shared.set_team_thread_mode(0, 1, 0);
421  }
422 
423  KOKKOS_INLINE_FUNCTION
424  const execution_space::scratch_memory_space &thread_scratch(const int) const {
425  return m_team_shared.set_team_thread_mode(0, team_size(), team_rank());
426  }
427 
428  KOKKOS_INLINE_FUNCTION int league_rank() const noexcept {
429  return m_league_rank;
430  }
431 
432  KOKKOS_INLINE_FUNCTION int league_size() const noexcept {
433  return m_league_size;
434  }
435 
436  KOKKOS_INLINE_FUNCTION int team_rank() const noexcept { return m_team_rank; }
437  KOKKOS_INLINE_FUNCTION int team_size() const noexcept { return m_team_size; }
438 
439  template <class... Properties>
440  constexpr KOKKOS_INLINE_FUNCTION HPXTeamMember(
441  const TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
442  &policy,
443  const int team_rank, const int league_rank, void *scratch,
444  int scratch_size) noexcept
445  : m_team_shared(scratch, scratch_size, scratch, scratch_size),
446  m_league_size(policy.league_size()),
447  m_league_rank(league_rank),
448  m_team_size(policy.team_size()),
449  m_team_rank(team_rank) {}
450 
451  KOKKOS_INLINE_FUNCTION
452  void team_barrier() const {}
453 
454  template <class ValueType>
455  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType &, const int &) const {
456  static_assert(std::is_trivially_default_constructible<ValueType>(),
457  "Only trivial constructible types can be broadcasted");
458  }
459 
460  template <class Closure, class ValueType>
461  KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure &, ValueType &,
462  const int &) const {
463  static_assert(std::is_trivially_default_constructible<ValueType>(),
464  "Only trivial constructible types can be broadcasted");
465  }
466 
467  template <class ValueType, class JoinOp>
468  KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType &value,
469  const JoinOp &) const {
470  return value;
471  }
472 
473  template <class ReducerType>
474  KOKKOS_INLINE_FUNCTION
475  typename std::enable_if<is_reducer<ReducerType>::value>::type
476  team_reduce(const ReducerType &) const {}
477 
478  template <typename Type>
479  KOKKOS_INLINE_FUNCTION Type
480  team_scan(const Type &value, Type *const global_accum = nullptr) const {
481  if (global_accum) {
482  Kokkos::atomic_fetch_add(global_accum, value);
483  }
484 
485  return 0;
486  }
487 };
488 
489 template <class... Properties>
490 class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
491  : public PolicyTraits<Properties...> {
492  using traits = PolicyTraits<Properties...>;
493 
494  int m_league_size;
495  int m_team_size;
496  std::size_t m_team_scratch_size[2];
497  std::size_t m_thread_scratch_size[2];
498  int m_chunk_size;
499 
500  public:
501  using member_type = HPXTeamMember;
502 
503  // NOTE: Max size is 1 for simplicity. In most cases more than 1 is not
504  // necessary on CPU. Implement later if there is a need.
505  template <class FunctorType>
506  inline static int team_size_max(const FunctorType &) {
507  return 1;
508  }
509 
510  template <class FunctorType>
511  inline static int team_size_recommended(const FunctorType &) {
512  return 1;
513  }
514 
515  template <class FunctorType>
516  inline static int team_size_recommended(const FunctorType &, const int &) {
517  return 1;
518  }
519 
520  template <class FunctorType>
521  int team_size_max(const FunctorType &, const ParallelForTag &) const {
522  return 1;
523  }
524 
525  template <class FunctorType>
526  int team_size_max(const FunctorType &, const ParallelReduceTag &) const {
527  return 1;
528  }
529 
530  template <class FunctorType, class ReducerType>
531  int team_size_max(const FunctorType &, const ReducerType &,
532  const ParallelReduceTag &) const {
533  return 1;
534  }
535 
536  template <class FunctorType>
537  int team_size_recommended(const FunctorType &, const ParallelForTag &) const {
538  return 1;
539  }
540 
541  template <class FunctorType>
542  int team_size_recommended(const FunctorType &,
543  const ParallelReduceTag &) const {
544  return 1;
545  }
546 
547  template <class FunctorType, class ReducerType>
548  int team_size_recommended(const FunctorType &, const ReducerType &,
549  const ParallelReduceTag &) const {
550  return 1;
551  }
552 
553  private:
554  inline void init(const int league_size_request, const int team_size_request) {
555  m_league_size = league_size_request;
556  const int max_team_size = 1; // TODO: Can't use team_size_max(...) because
557  // it requires a functor as argument.
558  m_team_size =
559  team_size_request > max_team_size ? max_team_size : team_size_request;
560 
561  if (m_chunk_size > 0) {
562  if (!Impl::is_integral_power_of_two(m_chunk_size))
563  Kokkos::abort("TeamPolicy blocking granularity must be power of two");
564  } else {
565  int new_chunk_size = 1;
566  while (new_chunk_size * 4 * Kokkos::Experimental::HPX::concurrency() <
567  m_league_size) {
568  new_chunk_size *= 2;
569  }
570 
571  if (new_chunk_size < 128) {
572  new_chunk_size = 1;
573  while ((new_chunk_size * Kokkos::Experimental::HPX::concurrency() <
574  m_league_size) &&
575  (new_chunk_size < 128))
576  new_chunk_size *= 2;
577  }
578 
579  m_chunk_size = new_chunk_size;
580  }
581  }
582 
583  public:
584  inline int team_size() const { return m_team_size; }
585  inline int league_size() const { return m_league_size; }
586 
587  inline size_t scratch_size(const int &level, int team_size_ = -1) const {
588  if (team_size_ < 0) {
589  team_size_ = m_team_size;
590  }
591  return m_team_scratch_size[level] +
592  team_size_ * m_thread_scratch_size[level];
593  }
594 
595  inline static int scratch_size_max(int level) {
596  return (level == 0 ? 1024 * 32 : // Roughly L1 size
597  20 * 1024 * 1024); // Limit to keep compatibility with CUDA
598  }
599 
600  public:
601  template <class ExecSpace, class... OtherProperties>
602  friend class TeamPolicyInternal;
603 
604  const typename traits::execution_space &space() const {
605  static typename traits::execution_space m_space;
606  return m_space;
607  }
608 
609  template <class... OtherProperties>
610  TeamPolicyInternal(const TeamPolicyInternal<Kokkos::Experimental::HPX,
611  OtherProperties...> &p) {
612  m_league_size = p.m_league_size;
613  m_team_size = p.m_team_size;
614  m_team_scratch_size[0] = p.m_team_scratch_size[0];
615  m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
616  m_team_scratch_size[1] = p.m_team_scratch_size[1];
617  m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
618  m_chunk_size = p.m_chunk_size;
619  }
620 
621  TeamPolicyInternal(const typename traits::execution_space &,
622  int league_size_request, int team_size_request,
623  int /* vector_length_request */ = 1)
624  : m_team_scratch_size{0, 0},
625  m_thread_scratch_size{0, 0},
626  m_chunk_size(0) {
627  init(league_size_request, team_size_request);
628  }
629 
630  TeamPolicyInternal(const typename traits::execution_space &,
631  int league_size_request, const Kokkos::AUTO_t &,
632  int /* vector_length_request */ = 1)
633  : m_team_scratch_size{0, 0},
634  m_thread_scratch_size{0, 0},
635  m_chunk_size(0) {
636  init(league_size_request, 1);
637  }
638 
639  TeamPolicyInternal(int league_size_request, int team_size_request,
640  int /* vector_length_request */ = 1)
641  : m_team_scratch_size{0, 0},
642  m_thread_scratch_size{0, 0},
643  m_chunk_size(0) {
644  init(league_size_request, team_size_request);
645  }
646 
647  TeamPolicyInternal(int league_size_request, const Kokkos::AUTO_t &,
648  int /* vector_length_request */ = 1)
649  : m_team_scratch_size{0, 0},
650  m_thread_scratch_size{0, 0},
651  m_chunk_size(0) {
652  init(league_size_request, 1);
653  }
654 
655  inline int chunk_size() const { return m_chunk_size; }
656 
657  inline TeamPolicyInternal &set_chunk_size(
658  typename traits::index_type chunk_size_) {
659  m_chunk_size = chunk_size_;
660  return *this;
661  }
662 
663  inline TeamPolicyInternal &set_scratch_size(const int &level,
664  const PerTeamValue &per_team) {
665  m_team_scratch_size[level] = per_team.value;
666  return *this;
667  }
668 
669  inline TeamPolicyInternal &set_scratch_size(
670  const int &level, const PerThreadValue &per_thread) {
671  m_thread_scratch_size[level] = per_thread.value;
672  return *this;
673  }
674 
675  inline TeamPolicyInternal &set_scratch_size(
676  const int &level, const PerTeamValue &per_team,
677  const PerThreadValue &per_thread) {
678  m_team_scratch_size[level] = per_team.value;
679  m_thread_scratch_size[level] = per_thread.value;
680  return *this;
681  }
682 };
683 } // namespace Impl
684 } // namespace Kokkos
685 
686 namespace Kokkos {
687 namespace Impl {
688 
689 template <class FunctorType, class... Traits>
690 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
691  Kokkos::Experimental::HPX> {
692  private:
693  using Policy = Kokkos::RangePolicy<Traits...>;
694  using WorkTag = typename Policy::work_tag;
695  using WorkRange = typename Policy::WorkRange;
696  using Member = typename Policy::member_type;
697 
698  const FunctorType m_functor;
699  const Policy m_policy;
700 
701  template <class TagType>
702  static typename std::enable_if<std::is_same<TagType, void>::value>::type
703  execute_functor(const FunctorType &functor, const Member i) {
704  functor(i);
705  }
706 
707  template <class TagType>
708  static typename std::enable_if<!std::is_same<TagType, void>::value>::type
709  execute_functor(const FunctorType &functor, const Member i) {
710  const TagType t{};
711  functor(t, i);
712  }
713 
714  template <class TagType>
715  static typename std::enable_if<std::is_same<TagType, void>::value>::type
716  execute_functor_range(const FunctorType &functor, const Member i_begin,
717  const Member i_end) {
718  for (Member i = i_begin; i < i_end; ++i) {
719  functor(i);
720  }
721  }
722 
723  template <class TagType>
724  static typename std::enable_if<!std::is_same<TagType, void>::value>::type
725  execute_functor_range(const FunctorType &functor, const Member i_begin,
726  const Member i_end) {
727  const TagType t{};
728  for (Member i = i_begin; i < i_end; ++i) {
729  functor(t, i);
730  }
731  }
732 
733  public:
734  void execute() const { Kokkos::Impl::dispatch_execute_task(this); }
735 
736  void execute_task() const {
737 #if KOKKOS_HPX_IMPLEMENTATION == 0
738  using hpx::parallel::for_loop;
739  using hpx::parallel::execution::par;
740  using hpx::parallel::execution::static_chunk_size;
741 
742  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
743  m_policy.begin(), m_policy.end(), [this](const Member i) {
744  execute_functor<WorkTag>(m_functor, i);
745  });
746 
747 #elif KOKKOS_HPX_IMPLEMENTATION == 1
748  using hpx::apply;
749  using hpx::lcos::local::latch;
750 
751  const int num_tasks =
752  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
753  m_policy.chunk_size();
754  latch num_tasks_remaining(num_tasks);
755  ChunkedRoundRobinExecutor exec(num_tasks);
756 
757  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
758  i_begin += m_policy.chunk_size()) {
759  apply(exec, [this, &num_tasks_remaining, i_begin]() {
760  const Member i_end =
761  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
762  execute_functor_range<WorkTag>(m_functor, i_begin, i_end);
763 
764  num_tasks_remaining.count_down(1);
765  });
766  }
767 
768  num_tasks_remaining.wait();
769 
770 #elif KOKKOS_HPX_IMPLEMENTATION == 2
771  using hpx::parallel::for_loop_strided;
772  using hpx::parallel::execution::par;
773  using hpx::parallel::execution::static_chunk_size;
774 
775  const int num_tasks =
776  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
777  m_policy.chunk_size();
778  ChunkedRoundRobinExecutor exec(num_tasks);
779 
780  for_loop_strided(
781  par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
782  m_policy.end(), m_policy.chunk_size(), [this](const Member i_begin) {
783  const Member i_end =
784  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
785  execute_functor_range<WorkTag>(m_functor, i_begin, i_end);
786  });
787 #endif
788  }
789 
790  inline ParallelFor(const FunctorType &arg_functor, Policy arg_policy)
791  : m_functor(arg_functor), m_policy(arg_policy) {}
792 };
793 
794 template <class FunctorType, class... Traits>
795 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
796  Kokkos::Experimental::HPX> {
797  private:
798  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
799  using Policy = typename MDRangePolicy::impl_range_policy;
800  using WorkTag = typename MDRangePolicy::work_tag;
801  using WorkRange = typename Policy::WorkRange;
802  using Member = typename Policy::member_type;
803  using iterate_type =
804  typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
805  WorkTag, void>;
806 
807  const FunctorType m_functor;
808  const MDRangePolicy m_mdr_policy;
809  const Policy m_policy;
810 
811  public:
812  void execute() const { dispatch_execute_task(this); }
813 
814  inline void execute_task() const {
815 #if KOKKOS_HPX_IMPLEMENTATION == 0
816  using hpx::parallel::for_loop;
817  using hpx::parallel::execution::par;
818  using hpx::parallel::execution::static_chunk_size;
819 
820  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
821  m_policy.begin(), m_policy.end(), [this](const Member i) {
822  iterate_type(m_mdr_policy, m_functor)(i);
823  });
824 
825 #elif KOKKOS_HPX_IMPLEMENTATION == 1
826  using hpx::apply;
827  using hpx::lcos::local::latch;
828 
829  const int num_tasks =
830  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
831  m_policy.chunk_size();
832  latch num_tasks_remaining(num_tasks);
833  ChunkedRoundRobinExecutor exec(num_tasks);
834 
835  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
836  i_begin += m_policy.chunk_size()) {
837  apply(exec, [this, &num_tasks_remaining, i_begin]() {
838  const Member i_end =
839  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
840  for (Member i = i_begin; i < i_end; ++i) {
841  iterate_type(m_mdr_policy, m_functor)(i);
842  }
843 
844  num_tasks_remaining.count_down(1);
845  });
846  }
847 
848  num_tasks_remaining.wait();
849 
850 #elif KOKKOS_HPX_IMPLEMENTATION == 2
851  using hpx::parallel::for_loop_strided;
852  using hpx::parallel::execution::par;
853  using hpx::parallel::execution::static_chunk_size;
854 
855  const int num_tasks =
856  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
857  m_policy.chunk_size();
858  ChunkedRoundRobinExecutor exec(num_tasks);
859 
860  for_loop_strided(
861  par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
862  m_policy.end(), m_policy.chunk_size(), [this](const Member i_begin) {
863  const Member i_end =
864  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
865  for (Member i = i_begin; i < i_end; ++i) {
866  iterate_type(m_mdr_policy, m_functor)(i);
867  }
868  });
869 #endif
870  }
871 
872  inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy)
873  : m_functor(arg_functor),
874  m_mdr_policy(arg_policy),
875  m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
876 };
877 } // namespace Impl
878 } // namespace Kokkos
879 
880 namespace Kokkos {
881 namespace Impl {
882 template <class FunctorType, class ReducerType, class... Traits>
883 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
884  Kokkos::Experimental::HPX> {
885  private:
886  using Policy = Kokkos::RangePolicy<Traits...>;
887  using WorkTag = typename Policy::work_tag;
888  using WorkRange = typename Policy::WorkRange;
889  using Member = typename Policy::member_type;
890  using Analysis =
891  FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
892  using ReducerConditional =
893  Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
894  FunctorType, ReducerType>;
895  using ReducerTypeFwd = typename ReducerConditional::type;
896  using WorkTagFwd =
897  typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
898  WorkTag, void>::type;
899  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
900  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
901  using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
902  using value_type = typename Analysis::value_type;
903  using pointer_type = typename Analysis::pointer_type;
904  using reference_type = typename Analysis::reference_type;
905 
906  const FunctorType m_functor;
907  const Policy m_policy;
908  const ReducerType m_reducer;
909  const pointer_type m_result_ptr;
910 
911  bool m_force_synchronous;
912 
913  template <class TagType>
914  inline static
915  typename std::enable_if<std::is_same<TagType, void>::value>::type
916  execute_functor(const FunctorType &functor, const Member i,
917  reference_type update) {
918  functor(i, update);
919  }
920 
921  template <class TagType>
922  inline static
923  typename std::enable_if<!std::is_same<TagType, void>::value>::type
924  execute_functor(const FunctorType &functor, const Member i,
925  reference_type update) {
926  const TagType t{};
927  functor(t, i, update);
928  }
929 
930  template <class TagType>
931  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
932  execute_functor_range(reference_type update, const Member i_begin,
933  const Member i_end) const {
934  for (Member i = i_begin; i < i_end; ++i) {
935  m_functor(i, update);
936  }
937  }
938 
939  template <class TagType>
940  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
941  execute_functor_range(reference_type update, const Member i_begin,
942  const Member i_end) const {
943  const TagType t{};
944 
945  for (Member i = i_begin; i < i_end; ++i) {
946  m_functor(t, i, update);
947  }
948  }
949 
950  class value_type_wrapper {
951  private:
952  std::size_t m_value_size;
953  char *m_value_buffer;
954 
955  public:
956  value_type_wrapper() : m_value_size(0), m_value_buffer(nullptr) {}
957 
958  value_type_wrapper(const std::size_t value_size)
959  : m_value_size(value_size), m_value_buffer(new char[m_value_size]) {}
960 
961  value_type_wrapper(const value_type_wrapper &other)
962  : m_value_size(0), m_value_buffer(nullptr) {
963  if (this != &other) {
964  m_value_buffer = new char[other.m_value_size];
965  m_value_size = other.m_value_size;
966 
967  std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
968  m_value_buffer);
969  }
970  }
971 
972  ~value_type_wrapper() { delete[] m_value_buffer; }
973 
974  value_type_wrapper(value_type_wrapper &&other)
975  : m_value_size(0), m_value_buffer(nullptr) {
976  if (this != &other) {
977  m_value_buffer = other.m_value_buffer;
978  m_value_size = other.m_value_size;
979 
980  other.m_value_buffer = nullptr;
981  other.m_value_size = 0;
982  }
983  }
984 
985  value_type_wrapper &operator=(const value_type_wrapper &other) {
986  if (this != &other) {
987  delete[] m_value_buffer;
988  m_value_buffer = new char[other.m_value_size];
989  m_value_size = other.m_value_size;
990 
991  std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
992  m_value_buffer);
993  }
994 
995  return *this;
996  }
997 
998  value_type_wrapper &operator=(value_type_wrapper &&other) {
999  if (this != &other) {
1000  delete[] m_value_buffer;
1001  m_value_buffer = other.m_value_buffer;
1002  m_value_size = other.m_value_size;
1003 
1004  other.m_value_buffer = nullptr;
1005  other.m_value_size = 0;
1006  }
1007 
1008  return *this;
1009  }
1010 
1011  pointer_type pointer() const {
1012  return reinterpret_cast<pointer_type>(m_value_buffer);
1013  }
1014 
1015  reference_type reference() const {
1016  return ValueOps::reference(
1017  reinterpret_cast<pointer_type>(m_value_buffer));
1018  }
1019  };
1020 
1021  public:
1022  void execute() const { dispatch_execute_task(this); }
1023 
1024  inline void execute_task() const {
1025  const std::size_t value_size =
1026  Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1027 
1028 #if KOKKOS_HPX_IMPLEMENTATION == 0
1029  // NOTE: This version makes the most use of HPX functionality, but
1030  // requires the struct value_type_wrapper to handle different
1031  // reference_types. It is also significantly slower than the version
1032  // below due to not reusing the buffer used by other functions.
1033  using hpx::parallel::for_loop;
1034  using hpx::parallel::reduction;
1035  using hpx::parallel::execution::par;
1036  using hpx::parallel::execution::static_chunk_size;
1037 
1038  value_type_wrapper final_value(value_size);
1039  value_type_wrapper identity(value_size);
1040 
1041  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1042  final_value.pointer());
1043  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1044  identity.pointer());
1045 
1046  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
1047  m_policy.begin(), m_policy.end(),
1048  reduction(final_value, identity,
1049  [this](value_type_wrapper &a,
1050  value_type_wrapper &b) -> value_type_wrapper & {
1051  ValueJoin::join(
1052  ReducerConditional::select(m_functor, m_reducer),
1053  a.pointer(), b.pointer());
1054  return a;
1055  }),
1056  [this](Member i, value_type_wrapper &update) {
1057  execute_functor<WorkTag>(m_functor, i, update.reference());
1058  });
1059 
1060  pointer_type final_value_ptr = final_value.pointer();
1061 
1062 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1063  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1064 
1065  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1066  buffer.resize(num_worker_threads, value_size);
1067 
1068  using hpx::apply;
1069  using hpx::lcos::local::latch;
1070 
1071  {
1072  latch num_tasks_remaining(num_worker_threads);
1073  ChunkedRoundRobinExecutor exec(num_worker_threads);
1074 
1075  for (int t = 0; t < num_worker_threads; ++t) {
1076  apply(exec, [this, &num_tasks_remaining, &buffer, t]() {
1077  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1078  reinterpret_cast<pointer_type>(buffer.get(t)));
1079 
1080  num_tasks_remaining.count_down(1);
1081  });
1082  }
1083 
1084  num_tasks_remaining.wait();
1085  }
1086 
1087  const int num_tasks =
1088  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1089  m_policy.chunk_size();
1090  latch num_tasks_remaining(num_tasks);
1091  ChunkedRoundRobinExecutor exec(num_tasks);
1092 
1093  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
1094  i_begin += m_policy.chunk_size()) {
1095  apply(exec, [this, &num_tasks_remaining, &buffer, i_begin]() {
1096  reference_type update =
1097  ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1098  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1099  const Member i_end =
1100  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1101  execute_functor_range<WorkTag>(update, i_begin, i_end);
1102 
1103  num_tasks_remaining.count_down(1);
1104  });
1105  }
1106 
1107  num_tasks_remaining.wait();
1108 
1109  for (int i = 1; i < num_worker_threads; ++i) {
1110  ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1111  reinterpret_cast<pointer_type>(buffer.get(0)),
1112  reinterpret_cast<pointer_type>(buffer.get(i)));
1113  }
1114 
1115  pointer_type final_value_ptr =
1116  reinterpret_cast<pointer_type>(buffer.get(0));
1117 
1118 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1119  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1120 
1121  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1122  buffer.resize(num_worker_threads, value_size);
1123 
1124  using hpx::parallel::for_loop;
1125  using hpx::parallel::for_loop_strided;
1126  using hpx::parallel::execution::par;
1127  using hpx::parallel::execution::static_chunk_size;
1128 
1129  {
1130  ChunkedRoundRobinExecutor exec(num_worker_threads);
1131 
1132  for_loop(par.on(exec).with(static_chunk_size(1)), std::size_t(0),
1133  num_worker_threads, [this, &buffer](const std::size_t t) {
1134  ValueInit::init(
1135  ReducerConditional::select(m_functor, m_reducer),
1136  reinterpret_cast<pointer_type>(buffer.get(t)));
1137  });
1138  }
1139 
1140  const int num_tasks =
1141  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1142  m_policy.chunk_size();
1143  ChunkedRoundRobinExecutor exec(num_tasks);
1144 
1145  for_loop_strided(
1146  par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
1147  m_policy.end(), m_policy.chunk_size(),
1148  [this, &buffer](const Member i_begin) {
1149  reference_type update =
1150  ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1151  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1152  const Member i_end =
1153  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1154  execute_functor_range<WorkTag>(update, i_begin, i_end);
1155  });
1156 
1157  for (int i = 1; i < num_worker_threads; ++i) {
1158  ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1159  reinterpret_cast<pointer_type>(buffer.get(0)),
1160  reinterpret_cast<pointer_type>(buffer.get(i)));
1161  }
1162 
1163  pointer_type final_value_ptr =
1164  reinterpret_cast<pointer_type>(buffer.get(0));
1165 #endif
1166 
1167  Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1168  ReducerConditional::select(m_functor, m_reducer), final_value_ptr);
1169 
1170  if (m_result_ptr != nullptr) {
1171  const int n = Analysis::value_count(
1172  ReducerConditional::select(m_functor, m_reducer));
1173 
1174  for (int j = 0; j < n; ++j) {
1175  m_result_ptr[j] = final_value_ptr[j];
1176  }
1177  }
1178  }
1179 
1180  template <class ViewType>
1181  inline ParallelReduce(
1182  const FunctorType &arg_functor, Policy arg_policy,
1183  const ViewType &arg_view,
1184  typename std::enable_if<Kokkos::is_view<ViewType>::value &&
1185  !Kokkos::is_reducer_type<ReducerType>::value,
1186  void *>::type = nullptr)
1187  : m_functor(arg_functor),
1188  m_policy(arg_policy),
1189  m_reducer(InvalidType()),
1190  m_result_ptr(arg_view.data()),
1191  m_force_synchronous(!arg_view.impl_track().has_record()) {}
1192 
1193  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
1194  const ReducerType &reducer)
1195  : m_functor(arg_functor),
1196  m_policy(arg_policy),
1197  m_reducer(reducer),
1198  m_result_ptr(reducer.view().data()),
1199  m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1200 };
1201 
1202 template <class FunctorType, class ReducerType, class... Traits>
1203 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
1204  Kokkos::Experimental::HPX> {
1205  private:
1206  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
1207  using Policy = typename MDRangePolicy::impl_range_policy;
1208  using WorkTag = typename MDRangePolicy::work_tag;
1209  using WorkRange = typename Policy::WorkRange;
1210  using Member = typename Policy::member_type;
1211  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
1212  MDRangePolicy, FunctorType>;
1213  using ReducerConditional =
1214  Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1215  FunctorType, ReducerType>;
1216  using ReducerTypeFwd = typename ReducerConditional::type;
1217  using WorkTagFwd =
1218  typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1219  WorkTag, void>::type;
1220  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1221  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1222  using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
1223  using pointer_type = typename Analysis::pointer_type;
1224  using value_type = typename Analysis::value_type;
1225  using reference_type = typename Analysis::reference_type;
1226  using iterate_type =
1227  typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
1228  WorkTag, reference_type>;
1229 
1230  const FunctorType m_functor;
1231  const MDRangePolicy m_mdr_policy;
1232  const Policy m_policy;
1233  const ReducerType m_reducer;
1234  const pointer_type m_result_ptr;
1235 
1236  bool m_force_synchronous;
1237 
1238  public:
1239  void execute() const { dispatch_execute_task(this); }
1240 
1241  inline void execute_task() const {
1242  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1243  const std::size_t value_size =
1244  Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1245 
1246  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1247  buffer.resize(num_worker_threads, value_size);
1248 
1249 #if KOKKOS_HPX_IMPLEMENTATION == 0
1250  using hpx::parallel::for_loop;
1251  using hpx::parallel::execution::par;
1252  using hpx::parallel::execution::static_chunk_size;
1253 
1254  for_loop(par, 0, num_worker_threads, [this, &buffer](std::size_t t) {
1255  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1256  reinterpret_cast<pointer_type>(buffer.get(t)));
1257  });
1258 
1259  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
1260  m_policy.begin(), m_policy.end(), [this, &buffer](const Member i) {
1261  reference_type update = ValueOps::reference(
1262  reinterpret_cast<pointer_type>(buffer.get(
1263  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1264  iterate_type(m_mdr_policy, m_functor, update)(i);
1265  });
1266 
1267 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1268  using hpx::apply;
1269  using hpx::lcos::local::latch;
1270 
1271  {
1272  latch num_tasks_remaining(num_worker_threads);
1273  ChunkedRoundRobinExecutor exec(num_worker_threads);
1274 
1275  for (int t = 0; t < num_worker_threads; ++t) {
1276  apply(exec, [this, &buffer, &num_tasks_remaining, t]() {
1277  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1278  reinterpret_cast<pointer_type>(buffer.get(t)));
1279 
1280  num_tasks_remaining.count_down(1);
1281  });
1282  }
1283 
1284  num_tasks_remaining.wait();
1285  }
1286 
1287  const int num_tasks =
1288  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1289  m_policy.chunk_size();
1290  latch num_tasks_remaining(num_tasks);
1291  ChunkedRoundRobinExecutor exec(num_tasks);
1292 
1293  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
1294  i_begin += m_policy.chunk_size()) {
1295  apply(exec, [this, &num_tasks_remaining, &buffer, i_begin]() {
1296  reference_type update =
1297  ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1298  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1299  const Member i_end =
1300  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1301 
1302  for (Member i = i_begin; i < i_end; ++i) {
1303  iterate_type(m_mdr_policy, m_functor, update)(i);
1304  }
1305 
1306  num_tasks_remaining.count_down(1);
1307  });
1308  }
1309 
1310  num_tasks_remaining.wait();
1311 
1312 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1313  using hpx::parallel::for_loop;
1314  using hpx::parallel::for_loop_strided;
1315  using hpx::parallel::execution::par;
1316  using hpx::parallel::execution::static_chunk_size;
1317 
1318  {
1319  ChunkedRoundRobinExecutor exec(num_worker_threads);
1320 
1321  for_loop(par.on(exec).with(static_chunk_size(1)), std::size_t(0),
1322  num_worker_threads, [this, &buffer](const std::size_t t) {
1323  ValueInit::init(
1324  ReducerConditional::select(m_functor, m_reducer),
1325  reinterpret_cast<pointer_type>(buffer.get(t)));
1326  });
1327  }
1328 
1329  const int num_tasks =
1330  (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1331  m_policy.chunk_size();
1332  ChunkedRoundRobinExecutor exec(num_tasks);
1333 
1334  for_loop_strided(
1335  par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
1336  m_policy.end(), m_policy.chunk_size(),
1337  [this, &buffer](const Member i_begin) {
1338  reference_type update =
1339  ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1340  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1341  const Member i_end =
1342  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1343 
1344  for (Member i = i_begin; i < i_end; ++i) {
1345  iterate_type(m_mdr_policy, m_functor, update)(i);
1346  }
1347  });
1348 #endif
1349 
1350  for (int i = 1; i < num_worker_threads; ++i) {
1351  ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1352  reinterpret_cast<pointer_type>(buffer.get(0)),
1353  reinterpret_cast<pointer_type>(buffer.get(i)));
1354  }
1355 
1356  Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1357  ReducerConditional::select(m_functor, m_reducer),
1358  reinterpret_cast<pointer_type>(buffer.get(0)));
1359 
1360  if (m_result_ptr != nullptr) {
1361  const int n = Analysis::value_count(
1362  ReducerConditional::select(m_functor, m_reducer));
1363 
1364  for (int j = 0; j < n; ++j) {
1365  m_result_ptr[j] = reinterpret_cast<pointer_type>(buffer.get(0))[j];
1366  }
1367  }
1368  }
1369 
1370  template <class ViewType>
1371  inline ParallelReduce(
1372  const FunctorType &arg_functor, MDRangePolicy arg_policy,
1373  const ViewType &arg_view,
1374  typename std::enable_if<Kokkos::is_view<ViewType>::value &&
1375  !Kokkos::is_reducer_type<ReducerType>::value,
1376  void *>::type = nullptr)
1377  : m_functor(arg_functor),
1378  m_mdr_policy(arg_policy),
1379  m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
1380  m_reducer(InvalidType()),
1381  m_result_ptr(arg_view.data()),
1382  m_force_synchronous(!arg_view.impl_track().has_record()) {}
1383 
1384  inline ParallelReduce(const FunctorType &arg_functor,
1385  MDRangePolicy arg_policy, const ReducerType &reducer)
1386  : m_functor(arg_functor),
1387  m_mdr_policy(arg_policy),
1388  m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
1389  m_reducer(reducer),
1390  m_result_ptr(reducer.view().data()),
1391  m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1392 };
1393 } // namespace Impl
1394 } // namespace Kokkos
1395 
1396 namespace Kokkos {
1397 namespace Impl {
1398 
1399 template <class FunctorType, class... Traits>
1400 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
1401  Kokkos::Experimental::HPX> {
1402  private:
1403  using Policy = Kokkos::RangePolicy<Traits...>;
1404  using WorkTag = typename Policy::work_tag;
1405  using WorkRange = typename Policy::WorkRange;
1406  using Member = typename Policy::member_type;
1407  using Analysis =
1408  FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1409  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
1410  using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
1411  using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
1412  using pointer_type = typename Analysis::pointer_type;
1413  using reference_type = typename Analysis::reference_type;
1414  using value_type = typename Analysis::value_type;
1415 
1416  const FunctorType m_functor;
1417  const Policy m_policy;
1418 
1419  template <class TagType>
1420  inline static
1421  typename std::enable_if<std::is_same<TagType, void>::value>::type
1422  execute_functor_range(const FunctorType &functor, const Member i_begin,
1423  const Member i_end, reference_type update,
1424  const bool final) {
1425  for (Member i = i_begin; i < i_end; ++i) {
1426  functor(i, update, final);
1427  }
1428  }
1429 
1430  template <class TagType>
1431  inline static
1432  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1433  execute_functor_range(const FunctorType &functor, const Member i_begin,
1434  const Member i_end, reference_type update,
1435  const bool final) {
1436  const TagType t{};
1437  for (Member i = i_begin; i < i_end; ++i) {
1438  functor(t, i, update, final);
1439  }
1440  }
1441 
1442  public:
1443  void execute() const { dispatch_execute_task(this); }
1444 
1445  inline void execute_task() const {
1446  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1447  const int value_count = Analysis::value_count(m_functor);
1448  const std::size_t value_size = Analysis::value_size(m_functor);
1449 
1450  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1451  buffer.resize(num_worker_threads, 2 * value_size);
1452 
1453  using hpx::apply;
1454  using hpx::lcos::local::barrier;
1455  using hpx::lcos::local::latch;
1456 
1457  barrier bar(num_worker_threads);
1458  latch num_tasks_remaining(num_worker_threads);
1459  ChunkedRoundRobinExecutor exec(num_worker_threads);
1460 
1461  for (int t = 0; t < num_worker_threads; ++t) {
1462  apply(exec, [this, &bar, &buffer, &num_tasks_remaining,
1463  num_worker_threads, value_count, value_size, t]() {
1464  reference_type update_sum = ValueInit::init(
1465  m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
1466 
1467  const WorkRange range(m_policy, t, num_worker_threads);
1468  execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1469  update_sum, false);
1470 
1471  bar.wait();
1472 
1473  if (t == 0) {
1474  ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
1475  buffer.get(0) + value_size));
1476 
1477  for (int i = 1; i < num_worker_threads; ++i) {
1478  pointer_type ptr_1_prev =
1479  reinterpret_cast<pointer_type>(buffer.get(i - 1));
1480  pointer_type ptr_2_prev =
1481  reinterpret_cast<pointer_type>(buffer.get(i - 1) + value_size);
1482  pointer_type ptr_2 =
1483  reinterpret_cast<pointer_type>(buffer.get(i) + value_size);
1484 
1485  for (int j = 0; j < value_count; ++j) {
1486  ptr_2[j] = ptr_2_prev[j];
1487  }
1488 
1489  ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
1490  }
1491  }
1492 
1493  bar.wait();
1494 
1495  reference_type update_base = ValueOps::reference(
1496  reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1497 
1498  execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1499  update_base, true);
1500 
1501  num_tasks_remaining.count_down(1);
1502  });
1503  }
1504 
1505  num_tasks_remaining.wait();
1506  }
1507 
1508  inline ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
1509  : m_functor(arg_functor), m_policy(arg_policy) {}
1510 };
1511 
1512 template <class FunctorType, class ReturnType, class... Traits>
1513 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
1514  ReturnType, Kokkos::Experimental::HPX> {
1515  private:
1516  using Policy = Kokkos::RangePolicy<Traits...>;
1517  using WorkTag = typename Policy::work_tag;
1518  using WorkRange = typename Policy::WorkRange;
1519  using Member = typename Policy::member_type;
1520  using Analysis =
1521  FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1522  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
1523  using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
1524  using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
1525  using pointer_type = typename Analysis::pointer_type;
1526  using reference_type = typename Analysis::reference_type;
1527  using value_type = typename Analysis::value_type;
1528 
1529  const FunctorType m_functor;
1530  const Policy m_policy;
1531  ReturnType &m_returnvalue;
1532 
1533  template <class TagType>
1534  inline static
1535  typename std::enable_if<std::is_same<TagType, void>::value>::type
1536  execute_functor_range(const FunctorType &functor, const Member i_begin,
1537  const Member i_end, reference_type update,
1538  const bool final) {
1539  for (Member i = i_begin; i < i_end; ++i) {
1540  functor(i, update, final);
1541  }
1542  }
1543 
1544  template <class TagType>
1545  inline static
1546  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1547  execute_functor_range(const FunctorType &functor, const Member i_begin,
1548  const Member i_end, reference_type update,
1549  const bool final) {
1550  const TagType t{};
1551  for (Member i = i_begin; i < i_end; ++i) {
1552  functor(t, i, update, final);
1553  }
1554  }
1555 
1556  public:
1557  void execute() const { dispatch_execute_task(this); }
1558 
1559  inline void execute_task() const {
1560  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1561  const int value_count = Analysis::value_count(m_functor);
1562  const std::size_t value_size = Analysis::value_size(m_functor);
1563 
1564  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1565  buffer.resize(num_worker_threads, 2 * value_size);
1566 
1567  using hpx::apply;
1568  using hpx::lcos::local::barrier;
1569  using hpx::lcos::local::latch;
1570 
1571  barrier bar(num_worker_threads);
1572  latch num_tasks_remaining(num_worker_threads);
1573  ChunkedRoundRobinExecutor exec(num_worker_threads);
1574 
1575  for (int t = 0; t < num_worker_threads; ++t) {
1576  apply(exec, [this, &bar, &buffer, &num_tasks_remaining,
1577  num_worker_threads, value_count, value_size, t]() {
1578  reference_type update_sum = ValueInit::init(
1579  m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
1580 
1581  const WorkRange range(m_policy, t, num_worker_threads);
1582  execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1583  update_sum, false);
1584 
1585  bar.wait();
1586 
1587  if (t == 0) {
1588  ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
1589  buffer.get(0) + value_size));
1590 
1591  for (int i = 1; i < num_worker_threads; ++i) {
1592  pointer_type ptr_1_prev =
1593  reinterpret_cast<pointer_type>(buffer.get(i - 1));
1594  pointer_type ptr_2_prev =
1595  reinterpret_cast<pointer_type>(buffer.get(i - 1) + value_size);
1596  pointer_type ptr_2 =
1597  reinterpret_cast<pointer_type>(buffer.get(i) + value_size);
1598 
1599  for (int j = 0; j < value_count; ++j) {
1600  ptr_2[j] = ptr_2_prev[j];
1601  }
1602 
1603  ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
1604  }
1605  }
1606 
1607  bar.wait();
1608 
1609  reference_type update_base = ValueOps::reference(
1610  reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1611 
1612  execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1613  update_base, true);
1614 
1615  if (t == num_worker_threads - 1) {
1616  m_returnvalue = update_base;
1617  }
1618 
1619  num_tasks_remaining.count_down(1);
1620  });
1621  }
1622 
1623  num_tasks_remaining.wait();
1624  }
1625 
1626  inline ParallelScanWithTotal(const FunctorType &arg_functor,
1627  const Policy &arg_policy,
1628  ReturnType &arg_returnvalue)
1629  : m_functor(arg_functor),
1630  m_policy(arg_policy),
1631  m_returnvalue(arg_returnvalue) {}
1632 };
1633 } // namespace Impl
1634 } // namespace Kokkos
1635 
1636 namespace Kokkos {
1637 namespace Impl {
1638 template <class FunctorType, class... Properties>
1639 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
1640  Kokkos::Experimental::HPX> {
1641  private:
1642  using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1643  using WorkTag = typename Policy::work_tag;
1644  using Member = typename Policy::member_type;
1645  using memory_space = Kokkos::HostSpace;
1646 
1647  const FunctorType m_functor;
1648  const Policy m_policy;
1649  const int m_league;
1650  const std::size_t m_shared;
1651 
1652  template <class TagType>
1653  inline static
1654  typename std::enable_if<std::is_same<TagType, void>::value>::type
1655  execute_functor(const FunctorType &functor, const Policy &policy,
1656  const int league_rank, char *local_buffer,
1657  const std::size_t local_buffer_size) {
1658  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1659  }
1660 
1661  template <class TagType>
1662  inline static
1663  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1664  execute_functor(const FunctorType &functor, const Policy &policy,
1665  const int league_rank, char *local_buffer,
1666  const std::size_t local_buffer_size) {
1667  const TagType t{};
1668  functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1669  }
1670 
1671  template <class TagType>
1672  inline static
1673  typename std::enable_if<std::is_same<TagType, void>::value>::type
1674  execute_functor_range(const FunctorType &functor, const Policy &policy,
1675  const int league_rank_begin,
1676  const int league_rank_end, char *local_buffer,
1677  const std::size_t local_buffer_size) {
1678  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1679  ++league_rank) {
1680  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1681  }
1682  }
1683 
1684  template <class TagType>
1685  inline static
1686  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1687  execute_functor_range(const FunctorType &functor, const Policy &policy,
1688  const int league_rank_begin,
1689  const int league_rank_end, char *local_buffer,
1690  const std::size_t local_buffer_size) {
1691  const TagType t{};
1692  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1693  ++league_rank) {
1694  functor(t,
1695  Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1696  }
1697  }
1698 
1699  public:
1700  void execute() const { dispatch_execute_task(this); }
1701 
1702  inline void execute_task() const {
1703  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1704 
1705  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1706  buffer.resize(num_worker_threads, m_shared);
1707 
1708 #if KOKKOS_HPX_IMPLEMENTATION == 0
1709  using hpx::parallel::for_loop;
1710  using hpx::parallel::execution::par;
1711  using hpx::parallel::execution::static_chunk_size;
1712 
1713  for_loop(
1714  par.with(static_chunk_size(m_policy.chunk_size())), 0,
1715  m_policy.league_size(), [this, &buffer](const int league_rank) {
1716  execute_functor<WorkTag>(
1717  m_functor, m_policy, league_rank,
1718  buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1719  m_shared);
1720  });
1721 
1722 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1723  using hpx::apply;
1724  using hpx::lcos::local::latch;
1725 
1726  const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1727  m_policy.chunk_size();
1728  latch num_tasks_remaining(num_tasks);
1729  ChunkedRoundRobinExecutor exec(num_tasks);
1730 
1731  for (int league_rank_begin = 0; league_rank_begin < m_policy.league_size();
1732  league_rank_begin += m_policy.chunk_size()) {
1733  apply(exec, [this, &buffer, &num_tasks_remaining, league_rank_begin]() {
1734  const int league_rank_end = (std::min)(
1735  league_rank_begin + m_policy.chunk_size(), m_policy.league_size());
1736  execute_functor_range<WorkTag>(
1737  m_functor, m_policy, league_rank_begin, league_rank_end,
1738  buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1739  m_shared);
1740 
1741  num_tasks_remaining.count_down(1);
1742  });
1743  }
1744 
1745  num_tasks_remaining.wait();
1746 
1747 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1748  using hpx::parallel::for_loop_strided;
1749  using hpx::parallel::execution::par;
1750  using hpx::parallel::execution::static_chunk_size;
1751 
1752  const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1753  m_policy.chunk_size();
1754  ChunkedRoundRobinExecutor exec(num_tasks);
1755 
1756  for_loop_strided(
1757  par.on(exec).with(static_chunk_size(1)), 0, m_policy.league_size(),
1758  m_policy.chunk_size(), [this, &buffer](const int league_rank_begin) {
1759  const int league_rank_end =
1760  (std::min)(league_rank_begin + m_policy.chunk_size(),
1761  m_policy.league_size());
1762  execute_functor_range<WorkTag>(
1763  m_functor, m_policy, league_rank_begin, league_rank_end,
1764  buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1765  m_shared);
1766  });
1767 #endif
1768  }
1769 
1770  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
1771  : m_functor(arg_functor),
1772  m_policy(arg_policy),
1773  m_league(arg_policy.league_size()),
1774  m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
1775  FunctorTeamShmemSize<FunctorType>::value(
1776  arg_functor, arg_policy.team_size())) {}
1777 };
1778 
1779 template <class FunctorType, class ReducerType, class... Properties>
1780 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
1781  ReducerType, Kokkos::Experimental::HPX> {
1782  private:
1783  using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1784  using Analysis =
1785  FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
1786  using Member = typename Policy::member_type;
1787  using WorkTag = typename Policy::work_tag;
1788  using ReducerConditional =
1789  Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1790  FunctorType, ReducerType>;
1791  using ReducerTypeFwd = typename ReducerConditional::type;
1792  using WorkTagFwd =
1793  typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1794  WorkTag, void>::type;
1795  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1796  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1797  using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
1798  using pointer_type = typename Analysis::pointer_type;
1799  using reference_type = typename Analysis::reference_type;
1800  using value_type = typename Analysis::value_type;
1801 
1802  const FunctorType m_functor;
1803  const int m_league;
1804  const Policy m_policy;
1805  const ReducerType m_reducer;
1806  pointer_type m_result_ptr;
1807  const std::size_t m_shared;
1808 
1809  bool m_force_synchronous;
1810 
1811  template <class TagType>
1812  inline static
1813  typename std::enable_if<std::is_same<TagType, void>::value>::type
1814  execute_functor(const FunctorType &functor, const Policy &policy,
1815  const int league_rank, char *local_buffer,
1816  const std::size_t local_buffer_size,
1817  reference_type update) {
1818  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1819  update);
1820  }
1821 
1822  template <class TagType>
1823  inline static
1824  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1825  execute_functor(const FunctorType &functor, const Policy &policy,
1826  const int league_rank, char *local_buffer,
1827  const std::size_t local_buffer_size,
1828  reference_type update) {
1829  const TagType t{};
1830  functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1831  update);
1832  }
1833 
1834  template <class TagType>
1835  inline static
1836  typename std::enable_if<std::is_same<TagType, void>::value>::type
1837  execute_functor_range(const FunctorType &functor, const Policy &policy,
1838  const int league_rank_begin,
1839  const int league_rank_end, char *local_buffer,
1840  const std::size_t local_buffer_size,
1841  reference_type update) {
1842  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1843  ++league_rank) {
1844  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1845  update);
1846  }
1847  }
1848 
1849  template <class TagType>
1850  inline static
1851  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1852  execute_functor_range(const FunctorType &functor, const Policy &policy,
1853  const int league_rank_begin,
1854  const int league_rank_end, char *local_buffer,
1855  const std::size_t local_buffer_size,
1856  reference_type update) {
1857  const TagType t{};
1858  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1859  ++league_rank) {
1860  functor(t,
1861  Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1862  update);
1863  }
1864  }
1865 
1866  public:
1867  void execute() const { dispatch_execute_task(this); }
1868 
1869  inline void execute_task() const {
1870  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1871  const std::size_t value_size =
1872  Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1873 
1874  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1875  buffer.resize(num_worker_threads, value_size + m_shared);
1876 
1877 #if KOKKOS_HPX_IMPLEMENTATION == 0
1878  using hpx::parallel::for_loop;
1879  using hpx::parallel::execution::par;
1880 
1881  for_loop(par, 0, num_worker_threads, [this, &buffer](const std::size_t t) {
1882  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1883  reinterpret_cast<pointer_type>(buffer.get(t)));
1884  });
1885 
1886  using hpx::parallel::execution::static_chunk_size;
1887 
1888  hpx::parallel::for_loop(
1889  par.with(static_chunk_size(m_policy.chunk_size())), 0,
1890  m_policy.league_size(),
1891  [this, &buffer, value_size](const int league_rank) {
1892  std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1893  reference_type update = ValueOps::reference(
1894  reinterpret_cast<pointer_type>(buffer.get(t)));
1895 
1896  execute_functor<WorkTag>(m_functor, m_policy, league_rank,
1897  buffer.get(t) + value_size, m_shared,
1898  update);
1899  });
1900 
1901 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1902  using hpx::apply;
1903  using hpx::lcos::local::latch;
1904 
1905  {
1906  latch num_tasks_remaining(num_worker_threads);
1907  ChunkedRoundRobinExecutor exec(num_worker_threads);
1908 
1909  for (int t = 0; t < num_worker_threads; ++t) {
1910  apply(exec, [this, &buffer, &num_tasks_remaining, t]() {
1911  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1912  reinterpret_cast<pointer_type>(buffer.get(t)));
1913 
1914  num_tasks_remaining.count_down(1);
1915  });
1916  }
1917 
1918  num_tasks_remaining.wait();
1919  }
1920 
1921  const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1922  m_policy.chunk_size();
1923  latch num_tasks_remaining(num_tasks);
1924  ChunkedRoundRobinExecutor exec(num_tasks);
1925 
1926  for (int league_rank_begin = 0; league_rank_begin < m_policy.league_size();
1927  league_rank_begin += m_policy.chunk_size()) {
1928  apply(exec, [this, &buffer, &num_tasks_remaining, league_rank_begin,
1929  value_size]() {
1930  std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1931  reference_type update =
1932  ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(t)));
1933  const int league_rank_end = (std::min)(
1934  league_rank_begin + m_policy.chunk_size(), m_policy.league_size());
1935  execute_functor_range<WorkTag>(
1936  m_functor, m_policy, league_rank_begin, league_rank_end,
1937  buffer.get(t) + value_size, m_shared, update);
1938 
1939  num_tasks_remaining.count_down(1);
1940  });
1941  }
1942 
1943  num_tasks_remaining.wait();
1944 
1945 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1946  using hpx::parallel::for_loop;
1947  using hpx::parallel::for_loop_strided;
1948  using hpx::parallel::execution::par;
1949  using hpx::parallel::execution::static_chunk_size;
1950 
1951  {
1952  ChunkedRoundRobinExecutor exec(num_worker_threads);
1953 
1954  for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
1955  [this, &buffer](std::size_t const t) {
1956  ValueInit::init(
1957  ReducerConditional::select(m_functor, m_reducer),
1958  reinterpret_cast<pointer_type>(buffer.get(t)));
1959  });
1960  }
1961 
1962  const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1963  m_policy.chunk_size();
1964  ChunkedRoundRobinExecutor exec(num_tasks);
1965 
1966  for_loop_strided(
1967  par.on(exec).with(static_chunk_size(1)), 0, m_policy.league_size(),
1968  m_policy.chunk_size(),
1969  [this, &buffer, value_size](int const league_rank_begin) {
1970  std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1971  reference_type update = ValueOps::reference(
1972  reinterpret_cast<pointer_type>(buffer.get(t)));
1973  const int league_rank_end =
1974  (std::min)(league_rank_begin + m_policy.chunk_size(),
1975  m_policy.league_size());
1976  execute_functor_range<WorkTag>(
1977  m_functor, m_policy, league_rank_begin, league_rank_end,
1978  buffer.get(t) + value_size, m_shared, update);
1979  });
1980 #endif
1981 
1982  const pointer_type ptr = reinterpret_cast<pointer_type>(buffer.get(0));
1983  for (int t = 1; t < num_worker_threads; ++t) {
1984  ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr,
1985  reinterpret_cast<pointer_type>(buffer.get(t)));
1986  }
1987 
1988  Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1989  ReducerConditional::select(m_functor, m_reducer), ptr);
1990 
1991  if (m_result_ptr) {
1992  const int n = Analysis::value_count(
1993  ReducerConditional::select(m_functor, m_reducer));
1994 
1995  for (int j = 0; j < n; ++j) {
1996  m_result_ptr[j] = ptr[j];
1997  }
1998  }
1999  }
2000 
2001  template <class ViewType>
2002  ParallelReduce(
2003  const FunctorType &arg_functor, const Policy &arg_policy,
2004  const ViewType &arg_result,
2005  typename std::enable_if<Kokkos::is_view<ViewType>::value &&
2006  !Kokkos::is_reducer_type<ReducerType>::value,
2007  void *>::type = nullptr)
2008  : m_functor(arg_functor),
2009  m_league(arg_policy.league_size()),
2010  m_policy(arg_policy),
2011  m_reducer(InvalidType()),
2012  m_result_ptr(arg_result.data()),
2013  m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
2014  FunctorTeamShmemSize<FunctorType>::value(
2015  m_functor, arg_policy.team_size())),
2016  m_force_synchronous(!arg_result.impl_track().has_record()) {}
2017 
2018  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
2019  const ReducerType &reducer)
2020  : m_functor(arg_functor),
2021  m_league(arg_policy.league_size()),
2022  m_policy(arg_policy),
2023  m_reducer(reducer),
2024  m_result_ptr(reducer.view().data()),
2025  m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
2026  FunctorTeamShmemSize<FunctorType>::value(
2027  arg_functor, arg_policy.team_size())),
2028  m_force_synchronous(!reducer.view().impl_track().has_record()) {}
2029 };
2030 } // namespace Impl
2031 } // namespace Kokkos
2032 
2033 namespace Kokkos {
2034 
2035 template <typename iType>
2036 KOKKOS_INLINE_FUNCTION
2037  Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2038  TeamThreadRange(const Impl::HPXTeamMember &thread, const iType &count) {
2039  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2040  thread, count);
2041 }
2042 
2043 template <typename iType1, typename iType2>
2044 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
2045  typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
2046 TeamThreadRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
2047  const iType2 &i_end) {
2048  using iType = typename std::common_type<iType1, iType2>::type;
2049  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2050  thread, iType(i_begin), iType(i_end));
2051 }
2052 
2053 template <typename iType>
2054 KOKKOS_INLINE_FUNCTION
2055  Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2056  TeamVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
2057  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2058  thread, count);
2059 }
2060 
2061 template <typename iType1, typename iType2>
2062 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
2063  typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
2064 TeamVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
2065  const iType2 &i_end) {
2066  using iType = typename std::common_type<iType1, iType2>::type;
2067  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2068  thread, iType(i_begin), iType(i_end));
2069 }
2070 
2071 template <typename iType>
2072 KOKKOS_INLINE_FUNCTION
2073  Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2074  ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
2075  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2076  thread, count);
2077 }
2078 
2079 template <typename iType>
2080 KOKKOS_INLINE_FUNCTION
2081  Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2082  ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &i_begin,
2083  const iType &i_end) {
2084  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2085  thread, i_begin, i_end);
2086 }
2087 
2088 KOKKOS_INLINE_FUNCTION
2089 Impl::ThreadSingleStruct<Impl::HPXTeamMember> PerTeam(
2090  const Impl::HPXTeamMember &thread) {
2091  return Impl::ThreadSingleStruct<Impl::HPXTeamMember>(thread);
2092 }
2093 
2094 KOKKOS_INLINE_FUNCTION
2095 Impl::VectorSingleStruct<Impl::HPXTeamMember> PerThread(
2096  const Impl::HPXTeamMember &thread) {
2097  return Impl::VectorSingleStruct<Impl::HPXTeamMember>(thread);
2098 }
2099 
2105 template <typename iType, class Lambda>
2106 KOKKOS_INLINE_FUNCTION void parallel_for(
2107  const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2108  &loop_boundaries,
2109  const Lambda &lambda) {
2110  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2111  i += loop_boundaries.increment)
2112  lambda(i);
2113 }
2114 
2121 template <typename iType, class Lambda, typename ValueType>
2122 KOKKOS_INLINE_FUNCTION void parallel_reduce(
2123  const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2124  &loop_boundaries,
2125  const Lambda &lambda, ValueType &result) {
2126  result = ValueType();
2127  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2128  i += loop_boundaries.increment) {
2129  lambda(i, result);
2130  }
2131 }
2132 
2138 template <typename iType, class Lambda>
2139 KOKKOS_INLINE_FUNCTION void parallel_for(
2140  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2141  &loop_boundaries,
2142  const Lambda &lambda) {
2143 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2144 #pragma ivdep
2145 #endif
2146  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2147  i += loop_boundaries.increment) {
2148  lambda(i);
2149  }
2150 }
2151 
2158 template <typename iType, class Lambda, typename ValueType>
2159 KOKKOS_INLINE_FUNCTION void parallel_reduce(
2160  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2161  &loop_boundaries,
2162  const Lambda &lambda, ValueType &result) {
2163  result = ValueType();
2164 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2165 #pragma ivdep
2166 #endif
2167  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2168  i += loop_boundaries.increment) {
2169  lambda(i, result);
2170  }
2171 }
2172 
2173 template <typename iType, class Lambda, typename ReducerType>
2174 KOKKOS_INLINE_FUNCTION void parallel_reduce(
2175  const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2176  &loop_boundaries,
2177  const Lambda &lambda, const ReducerType &reducer) {
2178  reducer.init(reducer.reference());
2179  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2180  i += loop_boundaries.increment) {
2181  lambda(i, reducer.reference());
2182  }
2183 }
2184 
2185 template <typename iType, class Lambda, typename ReducerType>
2186 KOKKOS_INLINE_FUNCTION void parallel_reduce(
2187  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2188  &loop_boundaries,
2189  const Lambda &lambda, const ReducerType &reducer) {
2190  reducer.init(reducer.reference());
2191 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2192 #pragma ivdep
2193 #endif
2194  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2195  i += loop_boundaries.increment) {
2196  lambda(i, reducer.reference());
2197  }
2198 }
2199 
2200 template <typename iType, class FunctorType>
2201 KOKKOS_INLINE_FUNCTION void parallel_scan(
2202  Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> const
2203  &loop_boundaries,
2204  const FunctorType &lambda) {
2205  using value_type = typename Kokkos::Impl::FunctorAnalysis<
2206  Kokkos::Impl::FunctorPatternInterface::SCAN, void,
2207  FunctorType>::value_type;
2208 
2209  value_type scan_val = value_type();
2210 
2211  // Intra-member scan
2212  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2213  i += loop_boundaries.increment) {
2214  lambda(i, scan_val, false);
2215  }
2216 
2217  // 'scan_val' output is the exclusive prefix sum
2218  scan_val = loop_boundaries.thread.team_scan(scan_val);
2219 
2220  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2221  i += loop_boundaries.increment) {
2222  lambda(i, scan_val, true);
2223  }
2224 }
2225 
2237 template <typename iType, class FunctorType>
2238 KOKKOS_INLINE_FUNCTION void parallel_scan(
2239  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2240  &loop_boundaries,
2241  const FunctorType &lambda) {
2242  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
2243  using value_type = typename ValueTraits::value_type;
2244 
2245  value_type scan_val = value_type();
2246 
2247 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2248 #pragma ivdep
2249 #endif
2250  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2251  i += loop_boundaries.increment) {
2252  lambda(i, scan_val, true);
2253  }
2254 }
2255 
2256 template <class FunctorType>
2257 KOKKOS_INLINE_FUNCTION void single(
2258  const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
2259  const FunctorType &lambda) {
2260  lambda();
2261 }
2262 
2263 template <class FunctorType>
2264 KOKKOS_INLINE_FUNCTION void single(
2265  const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
2266  const FunctorType &lambda) {
2267  lambda();
2268 }
2269 
2270 template <class FunctorType, class ValueType>
2271 KOKKOS_INLINE_FUNCTION void single(
2272  const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
2273  const FunctorType &lambda, ValueType &val) {
2274  lambda(val);
2275 }
2276 
2277 template <class FunctorType, class ValueType>
2278 KOKKOS_INLINE_FUNCTION void single(
2279  const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
2280  const FunctorType &lambda, ValueType &val) {
2281  lambda(val);
2282 }
2283 
2284 } // namespace Kokkos
2285 
2286 #include <HPX/Kokkos_HPX_Task.hpp>
2287 
2288 #endif /* #if defined( KOKKOS_ENABLE_HPX ) */
2289 #endif /* #ifndef KOKKOS_HPX_HPP */
KOKKOS_INLINE_FUNCTION size_type acquire() const
acquire value such that 0 &lt;= value &lt; size()
KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< iType, TeamMemberType > TeamThreadRange(const TeamMemberType &, const iType &count)=delete
Execution policy for parallel work over a threads within a team.
void print_configuration(std::ostream &, const bool detail=false)
Print &quot;Bill of Materials&quot;.
Scratch memory space associated with an execution space.
Memory management for host memory.
Declaration of various MemoryLayout options.
UniqueToken(execution_space const &=execution_space())
create object size for concurrency on the given instance
std::enable_if< std::is_same< typename Kokkos::View< T, P...>::array_layout, Kokkos::LayoutLeft >::value||std::is_same< typename Kokkos::View< T, P...>::array_layout, Kokkos::LayoutRight >::value >::type resize(Kokkos::View< T, P...> &v, const size_t n0=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n3=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n4=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n5=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7=KOKKOS_IMPL_CTOR_DEFAULT_ARG)
Resize a view with copying old data to new data at the corresponding indices.
Declaration of parallel operators.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename std::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=nullptr)
Parallel reduction.
KOKKOS_INLINE_FUNCTION void release(size_type) const
release a value acquired by generate
ReturnType
KOKKOS_INLINE_FUNCTION size_type size() const
upper bound for acquired values, i.e. 0 &lt;= value &lt; size()
KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< iType, TeamMemberType > TeamVectorRange(const TeamMemberType &, const iType &count)=delete
Execution policy for parallel work over a threads within a team.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename std::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=nullptr)
Execute functor in parallel according to the execution policy.
Execution policy for work over a range of an integral type.
KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct< iType, TeamMemberType > ThreadVectorRange(const TeamMemberType &, const iType &count)=delete
Execution policy for a vector parallel loop.