Kokkos Core Kernels Package  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Kokkos_HPX.hpp
1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 2.0
6 // Copyright (2014) Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
39 //
40 // ************************************************************************
41 //@HEADER
42 */
43 
44 #ifndef KOKKOS_HPX_HPP
45 #define KOKKOS_HPX_HPP
46 
47 #include <Kokkos_Macros.hpp>
48 #if defined(KOKKOS_ENABLE_HPX)
49 
50 #include <Kokkos_Core_fwd.hpp>
51 
52 #include <Kokkos_HostSpace.hpp>
53 #include <cstddef>
54 #include <iosfwd>
55 
56 #ifdef KOKKOS_ENABLE_HBWSPACE
57 #include <Kokkos_HBWSpace.hpp>
58 #endif
59 
60 #include <Kokkos_HostSpace.hpp>
61 #include <Kokkos_Layout.hpp>
62 #include <Kokkos_MemoryTraits.hpp>
63 #include <Kokkos_Parallel.hpp>
64 #include <Kokkos_ScratchSpace.hpp>
65 #include <Kokkos_TaskScheduler.hpp>
66 #include <impl/Kokkos_FunctorAdapter.hpp>
67 #include <impl/Kokkos_FunctorAnalysis.hpp>
68 #include <impl/Kokkos_Profiling_Interface.hpp>
69 #include <impl/Kokkos_Tags.hpp>
70 #include <impl/Kokkos_TaskQueue.hpp>
71 
72 #include <KokkosExp_MDRangePolicy.hpp>
73 
74 #include <hpx/apply.hpp>
75 #include <hpx/hpx_start.hpp>
76 #include <hpx/lcos/local/barrier.hpp>
77 #include <hpx/lcos/local/counting_semaphore.hpp>
78 #include <hpx/parallel/algorithms/for_loop.hpp>
79 #include <hpx/parallel/algorithms/reduce.hpp>
80 #include <hpx/parallel/executors/static_chunk_size.hpp>
81 #include <hpx/runtime.hpp>
82 #include <hpx/runtime/threads/run_as_hpx_thread.hpp>
83 #include <hpx/runtime/threads/threadmanager.hpp>
84 
85 #include <iostream>
86 #include <memory>
87 #include <sstream>
88 #include <stdexcept>
89 #include <type_traits>
90 #include <vector>
91 
92 // There are currently two different implementations for the parallel dispatch
93 // functions:
94 //
95 // - 0: The HPX way. Unfortunately, this comes with unnecessary
96 // overheads at the moment, so there is
97 // - 1: The manual way. This way is more verbose and does not take advantage of
98 // e.g. parallel::for_loop in HPX but it is significantly faster in many
99 // benchmarks.
100 //
101 // In the long run 0 should be the preferred implementation, but until HPX is
102 // improved 1 will be the default.
103 #ifndef KOKKOS_HPX_IMPLEMENTATION
104 #define KOKKOS_HPX_IMPLEMENTATION 1
105 #endif
106 
107 #if (KOKKOS_HPX_IMPLEMENTATION < 0) || (KOKKOS_HPX_IMPLEMENTATION > 1)
108 #error "You have chosen an invalid value for KOKKOS_HPX_IMPLEMENTATION"
109 #endif
110 
111 namespace Kokkos {
112 namespace Impl {
113 class thread_buffer {
114  static constexpr std::size_t m_cache_line_size = 64;
115 
116  std::size_t m_num_threads;
117  std::size_t m_size_per_thread;
118  std::size_t m_size_total;
119  char *m_data;
120 
121  void pad_to_cache_line(std::size_t &size) {
122  size = ((size + m_cache_line_size - 1) / m_cache_line_size) *
123  m_cache_line_size;
124  }
125 
126 public:
127  thread_buffer()
128  : m_num_threads(0), m_size_per_thread(0), m_size_total(0),
129  m_data(nullptr) {}
130  thread_buffer(const std::size_t num_threads,
131  const std::size_t size_per_thread) {
132  resize(num_threads, size_per_thread);
133  }
134  ~thread_buffer() { delete[] m_data; }
135 
136  thread_buffer(const thread_buffer &) = delete;
137  thread_buffer(thread_buffer &&) = delete;
138  thread_buffer &operator=(const thread_buffer &) = delete;
139  thread_buffer &operator=(thread_buffer) = delete;
140 
141  void resize(const std::size_t num_threads,
142  const std::size_t size_per_thread) {
143  m_num_threads = num_threads;
144  m_size_per_thread = size_per_thread;
145 
146  pad_to_cache_line(m_size_per_thread);
147 
148  std::size_t size_total_new = m_num_threads * m_size_per_thread;
149 
150  if (m_size_total < size_total_new) {
151  delete[] m_data;
152  m_data = new char[size_total_new];
153  m_size_total = size_total_new;
154  }
155  }
156 
157  char *get(std::size_t thread_num) {
158  assert(thread_num < m_num_threads);
159  if (m_data == nullptr) {
160  return nullptr;
161  }
162  return &m_data[thread_num * m_size_per_thread];
163  }
164 
165  std::size_t size_per_thread() const noexcept { return m_size_per_thread; }
166  std::size_t size_total() const noexcept { return m_size_total; }
167 };
168 } // namespace Impl
169 
170 namespace Experimental {
171 class HPX {
172 private:
173  static bool m_hpx_initialized;
174  static Kokkos::Impl::thread_buffer m_buffer;
175 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
176  static hpx::future<void> m_future;
177 #endif
178 
179 public:
180  using execution_space = HPX;
181  using memory_space = HostSpace;
182  using device_type = Kokkos::Device<execution_space, memory_space>;
183  using array_layout = LayoutRight;
184  using size_type = memory_space::size_type;
185  using scratch_memory_space = ScratchMemorySpace<HPX>;
186 
187  HPX() noexcept {}
188  static void print_configuration(std::ostream &,
189  const bool /* verbose */ = false) {
190  std::cout << "HPX backend" << std::endl;
191  }
192 
193  static bool in_parallel(HPX const & = HPX()) noexcept { return false; }
194  static void impl_static_fence(HPX const & = HPX())
195  #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
196  {
197  if (hpx::threads::get_self_ptr() == nullptr) {
198  hpx::threads::run_as_hpx_thread([]() { impl_get_future().wait(); });
199  } else {
200  impl_get_future().wait();
201  }
202  }
203  #else
204  noexcept {
205  }
206  #endif
207 
208  #ifdef KOKKOS_ENABLE_DEPRECATED_CODE
209  static void fence(HPX const & = HPX()) {
210  #else
211  void fence() const {
212  #endif
213  impl_static_fence();
214  }
215 
216  static bool is_asynchronous(HPX const & = HPX()) noexcept {
217 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
218  return true;
219 #else
220  return false;
221 #endif
222  }
223 
224  static std::vector<HPX> partition(...) {
225  Kokkos::abort("Kokkos::Experimental::HPX::partition_master: can't partition an HPX "
226  "instance\n");
227  return std::vector<HPX>();
228  }
229 
230  template <typename F>
231  static void partition_master(F const &f, int requested_num_partitions = 0,
232  int requested_partition_size = 0) {
233  if (requested_num_partitions > 1) {
234  Kokkos::abort("Kokkos::Experimental::HPX::partition_master: can't partition an "
235  "HPX instance\n");
236  }
237  }
238 
239  static int concurrency();
240  static void impl_initialize(int thread_count);
241  static void impl_initialize();
242  static bool impl_is_initialized() noexcept;
243  static void impl_finalize();
244 
245  static int impl_thread_pool_size() noexcept {
246  hpx::runtime *rt = hpx::get_runtime_ptr();
247  if (rt == nullptr) {
248  return 0;
249  } else {
250  if (hpx::threads::get_self_ptr() == nullptr) {
251  return hpx::resource::get_thread_pool(0).get_os_thread_count();
252  } else {
253  return hpx::this_thread::get_pool()->get_os_thread_count();
254  }
255  }
256  }
257 
258  static int impl_thread_pool_rank() noexcept {
259  hpx::runtime *rt = hpx::get_runtime_ptr();
260  if (rt == nullptr) {
261  return 0;
262  } else {
263  if (hpx::threads::get_self_ptr() == nullptr) {
264  return 0;
265  } else {
266  return hpx::this_thread::get_pool()->get_pool_index();
267  }
268  }
269  }
270 
271  static int impl_thread_pool_size(int depth) {
272  if (depth == 0) {
273  return impl_thread_pool_size();
274  } else {
275  return 1;
276  }
277  }
278 
279  static int impl_max_hardware_threads() noexcept {
280  return hpx::threads::hardware_concurrency();
281  }
282 
283  static int impl_hardware_thread_id() noexcept {
284  return hpx::get_worker_thread_num();
285  }
286 
287  static Kokkos::Impl::thread_buffer &impl_get_buffer() noexcept {
288  return m_buffer;
289  }
290 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
291  static hpx::future<void> &impl_get_future() noexcept { return m_future; }
292 #endif
293 
294  static constexpr const char *name() noexcept { return "HPX"; }
295 };
296 } // namespace Experimental
297 
298 namespace Impl {
299 template <typename Closure>
300 inline void dispatch_execute_task(Closure *closure) {
301 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
302  if (hpx::threads::get_self_ptr() == nullptr) {
303  hpx::threads::run_as_hpx_thread([closure]() {
304  hpx::future<void> &fut = Kokkos::Experimental::HPX::impl_get_future();
305  Closure closure_copy = *closure;
306  fut = fut.then([closure_copy](hpx::future<void> &&) {
307  closure_copy.execute_task();
308  });
309  });
310  } else {
311  hpx::future<void> &fut = Kokkos::Experimental::HPX::impl_get_future();
312  Closure closure_copy = *closure;
313  fut = fut.then(
314  [closure_copy](hpx::future<void> &&) { closure_copy.execute_task(); });
315  }
316 #else
317  if (hpx::threads::get_self_ptr() == nullptr) {
318  hpx::threads::run_as_hpx_thread([closure]() { closure->execute_task(); });
319  } else {
320  closure->execute_task();
321  }
322 #endif
323 }
324 } // namespace Impl
325 } // namespace Kokkos
326 
327 namespace Kokkos {
328 namespace Impl {
329 template <>
330 struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space,
331  Kokkos::Experimental::HPX::scratch_memory_space> {
332  enum { assignable = false };
333  enum { accessible = true };
334  enum { deepcopy = false };
335 };
336 
337 template <>
338 struct VerifyExecutionCanAccessMemorySpace<
339  Kokkos::Experimental::HPX::memory_space,
340  Kokkos::Experimental::HPX::scratch_memory_space> {
341  enum { value = true };
342  inline static void verify(void) {}
343  inline static void verify(const void *) {}
344 };
345 } // namespace Impl
346 } // namespace Kokkos
347 
348 namespace Kokkos {
349 namespace Experimental {
350 template <> class UniqueToken<HPX, UniqueTokenScope::Instance> {
351 public:
352  using execution_space = HPX;
353  using size_type = int;
354  UniqueToken(execution_space const & = execution_space()) noexcept {}
355 
356  // NOTE: Currently this assumes that there is no oversubscription.
357  // hpx::get_num_worker_threads can't be used directly because it may yield
358  // it's task (problematic if called after hpx::get_worker_thread_num).
359  int size() const noexcept { return HPX::impl_max_hardware_threads(); }
360  int acquire() const noexcept { return HPX::impl_hardware_thread_id(); }
361  void release(int) const noexcept {}
362 };
363 
364 template <> class UniqueToken<HPX, UniqueTokenScope::Global> {
365 public:
366  using execution_space = HPX;
367  using size_type = int;
368  UniqueToken(execution_space const & = execution_space()) noexcept {}
369 
370  // NOTE: Currently this assumes that there is no oversubscription.
371  // hpx::get_num_worker_threads can't be used directly because it may yield
372  // it's task (problematic if called after hpx::get_worker_thread_num).
373  int size() const noexcept { return HPX::impl_max_hardware_threads(); }
374  int acquire() const noexcept { return HPX::impl_hardware_thread_id(); }
375  void release(int) const noexcept {}
376 };
377 } // namespace Experimental
378 } // namespace Kokkos
379 
380 namespace Kokkos {
381 namespace Impl {
382 
383 struct HPXTeamMember {
384 public:
385  using execution_space = Kokkos::Experimental::HPX;
386  using scratch_memory_space =
388 
389 private:
390  scratch_memory_space m_team_shared;
391  std::size_t m_team_shared_size;
392 
393  int m_league_size;
394  int m_league_rank;
395  int m_team_size;
396  int m_team_rank;
397 
398 public:
399  KOKKOS_INLINE_FUNCTION
400  const scratch_memory_space &team_shmem() const {
401  return m_team_shared.set_team_thread_mode(0, 1, 0);
402  }
403 
404  KOKKOS_INLINE_FUNCTION
405  const execution_space::scratch_memory_space &team_scratch(const int) const {
406  return m_team_shared.set_team_thread_mode(0, 1, 0);
407  }
408 
409  KOKKOS_INLINE_FUNCTION
410  const execution_space::scratch_memory_space &thread_scratch(const int) const {
411  return m_team_shared.set_team_thread_mode(0, team_size(), team_rank());
412  }
413 
414  KOKKOS_INLINE_FUNCTION int league_rank() const noexcept {
415  return m_league_rank;
416  }
417 
418  KOKKOS_INLINE_FUNCTION int league_size() const noexcept {
419  return m_league_size;
420  }
421 
422  KOKKOS_INLINE_FUNCTION int team_rank() const noexcept { return m_team_rank; }
423  KOKKOS_INLINE_FUNCTION int team_size() const noexcept { return m_team_size; }
424 
425  template <class... Properties>
426  constexpr KOKKOS_INLINE_FUNCTION
427  HPXTeamMember(const TeamPolicyInternal<Kokkos::Experimental::HPX,
428  Properties...> &policy,
429  const int team_rank, const int league_rank, void *scratch,
430  int scratch_size) noexcept
431  : m_team_shared(scratch, scratch_size, scratch, scratch_size),
432  m_team_shared_size(scratch_size), m_league_size(policy.league_size()),
433  m_league_rank(league_rank), m_team_size(policy.team_size()),
434  m_team_rank(team_rank) {}
435 
436  KOKKOS_INLINE_FUNCTION
437  void team_barrier() const {}
438 
439  template <class ValueType>
440  KOKKOS_INLINE_FUNCTION void team_broadcast(ValueType &, const int &) const {
441  static_assert(std::is_trivially_default_constructible<ValueType>(),
442  "Only trivial constructible types can be broadcasted");
443  }
444 
445  template <class Closure, class ValueType>
446  KOKKOS_INLINE_FUNCTION void team_broadcast(const Closure &, ValueType &,
447  const int &) const {
448  static_assert(std::is_trivially_default_constructible<ValueType>(),
449  "Only trivial constructible types can be broadcasted");
450  }
451 
452  template <class ValueType, class JoinOp>
453  KOKKOS_INLINE_FUNCTION ValueType team_reduce(const ValueType &value,
454  const JoinOp &) const {
455  return value;
456  }
457 
458  template <class ReducerType>
459  KOKKOS_INLINE_FUNCTION
460  typename std::enable_if<is_reducer<ReducerType>::value>::type
461  team_reduce(const ReducerType &reducer) const {}
462 
463  template <typename Type>
464  KOKKOS_INLINE_FUNCTION Type
465  team_scan(const Type &value, Type *const global_accum = nullptr) const {
466  if (global_accum) {
467  Kokkos::atomic_fetch_add(global_accum, value);
468  }
469 
470  return 0;
471  }
472 };
473 
474 template <class... Properties>
475 class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
476  : public PolicyTraits<Properties...> {
477  using traits = PolicyTraits<Properties...>;
478 
479  int m_league_size;
480  int m_team_size;
481  std::size_t m_team_scratch_size[2];
482  std::size_t m_thread_scratch_size[2];
483  int m_chunk_size;
484 
485 public:
486  using member_type = HPXTeamMember;
487 
488  // NOTE: Max size is 1 for simplicity. In most cases more than 1 is not
489  // necessary on CPU. Implement later if there is a need.
490  template <class FunctorType>
491  inline static int team_size_max(const FunctorType &) {
492  return 1;
493  }
494 
495  template <class FunctorType>
496  inline static int team_size_recommended(const FunctorType &) {
497  return 1;
498  }
499 
500  template <class FunctorType>
501  inline static int team_size_recommended(const FunctorType &, const int &) {
502  return 1;
503  }
504 
505  template <class FunctorType>
506  int team_size_max(const FunctorType &, const ParallelForTag &) const {
507  return 1;
508  }
509 
510  template <class FunctorType>
511  int team_size_max(const FunctorType &, const ParallelReduceTag &) const {
512  return 1;
513  }
514  template <class FunctorType>
515  int team_size_recommended(const FunctorType &, const ParallelForTag &) const {
516  return 1;
517  }
518  template <class FunctorType>
519  int team_size_recommended(const FunctorType &,
520  const ParallelReduceTag &) const {
521  return 1;
522  }
523 
524 private:
525  inline void init(const int league_size_request, const int team_size_request) {
526  m_league_size = league_size_request;
527  const int max_team_size = 1; // TODO: Can't use team_size_max(...) because
528  // it requires a functor as argument.
529  m_team_size =
530  team_size_request > max_team_size ? max_team_size : team_size_request;
531 
532  if (m_chunk_size > 0) {
533  if (!Impl::is_integral_power_of_two(m_chunk_size))
534  Kokkos::abort("TeamPolicy blocking granularity must be power of two");
535  } else {
536  int new_chunk_size = 1;
537  while (new_chunk_size * 4 * Kokkos::Experimental::HPX::concurrency() <
538  m_league_size) {
539  new_chunk_size *= 2;
540  }
541 
542  if (new_chunk_size < 128) {
543  new_chunk_size = 1;
544  while ((new_chunk_size * Kokkos::Experimental::HPX::concurrency() <
545  m_league_size) &&
546  (new_chunk_size < 128))
547  new_chunk_size *= 2;
548  }
549 
550  m_chunk_size = new_chunk_size;
551  }
552  }
553 
554 public:
555  inline int team_size() const { return m_team_size; }
556  inline int league_size() const { return m_league_size; }
557 
558  inline size_t scratch_size(const int &level, int team_size_ = -1) const {
559  if (team_size_ < 0) {
560  team_size_ = m_team_size;
561  }
562  return m_team_scratch_size[level] +
563  team_size_ * m_thread_scratch_size[level];
564  }
565 
566 public:
567  template <class ExecSpace, class... OtherProperties>
568  friend class TeamPolicyInternal;
569 
570  template <class... OtherProperties>
571  TeamPolicyInternal(
572  const TeamPolicyInternal<Kokkos::Experimental::HPX, OtherProperties...> &p) {
573  m_league_size = p.m_league_size;
574  m_team_size = p.m_team_size;
575  m_team_scratch_size[0] = p.m_team_scratch_size[0];
576  m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
577  m_team_scratch_size[1] = p.m_team_scratch_size[1];
578  m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
579  m_chunk_size = p.m_chunk_size;
580  }
581 
582  TeamPolicyInternal(const typename traits::execution_space &,
583  int league_size_request, int team_size_request,
584  int /* vector_length_request */ = 1)
585  : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0},
586  m_chunk_size(0) {
587  init(league_size_request, team_size_request);
588  }
589 
590  TeamPolicyInternal(const typename traits::execution_space &,
591  int league_size_request,
592  const Kokkos::AUTO_t &team_size_request,
593  int /* vector_length_request */ = 1)
594  : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0},
595  m_chunk_size(0) {
596  init(league_size_request, 1);
597  }
598 
599  TeamPolicyInternal(int league_size_request, int team_size_request,
600  int /* vector_length_request */ = 1)
601  : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0},
602  m_chunk_size(0) {
603  init(league_size_request, team_size_request);
604  }
605 
606  TeamPolicyInternal(int league_size_request,
607  const Kokkos::AUTO_t &team_size_request,
608  int /* vector_length_request */ = 1)
609  : m_team_scratch_size{0, 0}, m_thread_scratch_size{0, 0},
610  m_chunk_size(0) {
611  init(league_size_request, 1);
612  }
613 
614  inline int chunk_size() const { return m_chunk_size; }
615 
616  inline TeamPolicyInternal &
617  set_chunk_size(typename traits::index_type chunk_size_) {
618  m_chunk_size = chunk_size_;
619  return *this;
620  }
621 
622  inline TeamPolicyInternal &set_scratch_size(const int &level,
623  const PerTeamValue &per_team) {
624  m_team_scratch_size[level] = per_team.value;
625  return *this;
626  }
627 
628  inline TeamPolicyInternal &
629  set_scratch_size(const int &level, const PerThreadValue &per_thread) {
630  m_thread_scratch_size[level] = per_thread.value;
631  return *this;
632  }
633 
634  inline TeamPolicyInternal &
635  set_scratch_size(const int &level, const PerTeamValue &per_team,
636  const PerThreadValue &per_thread) {
637  m_team_scratch_size[level] = per_team.value;
638  m_thread_scratch_size[level] = per_thread.value;
639  return *this;
640  }
641 };
642 } // namespace Impl
643 } // namespace Kokkos
644 
645 namespace Kokkos {
646 namespace Impl {
647 
648 template <class FunctorType, class... Traits>
649 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
650  Kokkos::Experimental::HPX> {
651 private:
652  using Policy = Kokkos::RangePolicy<Traits...>;
653  using WorkTag = typename Policy::work_tag;
654  using WorkRange = typename Policy::WorkRange;
655  using Member = typename Policy::member_type;
656 
657  const FunctorType m_functor;
658  const Policy m_policy;
659 
660  template <class TagType>
661  static typename std::enable_if<std::is_same<TagType, void>::value>::type
662  execute_functor(const FunctorType &functor, const Member i) {
663  functor(i);
664  }
665 
666  template <class TagType>
667  static typename std::enable_if<!std::is_same<TagType, void>::value>::type
668  execute_functor(const FunctorType &functor, const Member i) {
669  const TagType t{};
670  functor(t, i);
671  }
672 
673  template <class TagType>
674  static typename std::enable_if<std::is_same<TagType, void>::value>::type
675  execute_functor_range(const FunctorType &functor, const Member i_begin,
676  const Member i_end) {
677  for (Member i = i_begin; i < i_end; ++i) {
678  functor(i);
679  }
680  }
681 
682  template <class TagType>
683  static typename std::enable_if<!std::is_same<TagType, void>::value>::type
684  execute_functor_range(const FunctorType &functor, const Member i_begin,
685  const Member i_end) {
686  const TagType t{};
687  for (Member i = i_begin; i < i_end; ++i) {
688  functor(t, i);
689  }
690  }
691 
692 public:
693  void execute() const { Kokkos::Impl::dispatch_execute_task(this); }
694 
695  void execute_task() const {
696 #if KOKKOS_HPX_IMPLEMENTATION == 0
697  using hpx::parallel::for_loop;
698  using hpx::parallel::execution::par;
699  using hpx::parallel::execution::static_chunk_size;
700 
701  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
702  m_policy.begin(), m_policy.end(), [this](const Member i) {
703  execute_functor<WorkTag>(m_functor, i);
704  });
705 
706 #elif KOKKOS_HPX_IMPLEMENTATION == 1
707  using hpx::apply;
708  using hpx::lcos::local::counting_semaphore;
709 
710  counting_semaphore sem(0);
711  std::size_t num_tasks = 0;
712 
713  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
714  i_begin += m_policy.chunk_size()) {
715  apply([this, &sem, i_begin]() {
716  const Member i_end =
717  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
718  execute_functor_range<WorkTag>(m_functor, i_begin, i_end);
719 
720  sem.signal(1);
721  });
722 
723  ++num_tasks;
724  }
725 
726  sem.wait(num_tasks);
727 #endif
728  }
729 
730  inline ParallelFor(const FunctorType &arg_functor, Policy arg_policy)
731  : m_functor(arg_functor), m_policy(arg_policy) {}
732 };
733 
734 template <class FunctorType, class... Traits>
735 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
736  Kokkos::Experimental::HPX> {
737 private:
738  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
739  using Policy = typename MDRangePolicy::impl_range_policy;
740  using WorkTag = typename MDRangePolicy::work_tag;
741  using WorkRange = typename Policy::WorkRange;
742  using Member = typename Policy::member_type;
743  using iterate_type =
744  typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
745  WorkTag, void>;
746 
747  const FunctorType m_functor;
748  const MDRangePolicy m_mdr_policy;
749  const Policy m_policy;
750 
751 public:
752  void execute() const { dispatch_execute_task(this); }
753 
754  inline void execute_task() const {
755 #if KOKKOS_HPX_IMPLEMENTATION == 0
756  using hpx::parallel::for_loop;
757  using hpx::parallel::execution::par;
758  using hpx::parallel::execution::static_chunk_size;
759 
760  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
761  m_policy.begin(), m_policy.end(), [this](const Member i) {
762  iterate_type(m_mdr_policy, m_functor)(i);
763  });
764 
765 #elif KOKKOS_HPX_IMPLEMENTATION == 1
766  using hpx::apply;
767  using hpx::lcos::local::counting_semaphore;
768 
769  counting_semaphore sem(0);
770  std::size_t num_tasks = 0;
771 
772  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
773  i_begin += m_policy.chunk_size()) {
774  apply([this, &sem, i_begin]() {
775  const Member i_end =
776  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
777  for (Member i = i_begin; i < i_end; ++i) {
778  iterate_type(m_mdr_policy, m_functor)(i);
779  }
780 
781  sem.signal(1);
782  });
783 
784  ++num_tasks;
785  }
786 
787  sem.wait(num_tasks);
788 #endif
789  }
790 
791  inline ParallelFor(const FunctorType &arg_functor, MDRangePolicy arg_policy)
792  : m_functor(arg_functor), m_mdr_policy(arg_policy),
793  m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
794 };
795 } // namespace Impl
796 } // namespace Kokkos
797 
798 namespace Kokkos {
799 namespace Impl {
800 template <class FunctorType, class ReducerType, class... Traits>
801 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
802  Kokkos::Experimental::HPX> {
803 private:
804  using Policy = Kokkos::RangePolicy<Traits...>;
805  using WorkTag = typename Policy::work_tag;
806  using WorkRange = typename Policy::WorkRange;
807  using Member = typename Policy::member_type;
808  using Analysis =
809  FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
810  using ReducerConditional =
811  Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
812  FunctorType, ReducerType>;
813  using ReducerTypeFwd = typename ReducerConditional::type;
814  using WorkTagFwd =
815  typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
816  WorkTag, void>::type;
817  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
818  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
819  using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
820  using value_type = typename Analysis::value_type;
821  using pointer_type = typename Analysis::pointer_type;
822  using reference_type = typename Analysis::reference_type;
823 
824  const FunctorType m_functor;
825  const Policy m_policy;
826  const ReducerType m_reducer;
827  const pointer_type m_result_ptr;
828 
829  bool m_force_synchronous;
830 
831  template <class TagType>
832  inline static
833  typename std::enable_if<std::is_same<TagType, void>::value>::type
834  execute_functor(const FunctorType &functor, const Member i,
835  reference_type update) {
836  functor(i, update);
837  }
838 
839  template <class TagType>
840  inline static
841  typename std::enable_if<!std::is_same<TagType, void>::value>::type
842  execute_functor(const FunctorType &functor, const Member i,
843  reference_type update) {
844  const TagType t{};
845  functor(t, i, update);
846  }
847 
848  template <class TagType>
849  inline typename std::enable_if<std::is_same<TagType, void>::value>::type
850  execute_functor_range(reference_type update, const Member i_begin,
851  const Member i_end) const {
852  for (Member i = i_begin; i < i_end; ++i) {
853  m_functor(i, update);
854  }
855  }
856 
857  template <class TagType>
858  inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
859  execute_functor_range(reference_type update, const Member i_begin,
860  const Member i_end) const {
861  const TagType t{};
862 
863  for (Member i = i_begin; i < i_end; ++i) {
864  m_functor(t, i, update);
865  }
866  }
867 
868  class value_type_wrapper {
869  private:
870  std::size_t m_value_size;
871  char *m_value_buffer;
872 
873  public:
874  value_type_wrapper() : m_value_size(0), m_value_buffer(nullptr) {}
875 
876  value_type_wrapper(const std::size_t value_size)
877  : m_value_size(value_size), m_value_buffer(new char[m_value_size]) {}
878 
879  value_type_wrapper(const value_type_wrapper &other)
880  : m_value_size(0), m_value_buffer(nullptr) {
881  if (this != &other) {
882  m_value_buffer = new char[other.m_value_size];
883  m_value_size = other.m_value_size;
884 
885  std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
886  m_value_buffer);
887  }
888  }
889 
890  ~value_type_wrapper() { delete[] m_value_buffer; }
891 
892  value_type_wrapper(value_type_wrapper &&other)
893  : m_value_size(0), m_value_buffer(nullptr) {
894  if (this != &other) {
895  m_value_buffer = other.m_value_buffer;
896  m_value_size = other.m_value_size;
897 
898  other.m_value_buffer = nullptr;
899  other.m_value_size = 0;
900  }
901  }
902 
903  value_type_wrapper &operator=(const value_type_wrapper &other) {
904  if (this != &other) {
905  delete[] m_value_buffer;
906  m_value_buffer = new char[other.m_value_size];
907  m_value_size = other.m_value_size;
908 
909  std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
910  m_value_buffer);
911  }
912 
913  return *this;
914  }
915 
916  value_type_wrapper &operator=(value_type_wrapper &&other) {
917  if (this != &other) {
918  delete[] m_value_buffer;
919  m_value_buffer = other.m_value_buffer;
920  m_value_size = other.m_value_size;
921 
922  other.m_value_buffer = nullptr;
923  other.m_value_size = 0;
924  }
925 
926  return *this;
927  }
928 
929  pointer_type pointer() const {
930  return reinterpret_cast<pointer_type>(m_value_buffer);
931  }
932 
933  reference_type reference() const {
934  return ValueOps::reference(
935  reinterpret_cast<pointer_type>(m_value_buffer));
936  }
937  };
938 
939 public:
940  void execute() const {
941  dispatch_execute_task(this);
942  }
943 
944  inline void execute_task() const {
945  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
946 
947  std::size_t value_size =
948  Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
949 
950  using hpx::parallel::for_loop;
951  using hpx::parallel::execution::par;
952 
953 #if KOKKOS_HPX_IMPLEMENTATION == 0
954  // NOTE: This version makes the most use of HPX functionality, but
955  // requires the struct value_type_wrapper to handle different
956  // reference_types. It is also significantly slower than the version
957  // below due to not reusing the buffer used by other functions.
958  using hpx::parallel::reduction;
959  using hpx::parallel::execution::static_chunk_size;
960 
961  value_type_wrapper final_value(value_size);
962  value_type_wrapper identity(value_size);
963 
964  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
965  final_value.pointer());
966  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
967  identity.pointer());
968 
969  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
970  m_policy.begin(), m_policy.end(),
971  reduction(final_value, identity,
972  [this](value_type_wrapper &a,
973  value_type_wrapper &b) -> value_type_wrapper & {
974  ValueJoin::join(
975  ReducerConditional::select(m_functor, m_reducer),
976  a.pointer(), b.pointer());
977  return a;
978  }),
979  [this](Member i, value_type_wrapper &update) {
980  execute_functor<WorkTag>(m_functor, i, update.reference());
981  });
982 
983  pointer_type final_value_ptr = final_value.pointer();
984 
985 #elif KOKKOS_HPX_IMPLEMENTATION == 1
986  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
987  buffer.resize(num_worker_threads, value_size);
988 
989  for_loop(par, 0, num_worker_threads, [this, &buffer](std::size_t t) {
990  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
991  reinterpret_cast<pointer_type>(buffer.get(t)));
992  });
993 
994  using hpx::apply;
995  using hpx::lcos::local::counting_semaphore;
996 
997  counting_semaphore sem(0);
998  std::size_t num_tasks = 0;
999 
1000  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
1001  i_begin += m_policy.chunk_size()) {
1002  apply([this, &buffer, &sem, i_begin]() {
1003  reference_type update =
1004  ValueOps::reference(reinterpret_cast<pointer_type>(
1005  buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1006  const Member i_end =
1007  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1008  execute_functor_range<WorkTag>(update, i_begin, i_end);
1009 
1010  sem.signal(1);
1011  });
1012 
1013  ++num_tasks;
1014  }
1015 
1016  sem.wait(num_tasks);
1017 
1018  for (int i = 1; i < num_worker_threads; ++i) {
1019  ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1020  reinterpret_cast<pointer_type>(buffer.get(0)),
1021  reinterpret_cast<pointer_type>(buffer.get(i)));
1022  }
1023 
1024  pointer_type final_value_ptr =
1025  reinterpret_cast<pointer_type>(buffer.get(0));
1026 #endif
1027 
1028  Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1029  ReducerConditional::select(m_functor, m_reducer), final_value_ptr);
1030 
1031  if (m_result_ptr != nullptr) {
1032  const int n = Analysis::value_count(
1033  ReducerConditional::select(m_functor, m_reducer));
1034 
1035  for (int j = 0; j < n; ++j) {
1036  m_result_ptr[j] = final_value_ptr[j];
1037  }
1038  }
1039  }
1040 
1041  template <class ViewType>
1042  inline ParallelReduce(
1043  const FunctorType &arg_functor, Policy arg_policy,
1044  const ViewType &arg_view,
1045  typename std::enable_if<Kokkos::is_view<ViewType>::value &&
1046  !Kokkos::is_reducer_type<ReducerType>::value,
1047  void *>::type = NULL)
1048  : m_functor(arg_functor), m_policy(arg_policy), m_reducer(InvalidType()),
1049  m_result_ptr(arg_view.data()),
1050  m_force_synchronous(!arg_view.impl_track().has_record()) {}
1051 
1052  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
1053  const ReducerType &reducer)
1054  : m_functor(arg_functor), m_policy(arg_policy), m_reducer(reducer),
1055  m_result_ptr(reducer.view().data()),
1056  m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1057 };
1058 
1059 template <class FunctorType, class ReducerType, class... Traits>
1060 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
1061  Kokkos::Experimental::HPX> {
1062 private:
1063  using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
1064  using Policy = typename MDRangePolicy::impl_range_policy;
1065  using WorkTag = typename MDRangePolicy::work_tag;
1066  using WorkRange = typename Policy::WorkRange;
1067  using Member = typename Policy::member_type;
1068  using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
1069  MDRangePolicy, FunctorType>;
1070  using ReducerConditional =
1071  Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1072  FunctorType, ReducerType>;
1073  using ReducerTypeFwd = typename ReducerConditional::type;
1074  using WorkTagFwd =
1075  typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1076  WorkTag, void>::type;
1077  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1078  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1079  using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
1080  using pointer_type = typename Analysis::pointer_type;
1081  using value_type = typename Analysis::value_type;
1082  using reference_type = typename Analysis::reference_type;
1083  using iterate_type =
1084  typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
1085  WorkTag, reference_type>;
1086 
1087  const FunctorType m_functor;
1088  const MDRangePolicy m_mdr_policy;
1089  const Policy m_policy;
1090  const ReducerType m_reducer;
1091  const pointer_type m_result_ptr;
1092 
1093  bool m_force_synchronous;
1094 
1095 public:
1096  void execute() const {
1097  dispatch_execute_task(this);
1098  }
1099 
1100  inline void execute_task() const {
1101  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1102  const std::size_t value_size =
1103  Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1104 
1105  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1106  buffer.resize(num_worker_threads, value_size);
1107 
1108  using hpx::parallel::for_loop;
1109  using hpx::parallel::execution::par;
1110 
1111  for_loop(par, 0, num_worker_threads, [this, &buffer](std::size_t t) {
1112  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1113  reinterpret_cast<pointer_type>(buffer.get(t)));
1114  });
1115 
1116 #if KOKKOS_HPX_IMPLEMENTATION == 0
1117  using hpx::parallel::execution::static_chunk_size;
1118 
1119  for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
1120  m_policy.begin(), m_policy.end(), [this, &buffer](const Member i) {
1121  reference_type update = ValueOps::reference(
1122  reinterpret_cast<pointer_type>(buffer.get(
1123  Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1124  iterate_type(m_mdr_policy, m_functor, update)(i);
1125  });
1126 
1127 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1128  using hpx::apply;
1129  using hpx::lcos::local::counting_semaphore;
1130 
1131  counting_semaphore sem(0);
1132  std::size_t num_tasks = 0;
1133 
1134  for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
1135  i_begin += m_policy.chunk_size()) {
1136  apply([this, &buffer, &sem, i_begin]() {
1137  reference_type update =
1138  ValueOps::reference(reinterpret_cast<pointer_type>(
1139  buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1140  const Member i_end =
1141  (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1142 
1143  for (Member i = i_begin; i < i_end; ++i) {
1144  iterate_type(m_mdr_policy, m_functor, update)(i);
1145  }
1146 
1147  sem.signal(1);
1148  });
1149 
1150  ++num_tasks;
1151  }
1152 
1153  sem.wait(num_tasks);
1154 #endif
1155 
1156  for (int i = 1; i < num_worker_threads; ++i) {
1157  ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1158  reinterpret_cast<pointer_type>(buffer.get(0)),
1159  reinterpret_cast<pointer_type>(buffer.get(i)));
1160  }
1161 
1162  Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1163  ReducerConditional::select(m_functor, m_reducer),
1164  reinterpret_cast<pointer_type>(buffer.get(0)));
1165 
1166  if (m_result_ptr != nullptr) {
1167  const int n = Analysis::value_count(
1168  ReducerConditional::select(m_functor, m_reducer));
1169 
1170  for (int j = 0; j < n; ++j) {
1171  m_result_ptr[j] = reinterpret_cast<pointer_type>(buffer.get(0))[j];
1172  }
1173  }
1174  }
1175 
1176  template <class ViewType>
1177  inline ParallelReduce(
1178  const FunctorType &arg_functor, MDRangePolicy arg_policy,
1179  const ViewType &arg_view,
1180  typename std::enable_if<Kokkos::is_view<ViewType>::value &&
1181  !Kokkos::is_reducer_type<ReducerType>::value,
1182  void *>::type = NULL)
1183  : m_functor(arg_functor), m_mdr_policy(arg_policy),
1184  m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
1185  m_reducer(InvalidType()), m_result_ptr(arg_view.data()),
1186  m_force_synchronous(!arg_view.impl_track().has_record()) {}
1187 
1188  inline ParallelReduce(const FunctorType &arg_functor,
1189  MDRangePolicy arg_policy, const ReducerType &reducer)
1190  : m_functor(arg_functor), m_mdr_policy(arg_policy),
1191  m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
1192  m_reducer(reducer), m_result_ptr(reducer.view().data()),
1193  m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1194 };
1195 } // namespace Impl
1196 } // namespace Kokkos
1197 
1198 namespace Kokkos {
1199 namespace Impl {
1200 
1201 template <class FunctorType, class... Traits>
1202 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
1203  Kokkos::Experimental::HPX> {
1204 private:
1205  using Policy = Kokkos::RangePolicy<Traits...>;
1206  using WorkTag = typename Policy::work_tag;
1207  using WorkRange = typename Policy::WorkRange;
1208  using Member = typename Policy::member_type;
1209  using Analysis =
1210  FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1211  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
1212  using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
1213  using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
1214  using pointer_type = typename Analysis::pointer_type;
1215  using reference_type = typename Analysis::reference_type;
1216  using value_type = typename Analysis::value_type;
1217 
1218  const FunctorType m_functor;
1219  const Policy m_policy;
1220 
1221  template <class TagType>
1222  inline static
1223  typename std::enable_if<std::is_same<TagType, void>::value>::type
1224  execute_functor_range(const FunctorType &functor, const Member i_begin,
1225  const Member i_end, reference_type update,
1226  const bool final) {
1227  for (Member i = i_begin; i < i_end; ++i) {
1228  functor(i, update, final);
1229  }
1230  }
1231 
1232  template <class TagType>
1233  inline static
1234  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1235  execute_functor_range(const FunctorType &functor, const Member i_begin,
1236  const Member i_end, reference_type update,
1237  const bool final) {
1238  const TagType t{};
1239  for (Member i = i_begin; i < i_end; ++i) {
1240  functor(t, i, update, final);
1241  }
1242  }
1243 
1244 public:
1245  void execute() const { dispatch_execute_task(this); }
1246 
1247  inline void execute_task() const {
1248  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1249  const int value_count = Analysis::value_count(m_functor);
1250  const std::size_t value_size = Analysis::value_size(m_functor);
1251 
1252  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1253  buffer.resize(num_worker_threads, 2 * value_size);
1254 
1255  using hpx::lcos::local::barrier;
1256  using hpx::parallel::for_loop;
1257  using hpx::parallel::execution::par;
1258  using hpx::parallel::execution::static_chunk_size;
1259 
1260  barrier bar(num_worker_threads);
1261 
1262  for_loop(par.with(static_chunk_size(1)), 0, num_worker_threads,
1263  [this, &buffer, &bar, num_worker_threads, value_count,
1264  value_size](std::size_t const t) {
1265  reference_type update_sum = ValueInit::init(
1266  m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
1267 
1268  const WorkRange range(m_policy, t, num_worker_threads);
1269  execute_functor_range<WorkTag>(m_functor, range.begin(),
1270  range.end(), update_sum, false);
1271 
1272  bar.wait();
1273 
1274  if (t == 0) {
1275  ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
1276  buffer.get(0) + value_size));
1277 
1278  for (int i = 1; i < num_worker_threads; ++i) {
1279  pointer_type ptr_1_prev =
1280  reinterpret_cast<pointer_type>(buffer.get(i - 1));
1281  pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
1282  buffer.get(i - 1) + value_size);
1283  pointer_type ptr_2 = reinterpret_cast<pointer_type>(
1284  buffer.get(i) + value_size);
1285 
1286  for (int j = 0; j < value_count; ++j) {
1287  ptr_2[j] = ptr_2_prev[j];
1288  }
1289 
1290  ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
1291  }
1292  }
1293 
1294  bar.wait();
1295 
1296  reference_type update_base = ValueOps::reference(
1297  reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1298 
1299  execute_functor_range<WorkTag>(m_functor, range.begin(),
1300  range.end(), update_base, true);
1301  });
1302  }
1303 
1304  inline ParallelScan(const FunctorType &arg_functor, const Policy &arg_policy)
1305  : m_functor(arg_functor), m_policy(arg_policy) {}
1306 };
1307 
1308 template <class FunctorType, class ReturnType, class... Traits>
1309 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
1310  ReturnType, Kokkos::Experimental::HPX> {
1311 private:
1312  using Policy = Kokkos::RangePolicy<Traits...>;
1313  using WorkTag = typename Policy::work_tag;
1314  using WorkRange = typename Policy::WorkRange;
1315  using Member = typename Policy::member_type;
1316  using Analysis =
1317  FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1318  using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
1319  using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
1320  using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
1321  using pointer_type = typename Analysis::pointer_type;
1322  using reference_type = typename Analysis::reference_type;
1323  using value_type = typename Analysis::value_type;
1324 
1325  const FunctorType m_functor;
1326  const Policy m_policy;
1327  ReturnType &m_returnvalue;
1328 
1329  template <class TagType>
1330  inline static
1331  typename std::enable_if<std::is_same<TagType, void>::value>::type
1332  execute_functor_range(const FunctorType &functor, const Member i_begin,
1333  const Member i_end, reference_type update,
1334  const bool final) {
1335  for (Member i = i_begin; i < i_end; ++i) {
1336  functor(i, update, final);
1337  }
1338  }
1339 
1340  template <class TagType>
1341  inline static
1342  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1343  execute_functor_range(const FunctorType &functor, const Member i_begin,
1344  const Member i_end, reference_type update,
1345  const bool final) {
1346  const TagType t{};
1347  for (Member i = i_begin; i < i_end; ++i) {
1348  functor(t, i, update, final);
1349  }
1350  }
1351 
1352 public:
1353  void execute() const { dispatch_execute_task(this); }
1354 
1355  inline void execute_task() const {
1356  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1357  const int value_count = Analysis::value_count(m_functor);
1358  const std::size_t value_size = Analysis::value_size(m_functor);
1359 
1360  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1361  buffer.resize(num_worker_threads, 2 * value_size);
1362 
1363  using hpx::lcos::local::barrier;
1364  using hpx::parallel::for_loop;
1365  using hpx::parallel::execution::par;
1366  using hpx::parallel::execution::static_chunk_size;
1367 
1368  barrier bar(num_worker_threads);
1369 
1370  for_loop(par.with(static_chunk_size(1)), 0, num_worker_threads,
1371  [this, &buffer, &bar, num_worker_threads, value_count,
1372  value_size](std::size_t const t) {
1373  reference_type update_sum = ValueInit::init(
1374  m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
1375 
1376  const WorkRange range(m_policy, t, num_worker_threads);
1377  execute_functor_range<WorkTag>(m_functor, range.begin(),
1378  range.end(), update_sum, false);
1379 
1380  bar.wait();
1381 
1382  if (t == 0) {
1383  ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
1384  buffer.get(0) + value_size));
1385 
1386  for (int i = 1; i < num_worker_threads; ++i) {
1387  pointer_type ptr_1_prev =
1388  reinterpret_cast<pointer_type>(buffer.get(i - 1));
1389  pointer_type ptr_2_prev = reinterpret_cast<pointer_type>(
1390  buffer.get(i - 1) + value_size);
1391  pointer_type ptr_2 = reinterpret_cast<pointer_type>(
1392  buffer.get(i) + value_size);
1393 
1394  for (int j = 0; j < value_count; ++j) {
1395  ptr_2[j] = ptr_2_prev[j];
1396  }
1397 
1398  ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
1399  }
1400  }
1401 
1402  bar.wait();
1403 
1404  reference_type update_base = ValueOps::reference(
1405  reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1406 
1407  execute_functor_range<WorkTag>(m_functor, range.begin(),
1408  range.end(), update_base, true);
1409 
1410  if (t == std::size_t(num_worker_threads - 1)) {
1411  m_returnvalue = update_base;
1412  }
1413  });
1414  }
1415 
1416  inline ParallelScanWithTotal(const FunctorType &arg_functor,
1417  const Policy &arg_policy,
1418  ReturnType &arg_returnvalue)
1419  : m_functor(arg_functor), m_policy(arg_policy),
1420  m_returnvalue(arg_returnvalue) {}
1421 };
1422 } // namespace Impl
1423 } // namespace Kokkos
1424 
1425 namespace Kokkos {
1426 namespace Impl {
1427 template <class FunctorType, class... Properties>
1428 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
1429  Kokkos::Experimental::HPX> {
1430 private:
1431  using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1432  using WorkTag = typename Policy::work_tag;
1433  using Member = typename Policy::member_type;
1434  using memory_space = Kokkos::HostSpace;
1435 
1436  const FunctorType m_functor;
1437  const Policy m_policy;
1438  const int m_league;
1439  const std::size_t m_shared;
1440 
1441  template <class TagType>
1442  inline static
1443  typename std::enable_if<std::is_same<TagType, void>::value>::type
1444  execute_functor(const FunctorType &functor, const Policy &policy,
1445  const int league_rank, char *local_buffer,
1446  const std::size_t local_buffer_size) {
1447  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1448  }
1449 
1450  template <class TagType>
1451  inline static
1452  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1453  execute_functor(const FunctorType &functor, const Policy &policy,
1454  const int league_rank, char *local_buffer,
1455  const std::size_t local_buffer_size) {
1456  const TagType t{};
1457  functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1458  }
1459 
1460  template <class TagType>
1461  inline static
1462  typename std::enable_if<std::is_same<TagType, void>::value>::type
1463  execute_functor_range(const FunctorType &functor, const Policy &policy,
1464  const int league_rank_begin,
1465  const int league_rank_end, char *local_buffer,
1466  const std::size_t local_buffer_size) {
1467  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1468  ++league_rank) {
1469  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1470  }
1471  }
1472 
1473  template <class TagType>
1474  inline static
1475  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1476  execute_functor_range(const FunctorType &functor, const Policy &policy,
1477  const int league_rank_begin,
1478  const int league_rank_end, char *local_buffer,
1479  const std::size_t local_buffer_size) {
1480  const TagType t{};
1481  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1482  ++league_rank) {
1483  functor(t,
1484  Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1485  }
1486  }
1487 
1488 public:
1489  void execute() const { dispatch_execute_task(this); }
1490 
1491  inline void execute_task() const {
1492  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1493 
1494  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1495  buffer.resize(num_worker_threads, m_shared);
1496 
1497 #if KOKKOS_HPX_IMPLEMENTATION == 0
1498  using hpx::parallel::for_loop;
1499  using hpx::parallel::execution::par;
1500  using hpx::parallel::execution::static_chunk_size;
1501 
1502  for_loop(par.with(static_chunk_size(m_policy.chunk_size())), 0,
1503  m_policy.league_size(), [this, &buffer](const int league_rank) {
1504  execute_functor<WorkTag>(
1505  m_functor, m_policy, league_rank,
1506  buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1507  m_shared);
1508  });
1509 
1510 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1511  using hpx::apply;
1512  using hpx::lcos::local::counting_semaphore;
1513 
1514  counting_semaphore sem(0);
1515  std::size_t num_tasks = 0;
1516 
1517  for (int league_rank_begin = 0; league_rank_begin < m_policy.league_size();
1518  league_rank_begin += m_policy.chunk_size()) {
1519  apply([this, &buffer, &sem, league_rank_begin]() {
1520  const int league_rank_end = (std::min)(
1521  league_rank_begin + m_policy.chunk_size(), m_policy.league_size());
1522  execute_functor_range<WorkTag>(
1523  m_functor, m_policy, league_rank_begin, league_rank_end,
1524  buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()), m_shared);
1525 
1526  sem.signal(1);
1527  });
1528 
1529  ++num_tasks;
1530  }
1531 
1532  sem.wait(num_tasks);
1533 #endif
1534  }
1535 
1536  ParallelFor(const FunctorType &arg_functor, const Policy &arg_policy)
1537  : m_functor(arg_functor), m_policy(arg_policy),
1538  m_league(arg_policy.league_size()),
1539  m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
1540  FunctorTeamShmemSize<FunctorType>::value(
1541  arg_functor, arg_policy.team_size())) {}
1542 };
1543 
1544 template <class FunctorType, class ReducerType, class... Properties>
1545 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
1546  ReducerType, Kokkos::Experimental::HPX> {
1547 private:
1548  using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1549  using Analysis =
1550  FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
1551  using Member = typename Policy::member_type;
1552  using WorkTag = typename Policy::work_tag;
1553  using ReducerConditional =
1554  Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1555  FunctorType, ReducerType>;
1556  using ReducerTypeFwd = typename ReducerConditional::type;
1557  using WorkTagFwd =
1558  typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1559  WorkTag, void>::type;
1560  using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1561  using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1562  using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
1563  using pointer_type = typename Analysis::pointer_type;
1564  using reference_type = typename Analysis::reference_type;
1565  using value_type = typename Analysis::value_type;
1566 
1567  const FunctorType m_functor;
1568  const int m_league;
1569  const Policy m_policy;
1570  const ReducerType m_reducer;
1571  pointer_type m_result_ptr;
1572  const std::size_t m_shared;
1573 
1574  bool m_force_synchronous;
1575 
1576  template <class TagType>
1577  inline static
1578  typename std::enable_if<std::is_same<TagType, void>::value>::type
1579  execute_functor(const FunctorType &functor, const Policy &policy,
1580  const int league_rank, char *local_buffer,
1581  const std::size_t local_buffer_size,
1582  reference_type update) {
1583  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1584  update);
1585  }
1586 
1587  template <class TagType>
1588  inline static
1589  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1590  execute_functor(const FunctorType &functor, const Policy &policy,
1591  const int league_rank, char *local_buffer,
1592  const std::size_t local_buffer_size,
1593  reference_type update) {
1594  const TagType t{};
1595  functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1596  update);
1597  }
1598 
1599  template <class TagType>
1600  inline static
1601  typename std::enable_if<std::is_same<TagType, void>::value>::type
1602  execute_functor_range(const FunctorType &functor, const Policy &policy,
1603  const int league_rank_begin,
1604  const int league_rank_end, char *local_buffer,
1605  const std::size_t local_buffer_size,
1606  reference_type update) {
1607  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1608  ++league_rank) {
1609  functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1610  update);
1611  }
1612  }
1613 
1614  template <class TagType>
1615  inline static
1616  typename std::enable_if<!std::is_same<TagType, void>::value>::type
1617  execute_functor_range(const FunctorType &functor, const Policy &policy,
1618  const int league_rank_begin,
1619  const int league_rank_end, char *local_buffer,
1620  const std::size_t local_buffer_size,
1621  reference_type update) {
1622  const TagType t{};
1623  for (int league_rank = league_rank_begin; league_rank < league_rank_end;
1624  ++league_rank) {
1625  functor(t,
1626  Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1627  update);
1628  }
1629  }
1630 
1631 public:
1632  void execute() const {
1633  dispatch_execute_task(this);
1634  }
1635 
1636  inline void execute_task() const {
1637  const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1638  const std::size_t value_size =
1639  Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1640 
1641  thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1642  buffer.resize(num_worker_threads, value_size + m_shared);
1643 
1644  using hpx::parallel::for_loop;
1645  using hpx::parallel::execution::par;
1646 
1647  for_loop(par, 0, num_worker_threads, [this, &buffer](std::size_t t) {
1648  ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1649  reinterpret_cast<pointer_type>(buffer.get(t)));
1650  });
1651 
1652 #if KOKKOS_HPX_IMPLEMENTATION == 0
1653  using hpx::parallel::execution::static_chunk_size;
1654 
1655  hpx::parallel::for_loop(
1656  par.with(static_chunk_size(m_policy.chunk_size())), 0,
1657  m_policy.league_size(),
1658  [this, &buffer, value_size](const int league_rank) {
1659  std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1660  reference_type update = ValueOps::reference(
1661  reinterpret_cast<pointer_type>(buffer.get(t)));
1662 
1663  execute_functor<WorkTag>(m_functor, m_policy, league_rank,
1664  buffer.get(t) + value_size, m_shared,
1665  update);
1666  });
1667 
1668 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1669  using hpx::apply;
1670  using hpx::lcos::local::counting_semaphore;
1671 
1672  counting_semaphore sem(0);
1673  std::size_t num_tasks = 0;
1674 
1675  for (int league_rank_begin = 0; league_rank_begin < m_policy.league_size();
1676  league_rank_begin += m_policy.chunk_size()) {
1677  apply([this, &buffer, &sem, league_rank_begin, value_size]() {
1678  std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1679  reference_type update =
1680  ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(t)));
1681  const int league_rank_end = (std::min)(
1682  league_rank_begin + m_policy.chunk_size(), m_policy.league_size());
1683  execute_functor_range<WorkTag>(
1684  m_functor, m_policy, league_rank_begin, league_rank_end,
1685  buffer.get(t) + value_size, m_shared, update);
1686 
1687  sem.signal(1);
1688  });
1689 
1690  ++num_tasks;
1691  }
1692 
1693  sem.wait(num_tasks);
1694 #endif
1695 
1696  const pointer_type ptr = reinterpret_cast<pointer_type>(buffer.get(0));
1697  for (int t = 1; t < num_worker_threads; ++t) {
1698  ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr,
1699  reinterpret_cast<pointer_type>(buffer.get(t)));
1700  }
1701 
1702  Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1703  ReducerConditional::select(m_functor, m_reducer), ptr);
1704 
1705  if (m_result_ptr) {
1706  const int n = Analysis::value_count(
1707  ReducerConditional::select(m_functor, m_reducer));
1708 
1709  for (int j = 0; j < n; ++j) {
1710  m_result_ptr[j] = ptr[j];
1711  }
1712  }
1713  }
1714 
1715  template <class ViewType>
1716  ParallelReduce(
1717  const FunctorType &arg_functor, const Policy &arg_policy,
1718  const ViewType &arg_result,
1719  typename std::enable_if<Kokkos::is_view<ViewType>::value &&
1720  !Kokkos::is_reducer_type<ReducerType>::value,
1721  void *>::type = NULL)
1722  : m_functor(arg_functor), m_league(arg_policy.league_size()),
1723  m_policy(arg_policy), m_reducer(InvalidType()),
1724  m_result_ptr(arg_result.data()),
1725  m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
1726  FunctorTeamShmemSize<FunctorType>::value(
1727  m_functor, arg_policy.team_size())),
1728  m_force_synchronous(!arg_result.impl_track().has_record()) {}
1729 
1730  inline ParallelReduce(const FunctorType &arg_functor, Policy arg_policy,
1731  const ReducerType &reducer)
1732  : m_functor(arg_functor), m_league(arg_policy.league_size()),
1733  m_policy(arg_policy), m_reducer(reducer),
1734  m_result_ptr(reducer.view().data()),
1735  m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
1736  FunctorTeamShmemSize<FunctorType>::value(
1737  arg_functor, arg_policy.team_size())),
1738  m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1739 };
1740 } // namespace Impl
1741 } // namespace Kokkos
1742 
1743 namespace Kokkos {
1744 
1745 template <typename iType>
1746 KOKKOS_INLINE_FUNCTION
1747  Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1748  TeamThreadRange(const Impl::HPXTeamMember &thread, const iType &count) {
1749  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
1750  thread, count);
1751 }
1752 
1753 template <typename iType1, typename iType2>
1754 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
1755  typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
1756 TeamThreadRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
1757  const iType2 &i_end) {
1758  using iType = typename std::common_type<iType1, iType2>::type;
1759  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
1760  thread, iType(i_begin), iType(i_end));
1761 }
1762 
1763 template <typename iType>
1764 KOKKOS_INLINE_FUNCTION
1765  Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1766  TeamVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
1767  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
1768  thread, count);
1769 }
1770 
1771 template <typename iType1, typename iType2>
1772 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
1773  typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
1774 TeamVectorRange(const Impl::HPXTeamMember &thread, const iType1 &i_begin,
1775  const iType2 &i_end) {
1776  using iType = typename std::common_type<iType1, iType2>::type;
1777  return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
1778  thread, iType(i_begin), iType(i_end));
1779 }
1780 
1781 template <typename iType>
1782 KOKKOS_INLINE_FUNCTION
1783  Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1784  ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &count) {
1785  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
1786  thread, count);
1787 }
1788 
1789 template <typename iType>
1790 KOKKOS_INLINE_FUNCTION
1791  Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1792  ThreadVectorRange(const Impl::HPXTeamMember &thread, const iType &i_begin,
1793  const iType &i_end) {
1794  return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
1795  thread, i_begin, i_end);
1796 }
1797 
1798 KOKKOS_INLINE_FUNCTION
1799 Impl::ThreadSingleStruct<Impl::HPXTeamMember>
1800 PerTeam(const Impl::HPXTeamMember &thread) {
1801  return Impl::ThreadSingleStruct<Impl::HPXTeamMember>(thread);
1802 }
1803 
1804 KOKKOS_INLINE_FUNCTION
1805 Impl::VectorSingleStruct<Impl::HPXTeamMember>
1806 PerThread(const Impl::HPXTeamMember &thread) {
1807  return Impl::VectorSingleStruct<Impl::HPXTeamMember>(thread);
1808 }
1809 
1815 template <typename iType, class Lambda>
1816 KOKKOS_INLINE_FUNCTION void parallel_for(
1817  const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1818  &loop_boundaries,
1819  const Lambda &lambda) {
1820  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1821  i += loop_boundaries.increment)
1822  lambda(i);
1823 }
1824 
1831 template <typename iType, class Lambda, typename ValueType>
1832 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1833  const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1834  &loop_boundaries,
1835  const Lambda &lambda, ValueType &result) {
1836  result = ValueType();
1837  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1838  i += loop_boundaries.increment) {
1839  lambda(i, result);
1840  }
1841 }
1842 
1848 template <typename iType, class Lambda>
1849 KOKKOS_INLINE_FUNCTION void parallel_for(
1850  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1851  &loop_boundaries,
1852  const Lambda &lambda) {
1853 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1854 #pragma ivdep
1855 #endif
1856  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1857  i += loop_boundaries.increment) {
1858  lambda(i);
1859  }
1860 }
1861 
1868 template <typename iType, class Lambda, typename ValueType>
1869 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1870  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1871  &loop_boundaries,
1872  const Lambda &lambda, ValueType &result) {
1873  result = ValueType();
1874 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1875 #pragma ivdep
1876 #endif
1877  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1878  i += loop_boundaries.increment) {
1879  lambda(i, result);
1880  }
1881 }
1882 
1883 template <typename iType, class Lambda, typename ReducerType>
1884 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1885  const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1886  &loop_boundaries,
1887  const Lambda &lambda, const ReducerType &reducer) {
1888  reducer.init(reducer.reference());
1889  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1890  i += loop_boundaries.increment) {
1891  lambda(i, reducer.reference());
1892  }
1893 }
1894 
1895 template <typename iType, class Lambda, typename ReducerType>
1896 KOKKOS_INLINE_FUNCTION void parallel_reduce(
1897  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1898  &loop_boundaries,
1899  const Lambda &lambda, const ReducerType &reducer) {
1900  reducer.init(reducer.reference());
1901 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1902 #pragma ivdep
1903 #endif
1904  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1905  i += loop_boundaries.increment) {
1906  lambda(i, reducer.reference());
1907  }
1908 }
1909 
1910 template <typename iType, class FunctorType>
1911 KOKKOS_INLINE_FUNCTION void parallel_scan(
1912  Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember> const
1913  &loop_boundaries,
1914  const FunctorType &lambda) {
1915  using value_type = typename Kokkos::Impl::FunctorAnalysis<
1916  Kokkos::Impl::FunctorPatternInterface::SCAN, void,
1917  FunctorType>::value_type;
1918 
1919  value_type scan_val = value_type();
1920 
1921  // Intra-member scan
1922  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1923  i += loop_boundaries.increment) {
1924  lambda(i, scan_val, false);
1925  }
1926 
1927  // 'scan_val' output is the exclusive prefix sum
1928  scan_val = loop_boundaries.thread.team_scan(scan_val);
1929 
1930  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1931  i += loop_boundaries.increment) {
1932  lambda(i, scan_val, true);
1933  }
1934 }
1935 
1947 template <typename iType, class FunctorType>
1948 KOKKOS_INLINE_FUNCTION void parallel_scan(
1949  const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
1950  &loop_boundaries,
1951  const FunctorType &lambda) {
1952  using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
1953  using value_type = typename ValueTraits::value_type;
1954 
1955  value_type scan_val = value_type();
1956 
1957 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
1958 #pragma ivdep
1959 #endif
1960  for (iType i = loop_boundaries.start; i < loop_boundaries.end;
1961  i += loop_boundaries.increment) {
1962  lambda(i, scan_val, true);
1963  }
1964 }
1965 
1966 template <class FunctorType>
1967 KOKKOS_INLINE_FUNCTION void
1968 single(const Impl::VectorSingleStruct<Impl::HPXTeamMember> &single_struct,
1969  const FunctorType &lambda) {
1970  lambda();
1971 }
1972 
1973 template <class FunctorType>
1974 KOKKOS_INLINE_FUNCTION void
1975 single(const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &single_struct,
1976  const FunctorType &lambda) {
1977  lambda();
1978 }
1979 
1980 template <class FunctorType, class ValueType>
1981 KOKKOS_INLINE_FUNCTION void
1982 single(const Impl::VectorSingleStruct<Impl::HPXTeamMember> &single_struct,
1983  const FunctorType &lambda, ValueType &val) {
1984  lambda(val);
1985 }
1986 
1987 template <class FunctorType, class ValueType>
1988 KOKKOS_INLINE_FUNCTION void
1989 single(const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &single_struct,
1990  const FunctorType &lambda, ValueType &val) {
1991  lambda(val);
1992 }
1993 
1994 } // namespace Kokkos
1995 
1996 #include <HPX/Kokkos_HPX_Task.hpp>
1997 
1998 #endif /* #if defined( KOKKOS_ENABLE_HPX ) */
1999 #endif /* #ifndef KOKKOS_HPX_HPP */
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename Impl::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=0)
Execute functor in parallel according to the execution policy.
void print_configuration(std::ostream &, const bool detail=false)
Print &quot;Bill of Materials&quot;.
KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< iType, TeamMemberType > TeamThreadRange(const TeamMemberType &, const iType &count)=delete
Execution policy for parallel work over a threads within a team.
Scratch memory space associated with an execution space.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename Impl::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=0)
Parallel reduction.
std::enable_if< std::is_same< typename Kokkos::View< T, P...>::array_layout, Kokkos::LayoutLeft >::value||std::is_same< typename Kokkos::View< T, P...>::array_layout, Kokkos::LayoutRight >::value >::type resize(Kokkos::View< T, P...> &v, const size_t n0=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n3=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n4=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n5=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7=KOKKOS_IMPL_CTOR_DEFAULT_ARG)
Resize a view with copying old data to new data at the corresponding indices.
Memory management for host memory.
KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct< iType, TeamMemberType > ThreadVectorRange(const TeamMemberType &, const iType &count)=delete
Execution policy for a vector parallel loop.
Declaration of various MemoryLayout options.
Declaration of parallel operators.
ReturnType
Execution policy for work over a range of an integral type.
KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< iType, TeamMemberType > TeamVectorRange(const TeamMemberType &, const iType &count)=delete
Execution policy for parallel work over a threads within a team.