45 #ifndef KOKKOS_HPX_HPP
46 #define KOKKOS_HPX_HPP
48 #include <Kokkos_Macros.hpp>
49 #if defined(KOKKOS_ENABLE_HPX)
51 #include <Kokkos_Core_fwd.hpp>
53 #include <Kokkos_HostSpace.hpp>
57 #ifdef KOKKOS_ENABLE_HBWSPACE
58 #include <Kokkos_HBWSpace.hpp>
61 #include <HPX/Kokkos_HPX_ChunkedRoundRobinExecutor.hpp>
62 #include <Kokkos_HostSpace.hpp>
64 #include <Kokkos_MemoryTraits.hpp>
66 #include <Kokkos_ScratchSpace.hpp>
67 #include <Kokkos_TaskScheduler.hpp>
68 #include <impl/Kokkos_FunctorAdapter.hpp>
69 #include <impl/Kokkos_FunctorAnalysis.hpp>
70 #include <impl/Kokkos_Profiling_Interface.hpp>
71 #include <impl/Kokkos_Tags.hpp>
72 #include <impl/Kokkos_TaskQueue.hpp>
74 #include <KokkosExp_MDRangePolicy.hpp>
76 #include <hpx/apply.hpp>
77 #include <hpx/hpx_start.hpp>
78 #include <hpx/lcos/local/barrier.hpp>
79 #include <hpx/lcos/local/latch.hpp>
80 #include <hpx/parallel/algorithms/for_loop.hpp>
81 #include <hpx/parallel/algorithms/reduce.hpp>
82 #include <hpx/parallel/executors/static_chunk_size.hpp>
83 #include <hpx/runtime.hpp>
84 #include <hpx/runtime/threads/run_as_hpx_thread.hpp>
85 #include <hpx/runtime/threads/threadmanager.hpp>
86 #include <hpx/runtime/thread_pool_helpers.hpp>
92 #include <type_traits>
107 #ifndef KOKKOS_HPX_IMPLEMENTATION
108 #define KOKKOS_HPX_IMPLEMENTATION 1
111 #if (KOKKOS_HPX_IMPLEMENTATION < 0) || (KOKKOS_HPX_IMPLEMENTATION > 2)
112 #error "You have chosen an invalid value for KOKKOS_HPX_IMPLEMENTATION"
117 class thread_buffer {
118 static constexpr std::size_t m_cache_line_size = 64;
120 std::size_t m_num_threads;
121 std::size_t m_size_per_thread;
122 std::size_t m_size_total;
125 void pad_to_cache_line(std::size_t &size) {
126 size = ((size + m_cache_line_size - 1) / m_cache_line_size) *
133 m_size_per_thread(0),
136 thread_buffer(
const std::size_t num_threads,
137 const std::size_t size_per_thread) {
138 resize(num_threads, size_per_thread);
140 ~thread_buffer() {
delete[] m_data; }
142 thread_buffer(
const thread_buffer &) =
delete;
143 thread_buffer(thread_buffer &&) =
delete;
144 thread_buffer &operator=(
const thread_buffer &) =
delete;
145 thread_buffer &operator=(thread_buffer) =
delete;
147 void resize(
const std::size_t num_threads,
148 const std::size_t size_per_thread) {
149 m_num_threads = num_threads;
150 m_size_per_thread = size_per_thread;
152 pad_to_cache_line(m_size_per_thread);
154 std::size_t size_total_new = m_num_threads * m_size_per_thread;
156 if (m_size_total < size_total_new) {
158 m_data =
new char[size_total_new];
159 m_size_total = size_total_new;
163 char *
get(std::size_t thread_num) {
164 assert(thread_num < m_num_threads);
165 if (m_data ==
nullptr) {
168 return &m_data[thread_num * m_size_per_thread];
171 std::size_t size_per_thread() const noexcept {
return m_size_per_thread; }
172 std::size_t size_total() const noexcept {
return m_size_total; }
176 namespace Experimental {
179 static bool m_hpx_initialized;
180 static Kokkos::Impl::thread_buffer m_buffer;
181 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
182 static hpx::future<void> m_future;
186 using execution_space = HPX;
187 using memory_space = HostSpace;
188 using device_type = Kokkos::Device<execution_space, memory_space>;
189 using array_layout = LayoutRight;
190 using size_type = memory_space::size_type;
191 using scratch_memory_space = ScratchMemorySpace<HPX>;
195 const bool =
false) {
196 std::cout <<
"HPX backend" << std::endl;
198 uint32_t impl_instance_id() const noexcept {
return 0; }
200 static bool in_parallel(HPX
const & = HPX()) noexcept {
return false; }
201 static void impl_static_fence(HPX
const & = HPX())
202 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
204 if (hpx::threads::get_self_ptr() ==
nullptr) {
205 hpx::threads::run_as_hpx_thread([]() { impl_get_future().wait(); });
207 impl_get_future().wait();
215 void fence()
const { impl_static_fence(); }
217 static bool is_asynchronous(HPX
const & = HPX()) noexcept {
218 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
225 static std::vector<HPX> partition(...) {
227 "Kokkos::Experimental::HPX::partition_master: can't partition an HPX "
229 return std::vector<HPX>();
232 template <
typename F>
233 static void partition_master(F
const &,
int requested_num_partitions = 0,
235 if (requested_num_partitions > 1) {
237 "Kokkos::Experimental::HPX::partition_master: can't partition an "
242 static int concurrency();
243 static void impl_initialize(
int thread_count);
244 static void impl_initialize();
245 static bool impl_is_initialized() noexcept;
246 static
void impl_finalize();
248 static
int impl_thread_pool_size() noexcept {
249 hpx::runtime *rt = hpx::get_runtime_ptr();
253 if (hpx::threads::get_self_ptr() ==
nullptr) {
254 return hpx::resource::get_thread_pool(0).get_os_thread_count();
256 return hpx::this_thread::get_pool()->get_os_thread_count();
261 static int impl_thread_pool_rank() noexcept {
262 hpx::runtime *rt = hpx::get_runtime_ptr();
266 if (hpx::threads::get_self_ptr() ==
nullptr) {
269 return hpx::this_thread::get_pool()->get_pool_index();
274 static int impl_thread_pool_size(
int depth) {
276 return impl_thread_pool_size();
282 static int impl_max_hardware_threads() noexcept {
283 return hpx::threads::hardware_concurrency();
286 static int impl_hardware_thread_id() noexcept {
287 return hpx::get_worker_thread_num();
290 static Kokkos::Impl::thread_buffer &impl_get_buffer() noexcept {
294 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
295 static hpx::future<void> &impl_get_future() noexcept {
return m_future; }
298 static constexpr
const char *name() noexcept {
return "HPX"; }
302 namespace Profiling {
303 namespace Experimental {
305 struct DeviceTypeTraits<Kokkos::Experimental::HPX> {
306 constexpr
static DeviceType
id = DeviceType::HPX;
312 template <
typename Closure>
313 inline void dispatch_execute_task(Closure *closure) {
314 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
315 if (hpx::threads::get_self_ptr() ==
nullptr) {
316 hpx::threads::run_as_hpx_thread([closure]() {
317 hpx::future<void> &fut = Kokkos::Experimental::HPX::impl_get_future();
318 Closure closure_copy = *closure;
319 fut = fut.then([closure_copy](hpx::future<void> &&) {
320 closure_copy.execute_task();
324 hpx::future<void> &fut = Kokkos::Experimental::HPX::impl_get_future();
325 Closure closure_copy = *closure;
327 [closure_copy](hpx::future<void> &&) { closure_copy.execute_task(); });
330 if (hpx::threads::get_self_ptr() ==
nullptr) {
331 hpx::threads::run_as_hpx_thread([closure]() { closure->execute_task(); });
333 closure->execute_task();
343 struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space,
344 Kokkos::Experimental::HPX::scratch_memory_space> {
345 enum { assignable =
false };
346 enum { accessible =
true };
347 enum { deepcopy =
false };
351 struct VerifyExecutionCanAccessMemorySpace<
352 Kokkos::Experimental::HPX::memory_space,
353 Kokkos::Experimental::HPX::scratch_memory_space> {
354 enum { value =
true };
355 inline static void verify(
void) {}
356 inline static void verify(
const void *) {}
362 namespace Experimental {
364 class UniqueToken<HPX, UniqueTokenScope::Instance> {
366 using execution_space = HPX;
367 using size_type = int;
368 UniqueToken(execution_space
const & = execution_space()) noexcept {}
373 int size() const noexcept {
return HPX::impl_max_hardware_threads(); }
374 int acquire() const noexcept {
return HPX::impl_hardware_thread_id(); }
375 void release(
int) const noexcept {}
379 class UniqueToken<HPX, UniqueTokenScope::Global> {
381 using execution_space = HPX;
382 using size_type = int;
383 UniqueToken(execution_space
const & = execution_space()) noexcept {}
388 int size() const noexcept {
return HPX::impl_max_hardware_threads(); }
389 int acquire() const noexcept {
return HPX::impl_hardware_thread_id(); }
390 void release(
int) const noexcept {}
398 struct HPXTeamMember {
400 using execution_space = Kokkos::Experimental::HPX;
401 using scratch_memory_space =
405 scratch_memory_space m_team_shared;
413 KOKKOS_INLINE_FUNCTION
414 const scratch_memory_space &team_shmem()
const {
415 return m_team_shared.set_team_thread_mode(0, 1, 0);
418 KOKKOS_INLINE_FUNCTION
419 const execution_space::scratch_memory_space &team_scratch(
const int)
const {
420 return m_team_shared.set_team_thread_mode(0, 1, 0);
423 KOKKOS_INLINE_FUNCTION
424 const execution_space::scratch_memory_space &thread_scratch(
const int)
const {
425 return m_team_shared.set_team_thread_mode(0, team_size(), team_rank());
428 KOKKOS_INLINE_FUNCTION
int league_rank() const noexcept {
429 return m_league_rank;
432 KOKKOS_INLINE_FUNCTION
int league_size() const noexcept {
433 return m_league_size;
436 KOKKOS_INLINE_FUNCTION
int team_rank() const noexcept {
return m_team_rank; }
437 KOKKOS_INLINE_FUNCTION
int team_size() const noexcept {
return m_team_size; }
439 template <
class... Properties>
440 constexpr KOKKOS_INLINE_FUNCTION HPXTeamMember(
441 const TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
443 const int team_rank,
const int league_rank,
void *scratch,
444 int scratch_size) noexcept
445 : m_team_shared(scratch, scratch_size, scratch, scratch_size),
446 m_league_size(policy.league_size()),
447 m_league_rank(league_rank),
448 m_team_size(policy.team_size()),
449 m_team_rank(team_rank) {}
451 KOKKOS_INLINE_FUNCTION
452 void team_barrier()
const {}
454 template <
class ValueType>
455 KOKKOS_INLINE_FUNCTION
void team_broadcast(ValueType &,
const int &)
const {
456 static_assert(std::is_trivially_default_constructible<ValueType>(),
457 "Only trivial constructible types can be broadcasted");
460 template <
class Closure,
class ValueType>
461 KOKKOS_INLINE_FUNCTION
void team_broadcast(
const Closure &, ValueType &,
463 static_assert(std::is_trivially_default_constructible<ValueType>(),
464 "Only trivial constructible types can be broadcasted");
467 template <
class ValueType,
class JoinOp>
468 KOKKOS_INLINE_FUNCTION ValueType team_reduce(
const ValueType &value,
469 const JoinOp &)
const {
473 template <
class ReducerType>
474 KOKKOS_INLINE_FUNCTION
475 typename std::enable_if<is_reducer<ReducerType>::value>::type
476 team_reduce(
const ReducerType &)
const {}
478 template <
typename Type>
479 KOKKOS_INLINE_FUNCTION Type
480 team_scan(
const Type &value, Type *
const global_accum =
nullptr)
const {
482 Kokkos::atomic_fetch_add(global_accum, value);
489 template <
class... Properties>
490 class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
491 :
public PolicyTraits<Properties...> {
492 using traits = PolicyTraits<Properties...>;
496 std::size_t m_team_scratch_size[2];
497 std::size_t m_thread_scratch_size[2];
501 using member_type = HPXTeamMember;
505 template <
class FunctorType>
506 inline static int team_size_max(
const FunctorType &) {
510 template <
class FunctorType>
511 inline static int team_size_recommended(
const FunctorType &) {
515 template <
class FunctorType>
516 inline static int team_size_recommended(
const FunctorType &,
const int &) {
520 template <
class FunctorType>
521 int team_size_max(
const FunctorType &,
const ParallelForTag &)
const {
525 template <
class FunctorType>
526 int team_size_max(
const FunctorType &,
const ParallelReduceTag &)
const {
530 template <
class FunctorType,
class ReducerType>
531 int team_size_max(
const FunctorType &,
const ReducerType &,
532 const ParallelReduceTag &)
const {
536 template <
class FunctorType>
537 int team_size_recommended(
const FunctorType &,
const ParallelForTag &)
const {
541 template <
class FunctorType>
542 int team_size_recommended(
const FunctorType &,
543 const ParallelReduceTag &)
const {
547 template <
class FunctorType,
class ReducerType>
548 int team_size_recommended(
const FunctorType &,
const ReducerType &,
549 const ParallelReduceTag &)
const {
554 inline void init(
const int league_size_request,
const int team_size_request) {
555 m_league_size = league_size_request;
556 const int max_team_size = 1;
559 team_size_request > max_team_size ? max_team_size : team_size_request;
561 if (m_chunk_size > 0) {
562 if (!Impl::is_integral_power_of_two(m_chunk_size))
563 Kokkos::abort(
"TeamPolicy blocking granularity must be power of two");
565 int new_chunk_size = 1;
566 while (new_chunk_size * 4 * Kokkos::Experimental::HPX::concurrency() <
571 if (new_chunk_size < 128) {
573 while ((new_chunk_size * Kokkos::Experimental::HPX::concurrency() <
575 (new_chunk_size < 128))
579 m_chunk_size = new_chunk_size;
584 inline int team_size()
const {
return m_team_size; }
585 inline int league_size()
const {
return m_league_size; }
587 inline size_t scratch_size(
const int &level,
int team_size_ = -1)
const {
588 if (team_size_ < 0) {
589 team_size_ = m_team_size;
591 return m_team_scratch_size[level] +
592 team_size_ * m_thread_scratch_size[level];
595 inline static int scratch_size_max(
int level) {
596 return (level == 0 ? 1024 * 32 :
601 template <
class ExecSpace,
class... OtherProperties>
602 friend class TeamPolicyInternal;
604 const typename traits::execution_space &space()
const {
605 static typename traits::execution_space m_space;
609 template <
class... OtherProperties>
610 TeamPolicyInternal(
const TeamPolicyInternal<Kokkos::Experimental::HPX,
611 OtherProperties...> &p) {
612 m_league_size = p.m_league_size;
613 m_team_size = p.m_team_size;
614 m_team_scratch_size[0] = p.m_team_scratch_size[0];
615 m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
616 m_team_scratch_size[1] = p.m_team_scratch_size[1];
617 m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
618 m_chunk_size = p.m_chunk_size;
621 TeamPolicyInternal(
const typename traits::execution_space &,
622 int league_size_request,
int team_size_request,
624 : m_team_scratch_size{0, 0},
625 m_thread_scratch_size{0, 0},
627 init(league_size_request, team_size_request);
630 TeamPolicyInternal(
const typename traits::execution_space &,
631 int league_size_request,
const Kokkos::AUTO_t &,
633 : m_team_scratch_size{0, 0},
634 m_thread_scratch_size{0, 0},
636 init(league_size_request, 1);
639 TeamPolicyInternal(
int league_size_request,
int team_size_request,
641 : m_team_scratch_size{0, 0},
642 m_thread_scratch_size{0, 0},
644 init(league_size_request, team_size_request);
647 TeamPolicyInternal(
int league_size_request,
const Kokkos::AUTO_t &,
649 : m_team_scratch_size{0, 0},
650 m_thread_scratch_size{0, 0},
652 init(league_size_request, 1);
655 inline int chunk_size()
const {
return m_chunk_size; }
657 inline TeamPolicyInternal &set_chunk_size(
658 typename traits::index_type chunk_size_) {
659 m_chunk_size = chunk_size_;
663 inline TeamPolicyInternal &set_scratch_size(
const int &level,
664 const PerTeamValue &per_team) {
665 m_team_scratch_size[level] = per_team.value;
669 inline TeamPolicyInternal &set_scratch_size(
670 const int &level,
const PerThreadValue &per_thread) {
671 m_thread_scratch_size[level] = per_thread.value;
675 inline TeamPolicyInternal &set_scratch_size(
676 const int &level,
const PerTeamValue &per_team,
677 const PerThreadValue &per_thread) {
678 m_team_scratch_size[level] = per_team.value;
679 m_thread_scratch_size[level] = per_thread.value;
689 template <
class FunctorType,
class... Traits>
690 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
691 Kokkos::Experimental::HPX> {
694 using WorkTag =
typename Policy::work_tag;
695 using WorkRange =
typename Policy::WorkRange;
696 using Member =
typename Policy::member_type;
698 const FunctorType m_functor;
699 const Policy m_policy;
701 template <
class TagType>
702 static typename std::enable_if<std::is_same<TagType, void>::value>::type
703 execute_functor(
const FunctorType &functor,
const Member i) {
707 template <
class TagType>
708 static typename std::enable_if<!std::is_same<TagType, void>::value>::type
709 execute_functor(
const FunctorType &functor,
const Member i) {
714 template <
class TagType>
715 static typename std::enable_if<std::is_same<TagType, void>::value>::type
716 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
717 const Member i_end) {
718 for (Member i = i_begin; i < i_end; ++i) {
723 template <
class TagType>
724 static typename std::enable_if<!std::is_same<TagType, void>::value>::type
725 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
726 const Member i_end) {
728 for (Member i = i_begin; i < i_end; ++i) {
734 void execute()
const { Kokkos::Impl::dispatch_execute_task(
this); }
736 void execute_task()
const {
737 #if KOKKOS_HPX_IMPLEMENTATION == 0
738 using hpx::parallel::for_loop;
739 using hpx::parallel::execution::par;
740 using hpx::parallel::execution::static_chunk_size;
742 for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
743 m_policy.begin(), m_policy.end(), [
this](
const Member i) {
744 execute_functor<WorkTag>(m_functor, i);
747 #elif KOKKOS_HPX_IMPLEMENTATION == 1
749 using hpx::lcos::local::latch;
751 const int num_tasks =
752 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
753 m_policy.chunk_size();
754 latch num_tasks_remaining(num_tasks);
755 ChunkedRoundRobinExecutor exec(num_tasks);
757 for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
758 i_begin += m_policy.chunk_size()) {
759 apply(exec, [
this, &num_tasks_remaining, i_begin]() {
761 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
762 execute_functor_range<WorkTag>(m_functor, i_begin, i_end);
764 num_tasks_remaining.count_down(1);
768 num_tasks_remaining.wait();
770 #elif KOKKOS_HPX_IMPLEMENTATION == 2
771 using hpx::parallel::for_loop_strided;
772 using hpx::parallel::execution::par;
773 using hpx::parallel::execution::static_chunk_size;
775 const int num_tasks =
776 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
777 m_policy.chunk_size();
778 ChunkedRoundRobinExecutor exec(num_tasks);
781 par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
782 m_policy.end(), m_policy.chunk_size(), [
this](
const Member i_begin) {
784 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
785 execute_functor_range<WorkTag>(m_functor, i_begin, i_end);
790 inline ParallelFor(
const FunctorType &arg_functor, Policy arg_policy)
791 : m_functor(arg_functor), m_policy(arg_policy) {}
794 template <
class FunctorType,
class... Traits>
795 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
796 Kokkos::Experimental::HPX> {
798 using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
799 using Policy =
typename MDRangePolicy::impl_range_policy;
800 using WorkTag =
typename MDRangePolicy::work_tag;
801 using WorkRange =
typename Policy::WorkRange;
802 using Member =
typename Policy::member_type;
804 typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
807 const FunctorType m_functor;
808 const MDRangePolicy m_mdr_policy;
809 const Policy m_policy;
812 void execute()
const { dispatch_execute_task(
this); }
814 inline void execute_task()
const {
815 #if KOKKOS_HPX_IMPLEMENTATION == 0
816 using hpx::parallel::for_loop;
817 using hpx::parallel::execution::par;
818 using hpx::parallel::execution::static_chunk_size;
820 for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
821 m_policy.begin(), m_policy.end(), [
this](
const Member i) {
822 iterate_type(m_mdr_policy, m_functor)(i);
825 #elif KOKKOS_HPX_IMPLEMENTATION == 1
827 using hpx::lcos::local::latch;
829 const int num_tasks =
830 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
831 m_policy.chunk_size();
832 latch num_tasks_remaining(num_tasks);
833 ChunkedRoundRobinExecutor exec(num_tasks);
835 for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
836 i_begin += m_policy.chunk_size()) {
837 apply(exec, [
this, &num_tasks_remaining, i_begin]() {
839 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
840 for (Member i = i_begin; i < i_end; ++i) {
841 iterate_type(m_mdr_policy, m_functor)(i);
844 num_tasks_remaining.count_down(1);
848 num_tasks_remaining.wait();
850 #elif KOKKOS_HPX_IMPLEMENTATION == 2
851 using hpx::parallel::for_loop_strided;
852 using hpx::parallel::execution::par;
853 using hpx::parallel::execution::static_chunk_size;
855 const int num_tasks =
856 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
857 m_policy.chunk_size();
858 ChunkedRoundRobinExecutor exec(num_tasks);
861 par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
862 m_policy.end(), m_policy.chunk_size(), [
this](
const Member i_begin) {
864 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
865 for (Member i = i_begin; i < i_end; ++i) {
866 iterate_type(m_mdr_policy, m_functor)(i);
872 inline ParallelFor(
const FunctorType &arg_functor, MDRangePolicy arg_policy)
873 : m_functor(arg_functor),
874 m_mdr_policy(arg_policy),
875 m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)) {}
882 template <
class FunctorType,
class ReducerType,
class... Traits>
883 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
884 Kokkos::Experimental::HPX> {
887 using WorkTag =
typename Policy::work_tag;
888 using WorkRange =
typename Policy::WorkRange;
889 using Member =
typename Policy::member_type;
891 FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
892 using ReducerConditional =
893 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
894 FunctorType, ReducerType>;
895 using ReducerTypeFwd =
typename ReducerConditional::type;
897 typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
898 WorkTag,
void>::type;
899 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
900 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
901 using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
902 using value_type =
typename Analysis::value_type;
903 using pointer_type =
typename Analysis::pointer_type;
904 using reference_type =
typename Analysis::reference_type;
906 const FunctorType m_functor;
907 const Policy m_policy;
908 const ReducerType m_reducer;
909 const pointer_type m_result_ptr;
911 bool m_force_synchronous;
913 template <
class TagType>
915 typename std::enable_if<std::is_same<TagType, void>::value>::type
916 execute_functor(
const FunctorType &functor,
const Member i,
917 reference_type update) {
921 template <
class TagType>
923 typename std::enable_if<!std::is_same<TagType, void>::value>::type
924 execute_functor(
const FunctorType &functor,
const Member i,
925 reference_type update) {
927 functor(t, i, update);
930 template <
class TagType>
931 inline typename std::enable_if<std::is_same<TagType, void>::value>::type
932 execute_functor_range(reference_type update,
const Member i_begin,
933 const Member i_end)
const {
934 for (Member i = i_begin; i < i_end; ++i) {
935 m_functor(i, update);
939 template <
class TagType>
940 inline typename std::enable_if<!std::is_same<TagType, void>::value>::type
941 execute_functor_range(reference_type update,
const Member i_begin,
942 const Member i_end)
const {
945 for (Member i = i_begin; i < i_end; ++i) {
946 m_functor(t, i, update);
950 class value_type_wrapper {
952 std::size_t m_value_size;
953 char *m_value_buffer;
956 value_type_wrapper() : m_value_size(0), m_value_buffer(nullptr) {}
958 value_type_wrapper(
const std::size_t value_size)
959 : m_value_size(value_size), m_value_buffer(new char[m_value_size]) {}
961 value_type_wrapper(
const value_type_wrapper &other)
962 : m_value_size(0), m_value_buffer(nullptr) {
963 if (
this != &other) {
964 m_value_buffer =
new char[other.m_value_size];
965 m_value_size = other.m_value_size;
967 std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
972 ~value_type_wrapper() {
delete[] m_value_buffer; }
974 value_type_wrapper(value_type_wrapper &&other)
975 : m_value_size(0), m_value_buffer(nullptr) {
976 if (
this != &other) {
977 m_value_buffer = other.m_value_buffer;
978 m_value_size = other.m_value_size;
980 other.m_value_buffer =
nullptr;
981 other.m_value_size = 0;
985 value_type_wrapper &operator=(
const value_type_wrapper &other) {
986 if (
this != &other) {
987 delete[] m_value_buffer;
988 m_value_buffer =
new char[other.m_value_size];
989 m_value_size = other.m_value_size;
991 std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
998 value_type_wrapper &operator=(value_type_wrapper &&other) {
999 if (
this != &other) {
1000 delete[] m_value_buffer;
1001 m_value_buffer = other.m_value_buffer;
1002 m_value_size = other.m_value_size;
1004 other.m_value_buffer =
nullptr;
1005 other.m_value_size = 0;
1011 pointer_type pointer()
const {
1012 return reinterpret_cast<pointer_type
>(m_value_buffer);
1015 reference_type reference()
const {
1016 return ValueOps::reference(
1017 reinterpret_cast<pointer_type>(m_value_buffer));
1022 void execute()
const { dispatch_execute_task(
this); }
1024 inline void execute_task()
const {
1025 const std::size_t value_size =
1026 Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1028 #if KOKKOS_HPX_IMPLEMENTATION == 0
1033 using hpx::parallel::for_loop;
1034 using hpx::parallel::reduction;
1035 using hpx::parallel::execution::par;
1036 using hpx::parallel::execution::static_chunk_size;
1038 value_type_wrapper final_value(value_size);
1039 value_type_wrapper identity(value_size);
1041 ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1042 final_value.pointer());
1043 ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1044 identity.pointer());
1046 for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
1047 m_policy.begin(), m_policy.end(),
1048 reduction(final_value, identity,
1049 [
this](value_type_wrapper &a,
1050 value_type_wrapper &b) -> value_type_wrapper & {
1052 ReducerConditional::select(m_functor, m_reducer),
1053 a.pointer(), b.pointer());
1056 [
this](Member i, value_type_wrapper &update) {
1057 execute_functor<WorkTag>(m_functor, i, update.reference());
1060 pointer_type final_value_ptr = final_value.pointer();
1062 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1063 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1065 thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1066 buffer.resize(num_worker_threads, value_size);
1069 using hpx::lcos::local::latch;
1072 latch num_tasks_remaining(num_worker_threads);
1073 ChunkedRoundRobinExecutor exec(num_worker_threads);
1075 for (
int t = 0; t < num_worker_threads; ++t) {
1076 apply(exec, [
this, &num_tasks_remaining, &buffer, t]() {
1077 ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1078 reinterpret_cast<pointer_type>(buffer.get(t)));
1080 num_tasks_remaining.count_down(1);
1084 num_tasks_remaining.wait();
1087 const int num_tasks =
1088 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1089 m_policy.chunk_size();
1090 latch num_tasks_remaining(num_tasks);
1091 ChunkedRoundRobinExecutor exec(num_tasks);
1093 for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
1094 i_begin += m_policy.chunk_size()) {
1095 apply(exec, [
this, &num_tasks_remaining, &buffer, i_begin]() {
1096 reference_type update =
1097 ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1098 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1099 const Member i_end =
1100 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1101 execute_functor_range<WorkTag>(update, i_begin, i_end);
1103 num_tasks_remaining.count_down(1);
1107 num_tasks_remaining.wait();
1109 for (
int i = 1; i < num_worker_threads; ++i) {
1110 ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1111 reinterpret_cast<pointer_type>(buffer.get(0)),
1112 reinterpret_cast<pointer_type>(buffer.get(i)));
1115 pointer_type final_value_ptr =
1116 reinterpret_cast<pointer_type
>(buffer.get(0));
1118 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1119 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1121 thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1122 buffer.resize(num_worker_threads, value_size);
1124 using hpx::parallel::for_loop;
1125 using hpx::parallel::for_loop_strided;
1126 using hpx::parallel::execution::par;
1127 using hpx::parallel::execution::static_chunk_size;
1130 ChunkedRoundRobinExecutor exec(num_worker_threads);
1132 for_loop(par.on(exec).with(static_chunk_size(1)), std::size_t(0),
1133 num_worker_threads, [
this, &buffer](
const std::size_t t) {
1135 ReducerConditional::select(m_functor, m_reducer),
1136 reinterpret_cast<pointer_type>(buffer.get(t)));
1140 const int num_tasks =
1141 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1142 m_policy.chunk_size();
1143 ChunkedRoundRobinExecutor exec(num_tasks);
1146 par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
1147 m_policy.end(), m_policy.chunk_size(),
1148 [
this, &buffer](
const Member i_begin) {
1149 reference_type update =
1150 ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1151 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1152 const Member i_end =
1153 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1154 execute_functor_range<WorkTag>(update, i_begin, i_end);
1157 for (
int i = 1; i < num_worker_threads; ++i) {
1158 ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1159 reinterpret_cast<pointer_type>(buffer.get(0)),
1160 reinterpret_cast<pointer_type>(buffer.get(i)));
1163 pointer_type final_value_ptr =
1164 reinterpret_cast<pointer_type
>(buffer.get(0));
1167 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1168 ReducerConditional::select(m_functor, m_reducer), final_value_ptr);
1170 if (m_result_ptr !=
nullptr) {
1171 const int n = Analysis::value_count(
1172 ReducerConditional::select(m_functor, m_reducer));
1174 for (
int j = 0; j < n; ++j) {
1175 m_result_ptr[j] = final_value_ptr[j];
1180 template <
class ViewType>
1181 inline ParallelReduce(
1182 const FunctorType &arg_functor, Policy arg_policy,
1183 const ViewType &arg_view,
1184 typename std::enable_if<Kokkos::is_view<ViewType>::value &&
1185 !Kokkos::is_reducer_type<ReducerType>::value,
1186 void *>::type =
nullptr)
1187 : m_functor(arg_functor),
1188 m_policy(arg_policy),
1189 m_reducer(InvalidType()),
1190 m_result_ptr(arg_view.data()),
1191 m_force_synchronous(!arg_view.impl_track().has_record()) {}
1193 inline ParallelReduce(
const FunctorType &arg_functor, Policy arg_policy,
1194 const ReducerType &reducer)
1195 : m_functor(arg_functor),
1196 m_policy(arg_policy),
1198 m_result_ptr(reducer.view().data()),
1199 m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1202 template <
class FunctorType,
class ReducerType,
class... Traits>
1203 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
1204 Kokkos::Experimental::HPX> {
1206 using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
1207 using Policy =
typename MDRangePolicy::impl_range_policy;
1208 using WorkTag =
typename MDRangePolicy::work_tag;
1209 using WorkRange =
typename Policy::WorkRange;
1210 using Member =
typename Policy::member_type;
1211 using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
1212 MDRangePolicy, FunctorType>;
1213 using ReducerConditional =
1214 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1215 FunctorType, ReducerType>;
1216 using ReducerTypeFwd =
typename ReducerConditional::type;
1218 typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1219 WorkTag,
void>::type;
1220 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1221 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1222 using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
1223 using pointer_type =
typename Analysis::pointer_type;
1224 using value_type =
typename Analysis::value_type;
1225 using reference_type =
typename Analysis::reference_type;
1226 using iterate_type =
1227 typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
1228 WorkTag, reference_type>;
1230 const FunctorType m_functor;
1231 const MDRangePolicy m_mdr_policy;
1232 const Policy m_policy;
1233 const ReducerType m_reducer;
1234 const pointer_type m_result_ptr;
1236 bool m_force_synchronous;
1239 void execute()
const { dispatch_execute_task(
this); }
1241 inline void execute_task()
const {
1242 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1243 const std::size_t value_size =
1244 Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1246 thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1247 buffer.resize(num_worker_threads, value_size);
1249 #if KOKKOS_HPX_IMPLEMENTATION == 0
1250 using hpx::parallel::for_loop;
1251 using hpx::parallel::execution::par;
1252 using hpx::parallel::execution::static_chunk_size;
1254 for_loop(par, 0, num_worker_threads, [
this, &buffer](std::size_t t) {
1255 ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1256 reinterpret_cast<pointer_type>(buffer.get(t)));
1259 for_loop(par.with(static_chunk_size(m_policy.chunk_size())),
1260 m_policy.begin(), m_policy.end(), [
this, &buffer](
const Member i) {
1261 reference_type update = ValueOps::reference(
1262 reinterpret_cast<pointer_type>(buffer.get(
1263 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1264 iterate_type(m_mdr_policy, m_functor, update)(i);
1267 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1269 using hpx::lcos::local::latch;
1272 latch num_tasks_remaining(num_worker_threads);
1273 ChunkedRoundRobinExecutor exec(num_worker_threads);
1275 for (
int t = 0; t < num_worker_threads; ++t) {
1276 apply(exec, [
this, &buffer, &num_tasks_remaining, t]() {
1277 ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1278 reinterpret_cast<pointer_type>(buffer.get(t)));
1280 num_tasks_remaining.count_down(1);
1284 num_tasks_remaining.wait();
1287 const int num_tasks =
1288 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1289 m_policy.chunk_size();
1290 latch num_tasks_remaining(num_tasks);
1291 ChunkedRoundRobinExecutor exec(num_tasks);
1293 for (Member i_begin = m_policy.begin(); i_begin < m_policy.end();
1294 i_begin += m_policy.chunk_size()) {
1295 apply(exec, [
this, &num_tasks_remaining, &buffer, i_begin]() {
1296 reference_type update =
1297 ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1298 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1299 const Member i_end =
1300 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1302 for (Member i = i_begin; i < i_end; ++i) {
1303 iterate_type(m_mdr_policy, m_functor, update)(i);
1306 num_tasks_remaining.count_down(1);
1310 num_tasks_remaining.wait();
1312 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1313 using hpx::parallel::for_loop;
1314 using hpx::parallel::for_loop_strided;
1315 using hpx::parallel::execution::par;
1316 using hpx::parallel::execution::static_chunk_size;
1319 ChunkedRoundRobinExecutor exec(num_worker_threads);
1321 for_loop(par.on(exec).with(static_chunk_size(1)), std::size_t(0),
1322 num_worker_threads, [
this, &buffer](
const std::size_t t) {
1324 ReducerConditional::select(m_functor, m_reducer),
1325 reinterpret_cast<pointer_type>(buffer.get(t)));
1329 const int num_tasks =
1330 (m_policy.end() - m_policy.begin() + m_policy.chunk_size() - 1) /
1331 m_policy.chunk_size();
1332 ChunkedRoundRobinExecutor exec(num_tasks);
1335 par.on(exec).with(static_chunk_size(1)), m_policy.begin(),
1336 m_policy.end(), m_policy.chunk_size(),
1337 [
this, &buffer](
const Member i_begin) {
1338 reference_type update =
1339 ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(
1340 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1341 const Member i_end =
1342 (std::min)(i_begin + m_policy.chunk_size(), m_policy.end());
1344 for (Member i = i_begin; i < i_end; ++i) {
1345 iterate_type(m_mdr_policy, m_functor, update)(i);
1350 for (
int i = 1; i < num_worker_threads; ++i) {
1351 ValueJoin::join(ReducerConditional::select(m_functor, m_reducer),
1352 reinterpret_cast<pointer_type>(buffer.get(0)),
1353 reinterpret_cast<pointer_type>(buffer.get(i)));
1356 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1357 ReducerConditional::select(m_functor, m_reducer),
1358 reinterpret_cast<pointer_type>(buffer.get(0)));
1360 if (m_result_ptr !=
nullptr) {
1361 const int n = Analysis::value_count(
1362 ReducerConditional::select(m_functor, m_reducer));
1364 for (
int j = 0; j < n; ++j) {
1365 m_result_ptr[j] =
reinterpret_cast<pointer_type
>(buffer.get(0))[j];
1370 template <
class ViewType>
1371 inline ParallelReduce(
1372 const FunctorType &arg_functor, MDRangePolicy arg_policy,
1373 const ViewType &arg_view,
1374 typename std::enable_if<Kokkos::is_view<ViewType>::value &&
1375 !Kokkos::is_reducer_type<ReducerType>::value,
1376 void *>::type =
nullptr)
1377 : m_functor(arg_functor),
1378 m_mdr_policy(arg_policy),
1379 m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
1380 m_reducer(InvalidType()),
1381 m_result_ptr(arg_view.data()),
1382 m_force_synchronous(!arg_view.impl_track().has_record()) {}
1384 inline ParallelReduce(
const FunctorType &arg_functor,
1385 MDRangePolicy arg_policy,
const ReducerType &reducer)
1386 : m_functor(arg_functor),
1387 m_mdr_policy(arg_policy),
1388 m_policy(Policy(0, m_mdr_policy.m_num_tiles).set_chunk_size(1)),
1390 m_result_ptr(reducer.view().data()),
1391 m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1399 template <
class FunctorType,
class... Traits>
1400 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
1401 Kokkos::Experimental::HPX> {
1404 using WorkTag =
typename Policy::work_tag;
1405 using WorkRange =
typename Policy::WorkRange;
1406 using Member =
typename Policy::member_type;
1408 FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1409 using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
1410 using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
1411 using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
1412 using pointer_type =
typename Analysis::pointer_type;
1413 using reference_type =
typename Analysis::reference_type;
1414 using value_type =
typename Analysis::value_type;
1416 const FunctorType m_functor;
1417 const Policy m_policy;
1419 template <
class TagType>
1421 typename std::enable_if<std::is_same<TagType, void>::value>::type
1422 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1423 const Member i_end, reference_type update,
1425 for (Member i = i_begin; i < i_end; ++i) {
1426 functor(i, update,
final);
1430 template <
class TagType>
1432 typename std::enable_if<!std::is_same<TagType, void>::value>::type
1433 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1434 const Member i_end, reference_type update,
1437 for (Member i = i_begin; i < i_end; ++i) {
1438 functor(t, i, update,
final);
1443 void execute()
const { dispatch_execute_task(
this); }
1445 inline void execute_task()
const {
1446 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1447 const int value_count = Analysis::value_count(m_functor);
1448 const std::size_t value_size = Analysis::value_size(m_functor);
1450 thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1451 buffer.resize(num_worker_threads, 2 * value_size);
1454 using hpx::lcos::local::barrier;
1455 using hpx::lcos::local::latch;
1457 barrier bar(num_worker_threads);
1458 latch num_tasks_remaining(num_worker_threads);
1459 ChunkedRoundRobinExecutor exec(num_worker_threads);
1461 for (
int t = 0; t < num_worker_threads; ++t) {
1462 apply(exec, [
this, &bar, &buffer, &num_tasks_remaining,
1463 num_worker_threads, value_count, value_size, t]() {
1464 reference_type update_sum = ValueInit::init(
1465 m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
1467 const WorkRange range(m_policy, t, num_worker_threads);
1468 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1474 ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
1475 buffer.get(0) + value_size));
1477 for (
int i = 1; i < num_worker_threads; ++i) {
1478 pointer_type ptr_1_prev =
1479 reinterpret_cast<pointer_type
>(buffer.get(i - 1));
1480 pointer_type ptr_2_prev =
1481 reinterpret_cast<pointer_type
>(buffer.get(i - 1) + value_size);
1482 pointer_type ptr_2 =
1483 reinterpret_cast<pointer_type
>(buffer.get(i) + value_size);
1485 for (
int j = 0; j < value_count; ++j) {
1486 ptr_2[j] = ptr_2_prev[j];
1489 ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
1495 reference_type update_base = ValueOps::reference(
1496 reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1498 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1501 num_tasks_remaining.count_down(1);
1505 num_tasks_remaining.wait();
1508 inline ParallelScan(
const FunctorType &arg_functor,
const Policy &arg_policy)
1509 : m_functor(arg_functor), m_policy(arg_policy) {}
1512 template <
class FunctorType,
class ReturnType,
class... Traits>
1513 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
1517 using WorkTag =
typename Policy::work_tag;
1518 using WorkRange =
typename Policy::WorkRange;
1519 using Member =
typename Policy::member_type;
1521 FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1522 using ValueInit = Kokkos::Impl::FunctorValueInit<FunctorType, WorkTag>;
1523 using ValueJoin = Kokkos::Impl::FunctorValueJoin<FunctorType, WorkTag>;
1524 using ValueOps = Kokkos::Impl::FunctorValueOps<FunctorType, WorkTag>;
1525 using pointer_type =
typename Analysis::pointer_type;
1526 using reference_type =
typename Analysis::reference_type;
1527 using value_type =
typename Analysis::value_type;
1529 const FunctorType m_functor;
1530 const Policy m_policy;
1533 template <
class TagType>
1535 typename std::enable_if<std::is_same<TagType, void>::value>::type
1536 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1537 const Member i_end, reference_type update,
1539 for (Member i = i_begin; i < i_end; ++i) {
1540 functor(i, update,
final);
1544 template <
class TagType>
1546 typename std::enable_if<!std::is_same<TagType, void>::value>::type
1547 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1548 const Member i_end, reference_type update,
1551 for (Member i = i_begin; i < i_end; ++i) {
1552 functor(t, i, update,
final);
1557 void execute()
const { dispatch_execute_task(
this); }
1559 inline void execute_task()
const {
1560 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1561 const int value_count = Analysis::value_count(m_functor);
1562 const std::size_t value_size = Analysis::value_size(m_functor);
1564 thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1565 buffer.resize(num_worker_threads, 2 * value_size);
1568 using hpx::lcos::local::barrier;
1569 using hpx::lcos::local::latch;
1571 barrier bar(num_worker_threads);
1572 latch num_tasks_remaining(num_worker_threads);
1573 ChunkedRoundRobinExecutor exec(num_worker_threads);
1575 for (
int t = 0; t < num_worker_threads; ++t) {
1576 apply(exec, [
this, &bar, &buffer, &num_tasks_remaining,
1577 num_worker_threads, value_count, value_size, t]() {
1578 reference_type update_sum = ValueInit::init(
1579 m_functor, reinterpret_cast<pointer_type>(buffer.get(t)));
1581 const WorkRange range(m_policy, t, num_worker_threads);
1582 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1588 ValueInit::init(m_functor, reinterpret_cast<pointer_type>(
1589 buffer.get(0) + value_size));
1591 for (
int i = 1; i < num_worker_threads; ++i) {
1592 pointer_type ptr_1_prev =
1593 reinterpret_cast<pointer_type
>(buffer.get(i - 1));
1594 pointer_type ptr_2_prev =
1595 reinterpret_cast<pointer_type
>(buffer.get(i - 1) + value_size);
1596 pointer_type ptr_2 =
1597 reinterpret_cast<pointer_type
>(buffer.get(i) + value_size);
1599 for (
int j = 0; j < value_count; ++j) {
1600 ptr_2[j] = ptr_2_prev[j];
1603 ValueJoin::join(m_functor, ptr_2, ptr_1_prev);
1609 reference_type update_base = ValueOps::reference(
1610 reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1612 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1615 if (t == num_worker_threads - 1) {
1616 m_returnvalue = update_base;
1619 num_tasks_remaining.count_down(1);
1623 num_tasks_remaining.wait();
1626 inline ParallelScanWithTotal(
const FunctorType &arg_functor,
1627 const Policy &arg_policy,
1629 : m_functor(arg_functor),
1630 m_policy(arg_policy),
1631 m_returnvalue(arg_returnvalue) {}
1638 template <
class FunctorType,
class... Properties>
1639 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
1640 Kokkos::Experimental::HPX> {
1642 using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1643 using WorkTag =
typename Policy::work_tag;
1644 using Member =
typename Policy::member_type;
1647 const FunctorType m_functor;
1648 const Policy m_policy;
1650 const std::size_t m_shared;
1652 template <
class TagType>
1654 typename std::enable_if<std::is_same<TagType, void>::value>::type
1655 execute_functor(
const FunctorType &functor,
const Policy &policy,
1656 const int league_rank,
char *local_buffer,
1657 const std::size_t local_buffer_size) {
1658 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1661 template <
class TagType>
1663 typename std::enable_if<!std::is_same<TagType, void>::value>::type
1664 execute_functor(
const FunctorType &functor,
const Policy &policy,
1665 const int league_rank,
char *local_buffer,
1666 const std::size_t local_buffer_size) {
1668 functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1671 template <
class TagType>
1673 typename std::enable_if<std::is_same<TagType, void>::value>::type
1674 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1675 const int league_rank_begin,
1676 const int league_rank_end,
char *local_buffer,
1677 const std::size_t local_buffer_size) {
1678 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1680 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1684 template <
class TagType>
1686 typename std::enable_if<!std::is_same<TagType, void>::value>::type
1687 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1688 const int league_rank_begin,
1689 const int league_rank_end,
char *local_buffer,
1690 const std::size_t local_buffer_size) {
1692 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1695 Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1700 void execute()
const { dispatch_execute_task(
this); }
1702 inline void execute_task()
const {
1703 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1705 thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1706 buffer.resize(num_worker_threads, m_shared);
1708 #if KOKKOS_HPX_IMPLEMENTATION == 0
1709 using hpx::parallel::for_loop;
1710 using hpx::parallel::execution::par;
1711 using hpx::parallel::execution::static_chunk_size;
1714 par.with(static_chunk_size(m_policy.chunk_size())), 0,
1715 m_policy.league_size(), [
this, &buffer](
const int league_rank) {
1716 execute_functor<WorkTag>(
1717 m_functor, m_policy, league_rank,
1718 buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1722 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1724 using hpx::lcos::local::latch;
1726 const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1727 m_policy.chunk_size();
1728 latch num_tasks_remaining(num_tasks);
1729 ChunkedRoundRobinExecutor exec(num_tasks);
1731 for (
int league_rank_begin = 0; league_rank_begin < m_policy.league_size();
1732 league_rank_begin += m_policy.chunk_size()) {
1733 apply(exec, [
this, &buffer, &num_tasks_remaining, league_rank_begin]() {
1734 const int league_rank_end = (std::min)(
1735 league_rank_begin + m_policy.chunk_size(), m_policy.league_size());
1736 execute_functor_range<WorkTag>(
1737 m_functor, m_policy, league_rank_begin, league_rank_end,
1738 buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1741 num_tasks_remaining.count_down(1);
1745 num_tasks_remaining.wait();
1747 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1748 using hpx::parallel::for_loop_strided;
1749 using hpx::parallel::execution::par;
1750 using hpx::parallel::execution::static_chunk_size;
1752 const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1753 m_policy.chunk_size();
1754 ChunkedRoundRobinExecutor exec(num_tasks);
1757 par.on(exec).with(static_chunk_size(1)), 0, m_policy.league_size(),
1758 m_policy.chunk_size(), [
this, &buffer](
const int league_rank_begin) {
1759 const int league_rank_end =
1760 (std::min)(league_rank_begin + m_policy.chunk_size(),
1761 m_policy.league_size());
1762 execute_functor_range<WorkTag>(
1763 m_functor, m_policy, league_rank_begin, league_rank_end,
1764 buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1770 ParallelFor(
const FunctorType &arg_functor,
const Policy &arg_policy)
1771 : m_functor(arg_functor),
1772 m_policy(arg_policy),
1773 m_league(arg_policy.league_size()),
1774 m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
1775 FunctorTeamShmemSize<FunctorType>::value(
1776 arg_functor, arg_policy.team_size())) {}
1779 template <
class FunctorType,
class ReducerType,
class... Properties>
1780 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
1781 ReducerType, Kokkos::Experimental::HPX> {
1783 using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1785 FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, FunctorType>;
1786 using Member =
typename Policy::member_type;
1787 using WorkTag =
typename Policy::work_tag;
1788 using ReducerConditional =
1789 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1790 FunctorType, ReducerType>;
1791 using ReducerTypeFwd =
typename ReducerConditional::type;
1793 typename Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1794 WorkTag,
void>::type;
1795 using ValueInit = Kokkos::Impl::FunctorValueInit<ReducerTypeFwd, WorkTagFwd>;
1796 using ValueJoin = Kokkos::Impl::FunctorValueJoin<ReducerTypeFwd, WorkTagFwd>;
1797 using ValueOps = Kokkos::Impl::FunctorValueOps<ReducerTypeFwd, WorkTagFwd>;
1798 using pointer_type =
typename Analysis::pointer_type;
1799 using reference_type =
typename Analysis::reference_type;
1800 using value_type =
typename Analysis::value_type;
1802 const FunctorType m_functor;
1804 const Policy m_policy;
1805 const ReducerType m_reducer;
1806 pointer_type m_result_ptr;
1807 const std::size_t m_shared;
1809 bool m_force_synchronous;
1811 template <
class TagType>
1813 typename std::enable_if<std::is_same<TagType, void>::value>::type
1814 execute_functor(
const FunctorType &functor,
const Policy &policy,
1815 const int league_rank,
char *local_buffer,
1816 const std::size_t local_buffer_size,
1817 reference_type update) {
1818 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1822 template <
class TagType>
1824 typename std::enable_if<!std::is_same<TagType, void>::value>::type
1825 execute_functor(
const FunctorType &functor,
const Policy &policy,
1826 const int league_rank,
char *local_buffer,
1827 const std::size_t local_buffer_size,
1828 reference_type update) {
1830 functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1834 template <
class TagType>
1836 typename std::enable_if<std::is_same<TagType, void>::value>::type
1837 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1838 const int league_rank_begin,
1839 const int league_rank_end,
char *local_buffer,
1840 const std::size_t local_buffer_size,
1841 reference_type update) {
1842 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1844 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1849 template <
class TagType>
1851 typename std::enable_if<!std::is_same<TagType, void>::value>::type
1852 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1853 const int league_rank_begin,
1854 const int league_rank_end,
char *local_buffer,
1855 const std::size_t local_buffer_size,
1856 reference_type update) {
1858 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1861 Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1867 void execute()
const { dispatch_execute_task(
this); }
1869 inline void execute_task()
const {
1870 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1871 const std::size_t value_size =
1872 Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1874 thread_buffer &buffer = Kokkos::Experimental::HPX::impl_get_buffer();
1875 buffer.resize(num_worker_threads, value_size + m_shared);
1877 #if KOKKOS_HPX_IMPLEMENTATION == 0
1878 using hpx::parallel::for_loop;
1879 using hpx::parallel::execution::par;
1881 for_loop(par, 0, num_worker_threads, [
this, &buffer](
const std::size_t t) {
1882 ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1883 reinterpret_cast<pointer_type>(buffer.get(t)));
1886 using hpx::parallel::execution::static_chunk_size;
1888 hpx::parallel::for_loop(
1889 par.with(static_chunk_size(m_policy.chunk_size())), 0,
1890 m_policy.league_size(),
1891 [
this, &buffer, value_size](
const int league_rank) {
1892 std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1893 reference_type update = ValueOps::reference(
1894 reinterpret_cast<pointer_type>(buffer.get(t)));
1896 execute_functor<WorkTag>(m_functor, m_policy, league_rank,
1897 buffer.get(t) + value_size, m_shared,
1901 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1903 using hpx::lcos::local::latch;
1906 latch num_tasks_remaining(num_worker_threads);
1907 ChunkedRoundRobinExecutor exec(num_worker_threads);
1909 for (
int t = 0; t < num_worker_threads; ++t) {
1910 apply(exec, [
this, &buffer, &num_tasks_remaining, t]() {
1911 ValueInit::init(ReducerConditional::select(m_functor, m_reducer),
1912 reinterpret_cast<pointer_type>(buffer.get(t)));
1914 num_tasks_remaining.count_down(1);
1918 num_tasks_remaining.wait();
1921 const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1922 m_policy.chunk_size();
1923 latch num_tasks_remaining(num_tasks);
1924 ChunkedRoundRobinExecutor exec(num_tasks);
1926 for (
int league_rank_begin = 0; league_rank_begin < m_policy.league_size();
1927 league_rank_begin += m_policy.chunk_size()) {
1928 apply(exec, [
this, &buffer, &num_tasks_remaining, league_rank_begin,
1930 std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1931 reference_type update =
1932 ValueOps::reference(reinterpret_cast<pointer_type>(buffer.get(t)));
1933 const int league_rank_end = (std::min)(
1934 league_rank_begin + m_policy.chunk_size(), m_policy.league_size());
1935 execute_functor_range<WorkTag>(
1936 m_functor, m_policy, league_rank_begin, league_rank_end,
1937 buffer.get(t) + value_size, m_shared, update);
1939 num_tasks_remaining.count_down(1);
1943 num_tasks_remaining.wait();
1945 #elif KOKKOS_HPX_IMPLEMENTATION == 2
1946 using hpx::parallel::for_loop;
1947 using hpx::parallel::for_loop_strided;
1948 using hpx::parallel::execution::par;
1949 using hpx::parallel::execution::static_chunk_size;
1952 ChunkedRoundRobinExecutor exec(num_worker_threads);
1954 for_loop(par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
1955 [
this, &buffer](std::size_t
const t) {
1957 ReducerConditional::select(m_functor, m_reducer),
1958 reinterpret_cast<pointer_type>(buffer.get(t)));
1962 const int num_tasks = (m_policy.league_size() + m_policy.chunk_size() - 1) /
1963 m_policy.chunk_size();
1964 ChunkedRoundRobinExecutor exec(num_tasks);
1967 par.on(exec).with(static_chunk_size(1)), 0, m_policy.league_size(),
1968 m_policy.chunk_size(),
1969 [
this, &buffer, value_size](
int const league_rank_begin) {
1970 std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
1971 reference_type update = ValueOps::reference(
1972 reinterpret_cast<pointer_type>(buffer.get(t)));
1973 const int league_rank_end =
1974 (std::min)(league_rank_begin + m_policy.chunk_size(),
1975 m_policy.league_size());
1976 execute_functor_range<WorkTag>(
1977 m_functor, m_policy, league_rank_begin, league_rank_end,
1978 buffer.get(t) + value_size, m_shared, update);
1982 const pointer_type ptr =
reinterpret_cast<pointer_type
>(buffer.get(0));
1983 for (
int t = 1; t < num_worker_threads; ++t) {
1984 ValueJoin::join(ReducerConditional::select(m_functor, m_reducer), ptr,
1985 reinterpret_cast<pointer_type>(buffer.get(t)));
1988 Kokkos::Impl::FunctorFinal<ReducerTypeFwd, WorkTagFwd>::final(
1989 ReducerConditional::select(m_functor, m_reducer), ptr);
1992 const int n = Analysis::value_count(
1993 ReducerConditional::select(m_functor, m_reducer));
1995 for (
int j = 0; j < n; ++j) {
1996 m_result_ptr[j] = ptr[j];
2001 template <
class ViewType>
2003 const FunctorType &arg_functor,
const Policy &arg_policy,
2004 const ViewType &arg_result,
2005 typename std::enable_if<Kokkos::is_view<ViewType>::value &&
2006 !Kokkos::is_reducer_type<ReducerType>::value,
2007 void *>::type =
nullptr)
2008 : m_functor(arg_functor),
2009 m_league(arg_policy.league_size()),
2010 m_policy(arg_policy),
2011 m_reducer(InvalidType()),
2012 m_result_ptr(arg_result.data()),
2013 m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
2014 FunctorTeamShmemSize<FunctorType>::value(
2015 m_functor, arg_policy.team_size())),
2016 m_force_synchronous(!arg_result.impl_track().has_record()) {}
2018 inline ParallelReduce(
const FunctorType &arg_functor, Policy arg_policy,
2019 const ReducerType &reducer)
2020 : m_functor(arg_functor),
2021 m_league(arg_policy.league_size()),
2022 m_policy(arg_policy),
2024 m_result_ptr(reducer.view().data()),
2025 m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
2026 FunctorTeamShmemSize<FunctorType>::value(
2027 arg_functor, arg_policy.team_size())),
2028 m_force_synchronous(!reducer.view().impl_track().has_record()) {}
2035 template <
typename iType>
2036 KOKKOS_INLINE_FUNCTION
2037 Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2038 TeamThreadRange(
const Impl::HPXTeamMember &thread,
const iType &count) {
2039 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2043 template <
typename iType1,
typename iType2>
2044 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
2045 typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
2046 TeamThreadRange(
const Impl::HPXTeamMember &thread,
const iType1 &i_begin,
2047 const iType2 &i_end) {
2048 using iType =
typename std::common_type<iType1, iType2>::type;
2049 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2050 thread, iType(i_begin), iType(i_end));
2053 template <
typename iType>
2054 KOKKOS_INLINE_FUNCTION
2055 Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2056 TeamVectorRange(
const Impl::HPXTeamMember &thread,
const iType &count) {
2057 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2061 template <
typename iType1,
typename iType2>
2062 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
2063 typename std::common_type<iType1, iType2>::type, Impl::HPXTeamMember>
2064 TeamVectorRange(
const Impl::HPXTeamMember &thread,
const iType1 &i_begin,
2065 const iType2 &i_end) {
2066 using iType =
typename std::common_type<iType1, iType2>::type;
2067 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2068 thread, iType(i_begin), iType(i_end));
2071 template <
typename iType>
2072 KOKKOS_INLINE_FUNCTION
2073 Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2075 return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2079 template <
typename iType>
2080 KOKKOS_INLINE_FUNCTION
2081 Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2083 const iType &i_end) {
2084 return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2085 thread, i_begin, i_end);
2088 KOKKOS_INLINE_FUNCTION
2089 Impl::ThreadSingleStruct<Impl::HPXTeamMember> PerTeam(
2090 const Impl::HPXTeamMember &thread) {
2091 return Impl::ThreadSingleStruct<Impl::HPXTeamMember>(thread);
2094 KOKKOS_INLINE_FUNCTION
2095 Impl::VectorSingleStruct<Impl::HPXTeamMember> PerThread(
2096 const Impl::HPXTeamMember &thread) {
2097 return Impl::VectorSingleStruct<Impl::HPXTeamMember>(thread);
2105 template <
typename iType,
class Lambda>
2107 const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2109 const Lambda &lambda) {
2110 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2111 i += loop_boundaries.increment)
2121 template <
typename iType,
class Lambda,
typename ValueType>
2123 const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2125 const Lambda &lambda, ValueType &result) {
2126 result = ValueType();
2127 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2128 i += loop_boundaries.increment) {
2138 template <
typename iType,
class Lambda>
2140 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2142 const Lambda &lambda) {
2143 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2146 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2147 i += loop_boundaries.increment) {
2158 template <
typename iType,
class Lambda,
typename ValueType>
2160 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2162 const Lambda &lambda, ValueType &result) {
2163 result = ValueType();
2164 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2167 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2168 i += loop_boundaries.increment) {
2173 template <
typename iType,
class Lambda,
typename ReducerType>
2175 const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2177 const Lambda &lambda,
const ReducerType &reducer) {
2178 reducer.init(reducer.reference());
2179 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2180 i += loop_boundaries.increment) {
2181 lambda(i, reducer.reference());
2185 template <
typename iType,
class Lambda,
typename ReducerType>
2187 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2189 const Lambda &lambda,
const ReducerType &reducer) {
2190 reducer.init(reducer.reference());
2191 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2194 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2195 i += loop_boundaries.increment) {
2196 lambda(i, reducer.reference());
2200 template <
typename iType,
class FunctorType>
2201 KOKKOS_INLINE_FUNCTION
void parallel_scan(
2202 Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
const
2204 const FunctorType &lambda) {
2205 using value_type =
typename Kokkos::Impl::FunctorAnalysis<
2206 Kokkos::Impl::FunctorPatternInterface::SCAN, void,
2207 FunctorType>::value_type;
2209 value_type scan_val = value_type();
2212 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2213 i += loop_boundaries.increment) {
2214 lambda(i, scan_val,
false);
2218 scan_val = loop_boundaries.thread.team_scan(scan_val);
2220 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2221 i += loop_boundaries.increment) {
2222 lambda(i, scan_val,
true);
2237 template <
typename iType,
class FunctorType>
2238 KOKKOS_INLINE_FUNCTION
void parallel_scan(
2239 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2241 const FunctorType &lambda) {
2242 using ValueTraits = Kokkos::Impl::FunctorValueTraits<FunctorType, void>;
2243 using value_type =
typename ValueTraits::value_type;
2245 value_type scan_val = value_type();
2247 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2250 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2251 i += loop_boundaries.increment) {
2252 lambda(i, scan_val,
true);
2256 template <
class FunctorType>
2257 KOKKOS_INLINE_FUNCTION
void single(
2258 const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
2259 const FunctorType &lambda) {
2263 template <
class FunctorType>
2264 KOKKOS_INLINE_FUNCTION
void single(
2265 const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
2266 const FunctorType &lambda) {
2270 template <
class FunctorType,
class ValueType>
2271 KOKKOS_INLINE_FUNCTION
void single(
2272 const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
2273 const FunctorType &lambda, ValueType &val) {
2277 template <
class FunctorType,
class ValueType>
2278 KOKKOS_INLINE_FUNCTION
void single(
2279 const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
2280 const FunctorType &lambda, ValueType &val) {
2286 #include <HPX/Kokkos_HPX_Task.hpp>
KOKKOS_INLINE_FUNCTION size_type acquire() const
acquire value such that 0 <= value < size()
KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< iType, TeamMemberType > TeamThreadRange(const TeamMemberType &, const iType &count)=delete
Execution policy for parallel work over a threads within a team.
void print_configuration(std::ostream &, const bool detail=false)
Print "Bill of Materials".
Scratch memory space associated with an execution space.
Memory management for host memory.
Declaration of various MemoryLayout options.
UniqueToken(execution_space const &=execution_space())
create object size for concurrency on the given instance
std::enable_if< std::is_same< typename Kokkos::View< T, P...>::array_layout, Kokkos::LayoutLeft >::value||std::is_same< typename Kokkos::View< T, P...>::array_layout, Kokkos::LayoutRight >::value >::type resize(Kokkos::View< T, P...> &v, const size_t n0=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n3=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n4=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n5=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6=KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7=KOKKOS_IMPL_CTOR_DEFAULT_ARG)
Resize a view with copying old data to new data at the corresponding indices.
Declaration of parallel operators.
void parallel_reduce(const std::string &label, const PolicyType &policy, const FunctorType &functor, ReturnType &return_value, typename std::enable_if< Kokkos::Impl::is_execution_policy< PolicyType >::value >::type *=nullptr)
Parallel reduction.
KOKKOS_INLINE_FUNCTION void release(size_type) const
release a value acquired by generate
KOKKOS_INLINE_FUNCTION size_type size() const
upper bound for acquired values, i.e. 0 <= value < size()
KOKKOS_INLINE_FUNCTION_DELETED Impl::TeamThreadRangeBoundariesStruct< iType, TeamMemberType > TeamVectorRange(const TeamMemberType &, const iType &count)=delete
Execution policy for parallel work over a threads within a team.
void parallel_for(const ExecPolicy &policy, const FunctorType &functor, const std::string &str="", typename std::enable_if< Kokkos::Impl::is_execution_policy< ExecPolicy >::value >::type *=nullptr)
Execute functor in parallel according to the execution policy.
Execution policy for work over a range of an integral type.
KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct< iType, TeamMemberType > ThreadVectorRange(const TeamMemberType &, const iType &count)=delete
Execution policy for a vector parallel loop.