17 #ifndef KOKKOS_IMPL_PUBLIC_INCLUDE
18 #include <Kokkos_Macros.hpp>
20 "Including non-public Kokkos header files is not allowed.");
22 #ifndef KOKKOS_HPX_HPP
23 #define KOKKOS_HPX_HPP
25 #include <Kokkos_Macros.hpp>
26 #if defined(KOKKOS_ENABLE_HPX)
28 #include <Kokkos_Core_fwd.hpp>
30 #include <Kokkos_HostSpace.hpp>
34 #ifdef KOKKOS_ENABLE_HBWSPACE
35 #include <Kokkos_HBWSpace.hpp>
38 #include <Kokkos_HostSpace.hpp>
39 #include <Kokkos_Layout.hpp>
40 #include <Kokkos_MemoryTraits.hpp>
41 #include <Kokkos_Parallel.hpp>
42 #include <Kokkos_ScratchSpace.hpp>
43 #include <Kokkos_TaskScheduler.hpp>
44 #include <impl/Kokkos_ConcurrentBitset.hpp>
45 #include <impl/Kokkos_FunctorAnalysis.hpp>
46 #include <impl/Kokkos_Tools.hpp>
47 #include <impl/Kokkos_TaskQueue.hpp>
48 #include <impl/Kokkos_InitializationSettings.hpp>
50 #include <KokkosExp_MDRangePolicy.hpp>
52 #include <hpx/local/algorithm.hpp>
53 #include <hpx/local/barrier.hpp>
54 #include <hpx/local/condition_variable.hpp>
55 #include <hpx/local/execution.hpp>
56 #include <hpx/local/future.hpp>
57 #include <hpx/local/init.hpp>
58 #include <hpx/local/mutex.hpp>
59 #include <hpx/local/runtime.hpp>
60 #include <hpx/local/thread.hpp>
62 #include <Kokkos_UniqueToken.hpp>
68 #include <type_traits>
81 #ifndef KOKKOS_HPX_IMPLEMENTATION
82 #define KOKKOS_HPX_IMPLEMENTATION 1
85 #if (KOKKOS_HPX_IMPLEMENTATION < 0) || (KOKKOS_HPX_IMPLEMENTATION > 1)
86 #error "You have chosen an invalid value for KOKKOS_HPX_IMPLEMENTATION"
113 class thread_buffer {
114 static constexpr std::size_t m_cache_line_size = 64;
116 std::size_t m_num_threads;
117 std::size_t m_size_per_thread;
118 std::size_t m_size_total;
121 void pad_to_cache_line(std::size_t &size) {
122 size = ((size + m_cache_line_size - 1) / m_cache_line_size) *
129 m_size_per_thread(0),
132 thread_buffer(
const std::size_t num_threads,
133 const std::size_t size_per_thread) {
134 resize(num_threads, size_per_thread);
136 ~thread_buffer() {
delete[] m_data; }
138 thread_buffer(
const thread_buffer &) =
delete;
139 thread_buffer(thread_buffer &&) =
delete;
140 thread_buffer &operator=(
const thread_buffer &) =
delete;
141 thread_buffer &operator=(thread_buffer) =
delete;
143 void resize(
const std::size_t num_threads,
144 const std::size_t size_per_thread) {
145 m_num_threads = num_threads;
146 m_size_per_thread = size_per_thread;
148 pad_to_cache_line(m_size_per_thread);
150 std::size_t size_total_new = m_num_threads * m_size_per_thread;
152 if (m_size_total < size_total_new) {
154 m_data =
new char[size_total_new];
155 m_size_total = size_total_new;
159 char *
get(std::size_t thread_num) {
160 assert(thread_num < m_num_threads);
161 if (m_data ==
nullptr) {
164 return &m_data[thread_num * m_size_per_thread];
167 std::size_t size_per_thread() const noexcept {
return m_size_per_thread; }
168 std::size_t size_total() const noexcept {
return m_size_total; }
172 namespace Experimental {
175 static constexpr uint32_t impl_default_instance_id() {
return 1; }
178 static bool m_hpx_initialized;
179 uint32_t m_instance_id = impl_default_instance_id();
181 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
182 static std::atomic<uint32_t> m_next_instance_id;
185 enum class instance_mode { default_, independent };
188 static uint32_t m_active_parallel_region_count;
189 static hpx::spinlock m_active_parallel_region_count_mutex;
190 static hpx::condition_variable_any m_active_parallel_region_count_cond;
192 struct instance_data {
193 instance_data() =
default;
194 instance_data(hpx::shared_future<void> future) : m_future(future) {}
195 Kokkos::Impl::thread_buffer m_buffer;
196 hpx::shared_future<void> m_future = hpx::make_ready_future<void>();
197 hpx::spinlock m_future_mutex;
200 mutable std::shared_ptr<instance_data> m_independent_instance_data;
201 static instance_data m_default_instance_data;
203 std::reference_wrapper<Kokkos::Impl::thread_buffer> m_buffer;
204 std::reference_wrapper<hpx::shared_future<void>> m_future;
205 std::reference_wrapper<hpx::spinlock> m_future_mutex;
207 static Kokkos::Impl::thread_buffer m_default_buffer;
211 using execution_space = HPX;
212 using memory_space = HostSpace;
213 using device_type = Kokkos::Device<execution_space, memory_space>;
214 using array_layout = LayoutRight;
215 using size_type = memory_space::size_type;
216 using scratch_memory_space = ScratchMemorySpace<HPX>;
218 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
221 : m_instance_id(impl_default_instance_id()),
222 m_buffer(m_default_instance_data.m_buffer),
223 m_future(m_default_instance_data.m_future),
224 m_future_mutex(m_default_instance_data.m_future_mutex) {}
226 HPX(instance_mode mode)
227 : m_instance_id(mode == instance_mode::independent
228 ? m_next_instance_id++
229 : impl_default_instance_id()),
230 m_independent_instance_data(mode == instance_mode::independent
231 ? (new instance_data())
233 m_buffer(mode == instance_mode::independent
234 ? m_independent_instance_data->m_buffer
235 : m_default_instance_data.m_buffer),
236 m_future(mode == instance_mode::independent
237 ? m_independent_instance_data->m_future
238 : m_default_instance_data.m_future),
239 m_future_mutex(mode == instance_mode::independent
240 ? m_independent_instance_data->m_future_mutex
241 : m_default_instance_data.m_future_mutex) {}
243 HPX(hpx::shared_future<void> future)
244 : m_instance_id(m_next_instance_id++),
246 m_independent_instance_data(new instance_data(future)),
247 m_buffer(m_independent_instance_data->m_buffer),
248 m_future(m_independent_instance_data->m_future),
249 m_future_mutex(m_independent_instance_data->m_future_mutex) {}
251 HPX(HPX &&other) =
default;
252 HPX &operator=(HPX &&other) =
default;
253 HPX(
const HPX &other) =
default;
254 HPX &operator=(
const HPX &other) =
default;
259 void print_configuration(std::ostream &os,
bool =
false)
const {
260 os <<
"HPX backend\n";
261 os <<
"HPX Execution Space:\n";
262 os <<
" KOKKOS_ENABLE_HPX: yes\n";
263 os <<
"\nHPX Runtime Configuration:\n";
265 uint32_t impl_instance_id() const noexcept {
return m_instance_id; }
267 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
268 static bool in_parallel(HPX
const &instance = HPX()) noexcept {
269 return !instance.impl_get_future().is_ready();
272 static bool in_parallel(HPX
const & = HPX()) noexcept {
return false; }
275 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
276 static void impl_decrement_active_parallel_region_count() {
277 std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex);
278 if (--m_active_parallel_region_count == 0) {
280 m_active_parallel_region_count_cond.notify_all();
284 static void impl_increment_active_parallel_region_count() {
285 std::unique_lock<hpx::spinlock> l(m_active_parallel_region_count_mutex);
286 ++m_active_parallel_region_count;
291 const std::string &name =
292 "Kokkos::Experimental::HPX::fence: Unnamed Instance Fence")
const {
293 Kokkos::Tools::Experimental::Impl::profile_fence_event<
294 Kokkos::Experimental::HPX>(
296 Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{
299 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
300 impl_get_future().wait();
303 impl_get_future() = hpx::make_ready_future<void>();
308 static void impl_static_fence(
const std::string &name) {
309 Kokkos::Tools::Experimental::Impl::profile_fence_event<
310 Kokkos::Experimental::HPX>(
312 Kokkos::Tools::Experimental::SpecialSynchronizationCases::
313 GlobalDeviceSynchronization,
315 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
316 std::unique_lock<hpx::spinlock> l(
317 m_active_parallel_region_count_mutex);
318 m_active_parallel_region_count_cond.wait(
319 l, [&]() {
return m_active_parallel_region_count == 0; });
324 HPX().impl_get_future() = hpx::make_ready_future<void>();
329 static hpx::execution::parallel_executor impl_get_executor() {
330 return hpx::execution::parallel_executor();
333 static bool is_asynchronous(HPX
const & = HPX()) noexcept {
334 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
341 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_3
342 template <
typename F>
343 KOKKOS_DEPRECATED
static void partition_master(
344 F
const &,
int requested_num_partitions = 0,
int = 0) {
345 if (requested_num_partitions > 1) {
347 "Kokkos::Experimental::HPX::partition_master: can't partition an "
353 static int concurrency();
354 static void impl_initialize(InitializationSettings
const &);
355 static bool impl_is_initialized() noexcept;
356 static
void impl_finalize();
358 static
int impl_thread_pool_size() noexcept {
359 hpx::runtime *rt = hpx::get_runtime_ptr();
363 if (hpx::threads::get_self_ptr() ==
nullptr) {
364 return hpx::resource::get_thread_pool(0).get_os_thread_count();
366 return hpx::this_thread::get_pool()->get_os_thread_count();
371 static int impl_thread_pool_rank() noexcept {
372 hpx::runtime *rt = hpx::get_runtime_ptr();
376 if (hpx::threads::get_self_ptr() ==
nullptr) {
379 return hpx::this_thread::get_pool()->get_pool_index();
384 static int impl_thread_pool_size(
int depth) {
386 return impl_thread_pool_size();
392 static int impl_max_hardware_threads() noexcept {
393 return hpx::threads::hardware_concurrency();
396 static int impl_hardware_thread_id() noexcept {
397 return hpx::get_worker_thread_num();
400 Kokkos::Impl::thread_buffer &impl_get_buffer() const noexcept {
401 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
402 return m_buffer.get();
404 return m_default_buffer;
408 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
409 hpx::shared_future<void> &impl_get_future() const noexcept {
413 hpx::spinlock &impl_get_future_mutex() const noexcept {
414 return m_future_mutex;
418 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
419 struct [[nodiscard]] reset_on_exit_parallel {
421 reset_on_exit_parallel(HPX
const &space) : m_space(space) {}
422 ~reset_on_exit_parallel() {
425 m_space.m_independent_instance_data.reset();
427 HPX::impl_decrement_active_parallel_region_count();
435 struct [[nodiscard]] reset_count_on_exit_parallel {
436 reset_count_on_exit_parallel() =
default;
437 ~reset_count_on_exit_parallel() {
438 HPX::impl_decrement_active_parallel_region_count();
442 struct [[nodiscard]] reset_on_exit_parallel {
443 reset_on_exit_parallel(HPX
const &) =
default;
444 ~reset_on_exit_parallel() =
default;
447 struct [[nodiscard]] reset_count_on_exit_parallel {
448 reset_count_on_exit_parallel() =
default;
449 ~reset_count_on_exit_parallel() =
default;
453 static constexpr
const char *name() noexcept {
return "HPX"; }
456 friend bool operator==(HPX
const &lhs, HPX
const &rhs) {
457 return lhs.m_instance_id == rhs.m_instance_id;
459 friend bool operator!=(HPX
const &lhs, HPX
const &rhs) {
460 return !(lhs == rhs);
466 namespace Experimental {
468 struct DeviceTypeTraits<Kokkos::Experimental::HPX> {
469 static constexpr DeviceType
id = DeviceType::HPX;
470 static int device_id(
const Kokkos::Experimental::HPX &) {
return 0; }
477 #if defined(KOKKOS_ENABLE_HPX_ASYNC_DISPATCH)
478 template <
typename Closure>
479 inline void dispatch_execute_task(Closure *closure,
480 Kokkos::Experimental::HPX
const &instance,
481 bool force_synchronous =
false) {
482 Kokkos::Experimental::HPX::impl_increment_active_parallel_region_count();
484 Closure closure_copy = *closure;
487 std::unique_lock<hpx::spinlock> l(instance.impl_get_future_mutex());
488 hpx::util::ignore_lock(&instance.impl_get_future_mutex());
489 hpx::shared_future<void> &fut = instance.impl_get_future();
491 fut = fut.then(hpx::execution::parallel_executor(
492 hpx::threads::thread_schedule_hint(0)),
493 [closure_copy](hpx::shared_future<void> &&) {
494 return closure_copy.execute_task();
498 if (force_synchronous) {
500 "Kokkos::Experimental::Impl::HPX::dispatch_execute_task: fence due to "
501 "forced syncronizations");
505 template <
typename Closure>
506 inline void dispatch_execute_task(Closure *closure,
507 Kokkos::Experimental::HPX
const &,
509 closure->execute_task();
518 struct MemorySpaceAccess<Kokkos::Experimental::HPX::memory_space,
519 Kokkos::Experimental::HPX::scratch_memory_space> {
520 enum :
bool { assignable =
false };
521 enum :
bool { accessible =
true };
522 enum :
bool { deepcopy =
false };
529 namespace Experimental {
531 class UniqueToken<HPX, UniqueTokenScope::Instance> {
535 buffer_type m_buffer_view;
536 uint32_t
volatile *m_buffer;
539 using execution_space = HPX;
540 using size_type = int;
545 UniqueToken(execution_space
const & = execution_space()) noexcept
546 : m_count(execution_space::impl_max_hardware_threads()),
547 m_buffer_view(buffer_type()),
550 UniqueToken(size_type max_size, execution_space
const & = execution_space())
551 : m_count(max_size > execution_space::impl_max_hardware_threads()
552 ? execution_space::impl_max_hardware_threads()
555 max_size > execution_space::impl_max_hardware_threads()
557 : buffer_type(
"UniqueToken::m_buffer_view",
558 ::Kokkos::Impl::concurrent_bitset::buffer_bound(
560 m_buffer(m_buffer_view.data()) {}
563 KOKKOS_INLINE_FUNCTION
564 int size() const noexcept {
return m_count; }
567 KOKKOS_INLINE_FUNCTION
570 if (m_buffer ==
nullptr) {
571 return execution_space::impl_hardware_thread_id();
573 const ::Kokkos::pair<int, int> result =
574 ::Kokkos::Impl::concurrent_bitset::acquire_bounded(
575 m_buffer, m_count, ::Kokkos::Impl::clock_tic() % m_count);
577 if (result.first < 0) {
579 "UniqueToken<HPX> failure to acquire tokens, no tokens "
585 KOKKOS_IF_ON_DEVICE((return 0;))
589 KOKKOS_INLINE_FUNCTION
590 void release(
int i) const noexcept {
591 KOKKOS_IF_ON_HOST((
if (m_buffer !=
nullptr) {
592 ::Kokkos::Impl::concurrent_bitset::release(m_buffer, i);
595 KOKKOS_IF_ON_DEVICE(((
void)i;))
602 using execution_space = HPX;
603 using size_type = int;
604 UniqueToken(execution_space
const & = execution_space()) noexcept {}
609 int size() const noexcept {
return HPX::impl_max_hardware_threads(); }
610 int acquire() const noexcept {
return HPX::impl_hardware_thread_id(); }
611 void release(
int) const noexcept {}
619 struct HPXTeamMember {
621 using execution_space = Kokkos::Experimental::HPX;
622 using scratch_memory_space =
624 using team_handle = HPXTeamMember;
627 scratch_memory_space m_team_shared;
635 KOKKOS_INLINE_FUNCTION
636 const scratch_memory_space &team_shmem()
const {
637 return m_team_shared.set_team_thread_mode(0, 1, 0);
640 KOKKOS_INLINE_FUNCTION
641 const execution_space::scratch_memory_space &team_scratch(
const int)
const {
642 return m_team_shared.set_team_thread_mode(0, 1, 0);
645 KOKKOS_INLINE_FUNCTION
646 const execution_space::scratch_memory_space &thread_scratch(
const int)
const {
647 return m_team_shared.set_team_thread_mode(0, team_size(), team_rank());
650 KOKKOS_INLINE_FUNCTION
int league_rank() const noexcept {
651 return m_league_rank;
654 KOKKOS_INLINE_FUNCTION
int league_size() const noexcept {
655 return m_league_size;
658 KOKKOS_INLINE_FUNCTION
int team_rank() const noexcept {
return m_team_rank; }
659 KOKKOS_INLINE_FUNCTION
int team_size() const noexcept {
return m_team_size; }
661 template <
class... Properties>
662 constexpr KOKKOS_INLINE_FUNCTION HPXTeamMember(
663 const TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
665 const int team_rank,
const int league_rank,
void *scratch,
666 size_t scratch_size) noexcept
667 : m_team_shared(scratch, scratch_size, scratch, scratch_size),
668 m_league_size(policy.league_size()),
669 m_league_rank(league_rank),
670 m_team_size(policy.team_size()),
671 m_team_rank(team_rank) {}
673 KOKKOS_INLINE_FUNCTION
674 void team_barrier()
const {}
676 template <
class ValueType>
677 KOKKOS_INLINE_FUNCTION
void team_broadcast(ValueType &,
const int &)
const {}
679 template <
class Closure,
class ValueType>
680 KOKKOS_INLINE_FUNCTION
void team_broadcast(
const Closure &closure,
686 template <
class ValueType,
class JoinOp>
687 KOKKOS_INLINE_FUNCTION ValueType team_reduce(
const ValueType &value,
688 const JoinOp &)
const {
692 template <
class ReducerType>
693 KOKKOS_INLINE_FUNCTION std::enable_if_t<is_reducer<ReducerType>::value>
694 team_reduce(
const ReducerType &)
const {}
696 template <
typename Type>
697 KOKKOS_INLINE_FUNCTION Type
698 team_scan(
const Type &value, Type *
const global_accum =
nullptr)
const {
700 Kokkos::atomic_fetch_add(global_accum, value);
707 template <
class... Properties>
708 class TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>
709 :
public PolicyTraits<Properties...> {
712 std::size_t m_team_scratch_size[2];
713 std::size_t m_thread_scratch_size[2];
717 using traits = PolicyTraits<Properties...>;
720 using execution_policy = TeamPolicyInternal;
722 using member_type = HPXTeamMember;
725 using execution_space = Kokkos::Experimental::HPX;
729 template <
class FunctorType>
730 inline static int team_size_max(
const FunctorType &) {
734 template <
class FunctorType>
735 inline static int team_size_recommended(
const FunctorType &) {
739 template <
class FunctorType>
740 inline static int team_size_recommended(
const FunctorType &,
const int &) {
744 template <
class FunctorType>
745 int team_size_max(
const FunctorType &,
const ParallelForTag &)
const {
749 template <
class FunctorType>
750 int team_size_max(
const FunctorType &,
const ParallelReduceTag &)
const {
754 template <
class FunctorType,
class ReducerType>
755 int team_size_max(
const FunctorType &,
const ReducerType &,
756 const ParallelReduceTag &)
const {
760 template <
class FunctorType>
761 int team_size_recommended(
const FunctorType &,
const ParallelForTag &)
const {
765 template <
class FunctorType>
766 int team_size_recommended(
const FunctorType &,
767 const ParallelReduceTag &)
const {
771 template <
class FunctorType,
class ReducerType>
772 int team_size_recommended(
const FunctorType &,
const ReducerType &,
773 const ParallelReduceTag &)
const {
777 static int vector_length_max() {
return 1; }
779 inline int impl_vector_length() noexcept {
return 1; }
780 inline bool impl_auto_team_size() noexcept {
return false; }
781 inline bool impl_auto_vector_length() noexcept {
return false; }
782 inline void impl_set_vector_length(
int) noexcept {}
783 inline void impl_set_team_size(
int) noexcept {}
786 inline void init(
const int league_size_request,
const int team_size_request) {
787 m_league_size = league_size_request;
788 const int max_team_size = 1;
791 team_size_request > max_team_size ? max_team_size : team_size_request;
793 if (m_chunk_size > 0) {
794 if (!Impl::is_integral_power_of_two(m_chunk_size))
795 Kokkos::abort(
"TeamPolicy blocking granularity must be power of two");
797 int new_chunk_size = 1;
798 while (new_chunk_size * 4 * Kokkos::Experimental::HPX::concurrency() <
803 if (new_chunk_size < 128) {
805 while ((new_chunk_size * Kokkos::Experimental::HPX::concurrency() <
807 (new_chunk_size < 128))
811 m_chunk_size = new_chunk_size;
816 inline int team_size()
const {
return m_team_size; }
817 inline int league_size()
const {
return m_league_size; }
819 size_t scratch_size(
const int &level,
int team_size_ = -1)
const {
820 if (team_size_ < 0) {
821 team_size_ = m_team_size;
823 return m_team_scratch_size[level] +
824 team_size_ * m_thread_scratch_size[level];
827 inline static int scratch_size_max(
int level) {
828 return (level == 0 ? 1024 * 32 :
833 template <
class ExecSpace,
class... OtherProperties>
834 friend class TeamPolicyInternal;
836 const typename traits::execution_space &space()
const {
837 static typename traits::execution_space m_space;
841 template <
class... OtherProperties>
842 TeamPolicyInternal(
const TeamPolicyInternal<Kokkos::Experimental::HPX,
843 OtherProperties...> &p) {
844 m_league_size = p.m_league_size;
845 m_team_size = p.m_team_size;
846 m_team_scratch_size[0] = p.m_team_scratch_size[0];
847 m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
848 m_team_scratch_size[1] = p.m_team_scratch_size[1];
849 m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
850 m_chunk_size = p.m_chunk_size;
853 TeamPolicyInternal(
const typename traits::execution_space &,
854 int league_size_request,
int team_size_request,
856 : m_team_scratch_size{0, 0},
857 m_thread_scratch_size{0, 0},
859 init(league_size_request, team_size_request);
862 TeamPolicyInternal(
const typename traits::execution_space &,
863 int league_size_request,
const Kokkos::AUTO_t &,
865 : m_team_scratch_size{0, 0},
866 m_thread_scratch_size{0, 0},
868 init(league_size_request, 1);
871 TeamPolicyInternal(
const typename traits::execution_space &,
872 int league_size_request,
873 const Kokkos::AUTO_t &,
874 const Kokkos::AUTO_t & )
875 : m_team_scratch_size{0, 0},
876 m_thread_scratch_size{0, 0},
878 init(league_size_request, 1);
881 TeamPolicyInternal(
const typename traits::execution_space &,
882 int league_size_request,
int team_size_request,
883 const Kokkos::AUTO_t &
885 : m_team_scratch_size{0, 0},
886 m_thread_scratch_size{0, 0},
888 init(league_size_request, team_size_request);
891 TeamPolicyInternal(
int league_size_request,
892 const Kokkos::AUTO_t &,
893 const Kokkos::AUTO_t & )
894 : m_team_scratch_size{0, 0},
895 m_thread_scratch_size{0, 0},
897 init(league_size_request, 1);
900 TeamPolicyInternal(
int league_size_request,
int team_size_request,
901 const Kokkos::AUTO_t &
903 : m_team_scratch_size{0, 0},
904 m_thread_scratch_size{0, 0},
906 init(league_size_request, team_size_request);
909 TeamPolicyInternal(
int league_size_request,
int team_size_request,
911 : m_team_scratch_size{0, 0},
912 m_thread_scratch_size{0, 0},
914 init(league_size_request, team_size_request);
917 TeamPolicyInternal(
int league_size_request,
const Kokkos::AUTO_t &,
919 : m_team_scratch_size{0, 0},
920 m_thread_scratch_size{0, 0},
922 init(league_size_request, 1);
925 inline int chunk_size()
const {
return m_chunk_size; }
927 inline TeamPolicyInternal &set_chunk_size(
928 typename traits::index_type chunk_size_) {
929 m_chunk_size = chunk_size_;
933 inline TeamPolicyInternal &set_scratch_size(
const int &level,
934 const PerTeamValue &per_team) {
935 m_team_scratch_size[level] = per_team.value;
939 inline TeamPolicyInternal &set_scratch_size(
940 const int &level,
const PerThreadValue &per_thread) {
941 m_thread_scratch_size[level] = per_thread.value;
945 inline TeamPolicyInternal &set_scratch_size(
946 const int &level,
const PerTeamValue &per_team,
947 const PerThreadValue &per_thread) {
948 m_team_scratch_size[level] = per_team.value;
949 m_thread_scratch_size[level] = per_thread.value;
959 template <
typename Policy>
960 typename Policy::member_type get_hpx_adjusted_chunk_size(Policy
const &policy) {
961 const int concurrency = Kokkos::Experimental::HPX::concurrency();
962 const typename Policy::member_type n = policy.end() - policy.begin();
963 typename Policy::member_type new_chunk_size = policy.chunk_size();
965 while (n >= 4 * concurrency * new_chunk_size) {
969 return new_chunk_size;
972 template <
class FunctorType,
class... Traits>
973 class ParallelFor<FunctorType, Kokkos::RangePolicy<Traits...>,
974 Kokkos::Experimental::HPX> {
977 using WorkTag =
typename Policy::work_tag;
978 using WorkRange =
typename Policy::WorkRange;
979 using Member =
typename Policy::member_type;
981 const FunctorType m_functor;
982 const Policy m_policy;
984 template <
class TagType>
985 static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
986 const FunctorType &functor,
const Member i) {
990 template <
class TagType>
991 static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
992 const FunctorType &functor,
const Member i) {
997 template <
class TagType>
998 static std::enable_if_t<std::is_void<TagType>::value> execute_functor_range(
999 const FunctorType &functor,
const Member i_begin,
const Member i_end) {
1000 for (Member i = i_begin; i < i_end; ++i) {
1005 template <
class TagType>
1006 static std::enable_if_t<!std::is_void<TagType>::value> execute_functor_range(
1007 const FunctorType &functor,
const Member i_begin,
const Member i_end) {
1009 for (Member i = i_begin; i < i_end; ++i) {
1015 void execute()
const {
1016 Kokkos::Impl::dispatch_execute_task(
this, m_policy.space());
1019 void execute_task()
const {
1021 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1024 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1026 using hpx::execution::par;
1027 using hpx::execution::static_chunk_size;
1029 #if KOKKOS_HPX_IMPLEMENTATION == 0
1030 using hpx::for_loop;
1032 for_loop(par.on(exec).with(static_chunk_size(m_policy.chunk_size())),
1033 m_policy.begin(), m_policy.end(), [
this](
const Member i) {
1034 execute_functor<WorkTag>(m_functor, i);
1037 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1038 using hpx::for_loop_strided;
1040 const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
1043 par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
1044 [
this, chunk_size](
const Member i_begin) {
1045 const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
1046 execute_functor_range<WorkTag>(m_functor, i_begin, i_end);
1051 inline ParallelFor(
const FunctorType &arg_functor, Policy arg_policy)
1052 : m_functor(arg_functor), m_policy(arg_policy) {}
1055 template <
class FunctorType,
class... Traits>
1056 class ParallelFor<FunctorType, Kokkos::MDRangePolicy<Traits...>,
1057 Kokkos::Experimental::HPX> {
1059 using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
1060 using Policy =
typename MDRangePolicy::impl_range_policy;
1061 using WorkTag =
typename MDRangePolicy::work_tag;
1062 using WorkRange =
typename Policy::WorkRange;
1063 using Member =
typename Policy::member_type;
1064 using iterate_type =
1065 typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
1068 const iterate_type m_iter;
1069 const Policy m_policy;
1072 void execute()
const { dispatch_execute_task(
this, m_iter.m_rp.space()); }
1074 inline void execute_task()
const {
1076 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1077 m_iter.m_rp.space());
1079 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1081 using hpx::execution::par;
1082 using hpx::execution::static_chunk_size;
1084 #if KOKKOS_HPX_IMPLEMENTATION == 0
1085 using hpx::for_loop;
1087 for_loop(par.on(exec).with(
1088 static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
1089 m_policy.begin(), m_policy.end(),
1090 [
this](
const Member i) { iterate_type(i); });
1092 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1093 using hpx::for_loop_strided;
1095 const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
1097 for_loop_strided(par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
1098 [
this, chunk_size](
const Member i_begin) {
1099 const Member i_end =
1100 (std::min)(i_begin + chunk_size, m_policy.end());
1101 for (Member i = i_begin; i < i_end; ++i) {
1108 inline ParallelFor(
const FunctorType &arg_functor, MDRangePolicy arg_policy)
1109 : m_iter(arg_policy, arg_functor),
1110 m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)) {}
1111 template <
typename Policy,
typename Functor>
1112 static int max_tile_size_product(
const Policy &,
const Functor &) {
1126 template <
class FunctorType,
class ReducerType,
class... Traits>
1127 class ParallelReduce<FunctorType, Kokkos::RangePolicy<Traits...>, ReducerType,
1128 Kokkos::Experimental::HPX> {
1131 using WorkTag =
typename Policy::work_tag;
1132 using WorkRange =
typename Policy::WorkRange;
1133 using Member =
typename Policy::member_type;
1134 using ReducerConditional =
1135 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1136 FunctorType, ReducerType>;
1137 using ReducerTypeFwd =
typename ReducerConditional::type;
1139 FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
1140 using value_type =
typename Analysis::value_type;
1141 using pointer_type =
typename Analysis::pointer_type;
1142 using reference_type =
typename Analysis::reference_type;
1144 const FunctorType m_functor;
1145 const Policy m_policy;
1146 const ReducerType m_reducer;
1147 const pointer_type m_result_ptr;
1149 bool m_force_synchronous;
1151 template <
class TagType>
1152 inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
1153 const FunctorType &functor,
const Member i, reference_type update) {
1157 template <
class TagType>
1158 inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
1159 const FunctorType &functor,
const Member i, reference_type update) {
1161 functor(t, i, update);
1164 template <
class TagType>
1165 inline std::enable_if_t<std::is_void<TagType>::value> execute_functor_range(
1166 reference_type update,
const Member i_begin,
const Member i_end)
const {
1167 for (Member i = i_begin; i < i_end; ++i) {
1168 m_functor(i, update);
1172 template <
class TagType>
1173 inline std::enable_if_t<!std::is_void<TagType>::value> execute_functor_range(
1174 reference_type update,
const Member i_begin,
const Member i_end)
const {
1177 for (Member i = i_begin; i < i_end; ++i) {
1178 m_functor(t, i, update);
1182 class value_type_wrapper {
1184 std::size_t m_value_size;
1185 char *m_value_buffer;
1188 value_type_wrapper() : m_value_size(0), m_value_buffer(nullptr) {}
1190 value_type_wrapper(
const std::size_t value_size)
1191 : m_value_size(value_size), m_value_buffer(new char[m_value_size]) {}
1193 value_type_wrapper(
const value_type_wrapper &other)
1194 : m_value_size(0), m_value_buffer(nullptr) {
1195 if (
this != &other) {
1196 m_value_buffer =
new char[other.m_value_size];
1197 m_value_size = other.m_value_size;
1199 std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
1204 ~value_type_wrapper() {
delete[] m_value_buffer; }
1206 value_type_wrapper(value_type_wrapper &&other)
1207 : m_value_size(0), m_value_buffer(nullptr) {
1208 if (
this != &other) {
1209 m_value_buffer = other.m_value_buffer;
1210 m_value_size = other.m_value_size;
1212 other.m_value_buffer =
nullptr;
1213 other.m_value_size = 0;
1217 value_type_wrapper &operator=(
const value_type_wrapper &other) {
1218 if (
this != &other) {
1219 delete[] m_value_buffer;
1220 m_value_buffer =
new char[other.m_value_size];
1221 m_value_size = other.m_value_size;
1223 std::copy(other.m_value_buffer, other.m_value_buffer + m_value_size,
1230 value_type_wrapper &operator=(value_type_wrapper &&other) {
1231 if (
this != &other) {
1232 delete[] m_value_buffer;
1233 m_value_buffer = other.m_value_buffer;
1234 m_value_size = other.m_value_size;
1236 other.m_value_buffer =
nullptr;
1237 other.m_value_size = 0;
1243 pointer_type pointer()
const {
1244 return reinterpret_cast<pointer_type
>(m_value_buffer);
1247 reference_type reference()
const {
1248 return Analysis::Reducer::reference(
1249 reinterpret_cast<pointer_type>(m_value_buffer));
1254 void execute()
const {
1255 if (m_policy.end() <= m_policy.begin()) {
1257 typename Analysis::Reducer final_reducer(
1258 &ReducerConditional::select(m_functor, m_reducer));
1260 final_reducer.init(m_result_ptr);
1261 final_reducer.final(m_result_ptr);
1265 dispatch_execute_task(
this, m_policy.space(), m_force_synchronous);
1268 inline void execute_task()
const {
1270 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1273 typename Analysis::Reducer final_reducer(
1274 &ReducerConditional::select(m_functor, m_reducer));
1276 const std::size_t value_size =
1277 Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1279 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1281 using hpx::for_loop;
1282 using hpx::execution::par;
1283 using hpx::execution::static_chunk_size;
1285 #if KOKKOS_HPX_IMPLEMENTATION == 0
1290 using hpx::parallel::reduction;
1292 value_type_wrapper final_value(value_size);
1293 value_type_wrapper identity(value_size);
1295 final_reducer.init(final_value.pointer());
1296 final_reducer.init(identity.pointer());
1298 for_loop(par.on(exec).with(
1299 static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
1300 m_policy.begin(), m_policy.end(),
1301 reduction(final_value, identity,
1303 value_type_wrapper &a,
1304 value_type_wrapper &b) -> value_type_wrapper & {
1305 final_reducer.join(a.pointer(), b.pointer());
1308 [
this](Member i, value_type_wrapper &update) {
1309 execute_functor<WorkTag>(m_functor, i, update.reference());
1312 pointer_type final_value_ptr = final_value.pointer();
1314 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1315 using hpx::for_loop_strided;
1317 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1319 thread_buffer &buffer = m_policy.space().impl_get_buffer();
1320 buffer.resize(num_worker_threads, value_size);
1323 par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
1324 [&buffer, final_reducer ](
const int t) noexcept {
1325 final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
1328 const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
1331 par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
1332 [
this, &buffer, chunk_size](
const Member i_begin) {
1333 reference_type update = Analysis::Reducer::reference(
1334 reinterpret_cast<pointer_type>(buffer.get(
1335 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1336 const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
1337 execute_functor_range<WorkTag>(update, i_begin, i_end);
1340 for (
int i = 1; i < num_worker_threads; ++i) {
1341 final_reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
1342 reinterpret_cast<pointer_type>(buffer.get(i)));
1345 pointer_type final_value_ptr =
1346 reinterpret_cast<pointer_type
>(buffer.get(0));
1349 final_reducer.final(final_value_ptr);
1351 if (m_result_ptr !=
nullptr) {
1352 const int n = Analysis::value_count(
1353 ReducerConditional::select(m_functor, m_reducer));
1355 for (
int j = 0; j < n; ++j) {
1356 m_result_ptr[j] = final_value_ptr[j];
1361 template <
class ViewType>
1362 inline ParallelReduce(
1363 const FunctorType &arg_functor, Policy arg_policy,
1364 const ViewType &arg_view,
1365 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
1366 !Kokkos::is_reducer<ReducerType>::value,
1368 : m_functor(arg_functor),
1369 m_policy(arg_policy),
1370 m_reducer(InvalidType()),
1371 m_result_ptr(arg_view.data()),
1372 m_force_synchronous(!arg_view.impl_track().has_record()) {}
1374 inline ParallelReduce(
const FunctorType &arg_functor, Policy arg_policy,
1375 const ReducerType &reducer)
1376 : m_functor(arg_functor),
1377 m_policy(arg_policy),
1379 m_result_ptr(reducer.view().data()),
1380 m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1383 template <
class FunctorType,
class ReducerType,
class... Traits>
1384 class ParallelReduce<FunctorType, Kokkos::MDRangePolicy<Traits...>, ReducerType,
1385 Kokkos::Experimental::HPX> {
1387 using MDRangePolicy = Kokkos::MDRangePolicy<Traits...>;
1388 using Policy =
typename MDRangePolicy::impl_range_policy;
1389 using WorkTag =
typename MDRangePolicy::work_tag;
1390 using WorkRange =
typename Policy::WorkRange;
1391 using Member =
typename Policy::member_type;
1392 using ReducerConditional =
1393 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1394 FunctorType, ReducerType>;
1395 using ReducerTypeFwd =
typename ReducerConditional::type;
1396 using Analysis = FunctorAnalysis<FunctorPatternInterface::REDUCE,
1397 MDRangePolicy, ReducerTypeFwd>;
1399 using pointer_type =
typename Analysis::pointer_type;
1400 using value_type =
typename Analysis::value_type;
1401 using reference_type =
typename Analysis::reference_type;
1402 using iterate_type =
1403 typename Kokkos::Impl::HostIterateTile<MDRangePolicy, FunctorType,
1404 WorkTag, reference_type>;
1406 const iterate_type m_iter;
1407 const Policy m_policy;
1408 const ReducerType m_reducer;
1409 const pointer_type m_result_ptr;
1411 bool m_force_synchronous;
1414 void execute()
const {
1415 dispatch_execute_task(
this, m_iter.m_rp.space(), m_force_synchronous);
1418 inline void execute_task()
const {
1420 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1421 m_iter.m_rp.space());
1423 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1424 const std::size_t value_size = Analysis::value_size(
1425 ReducerConditional::select(m_iter.m_func, m_reducer));
1427 thread_buffer &buffer = m_iter.m_rp.space().impl_get_buffer();
1428 buffer.resize(num_worker_threads, value_size);
1430 using hpx::for_loop;
1431 using hpx::execution::par;
1432 using hpx::execution::static_chunk_size;
1434 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1436 typename Analysis::Reducer final_reducer(
1437 &ReducerConditional::select(m_iter.m_func, m_reducer));
1439 #if KOKKOS_HPX_IMPLEMENTATION == 0
1442 par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
1443 [&buffer, final_reducer](std::size_t t) {
1444 final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
1447 for_loop(par.on(exec).with(
1448 static_chunk_size(get_hpx_adjusted_chunk_size(m_policy))),
1449 m_policy.begin(), m_policy.end(), [
this, &buffer](
const Member i) {
1450 reference_type update = Analysis::Reducer::reference(
1451 reinterpret_cast<pointer_type>(buffer.get(
1452 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1456 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1457 using hpx::for_loop_strided;
1460 par.on(exec).with(static_chunk_size(1)), std::size_t(0),
1461 num_worker_threads, [&buffer, final_reducer](
const std::size_t t) {
1462 final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
1465 const Member chunk_size = get_hpx_adjusted_chunk_size(m_policy);
1468 par.on(exec), m_policy.begin(), m_policy.end(), chunk_size,
1469 [
this, &buffer, chunk_size](
const Member i_begin) {
1470 reference_type update = Analysis::Reducer::reference(
1471 reinterpret_cast<pointer_type>(buffer.get(
1472 Kokkos::Experimental::HPX::impl_hardware_thread_id())));
1473 const Member i_end = (std::min)(i_begin + chunk_size, m_policy.end());
1475 for (Member i = i_begin; i < i_end; ++i) {
1481 for (
int i = 1; i < num_worker_threads; ++i) {
1482 final_reducer.join(reinterpret_cast<pointer_type>(buffer.get(0)),
1483 reinterpret_cast<pointer_type>(buffer.get(i)));
1486 final_reducer.final(reinterpret_cast<pointer_type>(buffer.get(0)));
1488 if (m_result_ptr !=
nullptr) {
1489 const int n = Analysis::value_count(
1490 ReducerConditional::select(m_iter.m_func, m_reducer));
1492 for (
int j = 0; j < n; ++j) {
1493 m_result_ptr[j] =
reinterpret_cast<pointer_type
>(buffer.get(0))[j];
1498 template <
class ViewType>
1499 inline ParallelReduce(
1500 const FunctorType &arg_functor, MDRangePolicy arg_policy,
1501 const ViewType &arg_view,
1502 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
1503 !Kokkos::is_reducer<ReducerType>::value,
1505 : m_iter(arg_policy, arg_functor),
1506 m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)),
1507 m_reducer(InvalidType()),
1508 m_result_ptr(arg_view.data()),
1509 m_force_synchronous(!arg_view.impl_track().has_record()) {}
1511 inline ParallelReduce(
const FunctorType &arg_functor,
1512 MDRangePolicy arg_policy,
const ReducerType &reducer)
1513 : m_iter(arg_policy, arg_functor),
1514 m_policy(Policy(0, arg_policy.m_num_tiles).set_chunk_size(1)),
1516 m_result_ptr(reducer.view().data()),
1517 m_force_synchronous(!reducer.view().impl_track().has_record()) {}
1518 template <
typename Policy,
typename Functor>
1519 static int max_tile_size_product(
const Policy &,
const Functor &) {
1534 template <
class FunctorType,
class... Traits>
1535 class ParallelScan<FunctorType, Kokkos::RangePolicy<Traits...>,
1536 Kokkos::Experimental::HPX> {
1539 using WorkTag =
typename Policy::work_tag;
1540 using WorkRange =
typename Policy::WorkRange;
1541 using Member =
typename Policy::member_type;
1543 FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1544 using pointer_type =
typename Analysis::pointer_type;
1545 using reference_type =
typename Analysis::reference_type;
1546 using value_type =
typename Analysis::value_type;
1548 const FunctorType m_functor;
1549 const Policy m_policy;
1551 template <
class TagType>
1552 inline static std::enable_if_t<std::is_void<TagType>::value>
1553 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1554 const Member i_end, reference_type update,
1556 for (Member i = i_begin; i < i_end; ++i) {
1557 functor(i, update,
final);
1561 template <
class TagType>
1562 inline static std::enable_if_t<!std::is_void<TagType>::value>
1563 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1564 const Member i_end, reference_type update,
1567 for (Member i = i_begin; i < i_end; ++i) {
1568 functor(t, i, update,
final);
1573 void execute()
const { dispatch_execute_task(
this, m_policy.space()); }
1575 inline void execute_task()
const {
1577 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1580 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1581 const int value_count = Analysis::value_count(m_functor);
1582 const std::size_t value_size = Analysis::value_size(m_functor);
1584 thread_buffer &buffer = m_policy.space().impl_get_buffer();
1585 buffer.resize(num_worker_threads, 2 * value_size);
1588 using hpx::for_loop;
1589 using hpx::execution::par;
1590 using hpx::execution::static_chunk_size;
1592 barrier<> bar(num_worker_threads);
1593 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1595 typename Analysis::Reducer final_reducer(&m_functor);
1598 par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
1599 [
this, &bar, &buffer, num_worker_threads, value_count, value_size,
1600 final_reducer](
int t) {
1601 reference_type update_sum =
1602 final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
1604 const WorkRange range(m_policy, t, num_worker_threads);
1605 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1608 bar.arrive_and_wait();
1612 reinterpret_cast<pointer_type>(buffer.get(0) + value_size));
1614 for (
int i = 1; i < num_worker_threads; ++i) {
1615 pointer_type ptr_1_prev =
1616 reinterpret_cast<pointer_type
>(buffer.get(i - 1));
1617 pointer_type ptr_2_prev =
reinterpret_cast<pointer_type
>(
1618 buffer.get(i - 1) + value_size);
1619 pointer_type ptr_2 =
1620 reinterpret_cast<pointer_type
>(buffer.get(i) + value_size);
1622 for (
int j = 0; j < value_count; ++j) {
1623 ptr_2[j] = ptr_2_prev[j];
1626 final_reducer.join(ptr_2, ptr_1_prev);
1630 bar.arrive_and_wait();
1632 reference_type update_base = Analysis::Reducer::reference(
1633 reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1635 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1640 inline ParallelScan(
const FunctorType &arg_functor,
const Policy &arg_policy)
1641 : m_functor(arg_functor), m_policy(arg_policy) {}
1644 template <
class FunctorType,
class ReturnType,
class... Traits>
1645 class ParallelScanWithTotal<FunctorType, Kokkos::RangePolicy<Traits...>,
1649 using WorkTag =
typename Policy::work_tag;
1650 using WorkRange =
typename Policy::WorkRange;
1651 using Member =
typename Policy::member_type;
1653 FunctorAnalysis<FunctorPatternInterface::SCAN, Policy, FunctorType>;
1654 using pointer_type =
typename Analysis::pointer_type;
1655 using reference_type =
typename Analysis::reference_type;
1656 using value_type =
typename Analysis::value_type;
1658 const FunctorType m_functor;
1659 const Policy m_policy;
1660 pointer_type m_result_ptr;
1662 template <
class TagType>
1663 inline static std::enable_if_t<std::is_void<TagType>::value>
1664 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1665 const Member i_end, reference_type update,
1667 for (Member i = i_begin; i < i_end; ++i) {
1668 functor(i, update,
final);
1672 template <
class TagType>
1673 inline static std::enable_if_t<!std::is_void<TagType>::value>
1674 execute_functor_range(
const FunctorType &functor,
const Member i_begin,
1675 const Member i_end, reference_type update,
1678 for (Member i = i_begin; i < i_end; ++i) {
1679 functor(t, i, update,
final);
1684 void execute()
const { dispatch_execute_task(
this, m_policy.space()); }
1686 inline void execute_task()
const {
1688 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1691 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1692 const int value_count = Analysis::value_count(m_functor);
1693 const std::size_t value_size = Analysis::value_size(m_functor);
1695 thread_buffer &buffer = m_policy.space().impl_get_buffer();
1696 buffer.resize(num_worker_threads, 2 * value_size);
1699 using hpx::for_loop;
1700 using hpx::execution::par;
1701 using hpx::execution::static_chunk_size;
1703 barrier<> bar(num_worker_threads);
1704 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1706 typename Analysis::Reducer final_reducer(&m_functor);
1709 par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
1710 [
this, &bar, &buffer, num_worker_threads, value_count, value_size,
1711 final_reducer](
int t) {
1712 reference_type update_sum =
1713 final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
1715 const WorkRange range(m_policy, t, num_worker_threads);
1716 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1719 bar.arrive_and_wait();
1723 reinterpret_cast<pointer_type>(buffer.get(0) + value_size));
1725 for (
int i = 1; i < num_worker_threads; ++i) {
1726 pointer_type ptr_1_prev =
1727 reinterpret_cast<pointer_type
>(buffer.get(i - 1));
1728 pointer_type ptr_2_prev =
reinterpret_cast<pointer_type
>(
1729 buffer.get(i - 1) + value_size);
1730 pointer_type ptr_2 =
1731 reinterpret_cast<pointer_type
>(buffer.get(i) + value_size);
1733 for (
int j = 0; j < value_count; ++j) {
1734 ptr_2[j] = ptr_2_prev[j];
1737 final_reducer.join(ptr_2, ptr_1_prev);
1741 bar.arrive_and_wait();
1743 reference_type update_base = Analysis::Reducer::reference(
1744 reinterpret_cast<pointer_type>(buffer.get(t) + value_size));
1746 execute_functor_range<WorkTag>(m_functor, range.begin(), range.end(),
1749 if (t == num_worker_threads - 1) {
1750 *m_result_ptr = update_base;
1755 template <
class ViewType>
1756 ParallelScanWithTotal(
const FunctorType &arg_functor,
1757 const Policy &arg_policy,
1758 const ViewType &arg_result_view)
1759 : m_functor(arg_functor),
1760 m_policy(arg_policy),
1761 m_result_ptr(arg_result_view.data()) {
1765 "Kokkos::HPX parallel_scan result must be host-accessible!");
1773 template <
class FunctorType,
class... Properties>
1774 class ParallelFor<FunctorType, Kokkos::TeamPolicy<Properties...>,
1775 Kokkos::Experimental::HPX> {
1777 using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1778 using WorkTag =
typename Policy::work_tag;
1779 using Member =
typename Policy::member_type;
1782 const FunctorType m_functor;
1783 const Policy m_policy;
1785 const std::size_t m_shared;
1787 template <
class TagType>
1788 inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
1789 const FunctorType &functor,
const Policy &policy,
const int league_rank,
1790 char *local_buffer,
const std::size_t local_buffer_size) {
1791 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1794 template <
class TagType>
1795 inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
1796 const FunctorType &functor,
const Policy &policy,
const int league_rank,
1797 char *local_buffer,
const std::size_t local_buffer_size) {
1799 functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1802 template <
class TagType>
1803 inline static std::enable_if_t<std::is_void<TagType>::value>
1804 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1805 const int league_rank_begin,
const int league_rank_end,
1807 const std::size_t local_buffer_size) {
1808 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1810 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1814 template <
class TagType>
1815 inline static std::enable_if_t<!std::is_void<TagType>::value>
1816 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1817 const int league_rank_begin,
const int league_rank_end,
1819 const std::size_t local_buffer_size) {
1821 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1824 Member(policy, 0, league_rank, local_buffer, local_buffer_size));
1829 void execute()
const { dispatch_execute_task(
this, m_policy.space()); }
1831 inline void execute_task()
const {
1833 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1836 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1838 thread_buffer &buffer = m_policy.space().impl_get_buffer();
1839 buffer.resize(num_worker_threads, m_shared);
1841 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1843 using hpx::execution::par;
1844 using hpx::execution::static_chunk_size;
1846 #if KOKKOS_HPX_IMPLEMENTATION == 0
1847 using hpx::for_loop;
1850 par.on(exec).with(static_chunk_size(m_policy.chunk_size())), 0,
1851 m_policy.league_size(), [
this, &buffer](
const int league_rank) {
1852 execute_functor<WorkTag>(
1853 m_functor, m_policy, league_rank,
1854 buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1858 #elif KOKKOS_HPX_IMPLEMENTATION == 1
1859 using hpx::for_loop_strided;
1862 par.on(exec), 0, m_policy.league_size(), m_policy.chunk_size(),
1863 [
this, &buffer](
const int league_rank_begin) {
1864 const int league_rank_end =
1865 (std::min)(league_rank_begin + m_policy.chunk_size(),
1866 m_policy.league_size());
1867 execute_functor_range<WorkTag>(
1868 m_functor, m_policy, league_rank_begin, league_rank_end,
1869 buffer.get(Kokkos::Experimental::HPX::impl_hardware_thread_id()),
1875 ParallelFor(
const FunctorType &arg_functor,
const Policy &arg_policy)
1876 : m_functor(arg_functor),
1877 m_policy(arg_policy),
1878 m_league(arg_policy.league_size()),
1879 m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
1880 FunctorTeamShmemSize<FunctorType>::value(
1881 arg_functor, arg_policy.team_size())) {}
1884 template <
class FunctorType,
class ReducerType,
class... Properties>
1885 class ParallelReduce<FunctorType, Kokkos::TeamPolicy<Properties...>,
1886 ReducerType, Kokkos::Experimental::HPX> {
1888 using Policy = TeamPolicyInternal<Kokkos::Experimental::HPX, Properties...>;
1889 using Member =
typename Policy::member_type;
1890 using WorkTag =
typename Policy::work_tag;
1891 using ReducerConditional =
1892 Kokkos::Impl::if_c<std::is_same<InvalidType, ReducerType>::value,
1893 FunctorType, ReducerType>;
1894 using ReducerTypeFwd =
typename ReducerConditional::type;
1896 FunctorAnalysis<FunctorPatternInterface::REDUCE, Policy, ReducerTypeFwd>;
1897 using pointer_type =
typename Analysis::pointer_type;
1898 using reference_type =
typename Analysis::reference_type;
1899 using value_type =
typename Analysis::value_type;
1901 const FunctorType m_functor;
1903 const Policy m_policy;
1904 const ReducerType m_reducer;
1905 pointer_type m_result_ptr;
1906 const std::size_t m_shared;
1908 bool m_force_synchronous;
1910 template <
class TagType>
1911 inline static std::enable_if_t<std::is_void<TagType>::value> execute_functor(
1912 const FunctorType &functor,
const Policy &policy,
const int league_rank,
1913 char *local_buffer,
const std::size_t local_buffer_size,
1914 reference_type update) {
1915 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1919 template <
class TagType>
1920 inline static std::enable_if_t<!std::is_void<TagType>::value> execute_functor(
1921 const FunctorType &functor,
const Policy &policy,
const int league_rank,
1922 char *local_buffer,
const std::size_t local_buffer_size,
1923 reference_type update) {
1925 functor(t, Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1929 template <
class TagType>
1930 inline static std::enable_if_t<std::is_void<TagType>::value>
1931 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1932 const int league_rank_begin,
const int league_rank_end,
1933 char *local_buffer,
const std::size_t local_buffer_size,
1934 reference_type update) {
1935 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1937 functor(Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1942 template <
class TagType>
1943 inline static std::enable_if_t<!std::is_void<TagType>::value>
1944 execute_functor_range(
const FunctorType &functor,
const Policy &policy,
1945 const int league_rank_begin,
const int league_rank_end,
1946 char *local_buffer,
const std::size_t local_buffer_size,
1947 reference_type update) {
1949 for (
int league_rank = league_rank_begin; league_rank < league_rank_end;
1952 Member(policy, 0, league_rank, local_buffer, local_buffer_size),
1958 void execute()
const {
1959 if (m_policy.league_size() * m_policy.team_size() == 0) {
1961 typename Analysis::Reducer final_reducer(
1962 &ReducerConditional::select(m_functor, m_reducer));
1963 final_reducer.init(m_result_ptr);
1964 final_reducer.final(m_result_ptr);
1968 dispatch_execute_task(
this, m_policy.space());
1971 inline void execute_task()
const {
1973 Kokkos::Experimental::HPX::reset_on_exit_parallel reset_on_exit(
1976 const int num_worker_threads = Kokkos::Experimental::HPX::concurrency();
1977 const std::size_t value_size =
1978 Analysis::value_size(ReducerConditional::select(m_functor, m_reducer));
1980 thread_buffer &buffer = m_policy.space().impl_get_buffer();
1981 buffer.resize(num_worker_threads, value_size + m_shared);
1983 auto exec = Kokkos::Experimental::HPX::impl_get_executor();
1985 using hpx::for_loop;
1986 using hpx::execution::par;
1987 using hpx::execution::static_chunk_size;
1989 typename Analysis::Reducer final_reducer(
1990 &ReducerConditional::select(m_functor, m_reducer));
1992 #if KOKKOS_HPX_IMPLEMENTATION == 0
1995 par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
1996 [&buffer, final_reducer](
const std::size_t t) {
1997 final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
2000 for_loop(par.on(exec).with(static_chunk_size(m_policy.chunk_size())), 0,
2001 m_policy.league_size(),
2002 [
this, &buffer, value_size](
const int league_rank) {
2004 Kokkos::Experimental::HPX::impl_hardware_thread_id();
2005 reference_type update = Analysis::Reducer::reference(
2006 reinterpret_cast<pointer_type>(buffer.get(t)));
2008 execute_functor<WorkTag>(m_functor, m_policy, league_rank,
2009 buffer.get(t) + value_size, m_shared,
2013 #elif KOKKOS_HPX_IMPLEMENTATION == 1
2014 using hpx::for_loop_strided;
2017 par.on(exec).with(static_chunk_size(1)), 0, num_worker_threads,
2018 [&buffer, final_reducer](std::size_t
const t) {
2019 final_reducer.init(reinterpret_cast<pointer_type>(buffer.get(t)));
2023 par.on(exec), 0, m_policy.league_size(), m_policy.chunk_size(),
2024 [
this, &buffer, value_size](
int const league_rank_begin) {
2025 std::size_t t = Kokkos::Experimental::HPX::impl_hardware_thread_id();
2026 reference_type update = Analysis::Reducer::reference(
2027 reinterpret_cast<pointer_type>(buffer.get(t)));
2028 const int league_rank_end =
2029 (std::min)(league_rank_begin + m_policy.chunk_size(),
2030 m_policy.league_size());
2031 execute_functor_range<WorkTag>(
2032 m_functor, m_policy, league_rank_begin, league_rank_end,
2033 buffer.get(t) + value_size, m_shared, update);
2037 const pointer_type ptr =
reinterpret_cast<pointer_type
>(buffer.get(0));
2038 for (
int t = 1; t < num_worker_threads; ++t) {
2039 final_reducer.join(ptr, reinterpret_cast<pointer_type>(buffer.get(t)));
2042 final_reducer.final(ptr);
2045 const int n = Analysis::value_count(
2046 ReducerConditional::select(m_functor, m_reducer));
2048 for (
int j = 0; j < n; ++j) {
2049 m_result_ptr[j] = ptr[j];
2054 template <
class ViewType>
2055 ParallelReduce(
const FunctorType &arg_functor,
const Policy &arg_policy,
2056 const ViewType &arg_result,
2057 std::enable_if_t<Kokkos::is_view<ViewType>::value &&
2058 !Kokkos::is_reducer<ReducerType>::value,
2060 : m_functor(arg_functor),
2061 m_league(arg_policy.league_size()),
2062 m_policy(arg_policy),
2063 m_reducer(InvalidType()),
2064 m_result_ptr(arg_result.data()),
2065 m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
2066 FunctorTeamShmemSize<FunctorType>::value(
2067 m_functor, arg_policy.team_size())),
2068 m_force_synchronous(!arg_result.impl_track().has_record()) {}
2070 inline ParallelReduce(
const FunctorType &arg_functor, Policy arg_policy,
2071 const ReducerType &reducer)
2072 : m_functor(arg_functor),
2073 m_league(arg_policy.league_size()),
2074 m_policy(arg_policy),
2076 m_result_ptr(reducer.view().data()),
2077 m_shared(arg_policy.scratch_size(0) + arg_policy.scratch_size(1) +
2078 FunctorTeamShmemSize<FunctorType>::value(
2079 arg_functor, arg_policy.team_size())),
2080 m_force_synchronous(!reducer.view().impl_track().has_record()) {}
2087 template <
typename iType>
2088 KOKKOS_INLINE_FUNCTION
2089 Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2090 TeamThreadRange(
const Impl::HPXTeamMember &thread,
const iType &count) {
2091 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2095 template <
typename iType1,
typename iType2>
2096 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
2097 std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
2098 TeamThreadRange(
const Impl::HPXTeamMember &thread,
const iType1 &i_begin,
2099 const iType2 &i_end) {
2100 using iType = std::common_type_t<iType1, iType2>;
2101 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2102 thread, iType(i_begin), iType(i_end));
2105 template <
typename iType>
2106 KOKKOS_INLINE_FUNCTION
2107 Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2108 TeamVectorRange(
const Impl::HPXTeamMember &thread,
const iType &count) {
2109 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2113 template <
typename iType1,
typename iType2>
2114 KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct<
2115 std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
2116 TeamVectorRange(
const Impl::HPXTeamMember &thread,
const iType1 &i_begin,
2117 const iType2 &i_end) {
2118 using iType = std::common_type_t<iType1, iType2>;
2119 return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2120 thread, iType(i_begin), iType(i_end));
2123 template <
typename iType>
2124 KOKKOS_INLINE_FUNCTION
2125 Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2126 ThreadVectorRange(
const Impl::HPXTeamMember &thread,
const iType &count) {
2127 return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2131 template <
typename iType1,
typename iType2>
2132 KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct<
2133 std::common_type_t<iType1, iType2>, Impl::HPXTeamMember>
2134 ThreadVectorRange(
const Impl::HPXTeamMember &thread,
const iType1 &i_begin,
2135 const iType2 &i_end) {
2136 using iType = std::common_type_t<iType1, iType2>;
2137 return Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>(
2138 thread, iType(i_begin), iType(i_end));
2141 KOKKOS_INLINE_FUNCTION
2142 Impl::ThreadSingleStruct<Impl::HPXTeamMember> PerTeam(
2143 const Impl::HPXTeamMember &thread) {
2144 return Impl::ThreadSingleStruct<Impl::HPXTeamMember>(thread);
2147 KOKKOS_INLINE_FUNCTION
2148 Impl::VectorSingleStruct<Impl::HPXTeamMember> PerThread(
2149 const Impl::HPXTeamMember &thread) {
2150 return Impl::VectorSingleStruct<Impl::HPXTeamMember>(thread);
2158 template <
typename iType,
class Lambda>
2159 KOKKOS_INLINE_FUNCTION
void parallel_for(
2160 const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2162 const Lambda &lambda) {
2163 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2164 i += loop_boundaries.increment)
2174 template <
typename iType,
class Lambda,
typename ValueType>
2175 KOKKOS_INLINE_FUNCTION
void parallel_reduce(
2176 const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2178 const Lambda &lambda, ValueType &result) {
2179 result = ValueType();
2180 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2181 i += loop_boundaries.increment) {
2191 template <
typename iType,
class Lambda>
2192 KOKKOS_INLINE_FUNCTION
void parallel_for(
2193 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2195 const Lambda &lambda) {
2196 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2199 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2200 i += loop_boundaries.increment) {
2211 template <
typename iType,
class Lambda,
typename ValueType>
2212 KOKKOS_INLINE_FUNCTION
void parallel_reduce(
2213 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2215 const Lambda &lambda, ValueType &result) {
2216 result = ValueType();
2217 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2220 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2221 i += loop_boundaries.increment) {
2226 template <
typename iType,
class Lambda,
typename ReducerType>
2227 KOKKOS_INLINE_FUNCTION
void parallel_reduce(
2228 const Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2230 const Lambda &lambda,
const ReducerType &reducer) {
2231 reducer.init(reducer.reference());
2232 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2233 i += loop_boundaries.increment) {
2234 lambda(i, reducer.reference());
2238 template <
typename iType,
class Lambda,
typename ReducerType>
2239 KOKKOS_INLINE_FUNCTION
void parallel_reduce(
2240 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2242 const Lambda &lambda,
const ReducerType &reducer) {
2243 reducer.init(reducer.reference());
2244 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2247 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2248 i += loop_boundaries.increment) {
2249 lambda(i, reducer.reference());
2253 template <
typename iType,
class FunctorType>
2254 KOKKOS_INLINE_FUNCTION
void parallel_scan(
2255 Impl::TeamThreadRangeBoundariesStruct<iType, Impl::HPXTeamMember>
const
2257 const FunctorType &lambda) {
2258 using value_type =
typename Kokkos::Impl::FunctorAnalysis<
2259 Kokkos::Impl::FunctorPatternInterface::SCAN, void,
2260 FunctorType>::value_type;
2262 value_type scan_val = value_type();
2265 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2266 i += loop_boundaries.increment) {
2267 lambda(i, scan_val,
false);
2271 scan_val = loop_boundaries.thread.team_scan(scan_val);
2273 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2274 i += loop_boundaries.increment) {
2275 lambda(i, scan_val,
true);
2290 template <
typename iType,
class FunctorType>
2291 KOKKOS_INLINE_FUNCTION
void parallel_scan(
2292 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2294 const FunctorType &lambda) {
2296 typename Impl::FunctorAnalysis<Impl::FunctorPatternInterface::SCAN,
2297 TeamPolicy<Experimental::HPX>,
2298 FunctorType>::value_type;
2300 value_type scan_val = value_type();
2302 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2305 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2306 i += loop_boundaries.increment) {
2307 lambda(i, scan_val,
true);
2314 template <
typename iType,
class FunctorType,
typename ReducerType>
2315 KOKKOS_INLINE_FUNCTION std::enable_if_t<Kokkos::is_reducer<ReducerType>::value>
2317 const Impl::ThreadVectorRangeBoundariesStruct<iType, Impl::HPXTeamMember>
2319 const FunctorType &lambda,
const ReducerType &reducer) {
2320 typename ReducerType::value_type scan_val;
2321 reducer.init(scan_val);
2323 #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
2326 for (iType i = loop_boundaries.start; i < loop_boundaries.end;
2327 i += loop_boundaries.increment) {
2328 lambda(i, scan_val,
true);
2332 template <
class FunctorType>
2333 KOKKOS_INLINE_FUNCTION
void single(
2334 const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
2335 const FunctorType &lambda) {
2339 template <
class FunctorType>
2340 KOKKOS_INLINE_FUNCTION
void single(
2341 const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
2342 const FunctorType &lambda) {
2346 template <
class FunctorType,
class ValueType>
2347 KOKKOS_INLINE_FUNCTION
void single(
2348 const Impl::VectorSingleStruct<Impl::HPXTeamMember> &,
2349 const FunctorType &lambda, ValueType &val) {
2353 template <
class FunctorType,
class ValueType>
2354 KOKKOS_INLINE_FUNCTION
void single(
2355 const Impl::ThreadSingleStruct<Impl::HPXTeamMember> &,
2356 const FunctorType &lambda, ValueType &val) {
2362 #include <HPX/Kokkos_HPX_Task.hpp>
KOKKOS_INLINE_FUNCTION size_type acquire() const
Scratch memory space associated with an execution space.
View to an array of data.
Memory management for host memory.
UniqueToken(execution_space const &=execution_space())
KOKKOS_INLINE_FUNCTION void release(size_type) const
KOKKOS_INLINE_FUNCTION size_type size() const
Execution policy for work over a range of an integral type.
Access relationship between DstMemorySpace and SrcMemorySpace.