9 #ifndef _COMPADRE_PARALLELMANAGER_HPP_
10 #define _COMPADRE_PARALLELMANAGER_HPP_
12 #include "Compadre_Config.h"
81 #ifdef COMPADRE_USE_CUDA
98 if (
const char* env_threads = std::getenv(
"THREADS")) {
101 if (
const char* env_vector_lanes = std::getenv(
"VECTORLANES")) {
104 #ifdef COMPADRE_EXTREME_DEBUG
124 Kokkos::TeamPolicy<device_execution_space>
126 const int vector_lanes_per_thread = -1)
const {
128 if (threads_per_team>0 && vector_lanes_per_thread>0) {
131 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
138 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
144 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
150 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
154 }
else if (threads_per_team>0) {
157 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
164 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
170 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
176 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
180 }
else if (vector_lanes_per_thread>0) {
183 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
190 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
196 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
202 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
237 template<
typename Tag,
class C>
239 const int vector_lanes_per_thread = -1)
const {
241 if (threads_per_team>0 && vector_lanes_per_thread>0) {
244 Kokkos::parallel_for(
246 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
254 Kokkos::parallel_for(
256 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
263 Kokkos::parallel_for(
265 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
272 Kokkos::parallel_for(
274 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
279 }
else if (threads_per_team>0) {
282 Kokkos::parallel_for(
292 Kokkos::parallel_for(
301 Kokkos::parallel_for(
310 Kokkos::parallel_for(
317 }
else if (vector_lanes_per_thread>0) {
320 Kokkos::parallel_for(
322 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
330 Kokkos::parallel_for(
332 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
339 Kokkos::parallel_for(
341 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
348 Kokkos::parallel_for(
350 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
358 Kokkos::parallel_for(
368 Kokkos::parallel_for(
377 Kokkos::parallel_for(
386 Kokkos::parallel_for(
400 const int vector_lanes_per_thread = -1, std::string functor_name =
typeid(C).name())
const {
402 if (threads_per_team>0 && vector_lanes_per_thread>0) {
405 Kokkos::parallel_for(
407 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
415 Kokkos::parallel_for(
417 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
424 Kokkos::parallel_for(
426 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
433 Kokkos::parallel_for(
435 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
440 }
else if (threads_per_team>0) {
443 Kokkos::parallel_for(
453 Kokkos::parallel_for(
462 Kokkos::parallel_for(
471 Kokkos::parallel_for(
478 }
else if (vector_lanes_per_thread>0) {
480 Kokkos::parallel_for(
490 Kokkos::parallel_for(
499 Kokkos::parallel_for(
508 Kokkos::parallel_for(
517 Kokkos::parallel_for(
527 Kokkos::parallel_for(
536 Kokkos::parallel_for(
545 Kokkos::parallel_for(
557 template<
typename Tag,
class C>
560 CallFunctorWithTeamThreadsAndVectors<Tag,C>(functor, batch_size,
_default_threads, 1);
568 CallFunctorWithTeamThreadsAndVectors<C>(functor, batch_size,
_default_threads, 1, functor_name);
571 KOKKOS_INLINE_FUNCTION
580 KOKKOS_INLINE_FUNCTION
589 KOKKOS_INLINE_FUNCTION
598 KOKKOS_INLINE_FUNCTION
Kokkos::TeamPolicy< device_execution_space > TeamPolicyThreadsAndVectors(const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1) const
Creates a team policy for a parallel_for parallel_for will break out over loops over teams with each ...
std::size_t global_index_type
KOKKOS_INLINE_FUNCTION int getTeamScratchLevel(const int level) const
int _scratch_thread_level_b
higher (slower) level memory for Kokkos::parallel_for for thread access memory
KOKKOS_INLINE_FUNCTION int getTeamScratchSize(const int level) const
void CallFunctorWithTeamThreads(C functor, const global_index_type batch_size, std::string functor_name=typeid(C).name()) const
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
int _thread_scratch_size_b
int _default_vector_lanes
int _thread_scratch_size_a
KOKKOS_INLINE_FUNCTION int getThreadScratchSize(const int level) const
void setTeamScratchLevel(const int level, const int value)
void setThreadScratchLevel(const int level, const int value)
int _default_threads
largest team size
int _scratch_team_level_b
lowest level memory for Kokkos::parallel_for for thread access memory
void setTeamScratchSize(const int level, const int value)
void CallFunctorWithTeamThreadsAndVectors(C functor, const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1) const
Calls a parallel_for parallel_for will break out over loops over teams with each vector lane executin...
void setThreadScratchSize(const int level, const int value)
int _scratch_thread_level_a
higher (slower) level memory for Kokkos::parallel_for for team access memory
void CallFunctorWithTeamThreads(C functor, const global_index_type batch_size) const
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
int _scratch_team_level_a
lowest level memory for Kokkos::parallel_for for team access memory
void CallFunctorWithTeamThreadsAndVectors(C functor, const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1, std::string functor_name=typeid(C).name()) const
Calls a parallel_for parallel_for will break out over loops over teams with each vector lane executin...
KOKKOS_INLINE_FUNCTION int getThreadScratchLevel(const int level) const