1 #ifndef _COMPADRE_PARALLELMANAGER_HPP_
2 #define _COMPADRE_PARALLELMANAGER_HPP_
4 #include "Compadre_Config.h"
73 #ifdef COMPADRE_USE_CUDA
90 if (
const char* env_threads = std::getenv(
"THREADS")) {
93 if (
const char* env_vector_lanes = std::getenv(
"VECTORLANES")) {
96 #ifdef COMPADRE_EXTREME_DEBUG
116 Kokkos::TeamPolicy<device_execution_space>
118 const int vector_lanes_per_thread = -1)
const {
120 if (threads_per_team>0 && vector_lanes_per_thread>0) {
123 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
130 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
136 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
142 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
146 }
else if (threads_per_team>0) {
149 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
156 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
162 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
168 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team,
_default_vector_lanes)
172 }
else if (vector_lanes_per_thread>0) {
175 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
182 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
188 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
194 return Kokkos::TeamPolicy<device_execution_space>(batch_size,
_default_threads, vector_lanes_per_thread)
229 template<
typename Tag,
class C>
231 const int vector_lanes_per_thread = -1)
const {
233 if (threads_per_team>0 && vector_lanes_per_thread>0) {
236 Kokkos::parallel_for(
238 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
246 Kokkos::parallel_for(
248 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
255 Kokkos::parallel_for(
257 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
264 Kokkos::parallel_for(
266 Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
271 }
else if (threads_per_team>0) {
274 Kokkos::parallel_for(
284 Kokkos::parallel_for(
293 Kokkos::parallel_for(
302 Kokkos::parallel_for(
309 }
else if (vector_lanes_per_thread>0) {
312 Kokkos::parallel_for(
314 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
322 Kokkos::parallel_for(
324 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
331 Kokkos::parallel_for(
333 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
340 Kokkos::parallel_for(
342 Kokkos::TeamPolicy<Tag>(batch_size,
_default_threads, vector_lanes_per_thread)
350 Kokkos::parallel_for(
360 Kokkos::parallel_for(
369 Kokkos::parallel_for(
378 Kokkos::parallel_for(
392 const int vector_lanes_per_thread = -1, std::string functor_name =
typeid(C).name())
const {
394 if (threads_per_team>0 && vector_lanes_per_thread>0) {
397 Kokkos::parallel_for(
399 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
407 Kokkos::parallel_for(
409 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
416 Kokkos::parallel_for(
418 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
425 Kokkos::parallel_for(
427 Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
432 }
else if (threads_per_team>0) {
435 Kokkos::parallel_for(
445 Kokkos::parallel_for(
454 Kokkos::parallel_for(
463 Kokkos::parallel_for(
470 }
else if (vector_lanes_per_thread>0) {
472 Kokkos::parallel_for(
482 Kokkos::parallel_for(
491 Kokkos::parallel_for(
500 Kokkos::parallel_for(
509 Kokkos::parallel_for(
519 Kokkos::parallel_for(
528 Kokkos::parallel_for(
537 Kokkos::parallel_for(
549 template<
typename Tag,
class C>
552 CallFunctorWithTeamThreadsAndVectors<Tag,C>(functor, batch_size,
_default_threads, 1);
560 CallFunctorWithTeamThreadsAndVectors<C>(functor, batch_size,
_default_threads, 1, functor_name);
563 KOKKOS_INLINE_FUNCTION
572 KOKKOS_INLINE_FUNCTION
581 KOKKOS_INLINE_FUNCTION
590 KOKKOS_INLINE_FUNCTION
Kokkos::TeamPolicy< device_execution_space > TeamPolicyThreadsAndVectors(const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1) const
Creates a team policy for a parallel_for parallel_for will break out over loops over teams with each ...
std::size_t global_index_type
KOKKOS_INLINE_FUNCTION int getTeamScratchLevel(const int level) const
int _scratch_thread_level_b
higher (slower) level memory for Kokkos::parallel_for for thread access memory
KOKKOS_INLINE_FUNCTION int getTeamScratchSize(const int level) const
void CallFunctorWithTeamThreads(C functor, const global_index_type batch_size, std::string functor_name=typeid(C).name()) const
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
int _thread_scratch_size_b
int _default_vector_lanes
int _thread_scratch_size_a
KOKKOS_INLINE_FUNCTION int getThreadScratchSize(const int level) const
void setTeamScratchLevel(const int level, const int value)
void setThreadScratchLevel(const int level, const int value)
int _default_threads
largest team size
int _scratch_team_level_b
lowest level memory for Kokkos::parallel_for for thread access memory
void setTeamScratchSize(const int level, const int value)
void CallFunctorWithTeamThreadsAndVectors(C functor, const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1) const
Calls a parallel_for parallel_for will break out over loops over teams with each vector lane executin...
void setThreadScratchSize(const int level, const int value)
int _scratch_thread_level_a
higher (slower) level memory for Kokkos::parallel_for for team access memory
void CallFunctorWithTeamThreads(C functor, const global_index_type batch_size) const
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
int _scratch_team_level_a
lowest level memory for Kokkos::parallel_for for team access memory
void CallFunctorWithTeamThreadsAndVectors(C functor, const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1, std::string functor_name=typeid(C).name()) const
Calls a parallel_for parallel_for will break out over loops over teams with each vector lane executin...
KOKKOS_INLINE_FUNCTION int getThreadScratchLevel(const int level) const