9 #ifndef _COMPADRE_PARALLELMANAGER_HPP_ 
   10 #define _COMPADRE_PARALLELMANAGER_HPP_ 
   12 #include "Compadre_Config.h" 
   81 #ifdef COMPADRE_USE_CUDA 
   98         if (
const char* env_threads = std::getenv(
"THREADS")) {
 
  101         if (
const char* env_vector_lanes = std::getenv(
"VECTORLANES")) {
 
  104 #ifdef COMPADRE_EXTREME_DEBUG 
  124     Kokkos::TeamPolicy<device_execution_space> 
 
  126             const int vector_lanes_per_thread = -1)
 const {
 
  128         if (threads_per_team>0 && vector_lanes_per_thread>0) {
 
  131                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  138                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  144                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  150                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  154         } 
else if (threads_per_team>0) {
 
  157                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, 
_default_vector_lanes)
 
  164                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, 
_default_vector_lanes)
 
  170                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, 
_default_vector_lanes)
 
  176                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, threads_per_team, 
_default_vector_lanes)
 
  180         } 
else if (vector_lanes_per_thread>0) {
 
  183                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  190                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  196                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  202                 return Kokkos::TeamPolicy<device_execution_space>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  237     template<
typename Tag, 
class C>
 
  239             const int vector_lanes_per_thread = -1)
 const {
 
  241         if (threads_per_team>0 && vector_lanes_per_thread>0) {
 
  244                     Kokkos::parallel_for(
 
  246                         Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  254                 Kokkos::parallel_for(
 
  256                     Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  263                 Kokkos::parallel_for(
 
  265                     Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  272                 Kokkos::parallel_for(
 
  274                     Kokkos::TeamPolicy<Tag>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  279         } 
else if (threads_per_team>0) {
 
  282                     Kokkos::parallel_for(
 
  292                 Kokkos::parallel_for(
 
  301                 Kokkos::parallel_for(
 
  310                 Kokkos::parallel_for(
 
  317         } 
else if (vector_lanes_per_thread>0) {
 
  320                     Kokkos::parallel_for(
 
  322                         Kokkos::TeamPolicy<Tag>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  330                 Kokkos::parallel_for(
 
  332                     Kokkos::TeamPolicy<Tag>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  339                 Kokkos::parallel_for(
 
  341                     Kokkos::TeamPolicy<Tag>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  348                 Kokkos::parallel_for(
 
  350                     Kokkos::TeamPolicy<Tag>(batch_size, 
_default_threads, vector_lanes_per_thread)
 
  358                     Kokkos::parallel_for(
 
  368                 Kokkos::parallel_for(
 
  377                 Kokkos::parallel_for(
 
  386                 Kokkos::parallel_for(
 
  400             const int vector_lanes_per_thread = -1, std::string functor_name = 
typeid(C).name())
 const {
 
  402         if (threads_per_team>0 && vector_lanes_per_thread>0) {
 
  405                 Kokkos::parallel_for(
 
  407                     Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  415                 Kokkos::parallel_for(
 
  417                     Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  424                 Kokkos::parallel_for(
 
  426                     Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  433                 Kokkos::parallel_for(
 
  435                     Kokkos::TeamPolicy<>(batch_size, threads_per_team, vector_lanes_per_thread)
 
  440         } 
else if (threads_per_team>0) {
 
  443                 Kokkos::parallel_for(
 
  453                 Kokkos::parallel_for(
 
  462                 Kokkos::parallel_for(
 
  471                 Kokkos::parallel_for(
 
  478         } 
else if (vector_lanes_per_thread>0) {
 
  480                 Kokkos::parallel_for(
 
  490                 Kokkos::parallel_for(
 
  499                 Kokkos::parallel_for(
 
  508                 Kokkos::parallel_for(
 
  517                 Kokkos::parallel_for(
 
  527                 Kokkos::parallel_for(
 
  536                 Kokkos::parallel_for(
 
  545                 Kokkos::parallel_for(
 
  557     template<
typename Tag, 
class C>
 
  560         CallFunctorWithTeamThreadsAndVectors<Tag,C>(functor, batch_size, 
_default_threads, 1);
 
  568         CallFunctorWithTeamThreadsAndVectors<C>(functor, batch_size, 
_default_threads, 1, functor_name);
 
  571     KOKKOS_INLINE_FUNCTION
 
  580     KOKKOS_INLINE_FUNCTION
 
  589     KOKKOS_INLINE_FUNCTION
 
  598     KOKKOS_INLINE_FUNCTION
 
Kokkos::TeamPolicy< device_execution_space > TeamPolicyThreadsAndVectors(const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1) const 
Creates a team policy for a parallel_for parallel_for will break out over loops over teams with each ...
KOKKOS_INLINE_FUNCTION int getTeamScratchLevel(const int level) const 
int _scratch_thread_level_b
higher (slower) level memory for Kokkos::parallel_for for thread access memory 
KOKKOS_INLINE_FUNCTION int getTeamScratchSize(const int level) const 
void CallFunctorWithTeamThreads(C functor, const global_index_type batch_size, std::string functor_name=typeid(C).name()) const 
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
int _thread_scratch_size_b
int _default_vector_lanes
int _thread_scratch_size_a
KOKKOS_INLINE_FUNCTION int getThreadScratchSize(const int level) const 
void setTeamScratchLevel(const int level, const int value)
std::size_t global_index_type
void setThreadScratchLevel(const int level, const int value)
int _default_threads
largest team size 
int _scratch_team_level_b
lowest level memory for Kokkos::parallel_for for thread access memory 
void setTeamScratchSize(const int level, const int value)
void CallFunctorWithTeamThreadsAndVectors(C functor, const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1) const 
Calls a parallel_for parallel_for will break out over loops over teams with each vector lane executin...
void setThreadScratchSize(const int level, const int value)
int _scratch_thread_level_a
higher (slower) level memory for Kokkos::parallel_for for team access memory 
void CallFunctorWithTeamThreads(C functor, const global_index_type batch_size) const 
Calls a parallel_for parallel_for will break out over loops over teams with each thread executing cod...
int _scratch_team_level_a
lowest level memory for Kokkos::parallel_for for team access memory 
void CallFunctorWithTeamThreadsAndVectors(C functor, const global_index_type batch_size, const int threads_per_team=-1, const int vector_lanes_per_thread=-1, std::string functor_name=typeid(C).name()) const 
Calls a parallel_for parallel_for will break out over loops over teams with each vector lane executin...
KOKKOS_INLINE_FUNCTION int getThreadScratchLevel(const int level) const