48 #include "Kokkos_Core.hpp" 
   54 #ifdef KOKKOS_ENABLE_CUDA 
   55 #include "cuda_runtime_api.h" 
   62               const int use_trials ,
 
   63               const int use_nodes[] ,
 
   67   const int entry_min = 8;
 
   68   const int entry_max = 48;
 
   69   const int entry_step = 8;
 
   71   const int entry_min = 4;
 
   72   const int entry_max = 32;
 
   73   const int entry_step = 4;
 
   79   performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
 
   80     comm, use_print, use_trials, use_nodes, check, dev_config);
 
   87               const int use_trials ,
 
   88               const int use_nodes[] ,
 
   91   const int entry_min = 16;
 
   92   const int entry_max = 64;
 
   93   const int entry_step = 16;
 
   94   performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
 
   95     comm, use_print, use_trials, use_nodes, check, dev_config);
 
  101   bool verbose = 
false;
 
  110     const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
 
  111     const size_t num_cores_per_socket =
 
  112       Kokkos::hwloc::get_available_cores_per_numa();
 
  113     const size_t num_threads_per_core =
 
  114       Kokkos::hwloc::get_available_threads_per_core();
 
  119       "This test performance of MP::Vector FEM assembly.\n");
 
  121     CLP.
setOption(
"n", &nGrid, 
"Number of mesh points in the each direction");
 
  123     CLP.
setOption(
"ni", &nIter, 
"Number of assembly iterations");
 
  125     CLP.
setOption(
"print", 
"no-print", &print, 
"Print debugging output");
 
  127     int num_cores = num_cores_per_socket * num_sockets;
 
  129                   "Number of CPU cores to use (defaults to all)");
 
  131     CLP.
setOption(
"hyperthreads", &num_hyper_threads,
 
  132                   "Number of hyper threads per core to use (defaults to all)");
 
  133     int threads_per_vector = 1;
 
  134     CLP.
setOption(
"threads_per_vector", &threads_per_vector,
 
  135                   "Number of threads to use within each vector");
 
  136     CLP.
setOption(
"check", 
"no-check", &check, 
"Check correctness");
 
  137 #ifdef KOKKOS_ENABLE_SERIAL 
  139     CLP.
setOption(
"serial", 
"no-serial", &serial, 
"Enable Serial device");
 
  141 #ifdef KOKKOS_ENABLE_THREADS 
  143     CLP.
setOption(
"threads", 
"no-threads", &threads, 
"Enable Threads device");
 
  145 #ifdef KOKKOS_ENABLE_OPENMP 
  147     CLP.
setOption(
"openmp", 
"no-openmp", &openmp, 
"Enable OpenMP device");
 
  149 #ifdef KOKKOS_ENABLE_CUDA 
  151     CLP.
setOption(
"cuda", 
"no-cuda", &cuda, 
"Enable Cuda device");
 
  152     int cuda_threads_per_vector = 16;
 
  153     CLP.
setOption(
"cuda_threads_per_vector", &cuda_threads_per_vector,
 
  154                   "Number of Cuda threads to use within each vector");
 
  155     int cuda_block_size = 256;
 
  156     CLP.
setOption(
"cuda_block_size", &cuda_block_size,
 
  158     int num_cuda_blocks = 0;
 
  159     CLP.
setOption(
"num_cuda_blocks", &num_cuda_blocks,
 
  160                   "Number of Cuda blocks (0 implies the default choice)");
 
  162     CLP.
setOption(
"device", &device_id, 
"CUDA device ID.  Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
 
  164     CLP.
setOption(
"ngpus", &ngpus, 
"Number of GPUs per node for multi-GPU runs via MPI");
 
  166     CLP.
parse( argc, argv );
 
  169     use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
 
  178 #ifdef KOKKOS_ENABLE_SERIAL 
  180       typedef Kokkos::Serial Device;
 
  183       Kokkos::InitArguments init_args;
 
  185       Kokkos::initialize( init_args );
 
  187       if (comm->getRank() == 0)
 
  188         std::cout << std::endl
 
  189                   << 
"Serial performance with " << comm->getSize()
 
  190                   << 
" MPI ranks" << std::endl;
 
  194       mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
 
  201 #ifdef KOKKOS_ENABLE_THREADS 
  203       typedef Kokkos::Threads Device;
 
  206       Kokkos::InitArguments init_args;
 
  208       Kokkos::initialize( init_args );
 
  210       if (comm->getRank() == 0)
 
  211         std::cout << std::endl
 
  212                   << 
"Threads performance with " << comm->getSize()
 
  213                   << 
" MPI ranks and " << num_cores*num_hyper_threads
 
  214                   << 
" threads per rank:" << std::endl;
 
  218                                        num_hyper_threads / threads_per_vector);
 
  220       mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
 
  227 #ifdef KOKKOS_ENABLE_OPENMP 
  229       typedef Kokkos::OpenMP Device;
 
  232       Kokkos::InitArguments init_args;
 
  234       Kokkos::initialize( init_args );
 
  236       if (comm->getRank() == 0)
 
  237         std::cout << std::endl
 
  238                   << 
"OpenMP performance with " << comm->getSize()
 
  239                   << 
" MPI ranks and " << num_cores*num_hyper_threads
 
  240                   << 
" threads per rank:" << std::endl;
 
  244                                        num_hyper_threads / threads_per_vector);
 
  246       mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
 
  253 #ifdef KOKKOS_ENABLE_CUDA 
  255       typedef Kokkos::Cuda Device;
 
  258       if (device_id == -1) {
 
  261         if ((str = std::getenv(
"SLURM_LOCALID")))
 
  262           local_rank = std::atoi(str);
 
  263         else if ((str = std::getenv(
"MV2_COMM_WORLD_LOCAL_RANK")))
 
  264           local_rank = std::atoi(str);
 
  265         else if ((str = getenv(
"OMPI_COMM_WORLD_LOCAL_RANK")))
 
  266           local_rank = std::atoi(str);
 
  267         device_id = local_rank % ngpus;
 
  270         int num_device; cudaGetDeviceCount(&num_device);
 
  272           device_id >= num_device, std::logic_error,
 
  273           "Invalid device ID " << device_id << 
".  You probably are trying" <<
 
  274           " to run with too many GPUs per node");
 
  277       Kokkos::InitArguments init_args;
 
  278       init_args.device_id = device_id;
 
  279       Kokkos::initialize( init_args );
 
  281       cudaDeviceProp deviceProp;
 
  282       cudaGetDeviceProperties(&deviceProp, device_id);
 
  283       if (comm->getRank() == 0)
 
  284         std::cout << std::endl
 
  285                   << 
"CUDA performance performance with " << comm->getSize()
 
  286                   << 
" MPI ranks and device " << device_id << 
" (" 
  287                   << deviceProp.name << 
"):" 
  292         cuda_threads_per_vector,
 
  293         cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
 
  295       mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check,
 
Stokhos::StandardStorage< int, double > Storage
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class. 
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
static Teuchos::RCP< const Comm< OrdinalType > > getComm()
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const 
int main(int argc, char **argv)
void setDocString(const char doc_string[])