45 #include "Kokkos_Core.hpp" 
   50 #ifdef KOKKOS_ENABLE_CUDA 
   51 #include "cuda_runtime_api.h" 
   54 template <
typename Scalar, 
typename Ordinal, 
typename Device>
 
   67     const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
 
   68     const size_t num_cores_per_socket =
 
   69       Kokkos::hwloc::get_available_cores_per_numa();
 
   78       "This test performance of mean-based UQ::PCE multiply routines.\n");
 
   80     CLP.
setOption(
"n", &nGrid, 
"Number of mesh points in the each direction");
 
   82     CLP.
setOption(
"ni", &nIter, 
"Number of multiply iterations");
 
   84     CLP.
setOption(
"order", &order, 
"Polynomial order");
 
   86     CLP.
setOption(
"dmin", &dim_min, 
"Starting stochastic dimension");
 
   88     CLP.
setOption(
"dmax", &dim_max, 
"Stopping stochastic dimension");
 
   89     int numa = num_sockets;
 
   90     CLP.
setOption(
"numa", &numa,  
"Number of numa nodes");
 
   91     int cores = num_cores_per_socket;
 
   92     CLP.
setOption(
"cores", &cores, 
"Cores per numa node");
 
   93 #ifdef KOKKOS_ENABLE_THREADS 
   95     CLP.
setOption(
"threads", &threads, 
"Number of threads for Threads device");
 
   97 #ifdef KOKKOS_ENABLE_OPENMP 
   99     CLP.
setOption(
"openmp", &openmp, 
"Number of threads for OpenMP device");
 
  101 #ifdef KOKKOS_ENABLE_CUDA 
  103     CLP.
setOption(
"cuda", 
"no-cuda", &cuda, 
"Enable Cuda device");
 
  105     CLP.
setOption(
"device", &device_id, 
"CUDA device ID");
 
  107     CLP.
parse( argc, argv );
 
  112 #ifdef KOKKOS_ENABLE_THREADS 
  114       typedef Kokkos::Threads Device;
 
  116       Kokkos::InitArguments init_args;
 
  117       init_args.num_threads = threads;
 
  118       init_args.num_numa = numa;
 
  119       Kokkos::initialize( init_args );
 
  121       std::cout << std::endl
 
  122                 << 
"Threads performance with " << threads
 
  123                 << 
" threads, " << numa << 
" numas, " << cores
 
  124                 << 
" cores/numa:" << std::endl;
 
  126       performance_test_driver<Scalar,Ordinal,Device>(
 
  127         nGrid, nIter, order, dim_min, dim_max);
 
  133 #ifdef KOKKOS_ENABLE_OPENMP 
  135       typedef Kokkos::OpenMP Device;
 
  137       Kokkos::InitArguments init_args;
 
  138       init_args.num_threads = openmp;
 
  139       init_args.num_numa = numa;
 
  140       Kokkos::initialize( init_args );
 
  142       std::cout << std::endl
 
  143                 << 
"OpenMP performance with " << openmp
 
  144                 << 
" threads, " << numa << 
" numas, " << cores
 
  145                 << 
" cores/numa:" << std::endl;
 
  147       performance_test_driver<Scalar,Ordinal,Device>(
 
  148         nGrid, nIter, order, dim_min, dim_max);
 
  154 #ifdef KOKKOS_ENABLE_CUDA 
  156       typedef Kokkos::Cuda Device;
 
  158       Kokkos::InitArguments init_args;
 
  159       init_args.device_id = device_id;
 
  160       Kokkos::initialize( init_args );
 
  162       cudaDeviceProp deviceProp;
 
  163       cudaGetDeviceProperties(&deviceProp, device_id);
 
  164       std::cout << std::endl
 
  165                 << 
"CUDA performance for device " << device_id << 
" (" 
  166                 << deviceProp.name << 
"):" 
  169       performance_test_driver<Scalar,Ordinal,Device>(
 
  170         nGrid, nIter, order, dim_min, dim_max);
 
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
 
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
 
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const 
 
int main(int argc, char **argv)
 
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
 
void setDocString(const char doc_string[])