13 #include "Kokkos_Core.hpp" 
   18 #ifdef KOKKOS_ENABLE_CUDA 
   19 #include "cuda_runtime_api.h" 
   22 template <
typename Scalar, 
typename Ordinal, 
typename Device>
 
   35 #ifdef KOKKOS_ENABLE_THREADS 
   36     const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
 
   37     const size_t num_cores_per_socket =
 
   38       Kokkos::hwloc::get_available_cores_per_numa();
 
   39     const size_t num_threads_per_core =
 
   40       Kokkos::hwloc::get_available_threads_per_core();
 
   46       "This test performance of MP::Vector multiply routines.\n");
 
   48     CLP.
setOption(
"n", &nGrid, 
"Number of mesh points in the each direction");
 
   50     CLP.
setOption(
"ni", &nIter, 
"Number of multiply iterations");
 
   52     CLP.
setOption(
"emin", &ensemble_min, 
"Staring ensemble size");
 
   53     int ensemble_max = 24;
 
   54     CLP.
setOption(
"emax", &ensemble_max, 
"Stoping ensemble size");
 
   55     int ensemble_step = 4;
 
   56     CLP.
setOption(
"estep", &ensemble_step, 
"Ensemble increment");
 
   57 #ifdef KOKKOS_ENABLE_THREADS 
   59     CLP.
setOption(
"threads", 
"no-threads", &threads, 
"Enable Threads device");
 
   60     int num_cores = num_cores_per_socket * num_sockets;
 
   62                   "Number of CPU cores to use (defaults to all)");
 
   64     CLP.
setOption(
"hyperthreads", &num_hyper_threads,
 
   65                   "Number of hyper threads per core to use (defaults to all)");
 
   67 #ifdef KOKKOS_ENABLE_CUDA 
   69     CLP.
setOption(
"cuda", 
"no-cuda", &cuda, 
"Enable Cuda device");
 
   71     CLP.
setOption(
"device", &device_id, 
"CUDA device ID");
 
   73     CLP.
parse( argc, argv );
 
   75 #ifdef KOKKOS_ENABLE_THREADS 
   79       typedef Kokkos::Threads Device;
 
   81       Kokkos::InitializationSettings init_args;
 
   82       init_args.set_num_threads(num_cores*num_hyper_threads);
 
   83       Kokkos::initialize( init_args );
 
   85       std::cout << std::endl
 
   86                 << 
"Threads performance with " << num_cores*num_hyper_threads
 
   87                 << 
" threads:" << std::endl;
 
   89       performance_test_driver<Scalar,Ordinal,Device>(
 
   90         nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);
 
   96 #ifdef KOKKOS_ENABLE_CUDA 
  100       typedef Kokkos::Cuda Device;
 
  102       Kokkos::InitializationSettings init_args;
 
  103       init_args.set_device_id(device_id);
 
  104       Kokkos::initialize( init_args );
 
  106       cudaDeviceProp deviceProp;
 
  107       cudaGetDeviceProperties(&deviceProp, device_id);
 
  108       std::cout << std::endl
 
  109                 << 
"CUDA performance for device " << device_id << 
" (" 
  110                 << deviceProp.name << 
"):" 
  113       performance_test_driver<Scalar,Ordinal,Device>(
 
  114         nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);
 
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
 
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
 
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const 
 
int main(int argc, char **argv)
 
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
 
void setDocString(const char doc_string[])