13 #include "Kokkos_Core.hpp"
18 #ifdef KOKKOS_ENABLE_CUDA
19 #include "cuda_runtime_api.h"
22 template <
typename Scalar,
typename Ordinal,
typename Device>
35 #ifdef KOKKOS_ENABLE_THREADS
36 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
37 const size_t num_cores_per_socket =
38 Kokkos::hwloc::get_available_cores_per_numa();
39 const size_t num_threads_per_core =
40 Kokkos::hwloc::get_available_threads_per_core();
46 "This test performance of MP::Vector multiply routines.\n");
48 CLP.
setOption(
"n", &nGrid,
"Number of mesh points in the each direction");
50 CLP.
setOption(
"ni", &nIter,
"Number of multiply iterations");
52 CLP.
setOption(
"emin", &ensemble_min,
"Staring ensemble size");
53 int ensemble_max = 24;
54 CLP.
setOption(
"emax", &ensemble_max,
"Stoping ensemble size");
55 int ensemble_step = 4;
56 CLP.
setOption(
"estep", &ensemble_step,
"Ensemble increment");
57 #ifdef KOKKOS_ENABLE_THREADS
59 CLP.
setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
60 int num_cores = num_cores_per_socket * num_sockets;
62 "Number of CPU cores to use (defaults to all)");
64 CLP.
setOption(
"hyperthreads", &num_hyper_threads,
65 "Number of hyper threads per core to use (defaults to all)");
67 #ifdef KOKKOS_ENABLE_CUDA
69 CLP.
setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
71 CLP.
setOption(
"device", &device_id,
"CUDA device ID");
73 CLP.
parse( argc, argv );
78 #ifdef KOKKOS_ENABLE_THREADS
80 typedef Kokkos::Threads Device;
82 Kokkos::InitializationSettings init_args;
83 init_args.set_num_threads(num_cores*num_hyper_threads);
84 Kokkos::initialize( init_args );
86 std::cout << std::endl
87 <<
"Threads performance with " << num_cores*num_hyper_threads
88 <<
" threads:" << std::endl;
90 performance_test_driver<Scalar,Ordinal,Device>(
91 nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);
97 #ifdef KOKKOS_ENABLE_CUDA
99 typedef Kokkos::Cuda Device;
101 Kokkos::InitializationSettings init_args;
102 init_args.set_device_id(device_id);
103 Kokkos::initialize( init_args );
105 cudaDeviceProp deviceProp;
106 cudaGetDeviceProperties(&deviceProp, device_id);
107 std::cout << std::endl
108 <<
"CUDA performance for device " << device_id <<
" ("
109 << deviceProp.name <<
"):"
112 performance_test_driver<Scalar,Ordinal,Device>(
113 nGrid, nIter, ensemble_min, ensemble_max, ensemble_step);
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
int main(int argc, char **argv)
void performance_test_driver(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
void setDocString(const char doc_string[])