16 #include "Kokkos_Core.hpp"
22 #ifdef KOKKOS_ENABLE_CUDA
23 #include "cuda_runtime_api.h"
26 template <
typename Storage>
29 const int use_trials ,
30 const int use_nodes[] ,
34 const int entry_min = 8;
35 const int entry_max = 48;
36 const int entry_step = 8;
38 const int entry_min = 4;
39 const int entry_max = 32;
40 const int entry_step = 4;
46 performance_test_driver<Storage,entry_min,entry_max,entry_step>(
47 comm, use_print, use_trials, use_nodes, check, dev_config);
50 template <
typename Storage>
53 const int use_trials ,
54 const int use_nodes[] ,
57 const int entry_min = 16;
58 const int entry_max = 64;
59 const int entry_step = 16;
60 performance_test_driver<Storage,entry_min,entry_max,entry_step>(
61 comm, use_print, use_trials, use_nodes, check, dev_config);
76 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
77 const size_t num_cores_per_socket =
78 Kokkos::hwloc::get_available_cores_per_numa();
79 const size_t num_threads_per_core =
80 Kokkos::hwloc::get_available_threads_per_core();
85 "This test performance of MP::Vector FEM assembly.\n");
87 CLP.
setOption(
"n", &nGrid,
"Number of mesh points in the each direction");
89 CLP.
setOption(
"ni", &nIter,
"Number of assembly iterations");
91 CLP.
setOption(
"print",
"no-print", &print,
"Print debugging output");
93 int num_cores = num_cores_per_socket * num_sockets;
95 "Number of CPU cores to use (defaults to all)");
97 CLP.
setOption(
"hyperthreads", &num_hyper_threads,
98 "Number of hyper threads per core to use (defaults to all)");
99 int threads_per_vector = 1;
100 CLP.
setOption(
"threads_per_vector", &threads_per_vector,
101 "Number of threads to use within each vector");
102 CLP.
setOption(
"check",
"no-check", &check,
"Check correctness");
103 #ifdef KOKKOS_ENABLE_SERIAL
105 CLP.
setOption(
"serial",
"no-serial", &serial,
"Enable Serial device");
107 #ifdef KOKKOS_ENABLE_THREADS
109 CLP.
setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
111 #ifdef KOKKOS_ENABLE_OPENMP
113 CLP.
setOption(
"openmp",
"no-openmp", &openmp,
"Enable OpenMP device");
115 #ifdef KOKKOS_ENABLE_CUDA
117 CLP.
setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
118 int cuda_threads_per_vector = 16;
119 CLP.
setOption(
"cuda_threads_per_vector", &cuda_threads_per_vector,
120 "Number of Cuda threads to use within each vector");
121 int cuda_block_size = 256;
122 CLP.
setOption(
"cuda_block_size", &cuda_block_size,
124 int num_cuda_blocks = 0;
125 CLP.
setOption(
"num_cuda_blocks", &num_cuda_blocks,
126 "Number of Cuda blocks (0 implies the default choice)");
128 CLP.
setOption(
"device", &device_id,
"CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
130 CLP.
setOption(
"ngpus", &ngpus,
"Number of GPUs per node for multi-GPU runs via MPI");
132 CLP.
parse( argc, argv );
135 use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
140 #ifdef KOKKOS_ENABLE_SERIAL
142 typedef Kokkos::Serial Device;
145 Kokkos::initialize();
147 if (comm->getRank() == 0)
148 std::cout << std::endl
149 <<
"Serial performance with " << comm->getSize()
150 <<
" MPI ranks" << std::endl;
154 mainHost<Storage>(comm, print, nIter, use_nodes, check,
161 #ifdef KOKKOS_ENABLE_THREADS
163 typedef Kokkos::Threads Device;
166 Kokkos::InitializationSettings init_args;
167 init_args.set_num_threads(num_cores*num_hyper_threads);
168 Kokkos::initialize( init_args );
170 if (comm->getRank() == 0)
171 std::cout << std::endl
172 <<
"Threads performance with " << comm->getSize()
173 <<
" MPI ranks and " << num_cores*num_hyper_threads
174 <<
" threads per rank:" << std::endl;
178 num_hyper_threads / threads_per_vector);
180 mainHost<Storage>(comm, print, nIter, use_nodes, check,
187 #ifdef KOKKOS_ENABLE_OPENMP
189 typedef Kokkos::OpenMP Device;
192 Kokkos::InitializationSettings init_args;
193 init_args.set_num_threads(num_cores*num_hyper_threads);
194 Kokkos::initialize( init_args );
196 if (comm->getRank() == 0)
197 std::cout << std::endl
198 <<
"OpenMP performance with " << comm->getSize()
199 <<
" MPI ranks and " << num_cores*num_hyper_threads
200 <<
" threads per rank:" << std::endl;
204 num_hyper_threads / threads_per_vector);
206 mainHost<Storage>(comm, print, nIter, use_nodes, check,
213 #ifdef KOKKOS_ENABLE_CUDA
215 typedef Kokkos::Cuda Device;
218 if (device_id == -1) {
221 if ((str = std::getenv(
"SLURM_LOCALID")))
222 local_rank = std::atoi(str);
223 else if ((str = std::getenv(
"MV2_COMM_WORLD_LOCAL_RANK")))
224 local_rank = std::atoi(str);
225 else if ((str = getenv(
"OMPI_COMM_WORLD_LOCAL_RANK")))
226 local_rank = std::atoi(str);
227 device_id = local_rank % ngpus;
230 int num_device; cudaGetDeviceCount(&num_device);
232 device_id >= num_device, std::logic_error,
233 "Invalid device ID " << device_id <<
". You probably are trying" <<
234 " to run with too many GPUs per node");
237 Kokkos::InitializationSettings init_args;
238 init_args.set_device_id(device_id);
239 Kokkos::initialize( init_args );
241 cudaDeviceProp deviceProp;
242 cudaGetDeviceProperties(&deviceProp, device_id);
243 if (comm->getRank() == 0)
244 std::cout << std::endl
245 <<
"CUDA performance performance with " << comm->getSize()
246 <<
" MPI ranks and device " << device_id <<
" ("
247 << deviceProp.name <<
"):"
252 cuda_threads_per_vector,
253 cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
255 mainCuda<Storage>(comm, print, nIter, use_nodes, check,
Stokhos::StandardStorage< int, double > Storage
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
static Teuchos::RCP< const Comm< OrdinalType > > getComm()
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
int main(int argc, char **argv)
void setDocString(const char doc_string[])