16 #include "Kokkos_Core.hpp"
22 #ifdef KOKKOS_ENABLE_CUDA
23 #include "cuda_runtime_api.h"
30 const int use_trials ,
31 const int use_nodes[] ,
35 const int entry_min = 8;
36 const int entry_max = 48;
37 const int entry_step = 8;
39 const int entry_min = 4;
40 const int entry_max = 32;
41 const int entry_step = 4;
47 performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
48 comm, use_print, use_trials, use_nodes, check, dev_config);
55 const int use_trials ,
56 const int use_nodes[] ,
59 const int entry_min = 16;
60 const int entry_max = 64;
61 const int entry_step = 16;
62 performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
63 comm, use_print, use_trials, use_nodes, check, dev_config);
78 const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
79 const size_t num_cores_per_socket =
80 Kokkos::hwloc::get_available_cores_per_numa();
81 const size_t num_threads_per_core =
82 Kokkos::hwloc::get_available_threads_per_core();
87 "This test performance of MP::Vector FEM assembly.\n");
89 CLP.
setOption(
"n", &nGrid,
"Number of mesh points in the each direction");
91 CLP.
setOption(
"ni", &nIter,
"Number of assembly iterations");
93 CLP.
setOption(
"print",
"no-print", &print,
"Print debugging output");
95 int num_cores = num_cores_per_socket * num_sockets;
97 "Number of CPU cores to use (defaults to all)");
99 CLP.
setOption(
"hyperthreads", &num_hyper_threads,
100 "Number of hyper threads per core to use (defaults to all)");
101 int threads_per_vector = 1;
102 CLP.
setOption(
"threads_per_vector", &threads_per_vector,
103 "Number of threads to use within each vector");
104 CLP.
setOption(
"check",
"no-check", &check,
"Check correctness");
105 #ifdef KOKKOS_ENABLE_SERIAL
107 CLP.
setOption(
"serial",
"no-serial", &serial,
"Enable Serial device");
109 #ifdef KOKKOS_ENABLE_THREADS
111 CLP.
setOption(
"threads",
"no-threads", &threads,
"Enable Threads device");
113 #ifdef KOKKOS_ENABLE_OPENMP
115 CLP.
setOption(
"openmp",
"no-openmp", &openmp,
"Enable OpenMP device");
117 #ifdef KOKKOS_ENABLE_CUDA
119 CLP.
setOption(
"cuda",
"no-cuda", &cuda,
"Enable Cuda device");
120 int cuda_threads_per_vector = 16;
121 CLP.
setOption(
"cuda_threads_per_vector", &cuda_threads_per_vector,
122 "Number of Cuda threads to use within each vector");
123 int cuda_block_size = 256;
124 CLP.
setOption(
"cuda_block_size", &cuda_block_size,
126 int num_cuda_blocks = 0;
127 CLP.
setOption(
"num_cuda_blocks", &num_cuda_blocks,
128 "Number of Cuda blocks (0 implies the default choice)");
130 CLP.
setOption(
"device", &device_id,
"CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
132 CLP.
setOption(
"ngpus", &ngpus,
"Number of GPUs per node for multi-GPU runs via MPI");
134 CLP.
parse( argc, argv );
137 use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
146 #ifdef KOKKOS_ENABLE_SERIAL
148 typedef Kokkos::Serial Device;
151 Kokkos::InitializationSettings init_args;
152 init_args.set_num_threads(num_cores*num_hyper_threads);
153 Kokkos::initialize( init_args );
155 if (comm->getRank() == 0)
156 std::cout << std::endl
157 <<
"Serial performance with " << comm->getSize()
158 <<
" MPI ranks" << std::endl;
162 mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
169 #ifdef KOKKOS_ENABLE_THREADS
171 typedef Kokkos::Threads Device;
174 Kokkos::InitializationSettings init_args;
175 init_args.set_num_threads(num_cores*num_hyper_threads);
176 Kokkos::initialize( init_args );
178 if (comm->getRank() == 0)
179 std::cout << std::endl
180 <<
"Threads performance with " << comm->getSize()
181 <<
" MPI ranks and " << num_cores*num_hyper_threads
182 <<
" threads per rank:" << std::endl;
186 num_hyper_threads / threads_per_vector);
188 mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
195 #ifdef KOKKOS_ENABLE_OPENMP
197 typedef Kokkos::OpenMP Device;
200 Kokkos::InitializationSettings init_args;
201 init_args.set_num_threads(num_cores*num_hyper_threads);
202 Kokkos::initialize( init_args );
204 if (comm->getRank() == 0)
205 std::cout << std::endl
206 <<
"OpenMP performance with " << comm->getSize()
207 <<
" MPI ranks and " << num_cores*num_hyper_threads
208 <<
" threads per rank:" << std::endl;
212 num_hyper_threads / threads_per_vector);
214 mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
221 #ifdef KOKKOS_ENABLE_CUDA
223 typedef Kokkos::Cuda Device;
226 if (device_id == -1) {
229 if ((str = std::getenv(
"SLURM_LOCALID")))
230 local_rank = std::atoi(str);
231 else if ((str = std::getenv(
"MV2_COMM_WORLD_LOCAL_RANK")))
232 local_rank = std::atoi(str);
233 else if ((str = getenv(
"OMPI_COMM_WORLD_LOCAL_RANK")))
234 local_rank = std::atoi(str);
235 device_id = local_rank % ngpus;
238 int num_device; cudaGetDeviceCount(&num_device);
240 device_id >= num_device, std::logic_error,
241 "Invalid device ID " << device_id <<
". You probably are trying" <<
242 " to run with too many GPUs per node");
245 Kokkos::InitializationSettings init_args;
246 init_args.set_device_id(device_id);
247 Kokkos::initialize( init_args );
249 cudaDeviceProp deviceProp;
250 cudaGetDeviceProperties(&deviceProp, device_id);
251 if (comm->getRank() == 0)
252 std::cout << std::endl
253 <<
"CUDA performance performance with " << comm->getSize()
254 <<
" MPI ranks and device " << device_id <<
" ("
255 << deviceProp.name <<
"):"
260 cuda_threads_per_vector,
261 cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
263 mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check,
Stokhos::StandardStorage< int, double > Storage
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
static Teuchos::RCP< const Comm< OrdinalType > > getComm()
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
int main(int argc, char **argv)
void setDocString(const char doc_string[])