Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MPAssembly/TestAssembly.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include <iostream>
11 
12 // Tests
13 #include "TestAssembly.hpp"
14 
15 // Devices
16 #include "Kokkos_Core.hpp"
17 
18 // Utilities
19 #include "Teuchos_DefaultComm.hpp"
22 #ifdef KOKKOS_ENABLE_CUDA
23 #include "cuda_runtime_api.h"
24 #endif
25 
26 template <typename Storage>
27 void mainHost(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
28  const int use_print ,
29  const int use_trials ,
30  const int use_nodes[] ,
31  const bool check ,
33 #ifdef __MIC__
34  const int entry_min = 8;
35  const int entry_max = 48;
36  const int entry_step = 8;
37 #else
38  const int entry_min = 4;
39  const int entry_max = 32;
40  const int entry_step = 4;
41  // const int entry_min = 16;
42  // const int entry_max = 16;
43  // const int entry_step = 16;
44 #endif
45 
46  performance_test_driver<Storage,entry_min,entry_max,entry_step>(
47  comm, use_print, use_trials, use_nodes, check, dev_config);
48 }
49 
50 template <typename Storage>
51 void mainCuda(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
52  const int use_print ,
53  const int use_trials ,
54  const int use_nodes[] ,
55  const bool check ,
57  const int entry_min = 16;
58  const int entry_max = 64;
59  const int entry_step = 16;
60  performance_test_driver<Storage,entry_min,entry_max,entry_step>(
61  comm, use_print, use_trials, use_nodes, check, dev_config);
62 }
63 
64 int main(int argc, char *argv[])
65 {
66  bool success = true;
67  bool verbose = false;
68  try {
69 
70  Teuchos::oblackholestream blackHole;
71  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
72 
75 
76  const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
77  const size_t num_cores_per_socket =
78  Kokkos::hwloc::get_available_cores_per_numa();
79  const size_t num_threads_per_core =
80  Kokkos::hwloc::get_available_threads_per_core();
81 
82  // Setup command line options
84  CLP.setDocString(
85  "This test performance of MP::Vector FEM assembly.\n");
86  int nGrid = 32;
87  CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
88  int nIter = 10;
89  CLP.setOption("ni", &nIter, "Number of assembly iterations");
90  bool print = false;
91  CLP.setOption("print", "no-print", &print, "Print debugging output");
92  bool check = false;
93  int num_cores = num_cores_per_socket * num_sockets;
94  CLP.setOption("cores", &num_cores,
95  "Number of CPU cores to use (defaults to all)");
96  int num_hyper_threads = num_threads_per_core;
97  CLP.setOption("hyperthreads", &num_hyper_threads,
98  "Number of hyper threads per core to use (defaults to all)");
99  int threads_per_vector = 1;
100  CLP.setOption("threads_per_vector", &threads_per_vector,
101  "Number of threads to use within each vector");
102  CLP.setOption("check", "no-check", &check, "Check correctness");
103 #ifdef KOKKOS_ENABLE_SERIAL
104  bool serial = true;
105  CLP.setOption("serial", "no-serial", &serial, "Enable Serial device");
106 #endif
107 #ifdef KOKKOS_ENABLE_THREADS
108  bool threads = true;
109  CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
110 #endif
111 #ifdef KOKKOS_ENABLE_OPENMP
112  bool openmp = true;
113  CLP.setOption("openmp", "no-openmp", &openmp, "Enable OpenMP device");
114 #endif
115 #ifdef KOKKOS_ENABLE_CUDA
116  bool cuda = true;
117  CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
118  int cuda_threads_per_vector = 16;
119  CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
120  "Number of Cuda threads to use within each vector");
121  int cuda_block_size = 256;
122  CLP.setOption("cuda_block_size", &cuda_block_size,
123  "Cuda block size");
124  int num_cuda_blocks = 0;
125  CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
126  "Number of Cuda blocks (0 implies the default choice)");
127  int device_id = -1;
128  CLP.setOption("device", &device_id, "CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
129  int ngpus = 1;
130  CLP.setOption("ngpus", &ngpus, "Number of GPUs per node for multi-GPU runs via MPI");
131 #endif
132  CLP.parse( argc, argv );
133 
134  int use_nodes[3];
135  use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
136 
137  typedef int Ordinal;
138  typedef double Scalar;
139 
140 #ifdef KOKKOS_ENABLE_SERIAL
141  if (serial) {
142  typedef Kokkos::Serial Device;
144 
145  Kokkos::initialize();
146 
147  if (comm->getRank() == 0)
148  std::cout << std::endl
149  << "Serial performance with " << comm->getSize()
150  << " MPI ranks" << std::endl;
151 
152  Kokkos::Example::FENL::DeviceConfig dev_config(1, 1, 1);
153 
154  mainHost<Storage>(comm, print, nIter, use_nodes, check,
155  dev_config);
156 
157  Kokkos::finalize();
158  }
159 #endif
160 
161 #ifdef KOKKOS_ENABLE_THREADS
162  if (threads) {
163  typedef Kokkos::Threads Device;
165 
166  Kokkos::InitializationSettings init_args;
167  init_args.set_num_threads(num_cores*num_hyper_threads);
168  Kokkos::initialize( init_args );
169 
170  if (comm->getRank() == 0)
171  std::cout << std::endl
172  << "Threads performance with " << comm->getSize()
173  << " MPI ranks and " << num_cores*num_hyper_threads
174  << " threads per rank:" << std::endl;
175 
176  Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
177  threads_per_vector,
178  num_hyper_threads / threads_per_vector);
179 
180  mainHost<Storage>(comm, print, nIter, use_nodes, check,
181  dev_config);
182 
183  Kokkos::finalize();
184  }
185 #endif
186 
187 #ifdef KOKKOS_ENABLE_OPENMP
188  if (openmp) {
189  typedef Kokkos::OpenMP Device;
191 
192  Kokkos::InitializationSettings init_args;
193  init_args.set_num_threads(num_cores*num_hyper_threads);
194  Kokkos::initialize( init_args );
195 
196  if (comm->getRank() == 0)
197  std::cout << std::endl
198  << "OpenMP performance with " << comm->getSize()
199  << " MPI ranks and " << num_cores*num_hyper_threads
200  << " threads per rank:" << std::endl;
201 
202  Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
203  threads_per_vector,
204  num_hyper_threads / threads_per_vector);
205 
206  mainHost<Storage>(comm, print, nIter, use_nodes, check,
207  dev_config);
208 
209  Kokkos::finalize();
210  }
211 #endif
212 
213 #ifdef KOKKOS_ENABLE_CUDA
214  if (cuda) {
215  typedef Kokkos::Cuda Device;
217 
218  if (device_id == -1) {
219  int local_rank = 0;
220  char *str;
221  if ((str = std::getenv("SLURM_LOCALID")))
222  local_rank = std::atoi(str);
223  else if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")))
224  local_rank = std::atoi(str);
225  else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
226  local_rank = std::atoi(str);
227  device_id = local_rank % ngpus;
228 
229  // Check device is valid
230  int num_device; cudaGetDeviceCount(&num_device);
232  device_id >= num_device, std::logic_error,
233  "Invalid device ID " << device_id << ". You probably are trying" <<
234  " to run with too many GPUs per node");
235  }
236 
237  Kokkos::InitializationSettings init_args;
238  init_args.set_device_id(device_id);
239  Kokkos::initialize( init_args );
240 
241  cudaDeviceProp deviceProp;
242  cudaGetDeviceProperties(&deviceProp, device_id);
243  if (comm->getRank() == 0)
244  std::cout << std::endl
245  << "CUDA performance performance with " << comm->getSize()
246  << " MPI ranks and device " << device_id << " ("
247  << deviceProp.name << "):"
248  << std::endl;
249 
251  num_cuda_blocks,
252  cuda_threads_per_vector,
253  cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
254 
255  mainCuda<Storage>(comm, print, nIter, use_nodes, check,
256  dev_config);
257 
258  Kokkos::finalize();
259  }
260 #endif
261 
262  }
263  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
264 
265  if (success)
266  return 0;
267  return -1;
268 }
Stokhos::StandardStorage< int, double > Storage
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
static Teuchos::RCP< const Comm< OrdinalType > > getComm()
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
int main(int argc, char **argv)
void setDocString(const char doc_string[])