Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FadMPAssembly/TestAssembly.cpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Stokhos Package
4 //
5 // Copyright 2009 NTESS and the Stokhos contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #include <iostream>
11 
12 // Tests
13 #include "TestAssembly.hpp"
14 
15 // Devices
16 #include "Kokkos_Core.hpp"
17 
18 // Utilities
19 #include "Teuchos_DefaultComm.hpp"
22 #ifdef KOKKOS_ENABLE_CUDA
23 #include "cuda_runtime_api.h"
24 #endif
25 
26 template <typename Storage,
28 void mainHost(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
29  const int use_print ,
30  const int use_trials ,
31  const int use_nodes[] ,
32  const bool check ,
34 #ifdef __MIC__
35  const int entry_min = 8;
36  const int entry_max = 48;
37  const int entry_step = 8;
38 #else
39  const int entry_min = 4;
40  const int entry_max = 32;
41  const int entry_step = 4;
42  // const int entry_min = 1;
43  // const int entry_max = 1;
44  // const int entry_step = 1;
45 #endif
46 
47  performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
48  comm, use_print, use_trials, use_nodes, check, dev_config);
49 }
50 
51 template <typename Storage,
53 void mainCuda(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
54  const int use_print ,
55  const int use_trials ,
56  const int use_nodes[] ,
57  const bool check ,
59  const int entry_min = 16;
60  const int entry_max = 64;
61  const int entry_step = 16;
62  performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
63  comm, use_print, use_trials, use_nodes, check, dev_config);
64 }
65 
66 int main(int argc, char *argv[])
67 {
68  bool success = true;
69  bool verbose = false;
70  try {
71 
72  Teuchos::oblackholestream blackHole;
73  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
74 
77 
78  const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
79  const size_t num_cores_per_socket =
80  Kokkos::hwloc::get_available_cores_per_numa();
81  const size_t num_threads_per_core =
82  Kokkos::hwloc::get_available_threads_per_core();
83 
84  // Setup command line options
86  CLP.setDocString(
87  "This test performance of MP::Vector FEM assembly.\n");
88  int nGrid = 32;
89  CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
90  int nIter = 10;
91  CLP.setOption("ni", &nIter, "Number of assembly iterations");
92  bool print = false;
93  CLP.setOption("print", "no-print", &print, "Print debugging output");
94  bool check = false;
95  int num_cores = num_cores_per_socket * num_sockets;
96  CLP.setOption("cores", &num_cores,
97  "Number of CPU cores to use (defaults to all)");
98  int num_hyper_threads = num_threads_per_core;
99  CLP.setOption("hyperthreads", &num_hyper_threads,
100  "Number of hyper threads per core to use (defaults to all)");
101  int threads_per_vector = 1;
102  CLP.setOption("threads_per_vector", &threads_per_vector,
103  "Number of threads to use within each vector");
104  CLP.setOption("check", "no-check", &check, "Check correctness");
105 #ifdef KOKKOS_ENABLE_SERIAL
106  bool serial = true;
107  CLP.setOption("serial", "no-serial", &serial, "Enable Serial device");
108 #endif
109 #ifdef KOKKOS_ENABLE_THREADS
110  bool threads = true;
111  CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
112 #endif
113 #ifdef KOKKOS_ENABLE_OPENMP
114  bool openmp = true;
115  CLP.setOption("openmp", "no-openmp", &openmp, "Enable OpenMP device");
116 #endif
117 #ifdef KOKKOS_ENABLE_CUDA
118  bool cuda = true;
119  CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
120  int cuda_threads_per_vector = 16;
121  CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
122  "Number of Cuda threads to use within each vector");
123  int cuda_block_size = 256;
124  CLP.setOption("cuda_block_size", &cuda_block_size,
125  "Cuda block size");
126  int num_cuda_blocks = 0;
127  CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
128  "Number of Cuda blocks (0 implies the default choice)");
129  int device_id = -1;
130  CLP.setOption("device", &device_id, "CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
131  int ngpus = 1;
132  CLP.setOption("ngpus", &ngpus, "Number of GPUs per node for multi-GPU runs via MPI");
133 #endif
134  CLP.parse( argc, argv );
135 
136  int use_nodes[3];
137  use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
138 
139  typedef int Ordinal;
140  typedef double Scalar;
143  // const Kokkos::Example::FENL::AssemblyMethod Method =
144  // Kokkos::Example::FENL::Analytic;
145 
146 #ifdef KOKKOS_ENABLE_SERIAL
147  if (serial) {
148  typedef Kokkos::Serial Device;
150 
151  Kokkos::InitializationSettings init_args;
152  init_args.set_num_threads(num_cores*num_hyper_threads);
153  Kokkos::initialize( init_args );
154 
155  if (comm->getRank() == 0)
156  std::cout << std::endl
157  << "Serial performance with " << comm->getSize()
158  << " MPI ranks" << std::endl;
159 
160  Kokkos::Example::FENL::DeviceConfig dev_config(1, 1, 1);
161 
162  mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
163  dev_config);
164 
165  Kokkos::finalize();
166  }
167 #endif
168 
169 #ifdef KOKKOS_ENABLE_THREADS
170  if (threads) {
171  typedef Kokkos::Threads Device;
173 
174  Kokkos::InitializationSettings init_args;
175  init_args.set_num_threads(num_cores*num_hyper_threads);
176  Kokkos::initialize( init_args );
177 
178  if (comm->getRank() == 0)
179  std::cout << std::endl
180  << "Threads performance with " << comm->getSize()
181  << " MPI ranks and " << num_cores*num_hyper_threads
182  << " threads per rank:" << std::endl;
183 
184  Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
185  threads_per_vector,
186  num_hyper_threads / threads_per_vector);
187 
188  mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
189  dev_config);
190 
191  Kokkos::finalize();
192  }
193 #endif
194 
195 #ifdef KOKKOS_ENABLE_OPENMP
196  if (openmp) {
197  typedef Kokkos::OpenMP Device;
199 
200  Kokkos::InitializationSettings init_args;
201  init_args.set_num_threads(num_cores*num_hyper_threads);
202  Kokkos::initialize( init_args );
203 
204  if (comm->getRank() == 0)
205  std::cout << std::endl
206  << "OpenMP performance with " << comm->getSize()
207  << " MPI ranks and " << num_cores*num_hyper_threads
208  << " threads per rank:" << std::endl;
209 
210  Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
211  threads_per_vector,
212  num_hyper_threads / threads_per_vector);
213 
214  mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
215  dev_config);
216 
217  Kokkos::finalize();
218  }
219 #endif
220 
221 #ifdef KOKKOS_ENABLE_CUDA
222  if (cuda) {
223  typedef Kokkos::Cuda Device;
225 
226  if (device_id == -1) {
227  int local_rank = 0;
228  char *str;
229  if ((str = std::getenv("SLURM_LOCALID")))
230  local_rank = std::atoi(str);
231  else if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")))
232  local_rank = std::atoi(str);
233  else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
234  local_rank = std::atoi(str);
235  device_id = local_rank % ngpus;
236 
237  // Check device is valid
238  int num_device; cudaGetDeviceCount(&num_device);
240  device_id >= num_device, std::logic_error,
241  "Invalid device ID " << device_id << ". You probably are trying" <<
242  " to run with too many GPUs per node");
243  }
244 
245  Kokkos::InitializationSettings init_args;
246  init_args.set_device_id(device_id);
247  Kokkos::initialize( init_args );
248 
249  cudaDeviceProp deviceProp;
250  cudaGetDeviceProperties(&deviceProp, device_id);
251  if (comm->getRank() == 0)
252  std::cout << std::endl
253  << "CUDA performance performance with " << comm->getSize()
254  << " MPI ranks and device " << device_id << " ("
255  << deviceProp.name << "):"
256  << std::endl;
257 
259  num_cuda_blocks,
260  cuda_threads_per_vector,
261  cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
262 
263  mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check,
264  dev_config);
265 
266  Kokkos::finalize();
267  }
268 #endif
269 
270  }
271  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
272 
273  if (success)
274  return 0;
275  return -1;
276 }
Stokhos::StandardStorage< int, double > Storage
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
static Teuchos::RCP< const Comm< OrdinalType > > getComm()
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
int main(int argc, char **argv)
void setDocString(const char doc_string[])