Stokhos Package Browser (Single Doxygen Collection)  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
FadMPAssembly/TestAssembly.cpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Stokhos Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Eric T. Phipps (etphipp@sandia.gov).
38 //
39 // ***********************************************************************
40 // @HEADER
41 
42 #include <iostream>
43 
44 // Tests
45 #include "TestAssembly.hpp"
46 
47 // Devices
48 #include "Kokkos_Core.hpp"
49 
50 // Utilities
51 #include "Teuchos_DefaultComm.hpp"
54 #ifdef KOKKOS_ENABLE_CUDA
55 #include "cuda_runtime_api.h"
56 #endif
57 
58 template <typename Storage,
60 void mainHost(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
61  const int use_print ,
62  const int use_trials ,
63  const int use_nodes[] ,
64  const bool check ,
66 #ifdef __MIC__
67  const int entry_min = 8;
68  const int entry_max = 48;
69  const int entry_step = 8;
70 #else
71  const int entry_min = 4;
72  const int entry_max = 32;
73  const int entry_step = 4;
74  // const int entry_min = 1;
75  // const int entry_max = 1;
76  // const int entry_step = 1;
77 #endif
78 
79  performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
80  comm, use_print, use_trials, use_nodes, check, dev_config);
81 }
82 
83 template <typename Storage,
85 void mainCuda(const Teuchos::RCP<const Teuchos::Comm<int> >& comm ,
86  const int use_print ,
87  const int use_trials ,
88  const int use_nodes[] ,
89  const bool check ,
91  const int entry_min = 16;
92  const int entry_max = 64;
93  const int entry_step = 16;
94  performance_test_driver<Storage,entry_min,entry_max,entry_step,Method>(
95  comm, use_print, use_trials, use_nodes, check, dev_config);
96 }
97 
98 int main(int argc, char *argv[])
99 {
100  bool success = true;
101  bool verbose = false;
102  try {
103 
104  Teuchos::oblackholestream blackHole;
105  Teuchos::GlobalMPISession mpiSession (&argc, &argv, &blackHole);
106 
109 
110  const size_t num_sockets = Kokkos::hwloc::get_available_numa_count();
111  const size_t num_cores_per_socket =
112  Kokkos::hwloc::get_available_cores_per_numa();
113  const size_t num_threads_per_core =
114  Kokkos::hwloc::get_available_threads_per_core();
115 
116  // Setup command line options
118  CLP.setDocString(
119  "This test performance of MP::Vector FEM assembly.\n");
120  int nGrid = 32;
121  CLP.setOption("n", &nGrid, "Number of mesh points in the each direction");
122  int nIter = 10;
123  CLP.setOption("ni", &nIter, "Number of assembly iterations");
124  bool print = false;
125  CLP.setOption("print", "no-print", &print, "Print debugging output");
126  bool check = false;
127  int num_cores = num_cores_per_socket * num_sockets;
128  CLP.setOption("cores", &num_cores,
129  "Number of CPU cores to use (defaults to all)");
130  int num_hyper_threads = num_threads_per_core;
131  CLP.setOption("hyperthreads", &num_hyper_threads,
132  "Number of hyper threads per core to use (defaults to all)");
133  int threads_per_vector = 1;
134  CLP.setOption("threads_per_vector", &threads_per_vector,
135  "Number of threads to use within each vector");
136  CLP.setOption("check", "no-check", &check, "Check correctness");
137 #ifdef KOKKOS_ENABLE_SERIAL
138  bool serial = true;
139  CLP.setOption("serial", "no-serial", &serial, "Enable Serial device");
140 #endif
141 #ifdef KOKKOS_ENABLE_THREADS
142  bool threads = true;
143  CLP.setOption("threads", "no-threads", &threads, "Enable Threads device");
144 #endif
145 #ifdef KOKKOS_ENABLE_OPENMP
146  bool openmp = true;
147  CLP.setOption("openmp", "no-openmp", &openmp, "Enable OpenMP device");
148 #endif
149 #ifdef KOKKOS_ENABLE_CUDA
150  bool cuda = true;
151  CLP.setOption("cuda", "no-cuda", &cuda, "Enable Cuda device");
152  int cuda_threads_per_vector = 16;
153  CLP.setOption("cuda_threads_per_vector", &cuda_threads_per_vector,
154  "Number of Cuda threads to use within each vector");
155  int cuda_block_size = 256;
156  CLP.setOption("cuda_block_size", &cuda_block_size,
157  "Cuda block size");
158  int num_cuda_blocks = 0;
159  CLP.setOption("num_cuda_blocks", &num_cuda_blocks,
160  "Number of Cuda blocks (0 implies the default choice)");
161  int device_id = -1;
162  CLP.setOption("device", &device_id, "CUDA device ID. Set to default of -1 to use the default device as determined by the local node MPI rank and --ngpus");
163  int ngpus = 1;
164  CLP.setOption("ngpus", &ngpus, "Number of GPUs per node for multi-GPU runs via MPI");
165 #endif
166  CLP.parse( argc, argv );
167 
168  int use_nodes[3];
169  use_nodes[0] = nGrid; use_nodes[1] = nGrid; use_nodes[2] = nGrid;
170 
171  typedef int Ordinal;
172  typedef double Scalar;
175  // const Kokkos::Example::FENL::AssemblyMethod Method =
176  // Kokkos::Example::FENL::Analytic;
177 
178 #ifdef KOKKOS_ENABLE_SERIAL
179  if (serial) {
180  typedef Kokkos::Serial Device;
182 
183  Kokkos::InitArguments init_args;
184  init_args.num_threads = num_cores*num_hyper_threads;
185  Kokkos::initialize( init_args );
186 
187  if (comm->getRank() == 0)
188  std::cout << std::endl
189  << "Serial performance with " << comm->getSize()
190  << " MPI ranks" << std::endl;
191 
192  Kokkos::Example::FENL::DeviceConfig dev_config(1, 1, 1);
193 
194  mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
195  dev_config);
196 
197  Kokkos::finalize();
198  }
199 #endif
200 
201 #ifdef KOKKOS_ENABLE_THREADS
202  if (threads) {
203  typedef Kokkos::Threads Device;
205 
206  Kokkos::InitArguments init_args;
207  init_args.num_threads = num_cores*num_hyper_threads;
208  Kokkos::initialize( init_args );
209 
210  if (comm->getRank() == 0)
211  std::cout << std::endl
212  << "Threads performance with " << comm->getSize()
213  << " MPI ranks and " << num_cores*num_hyper_threads
214  << " threads per rank:" << std::endl;
215 
216  Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
217  threads_per_vector,
218  num_hyper_threads / threads_per_vector);
219 
220  mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
221  dev_config);
222 
223  Kokkos::finalize();
224  }
225 #endif
226 
227 #ifdef KOKKOS_ENABLE_OPENMP
228  if (openmp) {
229  typedef Kokkos::OpenMP Device;
231 
232  Kokkos::InitArguments init_args;
233  init_args.num_threads = num_cores*num_hyper_threads;
234  Kokkos::initialize( init_args );
235 
236  if (comm->getRank() == 0)
237  std::cout << std::endl
238  << "OpenMP performance with " << comm->getSize()
239  << " MPI ranks and " << num_cores*num_hyper_threads
240  << " threads per rank:" << std::endl;
241 
242  Kokkos::Example::FENL::DeviceConfig dev_config(num_cores,
243  threads_per_vector,
244  num_hyper_threads / threads_per_vector);
245 
246  mainHost<Storage,Method>(comm, print, nIter, use_nodes, check,
247  dev_config);
248 
249  Kokkos::finalize();
250  }
251 #endif
252 
253 #ifdef KOKKOS_ENABLE_CUDA
254  if (cuda) {
255  typedef Kokkos::Cuda Device;
257 
258  if (device_id == -1) {
259  int local_rank = 0;
260  char *str;
261  if ((str = std::getenv("SLURM_LOCALID")))
262  local_rank = std::atoi(str);
263  else if ((str = std::getenv("MV2_COMM_WORLD_LOCAL_RANK")))
264  local_rank = std::atoi(str);
265  else if ((str = getenv("OMPI_COMM_WORLD_LOCAL_RANK")))
266  local_rank = std::atoi(str);
267  device_id = local_rank % ngpus;
268 
269  // Check device is valid
270  int num_device; cudaGetDeviceCount(&num_device);
272  device_id >= num_device, std::logic_error,
273  "Invalid device ID " << device_id << ". You probably are trying" <<
274  " to run with too many GPUs per node");
275  }
276 
277  Kokkos::InitArguments init_args;
278  init_args.device_id = device_id;
279  Kokkos::initialize( init_args );
280 
281  cudaDeviceProp deviceProp;
282  cudaGetDeviceProperties(&deviceProp, device_id);
283  if (comm->getRank() == 0)
284  std::cout << std::endl
285  << "CUDA performance performance with " << comm->getSize()
286  << " MPI ranks and device " << device_id << " ("
287  << deviceProp.name << "):"
288  << std::endl;
289 
291  num_cuda_blocks,
292  cuda_threads_per_vector,
293  cuda_threads_per_vector == 0 ? 0 : cuda_block_size / cuda_threads_per_vector);
294 
295  mainCuda<Storage,Method>(comm, print, nIter, use_nodes, check,
296  dev_config);
297 
298  Kokkos::finalize();
299  }
300 #endif
301 
302  }
303  TEUCHOS_STANDARD_CATCH_STATEMENTS(verbose, std::cerr, success);
304 
305  if (success)
306  return 0;
307  return -1;
308 }
Stokhos::StandardStorage< int, double > Storage
void mainHost(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
Statically allocated storage class.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
void mainCuda(const Teuchos::RCP< const Teuchos::Comm< int > > &comm, const int use_print, const int use_trials, const int use_nodes[], const bool check, Kokkos::Example::FENL::DeviceConfig dev_config)
static Teuchos::RCP< const Comm< OrdinalType > > getComm()
int check(Epetra_CrsGraph &A, int NumMyRows1, int NumGlobalRows1, int NumMyNonzeros1, int NumGlobalNonzeros1, int *MyGlobalElements, bool verbose)
void setOption(const char option_true[], const char option_false[], bool *option_val, const char documentation[]=NULL)
#define TEUCHOS_STANDARD_CATCH_STATEMENTS(VERBOSE, ERR_STREAM, SUCCESS_FLAG)
EParseCommandLineReturn parse(int argc, char *argv[], std::ostream *errout=&std::cerr) const
int main(int argc, char **argv)
void setDocString(const char doc_string[])