MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
11 
12 #include <cstdio>
13 #include <cmath>
14 #include <numeric>
15 #include <utility>
16 #include <chrono>
17 #include <iomanip>
18 #include <Teuchos_ScalarTraits.hpp>
19 #if KOKKOS_VERSION >= 40799
20 #include <KokkosKernels_ArithTraits.hpp>
21 #else
22 #include <Kokkos_ArithTraits.hpp>
23 #endif
24 #include <Xpetra_Import.hpp>
25 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
26 #include <Xpetra_TpetraImport.hpp>
27 #include <Tpetra_Import.hpp>
28 #include <Tpetra_Distributor.hpp>
29 #include <mpi.h>
30 #endif
31 
32 #ifdef HAVE_MPI
33 #include <mpi.h>
34 #endif
35 
36 namespace MueLu {
37 
38 namespace PerfDetails {
39 template <class Scalar, class Node>
40 double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
41  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
42 #if KOKKOS_VERSION >= 40799
43  using impl_scalar_type = typename KokkosKernels::ArithTraits<Scalar>::val_type;
44 #else
45  using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
46 #endif
47 
48  using exec_space = typename Node::execution_space;
49  using memory_space = typename Node::memory_space;
50  using range_policy = Kokkos::RangePolicy<exec_space>;
51 
52  Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
53  Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
54  Kokkos::View<impl_scalar_type *, memory_space> c("c", VECTOR_SIZE);
55  double total_test_time = 0.0;
56 
57  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
58 
59  Kokkos::parallel_for(
60  "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
61  a(i) = ONE * (double)i;
62  b(i) = a(i);
63  });
64  exec_space().fence();
65 
66  using clock = std::chrono::high_resolution_clock;
67 
68  clock::time_point start, stop;
69 
70  for (int i = 0; i < KERNEL_REPEATS; i++) {
71  start = clock::now();
72  Kokkos::parallel_for(
73  "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
74  c(j) = a(j) + b(j);
75  });
76 
77  exec_space().fence();
78  stop = clock::now();
79  double my_test_time = std::chrono::duration<double>(stop - start).count();
80  total_test_time += my_test_time;
81  }
82 
83  return total_test_time / KERNEL_REPEATS;
84 }
85 
86 template <class Scalar, class Node>
87 double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
88  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
89 #if KOKKOS_VERSION >= 40799
90  using impl_scalar_type = typename KokkosKernels::ArithTraits<Scalar>::val_type;
91 #else
92  using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
93 #endif
94 
95  using exec_space = typename Node::execution_space;
96  using memory_space = typename Node::memory_space;
97  using range_policy = Kokkos::RangePolicy<exec_space>;
98 
99  Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
100  Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
101  double total_test_time = 0.0;
102 
103  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
104 
105  Kokkos::parallel_for(
106  "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
107  a(i) = ONE;
108  });
109  exec_space().fence();
110 
111  using clock = std::chrono::high_resolution_clock;
112  clock::time_point start, stop;
113 
114  for (int i = 0; i < KERNEL_REPEATS; i++) {
115  start = clock::now();
116  Kokkos::parallel_for(
117  "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
118  b(j) = a(j);
119  });
120 
121  exec_space().fence();
122  stop = clock::now();
123  double my_test_time = std::chrono::duration<double>(stop - start).count();
124  total_test_time += my_test_time;
125  }
126 
127  return total_test_time / KERNEL_REPEATS;
128 }
129 
130 double table_lookup(const std::vector<int> &x, const std::vector<double> &y, int value) {
131  // If there's no table, nan
132  if (x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
133 
134  // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
135  int N = (int)x.size();
136  int hi = 0;
137  for (; hi < N; hi++) {
138  if (x[hi] > value)
139  break;
140  }
141 
142  if (hi == 0) {
143  // Lower end (return the min time)
144  // printf("Lower end: %d < %d\n",value,x[0]);
145  return y[0];
146  } else if (hi == N) {
147  // Higher end (extrapolate from the last two points)
148  // printf("Upper end: %d > %d\n",value,x[N-1]);
149  hi = N - 1;
150  int run = x[hi] - x[hi - 1];
151  double rise = y[hi] - y[hi - 1];
152  double slope = rise / run;
153  int diff = value - x[hi - 1];
154 
155  return y[hi - 1] + slope * diff;
156  } else {
157  // Interpolate
158  // printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
159  int run = x[hi] - x[hi - 1];
160  double rise = y[hi] - y[hi - 1];
161  double slope = rise / run;
162  int diff = value - x[hi - 1];
163 
164  return y[hi - 1] + slope * diff;
165  }
166 }
167 
168 // Report bandwidth in GB / sec
169 const double GB = 1024.0 * 1024.0 * 1024.0;
170 double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
171  double time_per_call = time / num_calls;
172  return memory_per_call_bytes / GB / time_per_call;
173 }
174 
175 template <class exec_space, class memory_space>
176 void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm<int> &comm, std::vector<int> &sizes, std::vector<double> &times) {
177 #ifdef HAVE_MPI
178  int rank = comm.getRank();
179  int nproc = comm.getSize();
180 
181  if (nproc < 2) return;
182 
183  const int buff_size = (int)pow(2, MAX_SIZE);
184 
185  sizes.resize(MAX_SIZE + 1);
186  times.resize(MAX_SIZE + 1);
187 
188  // Allocate memory for the buffers (and fill send)
189  Kokkos::View<char *, memory_space> r_buf("recv", buff_size), s_buf("send", buff_size);
190  Kokkos::deep_copy(s_buf, 1);
191 
192  // Send and recieve.
193  // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
194  int odd = rank % 2;
195  int buddy = odd ? rank - 1 : rank + 1;
196 
197  for (int i = 0; i < MAX_SIZE + 1; i++) {
198  int msg_size = (int)pow(2, i);
199  comm.barrier();
200 
201  double t0 = MPI_Wtime();
202  for (int j = 0; j < KERNEL_REPEATS; j++) {
203  if (buddy < nproc) {
204  if (odd) {
205  comm.send(msg_size, (char *)s_buf.data(), buddy);
206  comm.receive(buddy, msg_size, (char *)r_buf.data());
207  } else {
208  comm.receive(buddy, msg_size, (char *)r_buf.data());
209  comm.send(msg_size, (char *)s_buf.data(), buddy);
210  }
211  }
212  }
213 
214  double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
215  sizes[i] = msg_size;
216  times[i] = time_per_call;
217  }
218 #else
219  return;
220 #endif
221 }
222 
223 template <class exec_space, class memory_space, class LocalOrdinal, class GlobalOrdinal, class Node>
224 void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import, std::vector<int> &sizes, std::vector<double> &times) {
225  int nproc = import->getSourceMap()->getComm()->getSize();
226  if (nproc < 2) return;
227 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
228  // NOTE: We need to get the distributer here, which means we need Tpetra, since Xpetra does
229  // not have a distributor interface
231  RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(import);
232  RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(import->getSourceMap()->getComm());
233  MPI_Comm communicator = *mcomm->getRawMpiComm();
234 
235  if (Ximport.is_null() || mcomm.is_null()) return;
236  auto Timport = Ximport->getTpetra_Import();
237  auto distor = Timport->getDistributor();
238 
239  // Distributor innards
240  Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
241  Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
242  int num_recvs = (int)distor.getNumReceives();
243  int num_sends = (int)distor.getNumSends();
244 
245  const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
246  sizes.resize(MAX_SIZE + 1);
247  times.resize(MAX_SIZE + 1);
248 
249  // Allocate memory for the buffers (and fill send)
250  Kokkos::View<char *, memory_space> f_recv_buf("forward_recv", buff_size_per_msg * num_recvs), f_send_buf("forward_send", buff_size_per_msg * num_sends);
251  Kokkos::View<char *, memory_space> r_recv_buf("reverse_recv", buff_size_per_msg * num_sends), r_send_buf("reverse_send", buff_size_per_msg * num_recvs);
252  Kokkos::deep_copy(f_send_buf, 1);
253  Kokkos::deep_copy(r_send_buf, 1);
254 
255  std::vector<MPI_Request> requests(num_sends + num_recvs);
256  std::vector<MPI_Status> status(num_sends + num_recvs);
257 
258  for (int i = 0; i < MAX_SIZE + 1; i++) {
259  int msg_size = (int)pow(2, i);
260 
261  MPI_Barrier(communicator);
262 
263  double t0 = MPI_Wtime();
264  for (int j = 0; j < KERNEL_REPEATS; j++) {
265  int ct = 0;
266  // Recv/Send the forward messsages
267  for (int r = 0; r < num_recvs; r++) {
268  const int tag = 1000 + j;
269  MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
270  ct++;
271  }
272  for (int s = 0; s < num_sends; s++) {
273  const int tag = 1000 + j;
274  MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
275  ct++;
276  }
277  // Wait for the forward messsages
278  MPI_Waitall(ct, requests.data(), status.data());
279 
280  ct = 0;
281  // Recv/Send the reverse messsages
282  for (int r = 0; r < num_sends; r++) {
283  const int tag = 2000 + j;
284  MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
285  ct++;
286  }
287  for (int s = 0; s < num_recvs; s++) {
288  const int tag = 2000 + j;
289  MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
290  ct++;
291  }
292  // Wait for the reverse messsages
293  MPI_Waitall(ct, requests.data(), status.data());
294  }
295 
296  double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
297  sizes[i] = msg_size;
298  times[i] = time_per_call;
299  }
300 
301 #endif
302 }
303 
304 } // end namespace PerfDetails
305 
306 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
308  : launch_and_wait_latency_(-1.0) {}
309 
310 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
312 
313 /****************************************************************************************/
314 /****************************************************************************************/
315 /****************************************************************************************/
316 
317 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
319  // We need launch/waits latency estimates for corrected stream
320  launch_latency_make_table(KERNEL_REPEATS);
321  double latency = launch_latency_lookup();
322 
323  if (LOG_MAX_SIZE < 2)
324  LOG_MAX_SIZE = 20;
325 
326  stream_sizes_.resize(LOG_MAX_SIZE + 1);
327  stream_copy_times_.resize(LOG_MAX_SIZE + 1);
328  stream_add_times_.resize(LOG_MAX_SIZE + 1);
329  latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE + 1);
330  latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE + 1);
331 
332  for (int i = 0; i < LOG_MAX_SIZE + 1; i++) {
333  int size = (int)pow(2, i);
334  double c_time = PerfDetails::stream_vector_copy<Scalar, Node>(KERNEL_REPEATS, size);
335  double a_time = PerfDetails::stream_vector_add<Scalar, Node>(KERNEL_REPEATS, size);
336 
337  stream_sizes_[i] = size;
338 
339  // Correct for the difference in memory transactions per element
340  stream_copy_times_[i] = c_time / 2.0;
341  stream_add_times_[i] = a_time / 3.0;
342 
343  // Correct for launch latency too. We'll note that sometimes the latency estimate
344  // is higher than the actual copy/add time estimate. If so, we don't correct
345  latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ((c_time - latency) / 2.0);
346  latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ((a_time - latency) / 3.0);
347  }
348 }
349 
350 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
351 double
353  return PerfDetails::table_lookup(stream_sizes_, stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
354 }
355 
356 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
357 double
359  return PerfDetails::table_lookup(stream_sizes_, stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
360 }
361 
362 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
363 double
365  return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES), stream_vector_add_lookup(SIZE_IN_BYTES));
366 }
367 
368 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
369 double
371  return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
372 }
373 
374 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
375 double
377  return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
378 }
379 
380 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381 double
383  return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES), latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
384 }
385 
386 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
388  print_stream_vector_table_impl(out, false, prefix);
389 }
390 
391 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
393  print_stream_vector_table_impl(out, true, prefix);
394 }
395 
396 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
397 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix) {
398  using namespace std;
399  std::ios old_format(NULL);
400  old_format.copyfmt(out);
401 
402  out << prefix
403  << setw(20) << "Length in Scalars" << setw(1) << " "
404  << setw(20) << "COPY (us)" << setw(1) << " "
405  << setw(20) << "ADD (us)" << setw(1) << " "
406  << setw(20) << "COPY (GB/s)" << setw(1) << " "
407  << setw(20) << "ADD (GB/s)" << std::endl;
408 
409  out << prefix
410  << setw(20) << "-----------------" << setw(1) << " "
411  << setw(20) << "---------" << setw(1) << " "
412  << setw(20) << "--------" << setw(1) << " "
413  << setw(20) << "-----------" << setw(1) << " "
414  << setw(20) << "----------" << std::endl;
415 
416  for (int i = 0; i < (int)stream_sizes_.size(); i++) {
417  int size = stream_sizes_[i];
418  double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
419  double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
420  // We've already corrected for the transactions per element difference
421  double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time, 1, size * sizeof(Scalar));
422  double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time, 1, size * sizeof(Scalar));
423 
424  out << prefix
425  << setw(20) << size << setw(1) << " "
426  << setw(20) << fixed << setprecision(4) << (c_time * 1e6) << setw(1) << " "
427  << setw(20) << fixed << setprecision(4) << (a_time * 1e6) << setw(1) << " "
428  << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
429  << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
430  }
431 
432  out.copyfmt(old_format);
433 }
434 
435 /****************************************************************************************/
436 /****************************************************************************************/
437 /****************************************************************************************/
438 
439 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
440 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm) {
441  PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_host_times_);
442 
443  PerfDetails::pingpong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_device_times_);
444 }
445 
446 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
447 double
449  return PerfDetails::table_lookup(pingpong_sizes_, pingpong_host_times_, SIZE_IN_BYTES);
450 }
451 
452 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
453 double
455  return PerfDetails::table_lookup(pingpong_sizes_, pingpong_device_times_, SIZE_IN_BYTES);
456 }
457 
458 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
459 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream &out, const std::string &prefix) {
460  if (pingpong_sizes_.size() == 0) return;
461 
462  using namespace std;
463  std::ios old_format(NULL);
464  old_format.copyfmt(out);
465 
466  out << prefix
467  << setw(20) << "Message Size" << setw(1) << " "
468  << setw(20) << "Host (us)" << setw(1) << " "
469  << setw(20) << "Device (us)" << std::endl;
470 
471  out << prefix
472  << setw(20) << "------------" << setw(1) << " "
473  << setw(20) << "---------" << setw(1) << " "
474  << setw(20) << "-----------" << std::endl;
475 
476  for (int i = 0; i < (int)pingpong_sizes_.size(); i++) {
477  int size = pingpong_sizes_[i];
478  double h_time = pingpong_host_times_[i];
479  double d_time = pingpong_device_times_[i];
480 
481  out << prefix
482  << setw(20) << size << setw(1) << " "
483  << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
484  << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
485  }
486 
487  out.copyfmt(old_format);
488 }
489 
490 /****************************************************************************************/
491 /****************************************************************************************/
492 /****************************************************************************************/
493 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
495  PerfDetails::halopong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_host_times_);
496 
497  PerfDetails::halopong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_device_times_);
498 }
499 
500 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
501 double
503  return PerfDetails::table_lookup(halopong_sizes_, halopong_host_times_, SIZE_IN_BYTES);
504 }
505 
506 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
507 double
509  return PerfDetails::table_lookup(halopong_sizes_, halopong_device_times_, SIZE_IN_BYTES);
510 }
511 
512 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
513 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream &out, const std::string &prefix) {
514  if (halopong_sizes_.size() == 0) return;
515 
516  using namespace std;
517  std::ios old_format(NULL);
518  old_format.copyfmt(out);
519 
520  out << prefix
521  << setw(20) << "Message Size" << setw(1) << " "
522  << setw(20) << "Host (us)" << setw(1) << " "
523  << setw(20) << "Device (us)" << std::endl;
524 
525  out << prefix
526  << setw(20) << "------------" << setw(1) << " "
527  << setw(20) << "---------" << setw(1) << " "
528  << setw(20) << "-----------" << std::endl;
529 
530  for (int i = 0; i < (int)halopong_sizes_.size(); i++) {
531  int size = halopong_sizes_[i];
532  double h_time = halopong_host_times_[i];
533  double d_time = halopong_device_times_[i];
534 
535  out << prefix
536  << setw(20) << size << setw(1) << " "
537  << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
538  << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
539  }
540 
541  out.copyfmt(old_format);
542 }
543 
544 /****************************************************************************************/
545 /****************************************************************************************/
546 /****************************************************************************************/
547 
548 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
550  using exec_space = typename Node::execution_space;
551  using range_policy = Kokkos::RangePolicy<exec_space>;
552  using clock = std::chrono::high_resolution_clock;
553 
554  double total_test_time = 0;
555  clock::time_point start, stop;
556  for (int i = 0; i < KERNEL_REPEATS; i++) {
557  start = clock::now();
558  Kokkos::parallel_for(
559  "empty kernel", range_policy(0, 1), KOKKOS_LAMBDA(const size_t j) {
560  ;
561  });
562  exec_space().fence();
563  stop = clock::now();
564  double my_test_time = std::chrono::duration<double>(stop - start).count();
565  total_test_time += my_test_time;
566  }
567 
568  launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
569 }
570 
571 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
572 double
574  return launch_and_wait_latency_;
575 }
576 
577 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
579  using namespace std;
580  std::ios old_format(NULL);
581  old_format.copyfmt(out);
582 
583  out << prefix
584  << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
585  << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_ * 1e6) << std::endl;
586 
587  out.copyfmt(old_format);
588 }
589 
590 } // namespace MueLu
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > &times)
virtual int getSize() const =0
virtual int getRank() const =0
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
double pingpong_device_lookup(int SIZE_IN_BYTES)
virtual int receive(const int sourceRank, const Ordinal bytes, char recvBuffer[]) const =0
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
virtual void barrier() const =0
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
void start()
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_halopong_table(std::ostream &out, const std::string &prefix="")
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double pingpong_host_lookup(int SIZE_IN_BYTES)
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
virtual void send(const Ordinal bytes, const char sendBuffer[], const int destRank) const =0
void launch_latency_make_table(int KERNEL_REPEATS)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
void stop()
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
bool is_null() const