MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
11 
12 #include <cstdio>
13 #include <cmath>
14 #include <numeric>
15 #include <utility>
16 #include <chrono>
17 #include <iomanip>
18 #include <Teuchos_ScalarTraits.hpp>
19 #include <Kokkos_ArithTraits.hpp>
20 #include <Xpetra_Import.hpp>
21 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
22 #include <Xpetra_TpetraImport.hpp>
23 #include <Tpetra_Import.hpp>
24 #include <Tpetra_Distributor.hpp>
25 #include <mpi.h>
26 #endif
27 
28 #ifdef HAVE_MPI
29 #include <mpi.h>
30 #endif
31 
32 namespace MueLu {
33 
34 namespace PerfDetails {
35 template <class Scalar, class Node>
36 double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
37  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
38  using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
39 
40  using exec_space = typename Node::execution_space;
41  using memory_space = typename Node::memory_space;
42  using range_policy = Kokkos::RangePolicy<exec_space>;
43 
44  Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
45  Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
46  Kokkos::View<impl_scalar_type *, memory_space> c("c", VECTOR_SIZE);
47  double total_test_time = 0.0;
48 
49  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
50 
51  Kokkos::parallel_for(
52  "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
53  a(i) = ONE * (double)i;
54  b(i) = a(i);
55  });
56  exec_space().fence();
57 
58  using clock = std::chrono::high_resolution_clock;
59 
60  clock::time_point start, stop;
61 
62  for (int i = 0; i < KERNEL_REPEATS; i++) {
63  start = clock::now();
64  Kokkos::parallel_for(
65  "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
66  c(j) = a(j) + b(j);
67  });
68 
69  exec_space().fence();
70  stop = clock::now();
71  double my_test_time = std::chrono::duration<double>(stop - start).count();
72  total_test_time += my_test_time;
73  }
74 
75  return total_test_time / KERNEL_REPEATS;
76 }
77 
78 template <class Scalar, class Node>
79 double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
80  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
81  using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
82 
83  using exec_space = typename Node::execution_space;
84  using memory_space = typename Node::memory_space;
85  using range_policy = Kokkos::RangePolicy<exec_space>;
86 
87  Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
88  Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
89  double total_test_time = 0.0;
90 
91  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
92 
93  Kokkos::parallel_for(
94  "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
95  a(i) = ONE;
96  });
97  exec_space().fence();
98 
99  using clock = std::chrono::high_resolution_clock;
100  clock::time_point start, stop;
101 
102  for (int i = 0; i < KERNEL_REPEATS; i++) {
103  start = clock::now();
104  Kokkos::parallel_for(
105  "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
106  b(j) = a(j);
107  });
108 
109  exec_space().fence();
110  stop = clock::now();
111  double my_test_time = std::chrono::duration<double>(stop - start).count();
112  total_test_time += my_test_time;
113  }
114 
115  return total_test_time / KERNEL_REPEATS;
116 }
117 
118 double table_lookup(const std::vector<int> &x, const std::vector<double> &y, int value) {
119  // If there's no table, nan
120  if (x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
121 
122  // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
123  int N = (int)x.size();
124  int hi = 0;
125  for (; hi < N; hi++) {
126  if (x[hi] > value)
127  break;
128  }
129 
130  if (hi == 0) {
131  // Lower end (return the min time)
132  // printf("Lower end: %d < %d\n",value,x[0]);
133  return y[0];
134  } else if (hi == N) {
135  // Higher end (extrapolate from the last two points)
136  // printf("Upper end: %d > %d\n",value,x[N-1]);
137  hi = N - 1;
138  int run = x[hi] - x[hi - 1];
139  double rise = y[hi] - y[hi - 1];
140  double slope = rise / run;
141  int diff = value - x[hi - 1];
142 
143  return y[hi - 1] + slope * diff;
144  } else {
145  // Interpolate
146  // printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
147  int run = x[hi] - x[hi - 1];
148  double rise = y[hi] - y[hi - 1];
149  double slope = rise / run;
150  int diff = value - x[hi - 1];
151 
152  return y[hi - 1] + slope * diff;
153  }
154 }
155 
156 // Report bandwidth in GB / sec
157 const double GB = 1024.0 * 1024.0 * 1024.0;
158 double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
159  double time_per_call = time / num_calls;
160  return memory_per_call_bytes / GB / time_per_call;
161 }
162 
163 template <class exec_space, class memory_space>
164 void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm<int> &comm, std::vector<int> &sizes, std::vector<double> &times) {
165 #ifdef HAVE_MPI
166  int rank = comm.getRank();
167  int nproc = comm.getSize();
168 
169  if (nproc < 2) return;
170 
171  const int buff_size = (int)pow(2, MAX_SIZE);
172 
173  sizes.resize(MAX_SIZE + 1);
174  times.resize(MAX_SIZE + 1);
175 
176  // Allocate memory for the buffers (and fill send)
177  Kokkos::View<char *, memory_space> r_buf("recv", buff_size), s_buf("send", buff_size);
178  Kokkos::deep_copy(s_buf, 1);
179 
180  // Send and recieve.
181  // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
182  int odd = rank % 2;
183  int buddy = odd ? rank - 1 : rank + 1;
184 
185  for (int i = 0; i < MAX_SIZE + 1; i++) {
186  int msg_size = (int)pow(2, i);
187  comm.barrier();
188 
189  double t0 = MPI_Wtime();
190  for (int j = 0; j < KERNEL_REPEATS; j++) {
191  if (buddy < nproc) {
192  if (odd) {
193  comm.send(msg_size, (char *)s_buf.data(), buddy);
194  comm.receive(buddy, msg_size, (char *)r_buf.data());
195  } else {
196  comm.receive(buddy, msg_size, (char *)r_buf.data());
197  comm.send(msg_size, (char *)s_buf.data(), buddy);
198  }
199  }
200  }
201 
202  double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
203  sizes[i] = msg_size;
204  times[i] = time_per_call;
205  }
206 #else
207  return;
208 #endif
209 }
210 
211 template <class exec_space, class memory_space, class LocalOrdinal, class GlobalOrdinal, class Node>
212 void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import, std::vector<int> &sizes, std::vector<double> &times) {
213  int nproc = import->getSourceMap()->getComm()->getSize();
214  if (nproc < 2) return;
215 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
216  // NOTE: We need to get the distributer here, which means we need Tpetra, since Xpetra does
217  // not have a distributor interface
219  RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(import);
220  RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(import->getSourceMap()->getComm());
221  MPI_Comm communicator = *mcomm->getRawMpiComm();
222 
223  if (Ximport.is_null() || mcomm.is_null()) return;
224  auto Timport = Ximport->getTpetra_Import();
225  auto distor = Timport->getDistributor();
226 
227  // Distributor innards
228  Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
229  Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
230  int num_recvs = (int)distor.getNumReceives();
231  int num_sends = (int)distor.getNumSends();
232 
233  const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
234  sizes.resize(MAX_SIZE + 1);
235  times.resize(MAX_SIZE + 1);
236 
237  // Allocate memory for the buffers (and fill send)
238  Kokkos::View<char *, memory_space> f_recv_buf("forward_recv", buff_size_per_msg * num_recvs), f_send_buf("forward_send", buff_size_per_msg * num_sends);
239  Kokkos::View<char *, memory_space> r_recv_buf("reverse_recv", buff_size_per_msg * num_sends), r_send_buf("reverse_send", buff_size_per_msg * num_recvs);
240  Kokkos::deep_copy(f_send_buf, 1);
241  Kokkos::deep_copy(r_send_buf, 1);
242 
243  std::vector<MPI_Request> requests(num_sends + num_recvs);
244  std::vector<MPI_Status> status(num_sends + num_recvs);
245 
246  for (int i = 0; i < MAX_SIZE + 1; i++) {
247  int msg_size = (int)pow(2, i);
248 
249  MPI_Barrier(communicator);
250 
251  double t0 = MPI_Wtime();
252  for (int j = 0; j < KERNEL_REPEATS; j++) {
253  int ct = 0;
254  // Recv/Send the forward messsages
255  for (int r = 0; r < num_recvs; r++) {
256  const int tag = 1000 + j;
257  MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
258  ct++;
259  }
260  for (int s = 0; s < num_sends; s++) {
261  const int tag = 1000 + j;
262  MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
263  ct++;
264  }
265  // Wait for the forward messsages
266  MPI_Waitall(ct, requests.data(), status.data());
267 
268  ct = 0;
269  // Recv/Send the reverse messsages
270  for (int r = 0; r < num_sends; r++) {
271  const int tag = 2000 + j;
272  MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
273  ct++;
274  }
275  for (int s = 0; s < num_recvs; s++) {
276  const int tag = 2000 + j;
277  MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
278  ct++;
279  }
280  // Wait for the reverse messsages
281  MPI_Waitall(ct, requests.data(), status.data());
282  }
283 
284  double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
285  sizes[i] = msg_size;
286  times[i] = time_per_call;
287  }
288 
289 #endif
290 }
291 
292 } // end namespace PerfDetails
293 
294 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
296  : launch_and_wait_latency_(-1.0) {}
297 
298 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
300 
301 /****************************************************************************************/
302 /****************************************************************************************/
303 /****************************************************************************************/
304 
305 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
307  // We need launch/waits latency estimates for corrected stream
308  launch_latency_make_table(KERNEL_REPEATS);
309  double latency = launch_latency_lookup();
310 
311  if (LOG_MAX_SIZE < 2)
312  LOG_MAX_SIZE = 20;
313 
314  stream_sizes_.resize(LOG_MAX_SIZE + 1);
315  stream_copy_times_.resize(LOG_MAX_SIZE + 1);
316  stream_add_times_.resize(LOG_MAX_SIZE + 1);
317  latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE + 1);
318  latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE + 1);
319 
320  for (int i = 0; i < LOG_MAX_SIZE + 1; i++) {
321  int size = (int)pow(2, i);
322  double c_time = PerfDetails::stream_vector_copy<Scalar, Node>(KERNEL_REPEATS, size);
323  double a_time = PerfDetails::stream_vector_add<Scalar, Node>(KERNEL_REPEATS, size);
324 
325  stream_sizes_[i] = size;
326 
327  // Correct for the difference in memory transactions per element
328  stream_copy_times_[i] = c_time / 2.0;
329  stream_add_times_[i] = a_time / 3.0;
330 
331  // Correct for launch latency too. We'll note that sometimes the latency estimate
332  // is higher than the actual copy/add time estimate. If so, we don't correct
333  latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ((c_time - latency) / 2.0);
334  latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ((a_time - latency) / 3.0);
335  }
336 }
337 
338 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
339 double
341  return PerfDetails::table_lookup(stream_sizes_, stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
342 }
343 
344 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
345 double
347  return PerfDetails::table_lookup(stream_sizes_, stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
348 }
349 
350 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
351 double
353  return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES), stream_vector_add_lookup(SIZE_IN_BYTES));
354 }
355 
356 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
357 double
359  return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
360 }
361 
362 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
363 double
365  return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
366 }
367 
368 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
369 double
371  return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES), latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
372 }
373 
374 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
376  print_stream_vector_table_impl(out, false, prefix);
377 }
378 
379 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381  print_stream_vector_table_impl(out, true, prefix);
382 }
383 
384 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
385 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix) {
386  using namespace std;
387  std::ios old_format(NULL);
388  old_format.copyfmt(out);
389 
390  out << prefix
391  << setw(20) << "Length in Scalars" << setw(1) << " "
392  << setw(20) << "COPY (us)" << setw(1) << " "
393  << setw(20) << "ADD (us)" << setw(1) << " "
394  << setw(20) << "COPY (GB/s)" << setw(1) << " "
395  << setw(20) << "ADD (GB/s)" << std::endl;
396 
397  out << prefix
398  << setw(20) << "-----------------" << setw(1) << " "
399  << setw(20) << "---------" << setw(1) << " "
400  << setw(20) << "--------" << setw(1) << " "
401  << setw(20) << "-----------" << setw(1) << " "
402  << setw(20) << "----------" << std::endl;
403 
404  for (int i = 0; i < (int)stream_sizes_.size(); i++) {
405  int size = stream_sizes_[i];
406  double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
407  double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
408  // We've already corrected for the transactions per element difference
409  double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time, 1, size * sizeof(Scalar));
410  double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time, 1, size * sizeof(Scalar));
411 
412  out << prefix
413  << setw(20) << size << setw(1) << " "
414  << setw(20) << fixed << setprecision(4) << (c_time * 1e6) << setw(1) << " "
415  << setw(20) << fixed << setprecision(4) << (a_time * 1e6) << setw(1) << " "
416  << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
417  << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
418  }
419 
420  out.copyfmt(old_format);
421 }
422 
423 /****************************************************************************************/
424 /****************************************************************************************/
425 /****************************************************************************************/
426 
427 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
428 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm) {
429  PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_host_times_);
430 
431  PerfDetails::pingpong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_device_times_);
432 }
433 
434 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
435 double
437  return PerfDetails::table_lookup(pingpong_sizes_, pingpong_host_times_, SIZE_IN_BYTES);
438 }
439 
440 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
441 double
443  return PerfDetails::table_lookup(pingpong_sizes_, pingpong_device_times_, SIZE_IN_BYTES);
444 }
445 
446 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
447 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream &out, const std::string &prefix) {
448  if (pingpong_sizes_.size() == 0) return;
449 
450  using namespace std;
451  std::ios old_format(NULL);
452  old_format.copyfmt(out);
453 
454  out << prefix
455  << setw(20) << "Message Size" << setw(1) << " "
456  << setw(20) << "Host (us)" << setw(1) << " "
457  << setw(20) << "Device (us)" << std::endl;
458 
459  out << prefix
460  << setw(20) << "------------" << setw(1) << " "
461  << setw(20) << "---------" << setw(1) << " "
462  << setw(20) << "-----------" << std::endl;
463 
464  for (int i = 0; i < (int)pingpong_sizes_.size(); i++) {
465  int size = pingpong_sizes_[i];
466  double h_time = pingpong_host_times_[i];
467  double d_time = pingpong_device_times_[i];
468 
469  out << prefix
470  << setw(20) << size << setw(1) << " "
471  << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
472  << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
473  }
474 
475  out.copyfmt(old_format);
476 }
477 
478 /****************************************************************************************/
479 /****************************************************************************************/
480 /****************************************************************************************/
481 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
483  PerfDetails::halopong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_host_times_);
484 
485  PerfDetails::halopong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_device_times_);
486 }
487 
488 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
489 double
491  return PerfDetails::table_lookup(halopong_sizes_, halopong_host_times_, SIZE_IN_BYTES);
492 }
493 
494 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
495 double
497  return PerfDetails::table_lookup(halopong_sizes_, halopong_device_times_, SIZE_IN_BYTES);
498 }
499 
500 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
501 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream &out, const std::string &prefix) {
502  if (halopong_sizes_.size() == 0) return;
503 
504  using namespace std;
505  std::ios old_format(NULL);
506  old_format.copyfmt(out);
507 
508  out << prefix
509  << setw(20) << "Message Size" << setw(1) << " "
510  << setw(20) << "Host (us)" << setw(1) << " "
511  << setw(20) << "Device (us)" << std::endl;
512 
513  out << prefix
514  << setw(20) << "------------" << setw(1) << " "
515  << setw(20) << "---------" << setw(1) << " "
516  << setw(20) << "-----------" << std::endl;
517 
518  for (int i = 0; i < (int)halopong_sizes_.size(); i++) {
519  int size = halopong_sizes_[i];
520  double h_time = halopong_host_times_[i];
521  double d_time = halopong_device_times_[i];
522 
523  out << prefix
524  << setw(20) << size << setw(1) << " "
525  << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
526  << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
527  }
528 
529  out.copyfmt(old_format);
530 }
531 
532 /****************************************************************************************/
533 /****************************************************************************************/
534 /****************************************************************************************/
535 
536 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
538  using exec_space = typename Node::execution_space;
539  using range_policy = Kokkos::RangePolicy<exec_space>;
540  using clock = std::chrono::high_resolution_clock;
541 
542  double total_test_time = 0;
543  clock::time_point start, stop;
544  for (int i = 0; i < KERNEL_REPEATS; i++) {
545  start = clock::now();
546  Kokkos::parallel_for(
547  "empty kernel", range_policy(0, 1), KOKKOS_LAMBDA(const size_t j) {
548  ;
549  });
550  exec_space().fence();
551  stop = clock::now();
552  double my_test_time = std::chrono::duration<double>(stop - start).count();
553  total_test_time += my_test_time;
554  }
555 
556  launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
557 }
558 
559 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
560 double
562  return launch_and_wait_latency_;
563 }
564 
565 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
567  using namespace std;
568  std::ios old_format(NULL);
569  old_format.copyfmt(out);
570 
571  out << prefix
572  << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
573  << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_ * 1e6) << std::endl;
574 
575  out.copyfmt(old_format);
576 }
577 
578 } // namespace MueLu
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > &times)
virtual int getSize() const =0
virtual int getRank() const =0
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
double pingpong_device_lookup(int SIZE_IN_BYTES)
virtual int receive(const int sourceRank, const Ordinal bytes, char recvBuffer[]) const =0
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
virtual void barrier() const =0
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
void start()
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_halopong_table(std::ostream &out, const std::string &prefix="")
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double pingpong_host_lookup(int SIZE_IN_BYTES)
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
virtual void send(const Ordinal bytes, const char sendBuffer[], const int destRank) const =0
void launch_latency_make_table(int KERNEL_REPEATS)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
void stop()
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
bool is_null() const