MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_PerfModels_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
47 
48 #include <cstdio>
49 #include <cmath>
50 #include <numeric>
51 #include <utility>
52 #include <chrono>
53 #include <iomanip>
54 #include <Teuchos_ScalarTraits.hpp>
55 #include <Kokkos_ArithTraits.hpp>
56 #include <Xpetra_Import.hpp>
57 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
58 #include <Xpetra_TpetraImport.hpp>
59 #include <Tpetra_Import.hpp>
60 #include <Tpetra_Distributor.hpp>
61 #include <mpi.h>
62 #endif
63 
64 #ifdef HAVE_MPI
65 #include <mpi.h>
66 #endif
67 
68 namespace MueLu {
69 
70 namespace PerfDetails {
71 template <class Scalar, class Node>
72 double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE) {
73  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
74  using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
75 
76  using exec_space = typename Node::execution_space;
77  using memory_space = typename Node::memory_space;
78  using range_policy = Kokkos::RangePolicy<exec_space>;
79 
80  Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
81  Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
82  Kokkos::View<impl_scalar_type *, memory_space> c("c", VECTOR_SIZE);
83  double total_test_time = 0.0;
84 
85  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
86 
87  Kokkos::parallel_for(
88  "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
89  a(i) = ONE * (double)i;
90  b(i) = a(i);
91  });
92  exec_space().fence();
93 
94  using clock = std::chrono::high_resolution_clock;
95 
96  clock::time_point start, stop;
97 
98  for (int i = 0; i < KERNEL_REPEATS; i++) {
99  start = clock::now();
100  Kokkos::parallel_for(
101  "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
102  c(j) = a(j) + b(j);
103  });
104 
105  exec_space().fence();
106  stop = clock::now();
107  double my_test_time = std::chrono::duration<double>(stop - start).count();
108  total_test_time += my_test_time;
109  }
110 
111  return total_test_time / KERNEL_REPEATS;
112 }
113 
114 template <class Scalar, class Node>
115 double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE) {
116  // PerfDetails' STREAM routines need to be instantiatiated on impl_scalar_type, not Scalar
117  using impl_scalar_type = typename Kokkos::ArithTraits<Scalar>::val_type;
118 
119  using exec_space = typename Node::execution_space;
120  using memory_space = typename Node::memory_space;
121  using range_policy = Kokkos::RangePolicy<exec_space>;
122 
123  Kokkos::View<impl_scalar_type *, memory_space> a("a", VECTOR_SIZE);
124  Kokkos::View<impl_scalar_type *, memory_space> b("b", VECTOR_SIZE);
125  double total_test_time = 0.0;
126 
127  impl_scalar_type ONE = Teuchos::ScalarTraits<impl_scalar_type>::one();
128 
129  Kokkos::parallel_for(
130  "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t i) {
131  a(i) = ONE;
132  });
133  exec_space().fence();
134 
135  using clock = std::chrono::high_resolution_clock;
136  clock::time_point start, stop;
137 
138  for (int i = 0; i < KERNEL_REPEATS; i++) {
139  start = clock::now();
140  Kokkos::parallel_for(
141  "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(const size_t j) { // Vector Addition
142  b(j) = a(j);
143  });
144 
145  exec_space().fence();
146  stop = clock::now();
147  double my_test_time = std::chrono::duration<double>(stop - start).count();
148  total_test_time += my_test_time;
149  }
150 
151  return total_test_time / KERNEL_REPEATS;
152 }
153 
154 double table_lookup(const std::vector<int> &x, const std::vector<double> &y, int value) {
155  // If there's no table, nan
156  if (x.size() == 0) return Teuchos::ScalarTraits<double>::nan();
157 
158  // NOTE: This should probably be a binary search, but this isn't performance sensitive, so we'll go simple
159  int N = (int)x.size();
160  int hi = 0;
161  for (; hi < N; hi++) {
162  if (x[hi] > value)
163  break;
164  }
165 
166  if (hi == 0) {
167  // Lower end (return the min time)
168  // printf("Lower end: %d < %d\n",value,x[0]);
169  return y[0];
170  } else if (hi == N) {
171  // Higher end (extrapolate from the last two points)
172  // printf("Upper end: %d > %d\n",value,x[N-1]);
173  hi = N - 1;
174  int run = x[hi] - x[hi - 1];
175  double rise = y[hi] - y[hi - 1];
176  double slope = rise / run;
177  int diff = value - x[hi - 1];
178 
179  return y[hi - 1] + slope * diff;
180  } else {
181  // Interpolate
182  // printf("Middle: %d < %d < %d\n",x[hi-1],value,x[hi]);
183  int run = x[hi] - x[hi - 1];
184  double rise = y[hi] - y[hi - 1];
185  double slope = rise / run;
186  int diff = value - x[hi - 1];
187 
188  return y[hi - 1] + slope * diff;
189  }
190 }
191 
192 // Report bandwidth in GB / sec
193 const double GB = 1024.0 * 1024.0 * 1024.0;
194 double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes) {
195  double time_per_call = time / num_calls;
196  return memory_per_call_bytes / GB / time_per_call;
197 }
198 
199 template <class exec_space, class memory_space>
200 void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm<int> &comm, std::vector<int> &sizes, std::vector<double> &times) {
201 #ifdef HAVE_MPI
202  int rank = comm.getRank();
203  int nproc = comm.getSize();
204 
205  if (nproc < 2) return;
206 
207  const int buff_size = (int)pow(2, MAX_SIZE);
208 
209  sizes.resize(MAX_SIZE + 1);
210  times.resize(MAX_SIZE + 1);
211 
212  // Allocate memory for the buffers (and fill send)
213  Kokkos::View<char *, memory_space> r_buf("recv", buff_size), s_buf("send", buff_size);
214  Kokkos::deep_copy(s_buf, 1);
215 
216  // Send and recieve.
217  // NOTE: Do consectutive pair buddies here for simplicity. We should be smart later
218  int odd = rank % 2;
219  int buddy = odd ? rank - 1 : rank + 1;
220 
221  for (int i = 0; i < MAX_SIZE + 1; i++) {
222  int msg_size = (int)pow(2, i);
223  comm.barrier();
224 
225  double t0 = MPI_Wtime();
226  for (int j = 0; j < KERNEL_REPEATS; j++) {
227  if (buddy < nproc) {
228  if (odd) {
229  comm.send(msg_size, (char *)s_buf.data(), buddy);
230  comm.receive(buddy, msg_size, (char *)r_buf.data());
231  } else {
232  comm.receive(buddy, msg_size, (char *)r_buf.data());
233  comm.send(msg_size, (char *)s_buf.data(), buddy);
234  }
235  }
236  }
237 
238  double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
239  sizes[i] = msg_size;
240  times[i] = time_per_call;
241  }
242 #else
243  return;
244 #endif
245 }
246 
247 template <class exec_space, class memory_space, class LocalOrdinal, class GlobalOrdinal, class Node>
248 void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import, std::vector<int> &sizes, std::vector<double> &times) {
249  int nproc = import->getSourceMap()->getComm()->getSize();
250  if (nproc < 2) return;
251 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
252  // NOTE: We need to get the distributer here, which means we need Tpetra, since Xpetra does
253  // not have a distributor interface
255  RCP<const x_import_type> Ximport = Teuchos::rcp_dynamic_cast<const x_import_type>(import);
256  RCP<const Teuchos::MpiComm<int> > mcomm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(import->getSourceMap()->getComm());
257  MPI_Comm communicator = *mcomm->getRawMpiComm();
258 
259  if (Ximport.is_null() || mcomm.is_null()) return;
260  auto Timport = Ximport->getTpetra_Import();
261  auto distor = Timport->getDistributor();
262 
263  // Distributor innards
264  Teuchos::ArrayView<const int> procsFrom = distor.getProcsFrom();
265  Teuchos::ArrayView<const int> procsTo = distor.getProcsTo();
266  int num_recvs = (int)distor.getNumReceives();
267  int num_sends = (int)distor.getNumSends();
268 
269  const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
270  sizes.resize(MAX_SIZE + 1);
271  times.resize(MAX_SIZE + 1);
272 
273  // Allocate memory for the buffers (and fill send)
274  Kokkos::View<char *, memory_space> f_recv_buf("forward_recv", buff_size_per_msg * num_recvs), f_send_buf("forward_send", buff_size_per_msg * num_sends);
275  Kokkos::View<char *, memory_space> r_recv_buf("reverse_recv", buff_size_per_msg * num_sends), r_send_buf("reverse_send", buff_size_per_msg * num_recvs);
276  Kokkos::deep_copy(f_send_buf, 1);
277  Kokkos::deep_copy(r_send_buf, 1);
278 
279  std::vector<MPI_Request> requests(num_sends + num_recvs);
280  std::vector<MPI_Status> status(num_sends + num_recvs);
281 
282  for (int i = 0; i < MAX_SIZE + 1; i++) {
283  int msg_size = (int)pow(2, i);
284 
285  MPI_Barrier(communicator);
286 
287  double t0 = MPI_Wtime();
288  for (int j = 0; j < KERNEL_REPEATS; j++) {
289  int ct = 0;
290  // Recv/Send the forward messsages
291  for (int r = 0; r < num_recvs; r++) {
292  const int tag = 1000 + j;
293  MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
294  ct++;
295  }
296  for (int s = 0; s < num_sends; s++) {
297  const int tag = 1000 + j;
298  MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
299  ct++;
300  }
301  // Wait for the forward messsages
302  MPI_Waitall(ct, requests.data(), status.data());
303 
304  ct = 0;
305  // Recv/Send the reverse messsages
306  for (int r = 0; r < num_sends; r++) {
307  const int tag = 2000 + j;
308  MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
309  ct++;
310  }
311  for (int s = 0; s < num_recvs; s++) {
312  const int tag = 2000 + j;
313  MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
314  ct++;
315  }
316  // Wait for the reverse messsages
317  MPI_Waitall(ct, requests.data(), status.data());
318  }
319 
320  double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
321  sizes[i] = msg_size;
322  times[i] = time_per_call;
323  }
324 
325 #endif
326 }
327 
328 } // end namespace PerfDetails
329 
330 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
332  : launch_and_wait_latency_(-1.0) {}
333 
334 /****************************************************************************************/
335 /****************************************************************************************/
336 /****************************************************************************************/
337 
338 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
340  // We need launch/waits latency estimates for corrected stream
341  launch_latency_make_table(KERNEL_REPEATS);
342  double latency = launch_latency_lookup();
343 
344  if (LOG_MAX_SIZE < 2)
345  LOG_MAX_SIZE = 20;
346 
347  stream_sizes_.resize(LOG_MAX_SIZE + 1);
348  stream_copy_times_.resize(LOG_MAX_SIZE + 1);
349  stream_add_times_.resize(LOG_MAX_SIZE + 1);
350  latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE + 1);
351  latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE + 1);
352 
353  for (int i = 0; i < LOG_MAX_SIZE + 1; i++) {
354  int size = (int)pow(2, i);
355  double c_time = PerfDetails::stream_vector_copy<Scalar, Node>(KERNEL_REPEATS, size);
356  double a_time = PerfDetails::stream_vector_add<Scalar, Node>(KERNEL_REPEATS, size);
357 
358  stream_sizes_[i] = size;
359 
360  // Correct for the difference in memory transactions per element
361  stream_copy_times_[i] = c_time / 2.0;
362  stream_add_times_[i] = a_time / 3.0;
363 
364  // Correct for launch latency too. We'll note that sometimes the latency estimate
365  // is higher than the actual copy/add time estimate. If so, we don't correct
366  latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ((c_time - latency) / 2.0);
367  latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ((a_time - latency) / 3.0);
368  }
369 }
370 
371 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
372 double
374  return PerfDetails::table_lookup(stream_sizes_, stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
375 }
376 
377 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
378 double
380  return PerfDetails::table_lookup(stream_sizes_, stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
381 }
382 
383 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
384 double
386  return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES), stream_vector_add_lookup(SIZE_IN_BYTES));
387 }
388 
389 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
390 double
392  return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_copy_times_, SIZE_IN_BYTES / sizeof(Scalar));
393 }
394 
395 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
396 double
398  return PerfDetails::table_lookup(stream_sizes_, latency_corrected_stream_add_times_, SIZE_IN_BYTES / sizeof(Scalar));
399 }
400 
401 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
402 double
404  return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES), latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
405 }
406 
407 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
409  print_stream_vector_table_impl(out, false, prefix);
410 }
411 
412 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
414  print_stream_vector_table_impl(out, true, prefix);
415 }
416 
417 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
418 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix) {
419  using namespace std;
420  std::ios old_format(NULL);
421  old_format.copyfmt(out);
422 
423  out << prefix
424  << setw(20) << "Length in Scalars" << setw(1) << " "
425  << setw(20) << "COPY (us)" << setw(1) << " "
426  << setw(20) << "ADD (us)" << setw(1) << " "
427  << setw(20) << "COPY (GB/s)" << setw(1) << " "
428  << setw(20) << "ADD (GB/s)" << std::endl;
429 
430  out << prefix
431  << setw(20) << "-----------------" << setw(1) << " "
432  << setw(20) << "---------" << setw(1) << " "
433  << setw(20) << "--------" << setw(1) << " "
434  << setw(20) << "-----------" << setw(1) << " "
435  << setw(20) << "----------" << std::endl;
436 
437  for (int i = 0; i < (int)stream_sizes_.size(); i++) {
438  int size = stream_sizes_[i];
439  double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
440  double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
441  // We've already corrected for the transactions per element difference
442  double c_bw = PerfDetails::convert_time_to_bandwidth_gbs(c_time, 1, size * sizeof(Scalar));
443  double a_bw = PerfDetails::convert_time_to_bandwidth_gbs(a_time, 1, size * sizeof(Scalar));
444 
445  out << prefix
446  << setw(20) << size << setw(1) << " "
447  << setw(20) << fixed << setprecision(4) << (c_time * 1e6) << setw(1) << " "
448  << setw(20) << fixed << setprecision(4) << (a_time * 1e6) << setw(1) << " "
449  << setw(20) << fixed << setprecision(4) << c_bw << setw(1) << " "
450  << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
451  }
452 
453  out.copyfmt(old_format);
454 }
455 
456 /****************************************************************************************/
457 /****************************************************************************************/
458 /****************************************************************************************/
459 
460 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
461 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm) {
462  PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_host_times_);
463 
464  PerfDetails::pingpong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_device_times_);
465 }
466 
467 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
468 double
470  return PerfDetails::table_lookup(pingpong_sizes_, pingpong_host_times_, SIZE_IN_BYTES);
471 }
472 
473 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
474 double
476  return PerfDetails::table_lookup(pingpong_sizes_, pingpong_device_times_, SIZE_IN_BYTES);
477 }
478 
479 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
480 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_pingpong_table(std::ostream &out, const std::string &prefix) {
481  if (pingpong_sizes_.size() == 0) return;
482 
483  using namespace std;
484  std::ios old_format(NULL);
485  old_format.copyfmt(out);
486 
487  out << prefix
488  << setw(20) << "Message Size" << setw(1) << " "
489  << setw(20) << "Host (us)" << setw(1) << " "
490  << setw(20) << "Device (us)" << std::endl;
491 
492  out << prefix
493  << setw(20) << "------------" << setw(1) << " "
494  << setw(20) << "---------" << setw(1) << " "
495  << setw(20) << "-----------" << std::endl;
496 
497  for (int i = 0; i < (int)pingpong_sizes_.size(); i++) {
498  int size = pingpong_sizes_[i];
499  double h_time = pingpong_host_times_[i];
500  double d_time = pingpong_device_times_[i];
501 
502  out << prefix
503  << setw(20) << size << setw(1) << " "
504  << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
505  << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
506  }
507 
508  out.copyfmt(old_format);
509 }
510 
511 /****************************************************************************************/
512 /****************************************************************************************/
513 /****************************************************************************************/
514 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
516  PerfDetails::halopong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_host_times_);
517 
518  PerfDetails::halopong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, import, halopong_sizes_, halopong_device_times_);
519 }
520 
521 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
522 double
524  return PerfDetails::table_lookup(halopong_sizes_, halopong_host_times_, SIZE_IN_BYTES);
525 }
526 
527 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
528 double
530  return PerfDetails::table_lookup(halopong_sizes_, halopong_device_times_, SIZE_IN_BYTES);
531 }
532 
533 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
534 void PerfModels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::print_halopong_table(std::ostream &out, const std::string &prefix) {
535  if (halopong_sizes_.size() == 0) return;
536 
537  using namespace std;
538  std::ios old_format(NULL);
539  old_format.copyfmt(out);
540 
541  out << prefix
542  << setw(20) << "Message Size" << setw(1) << " "
543  << setw(20) << "Host (us)" << setw(1) << " "
544  << setw(20) << "Device (us)" << std::endl;
545 
546  out << prefix
547  << setw(20) << "------------" << setw(1) << " "
548  << setw(20) << "---------" << setw(1) << " "
549  << setw(20) << "-----------" << std::endl;
550 
551  for (int i = 0; i < (int)halopong_sizes_.size(); i++) {
552  int size = halopong_sizes_[i];
553  double h_time = halopong_host_times_[i];
554  double d_time = halopong_device_times_[i];
555 
556  out << prefix
557  << setw(20) << size << setw(1) << " "
558  << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) << " "
559  << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
560  }
561 
562  out.copyfmt(old_format);
563 }
564 
565 /****************************************************************************************/
566 /****************************************************************************************/
567 /****************************************************************************************/
568 
569 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
571  using exec_space = typename Node::execution_space;
572  using range_policy = Kokkos::RangePolicy<exec_space>;
573  using clock = std::chrono::high_resolution_clock;
574 
575  double total_test_time = 0;
576  clock::time_point start, stop;
577  for (int i = 0; i < KERNEL_REPEATS; i++) {
578  start = clock::now();
579  Kokkos::parallel_for(
580  "empty kernel", range_policy(0, 1), KOKKOS_LAMBDA(const size_t j) {
581  ;
582  });
583  exec_space().fence();
584  stop = clock::now();
585  double my_test_time = std::chrono::duration<double>(stop - start).count();
586  total_test_time += my_test_time;
587  }
588 
589  launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
590 }
591 
592 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
593 double
595  return launch_and_wait_latency_;
596 }
597 
598 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
600  using namespace std;
601  std::ios old_format(NULL);
602  old_format.copyfmt(out);
603 
604  out << prefix
605  << setw(20) << "Launch+Wait Latency (us)" << setw(1) << " "
606  << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_ * 1e6) << std::endl;
607 
608  out.copyfmt(old_format);
609 }
610 
611 } // namespace MueLu
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > &times)
virtual int getSize() const =0
virtual int getRank() const =0
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
double pingpong_device_lookup(int SIZE_IN_BYTES)
virtual int receive(const int sourceRank, const Ordinal bytes, char recvBuffer[]) const =0
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > &times)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
virtual void barrier() const =0
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
void start()
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_halopong_table(std::ostream &out, const std::string &prefix="")
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double pingpong_host_lookup(int SIZE_IN_BYTES)
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
virtual void send(const Ordinal bytes, const char sendBuffer[], const int destRank) const =0
void launch_latency_make_table(int KERNEL_REPEATS)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
void stop()
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
bool is_null() const