19 #if KOKKOS_VERSION >= 40799
20 #include <KokkosKernels_ArithTraits.hpp>
22 #include <Kokkos_ArithTraits.hpp>
25 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
26 #include <Xpetra_TpetraImport.hpp>
27 #include <Tpetra_Import.hpp>
28 #include <Tpetra_Distributor.hpp>
38 namespace PerfDetails {
39 template <
class Scalar,
class Node>
42 #if KOKKOS_VERSION >= 40799
43 using impl_scalar_type =
typename KokkosKernels::ArithTraits<Scalar>::val_type;
45 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
48 using exec_space =
typename Node::execution_space;
49 using memory_space =
typename Node::memory_space;
50 using range_policy = Kokkos::RangePolicy<exec_space>;
52 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
53 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
54 Kokkos::View<impl_scalar_type *, memory_space> c(
"c", VECTOR_SIZE);
55 double total_test_time = 0.0;
60 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
61 a(i) = ONE * (double)i;
66 using clock = std::chrono::high_resolution_clock;
70 for (
int i = 0; i < KERNEL_REPEATS; i++) {
73 "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
79 double my_test_time = std::chrono::duration<double>(stop - start).count();
80 total_test_time += my_test_time;
83 return total_test_time / KERNEL_REPEATS;
86 template <
class Scalar,
class Node>
89 #if KOKKOS_VERSION >= 40799
90 using impl_scalar_type =
typename KokkosKernels::ArithTraits<Scalar>::val_type;
92 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
95 using exec_space =
typename Node::execution_space;
96 using memory_space =
typename Node::memory_space;
97 using range_policy = Kokkos::RangePolicy<exec_space>;
99 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
100 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
101 double total_test_time = 0.0;
105 Kokkos::parallel_for(
106 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
109 exec_space().fence();
111 using clock = std::chrono::high_resolution_clock;
114 for (
int i = 0; i < KERNEL_REPEATS; i++) {
115 start = clock::now();
116 Kokkos::parallel_for(
117 "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
121 exec_space().fence();
123 double my_test_time = std::chrono::duration<double>(stop - start).count();
124 total_test_time += my_test_time;
127 return total_test_time / KERNEL_REPEATS;
130 double table_lookup(
const std::vector<int> &x,
const std::vector<double> &y,
int value) {
135 int N = (int)x.size();
137 for (; hi < N; hi++) {
146 }
else if (hi == N) {
150 int run = x[hi] - x[hi - 1];
151 double rise = y[hi] - y[hi - 1];
152 double slope = rise / run;
153 int diff = value - x[hi - 1];
155 return y[hi - 1] + slope * diff;
159 int run = x[hi] - x[hi - 1];
160 double rise = y[hi] - y[hi - 1];
161 double slope = rise / run;
162 int diff = value - x[hi - 1];
164 return y[hi - 1] + slope * diff;
169 const double GB = 1024.0 * 1024.0 * 1024.0;
171 double time_per_call = time / num_calls;
172 return memory_per_call_bytes /
GB / time_per_call;
175 template <
class exec_space,
class memory_space>
181 if (nproc < 2)
return;
183 const int buff_size = (int)pow(2, MAX_SIZE);
185 sizes.resize(MAX_SIZE + 1);
186 times.resize(MAX_SIZE + 1);
189 Kokkos::View<char *, memory_space> r_buf(
"recv", buff_size), s_buf(
"send", buff_size);
190 Kokkos::deep_copy(s_buf, 1);
195 int buddy = odd ? rank - 1 : rank + 1;
197 for (
int i = 0; i < MAX_SIZE + 1; i++) {
198 int msg_size = (int)pow(2, i);
201 double t0 = MPI_Wtime();
202 for (
int j = 0; j < KERNEL_REPEATS; j++) {
205 comm.
send(msg_size, (
char *)s_buf.data(), buddy);
206 comm.
receive(buddy, msg_size, (
char *)r_buf.data());
208 comm.
receive(buddy, msg_size, (
char *)r_buf.data());
209 comm.
send(msg_size, (
char *)s_buf.data(), buddy);
214 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
216 times[i] = time_per_call;
223 template <
class exec_space,
class memory_space,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
225 int nproc =
import->getSourceMap()->getComm()->getSize();
226 if (nproc < 2)
return;
227 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
233 MPI_Comm communicator = *mcomm->getRawMpiComm();
235 if (Ximport.
is_null() || mcomm.is_null())
return;
236 auto Timport = Ximport->getTpetra_Import();
237 auto distor = Timport->getDistributor();
242 int num_recvs = (int)distor.getNumReceives();
243 int num_sends = (int)distor.getNumSends();
245 const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
246 sizes.resize(MAX_SIZE + 1);
247 times.resize(MAX_SIZE + 1);
250 Kokkos::View<char *, memory_space> f_recv_buf(
"forward_recv", buff_size_per_msg * num_recvs), f_send_buf(
"forward_send", buff_size_per_msg * num_sends);
251 Kokkos::View<char *, memory_space> r_recv_buf(
"reverse_recv", buff_size_per_msg * num_sends), r_send_buf(
"reverse_send", buff_size_per_msg * num_recvs);
252 Kokkos::deep_copy(f_send_buf, 1);
253 Kokkos::deep_copy(r_send_buf, 1);
255 std::vector<MPI_Request> requests(num_sends + num_recvs);
256 std::vector<MPI_Status> status(num_sends + num_recvs);
258 for (
int i = 0; i < MAX_SIZE + 1; i++) {
259 int msg_size = (int)pow(2, i);
261 MPI_Barrier(communicator);
263 double t0 = MPI_Wtime();
264 for (
int j = 0; j < KERNEL_REPEATS; j++) {
267 for (
int r = 0; r < num_recvs; r++) {
268 const int tag = 1000 + j;
269 MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
272 for (
int s = 0; s < num_sends; s++) {
273 const int tag = 1000 + j;
274 MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
278 MPI_Waitall(ct, requests.data(), status.data());
282 for (
int r = 0; r < num_sends; r++) {
283 const int tag = 2000 + j;
284 MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
287 for (
int s = 0; s < num_recvs; s++) {
288 const int tag = 2000 + j;
289 MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
293 MPI_Waitall(ct, requests.data(), status.data());
296 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
298 times[i] = time_per_call;
306 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
308 : launch_and_wait_latency_(-1.0) {}
310 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
317 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
320 launch_latency_make_table(KERNEL_REPEATS);
321 double latency = launch_latency_lookup();
323 if (LOG_MAX_SIZE < 2)
326 stream_sizes_.resize(LOG_MAX_SIZE + 1);
327 stream_copy_times_.resize(LOG_MAX_SIZE + 1);
328 stream_add_times_.resize(LOG_MAX_SIZE + 1);
329 latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE + 1);
330 latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE + 1);
332 for (
int i = 0; i < LOG_MAX_SIZE + 1; i++) {
333 int size = (int)pow(2, i);
334 double c_time = PerfDetails::stream_vector_copy<Scalar, Node>(KERNEL_REPEATS, size);
335 double a_time = PerfDetails::stream_vector_add<Scalar, Node>(KERNEL_REPEATS, size);
337 stream_sizes_[i] = size;
340 stream_copy_times_[i] = c_time / 2.0;
341 stream_add_times_[i] = a_time / 3.0;
345 latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ((c_time - latency) / 2.0);
346 latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ((a_time - latency) / 3.0);
350 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
356 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
362 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
365 return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES), stream_vector_add_lookup(SIZE_IN_BYTES));
368 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
374 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
380 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
383 return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES), latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
386 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
388 print_stream_vector_table_impl(out,
false, prefix);
391 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
393 print_stream_vector_table_impl(out,
true, prefix);
396 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
399 std::ios old_format(NULL);
400 old_format.copyfmt(out);
403 << setw(20) <<
"Length in Scalars" << setw(1) <<
" "
404 << setw(20) <<
"COPY (us)" << setw(1) <<
" "
405 << setw(20) <<
"ADD (us)" << setw(1) <<
" "
406 << setw(20) <<
"COPY (GB/s)" << setw(1) <<
" "
407 << setw(20) <<
"ADD (GB/s)" << std::endl;
410 << setw(20) <<
"-----------------" << setw(1) <<
" "
411 << setw(20) <<
"---------" << setw(1) <<
" "
412 << setw(20) <<
"--------" << setw(1) <<
" "
413 << setw(20) <<
"-----------" << setw(1) <<
" "
414 << setw(20) <<
"----------" << std::endl;
416 for (
int i = 0; i < (int)stream_sizes_.size(); i++) {
417 int size = stream_sizes_[i];
418 double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
419 double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
425 << setw(20) << size << setw(1) <<
" "
426 << setw(20) << fixed << setprecision(4) << (c_time * 1e6) << setw(1) <<
" "
427 << setw(20) << fixed << setprecision(4) << (a_time * 1e6) << setw(1) <<
" "
428 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) <<
" "
429 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
432 out.copyfmt(old_format);
439 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
441 PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_host_times_);
443 PerfDetails::pingpong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_device_times_);
446 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
452 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
458 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
460 if (pingpong_sizes_.size() == 0)
return;
463 std::ios old_format(NULL);
464 old_format.copyfmt(out);
467 << setw(20) <<
"Message Size" << setw(1) <<
" "
468 << setw(20) <<
"Host (us)" << setw(1) <<
" "
469 << setw(20) <<
"Device (us)" << std::endl;
472 << setw(20) <<
"------------" << setw(1) <<
" "
473 << setw(20) <<
"---------" << setw(1) <<
" "
474 << setw(20) <<
"-----------" << std::endl;
476 for (
int i = 0; i < (int)pingpong_sizes_.size(); i++) {
477 int size = pingpong_sizes_[i];
478 double h_time = pingpong_host_times_[i];
479 double d_time = pingpong_device_times_[i];
482 << setw(20) << size << setw(1) <<
" "
483 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) <<
" "
484 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
487 out.copyfmt(old_format);
493 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
495 PerfDetails::halopong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE,
import, halopong_sizes_, halopong_host_times_);
497 PerfDetails::halopong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE,
import, halopong_sizes_, halopong_device_times_);
500 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
506 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
512 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
514 if (halopong_sizes_.size() == 0)
return;
517 std::ios old_format(NULL);
518 old_format.copyfmt(out);
521 << setw(20) <<
"Message Size" << setw(1) <<
" "
522 << setw(20) <<
"Host (us)" << setw(1) <<
" "
523 << setw(20) <<
"Device (us)" << std::endl;
526 << setw(20) <<
"------------" << setw(1) <<
" "
527 << setw(20) <<
"---------" << setw(1) <<
" "
528 << setw(20) <<
"-----------" << std::endl;
530 for (
int i = 0; i < (int)halopong_sizes_.size(); i++) {
531 int size = halopong_sizes_[i];
532 double h_time = halopong_host_times_[i];
533 double d_time = halopong_device_times_[i];
536 << setw(20) << size << setw(1) <<
" "
537 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) <<
" "
538 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
541 out.copyfmt(old_format);
548 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
550 using exec_space =
typename Node::execution_space;
551 using range_policy = Kokkos::RangePolicy<exec_space>;
552 using clock = std::chrono::high_resolution_clock;
554 double total_test_time = 0;
556 for (
int i = 0; i < KERNEL_REPEATS; i++) {
557 start = clock::now();
558 Kokkos::parallel_for(
559 "empty kernel", range_policy(0, 1), KOKKOS_LAMBDA(
const size_t j) {
562 exec_space().fence();
564 double my_test_time = std::chrono::duration<double>(stop - start).count();
565 total_test_time += my_test_time;
568 launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
571 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
574 return launch_and_wait_latency_;
577 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
580 std::ios old_format(NULL);
581 old_format.copyfmt(out);
584 << setw(20) <<
"Launch+Wait Latency (us)" << setw(1) <<
" "
585 << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_ * 1e6) << std::endl;
587 out.copyfmt(old_format);
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > ×)
virtual int getSize() const =0
virtual int getRank() const =0
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
double pingpong_device_lookup(int SIZE_IN_BYTES)
virtual int receive(const int sourceRank, const Ordinal bytes, char recvBuffer[]) const =0
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > ×)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
virtual void barrier() const =0
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_halopong_table(std::ostream &out, const std::string &prefix="")
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double pingpong_host_lookup(int SIZE_IN_BYTES)
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
virtual void send(const Ordinal bytes, const char sendBuffer[], const int destRank) const =0
void launch_latency_make_table(int KERNEL_REPEATS)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
double launch_latency_lookup()
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)