19 #include <Kokkos_ArithTraits.hpp>
21 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
22 #include <Xpetra_TpetraImport.hpp>
23 #include <Tpetra_Import.hpp>
24 #include <Tpetra_Distributor.hpp>
34 namespace PerfDetails {
35 template <
class Scalar,
class Node>
38 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
40 using exec_space =
typename Node::execution_space;
41 using memory_space =
typename Node::memory_space;
42 using range_policy = Kokkos::RangePolicy<exec_space>;
44 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
45 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
46 Kokkos::View<impl_scalar_type *, memory_space> c(
"c", VECTOR_SIZE);
47 double total_test_time = 0.0;
52 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
53 a(i) = ONE * (double)i;
58 using clock = std::chrono::high_resolution_clock;
62 for (
int i = 0; i < KERNEL_REPEATS; i++) {
65 "stream/add", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
71 double my_test_time = std::chrono::duration<double>(stop - start).count();
72 total_test_time += my_test_time;
75 return total_test_time / KERNEL_REPEATS;
78 template <
class Scalar,
class Node>
81 using impl_scalar_type =
typename Kokkos::ArithTraits<Scalar>::val_type;
83 using exec_space =
typename Node::execution_space;
84 using memory_space =
typename Node::memory_space;
85 using range_policy = Kokkos::RangePolicy<exec_space>;
87 Kokkos::View<impl_scalar_type *, memory_space> a(
"a", VECTOR_SIZE);
88 Kokkos::View<impl_scalar_type *, memory_space> b(
"b", VECTOR_SIZE);
89 double total_test_time = 0.0;
94 "stream/fill", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t i) {
99 using clock = std::chrono::high_resolution_clock;
102 for (
int i = 0; i < KERNEL_REPEATS; i++) {
103 start = clock::now();
104 Kokkos::parallel_for(
105 "stream/copy", range_policy(0, VECTOR_SIZE), KOKKOS_LAMBDA(
const size_t j) {
109 exec_space().fence();
111 double my_test_time = std::chrono::duration<double>(stop - start).count();
112 total_test_time += my_test_time;
115 return total_test_time / KERNEL_REPEATS;
118 double table_lookup(
const std::vector<int> &x,
const std::vector<double> &y,
int value) {
123 int N = (int)x.size();
125 for (; hi < N; hi++) {
134 }
else if (hi == N) {
138 int run = x[hi] - x[hi - 1];
139 double rise = y[hi] - y[hi - 1];
140 double slope = rise / run;
141 int diff = value - x[hi - 1];
143 return y[hi - 1] + slope * diff;
147 int run = x[hi] - x[hi - 1];
148 double rise = y[hi] - y[hi - 1];
149 double slope = rise / run;
150 int diff = value - x[hi - 1];
152 return y[hi - 1] + slope * diff;
157 const double GB = 1024.0 * 1024.0 * 1024.0;
159 double time_per_call = time / num_calls;
160 return memory_per_call_bytes /
GB / time_per_call;
163 template <
class exec_space,
class memory_space>
169 if (nproc < 2)
return;
171 const int buff_size = (int)pow(2, MAX_SIZE);
173 sizes.resize(MAX_SIZE + 1);
174 times.resize(MAX_SIZE + 1);
177 Kokkos::View<char *, memory_space> r_buf(
"recv", buff_size), s_buf(
"send", buff_size);
178 Kokkos::deep_copy(s_buf, 1);
183 int buddy = odd ? rank - 1 : rank + 1;
185 for (
int i = 0; i < MAX_SIZE + 1; i++) {
186 int msg_size = (int)pow(2, i);
189 double t0 = MPI_Wtime();
190 for (
int j = 0; j < KERNEL_REPEATS; j++) {
193 comm.
send(msg_size, (
char *)s_buf.data(), buddy);
194 comm.
receive(buddy, msg_size, (
char *)r_buf.data());
196 comm.
receive(buddy, msg_size, (
char *)r_buf.data());
197 comm.
send(msg_size, (
char *)s_buf.data(), buddy);
202 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
204 times[i] = time_per_call;
211 template <
class exec_space,
class memory_space,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
213 int nproc =
import->getSourceMap()->getComm()->getSize();
214 if (nproc < 2)
return;
215 #if defined(HAVE_MUELU_TPETRA) && defined(HAVE_MPI)
221 MPI_Comm communicator = *mcomm->getRawMpiComm();
223 if (Ximport.
is_null() || mcomm.is_null())
return;
224 auto Timport = Ximport->getTpetra_Import();
225 auto distor = Timport->getDistributor();
230 int num_recvs = (int)distor.getNumReceives();
231 int num_sends = (int)distor.getNumSends();
233 const int buff_size_per_msg = (int)pow(2, MAX_SIZE);
234 sizes.resize(MAX_SIZE + 1);
235 times.resize(MAX_SIZE + 1);
238 Kokkos::View<char *, memory_space> f_recv_buf(
"forward_recv", buff_size_per_msg * num_recvs), f_send_buf(
"forward_send", buff_size_per_msg * num_sends);
239 Kokkos::View<char *, memory_space> r_recv_buf(
"reverse_recv", buff_size_per_msg * num_sends), r_send_buf(
"reverse_send", buff_size_per_msg * num_recvs);
240 Kokkos::deep_copy(f_send_buf, 1);
241 Kokkos::deep_copy(r_send_buf, 1);
243 std::vector<MPI_Request> requests(num_sends + num_recvs);
244 std::vector<MPI_Status> status(num_sends + num_recvs);
246 for (
int i = 0; i < MAX_SIZE + 1; i++) {
247 int msg_size = (int)pow(2, i);
249 MPI_Barrier(communicator);
251 double t0 = MPI_Wtime();
252 for (
int j = 0; j < KERNEL_REPEATS; j++) {
255 for (
int r = 0; r < num_recvs; r++) {
256 const int tag = 1000 + j;
257 MPI_Irecv(f_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsFrom[r], tag, communicator, &requests[ct]);
260 for (
int s = 0; s < num_sends; s++) {
261 const int tag = 1000 + j;
262 MPI_Isend(f_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsTo[s], tag, communicator, &requests[ct]);
266 MPI_Waitall(ct, requests.data(), status.data());
270 for (
int r = 0; r < num_sends; r++) {
271 const int tag = 2000 + j;
272 MPI_Irecv(r_recv_buf.data() + msg_size * r, msg_size, MPI_CHAR, procsTo[r], tag, communicator, &requests[ct]);
275 for (
int s = 0; s < num_recvs; s++) {
276 const int tag = 2000 + j;
277 MPI_Isend(r_send_buf.data() + msg_size * s, msg_size, MPI_CHAR, procsFrom[s], tag, communicator, &requests[ct]);
281 MPI_Waitall(ct, requests.data(), status.data());
284 double time_per_call = (MPI_Wtime() - t0) / (2.0 * KERNEL_REPEATS);
286 times[i] = time_per_call;
294 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
296 : launch_and_wait_latency_(-1.0) {}
298 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
305 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
308 launch_latency_make_table(KERNEL_REPEATS);
309 double latency = launch_latency_lookup();
311 if (LOG_MAX_SIZE < 2)
314 stream_sizes_.resize(LOG_MAX_SIZE + 1);
315 stream_copy_times_.resize(LOG_MAX_SIZE + 1);
316 stream_add_times_.resize(LOG_MAX_SIZE + 1);
317 latency_corrected_stream_copy_times_.resize(LOG_MAX_SIZE + 1);
318 latency_corrected_stream_add_times_.resize(LOG_MAX_SIZE + 1);
320 for (
int i = 0; i < LOG_MAX_SIZE + 1; i++) {
321 int size = (int)pow(2, i);
322 double c_time = PerfDetails::stream_vector_copy<Scalar, Node>(KERNEL_REPEATS, size);
323 double a_time = PerfDetails::stream_vector_add<Scalar, Node>(KERNEL_REPEATS, size);
325 stream_sizes_[i] = size;
328 stream_copy_times_[i] = c_time / 2.0;
329 stream_add_times_[i] = a_time / 3.0;
333 latency_corrected_stream_copy_times_[i] = (c_time - latency <= 0.0) ? c_time / 2.0 : ((c_time - latency) / 2.0);
334 latency_corrected_stream_add_times_[i] = (a_time - latency <= 0.0) ? a_time / 3.0 : ((a_time - latency) / 3.0);
338 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
344 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
350 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
353 return std::min(stream_vector_copy_lookup(SIZE_IN_BYTES), stream_vector_add_lookup(SIZE_IN_BYTES));
356 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
362 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
368 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
371 return std::min(latency_corrected_stream_vector_copy_lookup(SIZE_IN_BYTES), latency_corrected_stream_vector_add_lookup(SIZE_IN_BYTES));
374 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
376 print_stream_vector_table_impl(out,
false, prefix);
379 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
381 print_stream_vector_table_impl(out,
true, prefix);
384 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
387 std::ios old_format(NULL);
388 old_format.copyfmt(out);
391 << setw(20) <<
"Length in Scalars" << setw(1) <<
" "
392 << setw(20) <<
"COPY (us)" << setw(1) <<
" "
393 << setw(20) <<
"ADD (us)" << setw(1) <<
" "
394 << setw(20) <<
"COPY (GB/s)" << setw(1) <<
" "
395 << setw(20) <<
"ADD (GB/s)" << std::endl;
398 << setw(20) <<
"-----------------" << setw(1) <<
" "
399 << setw(20) <<
"---------" << setw(1) <<
" "
400 << setw(20) <<
"--------" << setw(1) <<
" "
401 << setw(20) <<
"-----------" << setw(1) <<
" "
402 << setw(20) <<
"----------" << std::endl;
404 for (
int i = 0; i < (int)stream_sizes_.size(); i++) {
405 int size = stream_sizes_[i];
406 double c_time = use_latency_correction ? latency_corrected_stream_copy_times_[i] : stream_copy_times_[i];
407 double a_time = use_latency_correction ? latency_corrected_stream_add_times_[i] : stream_add_times_[i];
413 << setw(20) << size << setw(1) <<
" "
414 << setw(20) << fixed << setprecision(4) << (c_time * 1e6) << setw(1) <<
" "
415 << setw(20) << fixed << setprecision(4) << (a_time * 1e6) << setw(1) <<
" "
416 << setw(20) << fixed << setprecision(4) << c_bw << setw(1) <<
" "
417 << setw(20) << fixed << setprecision(4) << a_bw << std::endl;
420 out.copyfmt(old_format);
427 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
429 PerfDetails::pingpong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_host_times_);
431 PerfDetails::pingpong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE, *comm, pingpong_sizes_, pingpong_device_times_);
434 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
440 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
446 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
448 if (pingpong_sizes_.size() == 0)
return;
451 std::ios old_format(NULL);
452 old_format.copyfmt(out);
455 << setw(20) <<
"Message Size" << setw(1) <<
" "
456 << setw(20) <<
"Host (us)" << setw(1) <<
" "
457 << setw(20) <<
"Device (us)" << std::endl;
460 << setw(20) <<
"------------" << setw(1) <<
" "
461 << setw(20) <<
"---------" << setw(1) <<
" "
462 << setw(20) <<
"-----------" << std::endl;
464 for (
int i = 0; i < (int)pingpong_sizes_.size(); i++) {
465 int size = pingpong_sizes_[i];
466 double h_time = pingpong_host_times_[i];
467 double d_time = pingpong_device_times_[i];
470 << setw(20) << size << setw(1) <<
" "
471 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) <<
" "
472 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
475 out.copyfmt(old_format);
481 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
483 PerfDetails::halopong_basic<Kokkos::HostSpace::execution_space, Kokkos::HostSpace::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE,
import, halopong_sizes_, halopong_host_times_);
485 PerfDetails::halopong_basic<typename Node::execution_space, typename Node::memory_space>(KERNEL_REPEATS, LOG_MAX_SIZE,
import, halopong_sizes_, halopong_device_times_);
488 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
494 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
500 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
502 if (halopong_sizes_.size() == 0)
return;
505 std::ios old_format(NULL);
506 old_format.copyfmt(out);
509 << setw(20) <<
"Message Size" << setw(1) <<
" "
510 << setw(20) <<
"Host (us)" << setw(1) <<
" "
511 << setw(20) <<
"Device (us)" << std::endl;
514 << setw(20) <<
"------------" << setw(1) <<
" "
515 << setw(20) <<
"---------" << setw(1) <<
" "
516 << setw(20) <<
"-----------" << std::endl;
518 for (
int i = 0; i < (int)halopong_sizes_.size(); i++) {
519 int size = halopong_sizes_[i];
520 double h_time = halopong_host_times_[i];
521 double d_time = halopong_device_times_[i];
524 << setw(20) << size << setw(1) <<
" "
525 << setw(20) << fixed << setprecision(4) << (h_time * 1e6) << setw(1) <<
" "
526 << setw(20) << fixed << setprecision(4) << (d_time * 1e6) << setw(1) << std::endl;
529 out.copyfmt(old_format);
536 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
538 using exec_space =
typename Node::execution_space;
539 using range_policy = Kokkos::RangePolicy<exec_space>;
540 using clock = std::chrono::high_resolution_clock;
542 double total_test_time = 0;
544 for (
int i = 0; i < KERNEL_REPEATS; i++) {
545 start = clock::now();
546 Kokkos::parallel_for(
547 "empty kernel", range_policy(0, 1), KOKKOS_LAMBDA(
const size_t j) {
550 exec_space().fence();
552 double my_test_time = std::chrono::duration<double>(stop - start).count();
553 total_test_time += my_test_time;
556 launch_and_wait_latency_ = total_test_time / KERNEL_REPEATS;
559 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
562 return launch_and_wait_latency_;
565 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node>
568 std::ios old_format(NULL);
569 old_format.copyfmt(out);
572 << setw(20) <<
"Launch+Wait Latency (us)" << setw(1) <<
" "
573 << setw(20) << fixed << setprecision(4) << (launch_and_wait_latency_ * 1e6) << std::endl;
575 out.copyfmt(old_format);
void halopong_basic(int KERNEL_REPEATS, int MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import, std::vector< int > &sizes, std::vector< double > ×)
virtual int getSize() const =0
virtual int getRank() const =0
double convert_time_to_bandwidth_gbs(double time, int num_calls, double memory_per_call_bytes)
double stream_vector_copy(int KERNEL_REPEATS, int VECTOR_SIZE)
double pingpong_device_lookup(int SIZE_IN_BYTES)
virtual int receive(const int sourceRank, const Ordinal bytes, char recvBuffer[]) const =0
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
void pingpong_basic(int KERNEL_REPEATS, int MAX_SIZE, const Teuchos::Comm< int > &comm, std::vector< int > &sizes, std::vector< double > ×)
double table_lookup(const std::vector< int > &x, const std::vector< double > &y, int value)
virtual void barrier() const =0
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_halopong_table(std::ostream &out, const std::string &prefix="")
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double pingpong_host_lookup(int SIZE_IN_BYTES)
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
virtual void send(const Ordinal bytes, const char sendBuffer[], const int destRank) const =0
void launch_latency_make_table(int KERNEL_REPEATS)
double stream_vector_add(int KERNEL_REPEATS, int VECTOR_SIZE)
double launch_latency_lookup()
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)