MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_PerfModels_decl.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_PERFMODELS_HPP
11 #define MUELU_PERFMODELS_HPP
12 
13 #include "MueLu_ConfigDefs.hpp"
14 #include "Xpetra_Import_fwd.hpp"
15 
16 #include <vector>
17 #include <ostream>
18 #include <Teuchos_DefaultComm.hpp>
19 
20 #include "MueLu_PerfModels_fwd.hpp"
21 
22 namespace MueLu {
23 
24 template <class Scalar,
27  class Node = DefaultNode>
28 class PerfModels {
29  public:
30  PerfModels();
31  ~PerfModels();
32 
33  /* Single Node tests based upon the STREAM benchmark for measuring memory
34  * bandwith and computation rate. These processes compute either the addition
35  * of two vectors or the multiplication of dense matrices of any given size.
36  * Many iterations occur which then return a vector containing the individual
37  * lengths of time per iteration.
38  *
39  * See further here:
40  * - https://www.cs.virginia.edu/stream/ref.html
41  * - https://github.com/UoB-HPC/BabelStream
42  */
43 
44  /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
45  void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE = 20);
46  bool has_stream_vector_table() const { return stream_sizes_.size() > 0; }
47 
48  /* Lookup in the stream_vector table */
49  double stream_vector_copy_lookup(int SIZE_IN_BYTES);
50  double stream_vector_add_lookup(int SIZE_IN_BYTES);
51  double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES);
52  double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES);
53 
54  // Uses the faster of the tables. The time is then divided by the number of memory transactions
55  // per element in the kernel (e.g. 2 for COPY and 3 for ADD).
56  double stream_vector_lookup(int SIZE_IN_BYTES);
57  double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);
58 
59  /* Print table */
60  void print_stream_vector_table(std::ostream &out, const std::string &prefix = "");
61  void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix = "");
62 
63  /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
64  * The sender process sends a message and then waits for confirmation of reception.
65  * Many iterations occur with various message sizes and the average latency values
66  * are returned within a map. Utilizes blocking send and recieve.
67  *
68  * See further: https://mvapich.cse.ohio-state.edu/benchmarks/
69  */
70  void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
71  bool has_pingpong_table() const { return pingpong_sizes_.size() > 0; }
72 
73  /* Lookup in the pingpong_vector table */
74  double pingpong_host_lookup(int SIZE_IN_BYTES);
75  double pingpong_device_lookup(int SIZE_IN_BYTES);
76 
77  /* Print table */
78  void print_pingpong_table(std::ostream &out, const std::string &prefix = "");
79 
80  /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.
81  * Here we use exactly the communication pattern specified in the import object
82  * and send messages accordingly. We vary the size in bytes sent per message,
83  * which should capture max-rate effects to some degree.
84  *
85  * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/
86  */
87  void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import);
88  bool has_halopong_table() const { return halopong_sizes_.size() > 0; }
89 
90  /* Lookup in the halopong_vector table */
91  double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
92  double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
93 
94  /* Print table */
95  void print_halopong_table(std::ostream &out, const std::string &prefix = "");
96 
97  /* Estimate launch latency based on the cost of submitting an empty Kokkos::parallel_for.
98  * This necessary to correct the memory bandwidth costs for models on high latency platforms,
99  * e.g., GPUS.
100  */
101  void launch_latency_make_table(int KERNEL_REPEATS);
103 
104  /* Lookup launch latency */
105  double launch_latency_lookup();
106 
107  /* Print table */
108  void print_launch_latency_table(std::ostream &out, const std::string &prefix = "");
109 
110  private:
111  void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix);
112 
113  std::vector<int> stream_sizes_;
114  std::vector<double> stream_copy_times_;
115  std::vector<double> stream_add_times_;
118 
119  std::vector<int> pingpong_sizes_;
120  std::vector<double> pingpong_host_times_;
121  std::vector<double> pingpong_device_times_;
122 
123  std::vector<int> halopong_sizes_;
124  std::vector<double> halopong_host_times_;
125  std::vector<double> halopong_device_times_;
126 
128 
129 }; // class PerfModels
130 
131 } // namespace MueLu
132 
133 #endif // ifndef MUELU_PERFMODELS_HPP
std::vector< double > latency_corrected_stream_copy_times_
bool has_halopong_table() const
MueLu::DefaultLocalOrdinal LocalOrdinal
std::vector< double > halopong_device_times_
std::vector< int > halopong_sizes_
Tpetra::KokkosClassic::DefaultNode::DefaultNodeType DefaultNode
double pingpong_device_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
std::vector< double > latency_corrected_stream_add_times_
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultNode Node
std::vector< double > stream_add_times_
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
std::vector< int > pingpong_sizes_
std::vector< int > stream_sizes_
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_halopong_table(std::ostream &out, const std::string &prefix="")
std::vector< double > halopong_host_times_
std::vector< double > pingpong_device_times_
bool has_launch_latency_table() const
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double pingpong_host_lookup(int SIZE_IN_BYTES)
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
void launch_latency_make_table(int KERNEL_REPEATS)
std::vector< double > stream_copy_times_
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
std::vector< double > pingpong_host_times_
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
bool has_stream_vector_table() const
bool has_pingpong_table() const