MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_PerfModels_decl.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 //
43 // ***********************************************************************
44 //
45 // @HEADER
46 #ifndef MUELU_PERFMODELS_HPP
47 #define MUELU_PERFMODELS_HPP
48 
49 #include "MueLu_ConfigDefs.hpp"
50 #include "Xpetra_Import_fwd.hpp"
51 
52 #include <vector>
53 #include <ostream>
54 #include <Teuchos_DefaultComm.hpp>
55 
56 #include "MueLu_PerfModels_fwd.hpp"
57 
58 namespace MueLu {
59 
60 template <class Scalar,
63  class Node = DefaultNode>
64 class PerfModels {
65  public:
66  PerfModels();
67 
68  /* Single Node tests based upon the STREAM benchmark for measuring memory
69  * bandwith and computation rate. These processes compute either the addition
70  * of two vectors or the multiplication of dense matrices of any given size.
71  * Many iterations occur which then return a vector containing the individual
72  * lengths of time per iteration.
73  *
74  * See further here:
75  * - https://www.cs.virginia.edu/stream/ref.html
76  * - https://github.com/UoB-HPC/BabelStream
77  */
78 
79  /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */
80  void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE = 20);
81  bool has_stream_vector_table() const { return stream_sizes_.size() > 0; }
82 
83  /* Lookup in the stream_vector table */
84  double stream_vector_copy_lookup(int SIZE_IN_BYTES);
85  double stream_vector_add_lookup(int SIZE_IN_BYTES);
86  double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES);
87  double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES);
88 
89  // Uses the faster of the tables. The time is then divided by the number of memory transactions
90  // per element in the kernel (e.g. 2 for COPY and 3 for ADD).
91  double stream_vector_lookup(int SIZE_IN_BYTES);
92  double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);
93 
94  /* Print table */
95  void print_stream_vector_table(std::ostream &out, const std::string &prefix = "");
96  void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix = "");
97 
98  /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.
99  * The sender process sends a message and then waits for confirmation of reception.
100  * Many iterations occur with various message sizes and the average latency values
101  * are returned within a map. Utilizes blocking send and recieve.
102  *
103  * See further: https://mvapich.cse.ohio-state.edu/benchmarks/
104  */
105  void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);
106  bool has_pingpong_table() const { return pingpong_sizes_.size() > 0; }
107 
108  /* Lookup in the pingpong_vector table */
109  double pingpong_host_lookup(int SIZE_IN_BYTES);
110  double pingpong_device_lookup(int SIZE_IN_BYTES);
111 
112  /* Print table */
113  void print_pingpong_table(std::ostream &out, const std::string &prefix = "");
114 
115  /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.
116  * Here we use exactly the communication pattern specified in the import object
117  * and send messages accordingly. We vary the size in bytes sent per message,
118  * which should capture max-rate effects to some degree.
119  *
120  * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/
121  */
122  void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import);
123  bool has_halopong_table() const { return halopong_sizes_.size() > 0; }
124 
125  /* Lookup in the halopong_vector table */
126  double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
127  double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);
128 
129  /* Print table */
130  void print_halopong_table(std::ostream &out, const std::string &prefix = "");
131 
132  /* Estimate launch latency based on the cost of submitting an empty Kokkos::parallel_for.
133  * This necessary to correct the memory bandwidth costs for models on high latency platforms,
134  * e.g., GPUS.
135  */
136  void launch_latency_make_table(int KERNEL_REPEATS);
138 
139  /* Lookup launch latency */
140  double launch_latency_lookup();
141 
142  /* Print table */
143  void print_launch_latency_table(std::ostream &out, const std::string &prefix = "");
144 
145  private:
146  void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix);
147 
148  std::vector<int> stream_sizes_;
149  std::vector<double> stream_copy_times_;
150  std::vector<double> stream_add_times_;
153 
154  std::vector<int> pingpong_sizes_;
155  std::vector<double> pingpong_host_times_;
156  std::vector<double> pingpong_device_times_;
157 
158  std::vector<int> halopong_sizes_;
159  std::vector<double> halopong_host_times_;
160  std::vector<double> halopong_device_times_;
161 
163 
164 }; // class PerfModels
165 
166 } // namespace MueLu
167 
168 #endif // ifndef MUELU_PERFMODELS_HPP
std::vector< double > latency_corrected_stream_copy_times_
bool has_halopong_table() const
MueLu::DefaultLocalOrdinal LocalOrdinal
std::vector< double > halopong_device_times_
std::vector< int > halopong_sizes_
Tpetra::KokkosClassic::DefaultNode::DefaultNodeType DefaultNode
double pingpong_device_lookup(int SIZE_IN_BYTES)
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
std::vector< double > latency_corrected_stream_add_times_
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultNode Node
std::vector< double > stream_add_times_
double stream_vector_lookup(int SIZE_IN_BYTES)
MueLu::DefaultScalar Scalar
MueLu::DefaultGlobalOrdinal GlobalOrdinal
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
std::vector< int > pingpong_sizes_
std::vector< int > stream_sizes_
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
void print_halopong_table(std::ostream &out, const std::string &prefix="")
std::vector< double > halopong_host_times_
std::vector< double > pingpong_device_times_
bool has_launch_latency_table() const
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
double pingpong_host_lookup(int SIZE_IN_BYTES)
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
void launch_latency_make_table(int KERNEL_REPEATS)
std::vector< double > stream_copy_times_
double stream_vector_add_lookup(int SIZE_IN_BYTES)
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
std::vector< double > pingpong_host_times_
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
bool has_stream_vector_table() const
bool has_pingpong_table() const