doc/html/MueLu__PerfModels__decl_8hpp_source.html

 // @HEADER

 // *****************************************************************************

 //        MueLu: A package for multigrid based preconditioning

 //

 // Copyright 2012 NTESS and the MueLu contributors.

 // SPDX-License-Identifier: BSD-3-Clause

 // *****************************************************************************

 // @HEADER


 #ifndef MUELU_PERFMODELS_HPP

 #define MUELU_PERFMODELS_HPP


 #include "MueLu_ConfigDefs.hpp"

 #include "Xpetra_Import_fwd.hpp"


 #include <vector>

 #include <ostream>

 #include <Teuchos_DefaultComm.hpp>


 #include "MueLu_PerfModels_fwd.hpp"


 namespace MueLu {


 template <class Scalar,

           class LocalOrdinal  = DefaultLocalOrdinal,

           class GlobalOrdinal = DefaultGlobalOrdinal,

           class Node          = DefaultNode>

 class PerfModels {

  public:

   PerfModels();

   ~PerfModels();


   /* Single Node tests based upon the STREAM benchmark for measuring memory

    * bandwith and computation rate. These processes compute either the addition

    * of two vectors or the multiplication of dense matrices of any given size.

    * Many iterations occur which then return a vector containing the individual

    * lengths of time per iteration.

    *

    * See further here:

    *    - https://www.cs.virginia.edu/stream/ref.html

    *    - https://github.com/UoB-HPC/BabelStream

    */


   /* This version is for table interpolation and works on chars, so the LOG_MAX_SIZE is for bytes */

   void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE = 20);

   bool has_stream_vector_table() const { return stream_sizes_.size() > 0; }


   /* Lookup in the stream_vector table */

   double stream_vector_copy_lookup(int SIZE_IN_BYTES);

   double stream_vector_add_lookup(int SIZE_IN_BYTES);

   double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES);

   double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES);


   // Uses the faster of the tables.  The time is then divided by the number of memory transactions

   // per element in the kernel (e.g. 2 for COPY and 3 for ADD).

   double stream_vector_lookup(int SIZE_IN_BYTES);

   double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES);


   /* Print table */

   void print_stream_vector_table(std::ostream &out, const std::string &prefix = "");

   void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix = "");


   /* A latency test between two processes based upon the MVAPICH OSU Micro-Benchmarks.

    * The sender process sends a message and then waits for confirmation of reception.

    * Many iterations occur with various message sizes and the average latency values

    * are returned within a map. Utilizes blocking send and recieve.

    *

    * See further: https://mvapich.cse.ohio-state.edu/benchmarks/

    */

   void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Teuchos::Comm<int> > &comm);

   bool has_pingpong_table() const { return pingpong_sizes_.size() > 0; }


   /* Lookup in the pingpong_vector table */

   double pingpong_host_lookup(int SIZE_IN_BYTES);

   double pingpong_device_lookup(int SIZE_IN_BYTES);


   /* Print table */

   void print_pingpong_table(std::ostream &out, const std::string &prefix = "");


   /* A halo-exchange based ping-pong, inspired by halo-mode in MPPTEST from ANL.

    * Here we use exactly the communication pattern specified in the import object

    * and send messages accordingly.  We vary the size in bytes sent per message,

    * which should capture max-rate effects to some degree.

    *

    * See further: https://www.mcs.anl.gov/research/projects/mpi/mpptest/

    */

   void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP<const Xpetra::Import<LocalOrdinal, GlobalOrdinal, Node> > &import);

   bool has_halopong_table() const { return halopong_sizes_.size() > 0; }


   /* Lookup in the halopong_vector table */

   double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE);

   double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE);


   /* Print table */

   void print_halopong_table(std::ostream &out, const std::string &prefix = "");


   /* Estimate launch latency based on the cost of submitting an empty Kokkos::parallel_for.

    * This necessary to correct the memory bandwidth costs for models on high latency platforms,

    * e.g., GPUS.

    */

   void launch_latency_make_table(int KERNEL_REPEATS);

   bool has_launch_latency_table() const { return launch_and_wait_latency_ > 0; }


   /* Lookup launch latency */

   double launch_latency_lookup();


   /* Print table */

   void print_launch_latency_table(std::ostream &out, const std::string &prefix = "");


  private:

   void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix);


   std::vector<int> stream_sizes_;

   std::vector<double> stream_copy_times_;

   std::vector<double> stream_add_times_;

   std::vector<double> latency_corrected_stream_copy_times_;

   std::vector<double> latency_corrected_stream_add_times_;


   std::vector<int> pingpong_sizes_;

   std::vector<double> pingpong_host_times_;

   std::vector<double> pingpong_device_times_;


   std::vector<int> halopong_sizes_;

   std::vector<double> halopong_host_times_;

   std::vector<double> halopong_device_times_;


   double launch_and_wait_latency_;


 };  // class PerfModels


 }  // namespace MueLu


 #endif  // ifndef MUELU_PERFMODELS_HPP

MueLu::PerfModels::latency_corrected_stream_copy_times_
std::vector< double > latency_corrected_stream_copy_times_
Definition: MueLu_PerfModels_decl.hpp:116

MueLu::PerfModels::has_halopong_table
bool has_halopong_table() const
Definition: MueLu_PerfModels_decl.hpp:88

MueLu_PerfModels_fwd.hpp

LocalOrdinal
MueLu::DefaultLocalOrdinal LocalOrdinal
Definition: MueLu_UseDefaultTypes.hpp:13

MueLu::PerfModels::halopong_device_times_
std::vector< double > halopong_device_times_
Definition: MueLu_PerfModels_decl.hpp:125

MueLu::PerfModels::halopong_sizes_
std::vector< int > halopong_sizes_
Definition: MueLu_PerfModels_decl.hpp:123

MueLu::DefaultNode
Tpetra::KokkosClassic::DefaultNode::DefaultNodeType DefaultNode
Definition: MueLu_Details_DefaultTypes.hpp:32

Xpetra::Import

MueLu::PerfModels::pingpong_device_lookup
double pingpong_device_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:442

MueLu::PerfModels::stream_vector_make_table
void stream_vector_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE=20)
Definition: MueLu_PerfModels_def.hpp:306

MueLu::PerfModels::launch_and_wait_latency_
double launch_and_wait_latency_
Definition: MueLu_PerfModels_decl.hpp:127

MueLu::PerfModels::print_latency_corrected_stream_vector_table
void print_latency_corrected_stream_vector_table(std::ostream &out, const std::string &prefix="")
Definition: MueLu_PerfModels_def.hpp:380

MueLu::PerfModels::latency_corrected_stream_add_times_
std::vector< double > latency_corrected_stream_add_times_
Definition: MueLu_PerfModels_decl.hpp:117

MueLu::PerfModels::latency_corrected_stream_vector_add_lookup
double latency_corrected_stream_vector_add_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:364

MueLu::PerfModels::latency_corrected_stream_vector_lookup
double latency_corrected_stream_vector_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:370

MueLu_ConfigDefs.hpp

Node
MueLu::DefaultNode Node
Definition: MueLu_UseDefaultTypes.hpp:15

MueLu::PerfModels::stream_add_times_
std::vector< double > stream_add_times_
Definition: MueLu_PerfModels_decl.hpp:115

MueLu::PerfModels::~PerfModels
~PerfModels()
Definition: MueLu_PerfModels_def.hpp:299

MueLu::PerfModels::stream_vector_lookup
double stream_vector_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:352

Scalar
MueLu::DefaultScalar Scalar
Definition: MueLu_UseDefaultTypes.hpp:12

GlobalOrdinal
MueLu::DefaultGlobalOrdinal GlobalOrdinal
Definition: MueLu_UseDefaultTypes.hpp:14

MueLu::PerfModels::stream_vector_copy_lookup
double stream_vector_copy_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:340

MueLu::PerfModels::pingpong_sizes_
std::vector< int > pingpong_sizes_
Definition: MueLu_PerfModels_decl.hpp:119

MueLu::PerfModels::stream_sizes_
std::vector< int > stream_sizes_
Definition: MueLu_PerfModels_decl.hpp:113

MueLu::DefaultLocalOrdinal
int DefaultLocalOrdinal
Definition: MueLu_Details_DefaultTypes.hpp:22

MueLu::PerfModels::halopong_host_lookup
double halopong_host_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
Definition: MueLu_PerfModels_def.hpp:490

MueLu::PerfModels::halopong_device_lookup
double halopong_device_lookup(int SIZE_IN_BYTES_PER_MESSAGE)
Definition: MueLu_PerfModels_def.hpp:496

MueLu::PerfModels::print_stream_vector_table_impl
void print_stream_vector_table_impl(std::ostream &out, bool use_latency_correction, const std::string &prefix)
Definition: MueLu_PerfModels_def.hpp:385

MueLu::PerfModels::PerfModels
PerfModels()
Definition: MueLu_PerfModels_def.hpp:295

MueLu::PerfModels::latency_corrected_stream_vector_copy_lookup
double latency_corrected_stream_vector_copy_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:358

Teuchos::Comm
Definition: MueLu_Memory.hpp:22

MueLu::PerfModels::print_launch_latency_table
void print_launch_latency_table(std::ostream &out, const std::string &prefix="")
Definition: MueLu_PerfModels_def.hpp:566

MueLu::PerfModels::print_halopong_table
void print_halopong_table(std::ostream &out, const std::string &prefix="")
Definition: MueLu_PerfModels_def.hpp:501

MueLu::PerfModels::halopong_host_times_
std::vector< double > halopong_host_times_
Definition: MueLu_PerfModels_decl.hpp:124

MueLu::PerfModels::pingpong_device_times_
std::vector< double > pingpong_device_times_
Definition: MueLu_PerfModels_decl.hpp:121

MueLu::PerfModels::has_launch_latency_table
bool has_launch_latency_table() const
Definition: MueLu_PerfModels_decl.hpp:102

MueLu::PerfModels::print_pingpong_table
void print_pingpong_table(std::ostream &out, const std::string &prefix="")
Definition: MueLu_PerfModels_def.hpp:447

MueLu::PerfModels::pingpong_host_lookup
double pingpong_host_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:436

MueLu::PerfModels::halopong_make_table
void halopong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > > &import)
Definition: MueLu_PerfModels_def.hpp:482

MueLu::PerfModels::launch_latency_make_table
void launch_latency_make_table(int KERNEL_REPEATS)
Definition: MueLu_PerfModels_def.hpp:537

Teuchos::RCP

MueLu::PerfModels::launch_latency_lookup
double launch_latency_lookup()
Definition: MueLu_PerfModels_def.hpp:561

MueLu::PerfModels::stream_copy_times_
std::vector< double > stream_copy_times_
Definition: MueLu_PerfModels_decl.hpp:114

MueLu::PerfModels::stream_vector_add_lookup
double stream_vector_add_lookup(int SIZE_IN_BYTES)
Definition: MueLu_PerfModels_def.hpp:346

MueLu::PerfModels::print_stream_vector_table
void print_stream_vector_table(std::ostream &out, const std::string &prefix="")
Definition: MueLu_PerfModels_def.hpp:375

MueLu::PerfModels::pingpong_host_times_
std::vector< double > pingpong_host_times_
Definition: MueLu_PerfModels_decl.hpp:120

MueLu::DefaultGlobalOrdinal
int DefaultGlobalOrdinal
Definition: MueLu_Details_DefaultTypes.hpp:29

Xpetra_Import_fwd.hpp

MueLu::PerfModels::pingpong_make_table
void pingpong_make_table(int KERNEL_REPEATS, int LOG_MAX_SIZE, const RCP< const Teuchos::Comm< int > > &comm)
Definition: MueLu_PerfModels_def.hpp:428

MueLu::PerfModels::has_stream_vector_table
bool has_stream_vector_table() const
Definition: MueLu_PerfModels_decl.hpp:46

MueLu::PerfModels::has_pingpong_table
bool has_pingpong_table() const
Definition: MueLu_PerfModels_decl.hpp:71