MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_RepartitionHeuristicFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
11 #define PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
12 
13 #include <algorithm>
14 #include <iostream>
15 #include <sstream>
16 
17 #ifdef HAVE_MPI
19 #include <Teuchos_CommHelpers.hpp>
20 
21 //#include <Xpetra_Map.hpp>
22 #include <Xpetra_Matrix.hpp>
23 
24 #include "MueLu_RAPFactory.hpp"
25 #include "MueLu_BlockedRAPFactory.hpp"
26 #include "MueLu_SubBlockAFactory.hpp"
27 #include "MueLu_Level.hpp"
28 #include "MueLu_MasterList.hpp"
29 #include "MueLu_Monitor.hpp"
30 
32 
33 namespace MueLu {
34 
35 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
37 
38 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
40 
41 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
43  RCP<ParameterList> validParamList = rcp(new ParameterList());
44 
45 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
46  SET_VALID_ENTRY("repartition: start level");
47  SET_VALID_ENTRY("repartition: use map");
48  SET_VALID_ENTRY("repartition: node repartition level");
49  SET_VALID_ENTRY("repartition: min rows per proc");
50  SET_VALID_ENTRY("repartition: target rows per proc");
51  SET_VALID_ENTRY("repartition: min rows per thread");
52  SET_VALID_ENTRY("repartition: target rows per thread");
53  SET_VALID_ENTRY("repartition: max imbalance");
54  SET_VALID_ENTRY("repartition: put on single proc");
55 #undef SET_VALID_ENTRY
56 
57  validParamList->set<RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
58  validParamList->set<RCP<const FactoryBase> >("Map", Teuchos::null, "Factory of the map Map");
59  validParamList->set<RCP<const FactoryBase> >("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
60 
61  return validParamList;
62 }
63 
64 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
66  const Teuchos::ParameterList& pL = GetParameterList();
67  if (pL.isParameter("repartition: use map")) {
68  const bool useMap = pL.get<bool>("repartition: use map");
69  if (useMap)
70  Input(currentLevel, "Map");
71  else
72  Input(currentLevel, "A");
73  } else
74  Input(currentLevel, "A");
75  if (pL.isParameter("repartition: node repartition level")) {
76  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
77  if (currentLevel.GetLevelID() == nodeRepartLevel) {
78  Input(currentLevel, "Node Comm");
79  }
80  }
81 }
82 
83 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
85  FactoryMonitor m(*this, "Build", currentLevel);
86 
87  const Teuchos::ParameterList& pL = GetParameterList();
88  // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
89  // TODO (JG): I don't really know if we want to do this.
90  const int startLevel = pL.get<int>("repartition: start level");
91  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
92  LO minRowsPerProcess = pL.get<LO>("repartition: min rows per proc");
93  LO targetRowsPerProcess = pL.get<LO>("repartition: target rows per proc");
94  LO minRowsPerThread = pL.get<LO>("repartition: min rows per thread");
95  LO targetRowsPerThread = pL.get<LO>("repartition: target rows per thread");
96  LO putOnSingleProc = pL.get<LO>("repartition: put on single proc");
97  const double nonzeroImbalance = pL.get<double>("repartition: max imbalance");
98  const bool useMap = pL.get<bool>("repartition: use map");
99 
100  int thread_per_mpi_rank = 1;
101 #if defined(KOKKOS_ENABLE_OPENMP)
102  using execution_space = typename Node::device_type::execution_space;
103  if (std::is_same<execution_space, Kokkos::OpenMP>::value)
104  thread_per_mpi_rank = execution_space().concurrency();
105 #endif
106 
107  if (minRowsPerThread > 0)
108  // We ignore the value given by minRowsPerProcess and repartition based on threads instead
109  minRowsPerProcess = minRowsPerThread * thread_per_mpi_rank;
110 
111  if (targetRowsPerThread == 0)
112  targetRowsPerThread = minRowsPerThread;
113 
114  if (targetRowsPerThread > 0)
115  // We ignore the value given by targetRowsPerProcess and repartition based on threads instead
116  targetRowsPerProcess = targetRowsPerThread * thread_per_mpi_rank;
117 
118  if (targetRowsPerProcess == 0)
119  targetRowsPerProcess = minRowsPerProcess;
120 
121  // Stick this on the level so Zoltan2Interface can use this later
122  Set<LO>(currentLevel, "repartition: heuristic target rows per process", targetRowsPerProcess);
123 
124  // Check for validity of the node repartition option
125  TEUCHOS_TEST_FOR_EXCEPTION(nodeRepartLevel >= startLevel, Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): If 'repartition: node repartition level' is set, it must be less than or equal to 'repartition: start level'");
126 
127  RCP<Matrix> A;
129  RCP<const Map> map;
130  if (!useMap) {
131  Afact = GetFactory("A");
132  if (!Afact.is_null() && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) == Teuchos::null &&
133  Teuchos::rcp_dynamic_cast<const BlockedRAPFactory>(Afact) == Teuchos::null &&
134  Teuchos::rcp_dynamic_cast<const SubBlockAFactory>(Afact) == Teuchos::null) {
135  GetOStream(Warnings) << "MueLu::RepartitionHeuristicFactory::Build: The generation factory for A must "
136  "be a RAPFactory or a SubBlockAFactory providing the non-rebalanced matrix information! "
137  "It specifically must not be of type Rebalance(Blocked)AcFactory or similar. "
138  "Please check the input. Make also sure that \"number of partitions\" is provided to "
139  "the Interface class and the RepartitionFactory instance. Instead, we have a "
140  << Afact->description() << std::endl;
141  }
142  // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
143  A = Get<RCP<Matrix> >(currentLevel, "A");
144  map = A->getRowMap();
145  } else
146  map = Get<RCP<const Map> >(currentLevel, "Map");
147 
148  // ======================================================================================================
149  // Determine whether partitioning is needed
150  // ======================================================================================================
151  // NOTE: most tests include some global communication, which is why we currently only do tests until we make
152  // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
153  // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.
154 
155  // NOTE: We should probably consider merging all of these sumAll/minAll things at some point.
156 
157  // Test0: Should we do node repartitioning?
158  if (currentLevel.GetLevelID() == nodeRepartLevel && map->getComm()->getSize() > 1) {
159  RCP<const Teuchos::Comm<int> > NodeComm = Get<RCP<const Teuchos::Comm<int> > >(currentLevel, "Node Comm");
160  TEUCHOS_TEST_FOR_EXCEPTION(NodeComm.is_null(), Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): NodeComm is null.");
161 
162  // If we only have one node, then we don't want to pop down to one rank
163  if (NodeComm()->getSize() != map->getComm()->getSize()) {
164  GetOStream(Statistics1) << "Repartitioning? YES: \n Within node only" << std::endl;
165  int nodeRank = NodeComm->getRank();
166 
167  // Do a reduction to get the total number of nodes
168  int isZero = (nodeRank == 0);
169  int numNodes = 0;
170  Teuchos::reduceAll(*map->getComm(), Teuchos::REDUCE_SUM, isZero, Teuchos::outArg(numNodes));
171  Set(currentLevel, "number of partitions", numNodes);
172  return;
173  }
174  }
175 
176  // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
177  if (currentLevel.GetLevelID() < startLevel) {
178  GetOStream(Statistics1) << "Repartitioning? NO:"
179  << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;
180 
181  // a negative number of processors means: no repartitioning
182  Set(currentLevel, "number of partitions", -1);
183 
184  return;
185  }
186 
187  RCP<const Teuchos::Comm<int> > origComm = map->getComm();
188  RCP<const Teuchos::Comm<int> > comm = origComm;
189 
190  // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
191  // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
192  // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
193 
194  // TODO: The block transfer factories do not check correctly whether or not repartitioning actually took place.
195  {
196  if (comm->getSize() == 1 && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) != Teuchos::null) {
197  GetOStream(Statistics1) << "Repartitioning? NO:"
198  << "\n comm size = 1" << std::endl;
199 
200  Set(currentLevel, "number of partitions", -1);
201  return;
202  }
203 
204  int numActiveProcesses = 0;
205  MueLu_sumAll(comm, Teuchos::as<int>((map->getLocalNumElements() > 0) ? 1 : 0), numActiveProcesses);
206 
207  if (numActiveProcesses == 1) {
208  GetOStream(Statistics1) << "Repartitioning? NO:"
209  << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;
210 
211  Set(currentLevel, "number of partitions", 1);
212  return;
213  }
214  }
215 
216  // Test 2.5: Repartition down to one rank if we're below "repatition: put on single proc"
217  // This is (mostly) an ML-compatibility thing
218  if (putOnSingleProc && map->getGlobalNumElements() < (Xpetra::global_size_t)putOnSingleProc) {
219  GetOStream(Statistics1) << "Repartitioning? YES:"
220  << "\n # rows is below the single-proc threshold = " << putOnSingleProc << std::endl;
221 
222  Set(currentLevel, "number of partitions", 1);
223  return;
224  }
225 
226  bool test3 = false, test4 = false;
227  std::string msg3, msg4;
228 
229  // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
230  // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
231  if (minRowsPerProcess > 0) {
232  LO numMyRows = Teuchos::as<LO>(map->getLocalNumElements()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
233  LO haveFewRows = (numMyRows < minRowsPerProcess ? 1 : 0), numWithFewRows = 0;
234  MueLu_sumAll(comm, haveFewRows, numWithFewRows);
235  MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);
236 
237  // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
238  // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
239  // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
240  if (numWithFewRows > 0)
241  test3 = true;
242 
243  msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcess);
244  }
245 
246  // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
247  if (!test3) {
248  if (useMap)
249  msg4 = "";
250  else {
251  GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getLocalNumEntries());
252  MueLu_maxAll(comm, numMyNnz, maxNnz);
253  MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
254  double imbalance = Teuchos::as<double>(maxNnz) / minNnz;
255 
256  if (imbalance > nonzeroImbalance)
257  test4 = true;
258 
259  msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
260  }
261  }
262 
263  if (!test3 && !test4) {
264  GetOStream(Statistics1) << "Repartitioning? NO:" << msg3 + msg4 << std::endl;
265 
266  // A negative number of partitions means: no repartitioning
267  Set(currentLevel, "number of partitions", -1);
268  return;
269  }
270 
271  GetOStream(Statistics1) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;
272 
273  // ======================================================================================================
274  // Calculate number of partitions
275  // ======================================================================================================
276  // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
277  // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
278 
279  // The number of partitions is calculated by the RepartitionFactory and stored in "number of partitions" variable on
280  // the current level. If this variable is already set (e.g., by another instance of RepartitionFactory) then this number
281  // is used. The "number of partitions" variable serves as basic communication between the RepartitionFactory (which
282  // requests a certain number of partitions) and the *Interface classes which call the underlying partitioning algorithms
283  // and produce the "Partition" array with the requested number of partitions.
284  const auto globalNumRows = Teuchos::as<GO>(map->getGlobalNumElements());
285  int numPartitions = 1;
286  if (globalNumRows >= targetRowsPerProcess) {
287  // Make sure that each CPU thread has approximately targetRowsPerProcess
288  numPartitions = std::max(Teuchos::as<int>(globalNumRows / targetRowsPerProcess), 1);
289  }
290  numPartitions = std::min(numPartitions, comm->getSize());
291 
292  Set(currentLevel, "number of partitions", numPartitions);
293 
294  GetOStream(Statistics1) << "Number of partitions to use = " << numPartitions << std::endl;
295 } // Build
296 } // namespace MueLu
297 
298 #endif // ifdef HAVE_MPI
299 #endif /* PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_ */
#define MueLu_sumAll(rcpComm, in, out)
virtual ~RepartitionHeuristicFactory()
Destructor.
#define MueLu_maxAll(rcpComm, in, out)
void Build(Level &currentLevel) const
Build an object with this factory.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Print more statistics.
LocalOrdinal LO
#define SET_VALID_ENTRY(name)
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
#define MueLu_minAll(rcpComm, in, out)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
Factory for building a thresholded operator.
size_t global_size_t
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionHeuristicFactory needs, and the factories that generate that data...
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:51
Exception throws to report errors in the internal logical of the program.
Print all warning messages.
Factory for building coarse matrices.
RepartitionHeuristicFactory()
Constructor.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
virtual std::string description() const
Return a simple one-line description of this object.
std::string toString(const T &t)
bool is_null() const