MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_RepartitionHeuristicFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
11 #define PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
12 
13 #include <algorithm>
14 #include <iostream>
15 #include <sstream>
16 
17 #ifdef HAVE_MPI
19 #include <Teuchos_CommHelpers.hpp>
20 
21 //#include <Xpetra_Map.hpp>
22 #include <Xpetra_Matrix.hpp>
23 
24 #include "MueLu_RAPFactory.hpp"
25 #include "MueLu_BlockedRAPFactory.hpp"
26 #include "MueLu_SubBlockAFactory.hpp"
27 #include "MueLu_Level.hpp"
28 #include "MueLu_MasterList.hpp"
29 #include "MueLu_Monitor.hpp"
30 
32 
33 namespace MueLu {
34 
35 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
37 
38 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
40 
41 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
43  RCP<ParameterList> validParamList = rcp(new ParameterList());
44 
45 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
46  SET_VALID_ENTRY("repartition: start level");
47  SET_VALID_ENTRY("repartition: use map");
48  SET_VALID_ENTRY("repartition: node repartition level");
49  SET_VALID_ENTRY("repartition: min rows per proc");
50  SET_VALID_ENTRY("repartition: target rows per proc");
51  SET_VALID_ENTRY("repartition: min rows per thread");
52  SET_VALID_ENTRY("repartition: target rows per thread");
53  SET_VALID_ENTRY("repartition: max imbalance");
54 #undef SET_VALID_ENTRY
55 
56  validParamList->set<RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
57  validParamList->set<RCP<const FactoryBase> >("Map", Teuchos::null, "Factory of the map Map");
58  validParamList->set<RCP<const FactoryBase> >("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
59 
60  return validParamList;
61 }
62 
63 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
65  const Teuchos::ParameterList& pL = GetParameterList();
66  if (pL.isParameter("repartition: use map")) {
67  const bool useMap = pL.get<bool>("repartition: use map");
68  if (useMap)
69  Input(currentLevel, "Map");
70  else
71  Input(currentLevel, "A");
72  } else
73  Input(currentLevel, "A");
74  if (pL.isParameter("repartition: node repartition level")) {
75  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
76  if (currentLevel.GetLevelID() == nodeRepartLevel) {
77  Input(currentLevel, "Node Comm");
78  }
79  }
80 }
81 
82 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
84  FactoryMonitor m(*this, "Build", currentLevel);
85 
86  const Teuchos::ParameterList& pL = GetParameterList();
87  // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
88  // TODO (JG): I don't really know if we want to do this.
89  const int startLevel = pL.get<int>("repartition: start level");
90  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
91  LO minRowsPerProcess = pL.get<LO>("repartition: min rows per proc");
92  LO targetRowsPerProcess = pL.get<LO>("repartition: target rows per proc");
93  LO minRowsPerThread = pL.get<LO>("repartition: min rows per thread");
94  LO targetRowsPerThread = pL.get<LO>("repartition: target rows per thread");
95  const double nonzeroImbalance = pL.get<double>("repartition: max imbalance");
96  const bool useMap = pL.get<bool>("repartition: use map");
97 
98  int thread_per_mpi_rank = 1;
99 #if defined(KOKKOS_ENABLE_OPENMP)
100  using execution_space = typename Node::device_type::execution_space;
101  if (std::is_same<execution_space, Kokkos::OpenMP>::value)
102  thread_per_mpi_rank = execution_space().concurrency();
103 #endif
104 
105  if (minRowsPerThread > 0)
106  // We ignore the value given by minRowsPerProcess and repartition based on threads instead
107  minRowsPerProcess = minRowsPerThread * thread_per_mpi_rank;
108 
109  if (targetRowsPerThread == 0)
110  targetRowsPerThread = minRowsPerThread;
111 
112  if (targetRowsPerThread > 0)
113  // We ignore the value given by targetRowsPerProcess and repartition based on threads instead
114  targetRowsPerProcess = targetRowsPerThread * thread_per_mpi_rank;
115 
116  if (targetRowsPerProcess == 0)
117  targetRowsPerProcess = minRowsPerProcess;
118 
119  // Stick this on the level so Zoltan2Interface can use this later
120  Set<LO>(currentLevel, "repartition: heuristic target rows per process", targetRowsPerProcess);
121 
122  // Check for validity of the node repartition option
123  TEUCHOS_TEST_FOR_EXCEPTION(nodeRepartLevel >= startLevel, Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): If 'repartition: node repartition level' is set, it must be less than or equal to 'repartition: start level'");
124 
125  RCP<Matrix> A;
127  RCP<const Map> map;
128  if (!useMap) {
129  Afact = GetFactory("A");
130  if (!Afact.is_null() && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) == Teuchos::null &&
131  Teuchos::rcp_dynamic_cast<const BlockedRAPFactory>(Afact) == Teuchos::null &&
132  Teuchos::rcp_dynamic_cast<const SubBlockAFactory>(Afact) == Teuchos::null) {
133  GetOStream(Warnings) << "MueLu::RepartitionHeuristicFactory::Build: The generation factory for A must "
134  "be a RAPFactory or a SubBlockAFactory providing the non-rebalanced matrix information! "
135  "It specifically must not be of type Rebalance(Blocked)AcFactory or similar. "
136  "Please check the input. Make also sure that \"number of partitions\" is provided to "
137  "the Interface class and the RepartitionFactory instance. Instead, we have a "
138  << Afact->description() << std::endl;
139  }
140  // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
141  A = Get<RCP<Matrix> >(currentLevel, "A");
142  map = A->getRowMap();
143  } else
144  map = Get<RCP<const Map> >(currentLevel, "Map");
145 
146  // ======================================================================================================
147  // Determine whether partitioning is needed
148  // ======================================================================================================
149  // NOTE: most tests include some global communication, which is why we currently only do tests until we make
150  // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
151  // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.
152 
153  // Test0: Should we do node repartitioning?
154  if (currentLevel.GetLevelID() == nodeRepartLevel && map->getComm()->getSize() > 1) {
155  RCP<const Teuchos::Comm<int> > NodeComm = Get<RCP<const Teuchos::Comm<int> > >(currentLevel, "Node Comm");
156  TEUCHOS_TEST_FOR_EXCEPTION(NodeComm.is_null(), Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): NodeComm is null.");
157 
158  // If we only have one node, then we don't want to pop down to one rank
159  if (NodeComm()->getSize() != map->getComm()->getSize()) {
160  GetOStream(Statistics1) << "Repartitioning? YES: \n Within node only" << std::endl;
161  int nodeRank = NodeComm->getRank();
162 
163  // Do a reduction to get the total number of nodes
164  int isZero = (nodeRank == 0);
165  int numNodes = 0;
166  Teuchos::reduceAll(*map->getComm(), Teuchos::REDUCE_SUM, isZero, Teuchos::outArg(numNodes));
167  Set(currentLevel, "number of partitions", numNodes);
168  return;
169  }
170  }
171 
172  // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
173  if (currentLevel.GetLevelID() < startLevel) {
174  GetOStream(Statistics1) << "Repartitioning? NO:"
175  << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;
176 
177  // a negative number of processors means: no repartitioning
178  Set(currentLevel, "number of partitions", -1);
179 
180  return;
181  }
182 
183  RCP<const Teuchos::Comm<int> > origComm = map->getComm();
184  RCP<const Teuchos::Comm<int> > comm = origComm;
185 
186  // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
187  // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
188  // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
189 
190  // TODO: The block transfer factories do not check correctly whether or not repartitioning actually took place.
191  {
192  if (comm->getSize() == 1 && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) != Teuchos::null) {
193  GetOStream(Statistics1) << "Repartitioning? NO:"
194  << "\n comm size = 1" << std::endl;
195 
196  Set(currentLevel, "number of partitions", -1);
197  return;
198  }
199 
200  int numActiveProcesses = 0;
201  MueLu_sumAll(comm, Teuchos::as<int>((map->getLocalNumElements() > 0) ? 1 : 0), numActiveProcesses);
202 
203  if (numActiveProcesses == 1) {
204  GetOStream(Statistics1) << "Repartitioning? NO:"
205  << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;
206 
207  Set(currentLevel, "number of partitions", 1);
208  return;
209  }
210  }
211 
212  bool test3 = false, test4 = false;
213  std::string msg3, msg4;
214 
215  // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
216  // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
217  if (minRowsPerProcess > 0) {
218  LO numMyRows = Teuchos::as<LO>(map->getLocalNumElements()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
219  LO haveFewRows = (numMyRows < minRowsPerProcess ? 1 : 0), numWithFewRows = 0;
220  MueLu_sumAll(comm, haveFewRows, numWithFewRows);
221  MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);
222 
223  // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
224  // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
225  // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
226  if (numWithFewRows > 0)
227  test3 = true;
228 
229  msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcess);
230  }
231 
232  // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
233  if (!test3) {
234  if (useMap)
235  msg4 = "";
236  else {
237  GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getLocalNumEntries());
238  MueLu_maxAll(comm, numMyNnz, maxNnz);
239  MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
240  double imbalance = Teuchos::as<double>(maxNnz) / minNnz;
241 
242  if (imbalance > nonzeroImbalance)
243  test4 = true;
244 
245  msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
246  }
247  }
248 
249  if (!test3 && !test4) {
250  GetOStream(Statistics1) << "Repartitioning? NO:" << msg3 + msg4 << std::endl;
251 
252  // A negative number of partitions means: no repartitioning
253  Set(currentLevel, "number of partitions", -1);
254  return;
255  }
256 
257  GetOStream(Statistics1) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;
258 
259  // ======================================================================================================
260  // Calculate number of partitions
261  // ======================================================================================================
262  // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
263  // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
264 
265  // The number of partitions is calculated by the RepartitionFactory and stored in "number of partitions" variable on
266  // the current level. If this variable is already set (e.g., by another instance of RepartitionFactory) then this number
267  // is used. The "number of partitions" variable serves as basic communication between the RepartitionFactory (which
268  // requests a certain number of partitions) and the *Interface classes which call the underlying partitioning algorithms
269  // and produce the "Partition" array with the requested number of partitions.
270  const auto globalNumRows = Teuchos::as<GO>(map->getGlobalNumElements());
271  int numPartitions = 1;
272  if (globalNumRows >= targetRowsPerProcess) {
273  // Make sure that each CPU thread has approximately targetRowsPerProcess
274  numPartitions = std::max(Teuchos::as<int>(globalNumRows / targetRowsPerProcess), 1);
275  }
276  numPartitions = std::min(numPartitions, comm->getSize());
277 
278  Set(currentLevel, "number of partitions", numPartitions);
279 
280  GetOStream(Statistics1) << "Number of partitions to use = " << numPartitions << std::endl;
281 } // Build
282 } // namespace MueLu
283 
284 #endif // ifdef HAVE_MPI
285 #endif /* PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_ */
#define MueLu_sumAll(rcpComm, in, out)
virtual ~RepartitionHeuristicFactory()
Destructor.
#define MueLu_maxAll(rcpComm, in, out)
void Build(Level &currentLevel) const
Build an object with this factory.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Print more statistics.
LocalOrdinal LO
#define SET_VALID_ENTRY(name)
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
#define MueLu_minAll(rcpComm, in, out)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
Factory for building a thresholded operator.
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionHeuristicFactory needs, and the factories that generate that data...
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:51
Exception throws to report errors in the internal logical of the program.
Print all warning messages.
Factory for building coarse matrices.
RepartitionHeuristicFactory()
Constructor.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
virtual std::string description() const
Return a simple one-line description of this object.
std::string toString(const T &t)
bool is_null() const