MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_RepartitionHeuristicFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 //
3 // ***********************************************************************
4 //
5 // MueLu: A package for multigrid based preconditioning
6 // Copyright 2012 Sandia Corporation
7 //
8 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
9 // the U.S. Government retains certain rights in this software.
10 //
11 // Redistribution and use in source and binary forms, with or without
12 // modification, are permitted provided that the following conditions are
13 // met:
14 //
15 // 1. Redistributions of source code must retain the above copyright
16 // notice, this list of conditions and the following disclaimer.
17 //
18 // 2. Redistributions in binary form must reproduce the above copyright
19 // notice, this list of conditions and the following disclaimer in the
20 // documentation and/or other materials provided with the distribution.
21 //
22 // 3. Neither the name of the Corporation nor the names of the
23 // contributors may be used to endorse or promote products derived from
24 // this software without specific prior written permission.
25 //
26 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
27 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
29 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
30 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
31 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
32 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
33 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
34 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
35 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
36 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
37 //
38 // Questions? Contact
39 // Jonathan Hu (jhu@sandia.gov)
40 // Andrey Prokopenko (aprokop@sandia.gov)
41 // Ray Tuminaro (rstumin@sandia.gov)
42 // Tobias Wiesner (tawiesn@sandia.gov)
43 //
44 // ***********************************************************************
45 //
46 // @HEADER
47 #ifndef PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
48 #define PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_
49 
50 #include <algorithm>
51 #include <iostream>
52 #include <sstream>
53 
54 #ifdef HAVE_MPI
56 #include <Teuchos_CommHelpers.hpp>
57 
58 //#include <Xpetra_Map.hpp>
59 #include <Xpetra_Matrix.hpp>
60 
61 #include "MueLu_RAPFactory.hpp"
62 #include "MueLu_BlockedRAPFactory.hpp"
63 #include "MueLu_SubBlockAFactory.hpp"
64 #include "MueLu_Level.hpp"
65 #include "MueLu_MasterList.hpp"
66 #include "MueLu_Monitor.hpp"
67 
69 
70 namespace MueLu {
71 
72 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
74  RCP<ParameterList> validParamList = rcp(new ParameterList());
75 
76 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
77  SET_VALID_ENTRY("repartition: start level");
78  SET_VALID_ENTRY("repartition: use map");
79  SET_VALID_ENTRY("repartition: node repartition level");
80  SET_VALID_ENTRY("repartition: min rows per proc");
81  SET_VALID_ENTRY("repartition: target rows per proc");
82  SET_VALID_ENTRY("repartition: min rows per thread");
83  SET_VALID_ENTRY("repartition: target rows per thread");
84  SET_VALID_ENTRY("repartition: max imbalance");
85 #undef SET_VALID_ENTRY
86 
87  validParamList->set<RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
88  validParamList->set<RCP<const FactoryBase> >("Map", Teuchos::null, "Factory of the map Map");
89  validParamList->set<RCP<const FactoryBase> >("Node Comm", Teuchos::null, "Generating factory of the node level communicator");
90 
91  return validParamList;
92 }
93 
94 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
96  const Teuchos::ParameterList& pL = GetParameterList();
97  if (pL.isParameter("repartition: use map")) {
98  const bool useMap = pL.get<bool>("repartition: use map");
99  if (useMap)
100  Input(currentLevel, "Map");
101  else
102  Input(currentLevel, "A");
103  } else
104  Input(currentLevel, "A");
105  if (pL.isParameter("repartition: node repartition level")) {
106  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
107  if (currentLevel.GetLevelID() == nodeRepartLevel) {
108  Input(currentLevel, "Node Comm");
109  }
110  }
111 }
112 
113 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
115  FactoryMonitor m(*this, "Build", currentLevel);
116 
117  const Teuchos::ParameterList& pL = GetParameterList();
118  // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
119  // TODO (JG): I don't really know if we want to do this.
120  const int startLevel = pL.get<int>("repartition: start level");
121  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
122  LO minRowsPerProcess = pL.get<LO>("repartition: min rows per proc");
123  LO targetRowsPerProcess = pL.get<LO>("repartition: target rows per proc");
124  LO minRowsPerThread = pL.get<LO>("repartition: min rows per thread");
125  LO targetRowsPerThread = pL.get<LO>("repartition: target rows per thread");
126  const double nonzeroImbalance = pL.get<double>("repartition: max imbalance");
127  const bool useMap = pL.get<bool>("repartition: use map");
128 
129  int thread_per_mpi_rank = 1;
130 #if defined(KOKKOS_ENABLE_OPENMP)
131  using execution_space = typename Node::device_type::execution_space;
132  if (std::is_same<execution_space, Kokkos::OpenMP>::value)
133  thread_per_mpi_rank = execution_space().concurrency();
134 #endif
135 
136  if (minRowsPerThread > 0)
137  // We ignore the value given by minRowsPerProcess and repartition based on threads instead
138  minRowsPerProcess = minRowsPerThread * thread_per_mpi_rank;
139 
140  if (targetRowsPerThread == 0)
141  targetRowsPerThread = minRowsPerThread;
142 
143  if (targetRowsPerThread > 0)
144  // We ignore the value given by targetRowsPerProcess and repartition based on threads instead
145  targetRowsPerProcess = targetRowsPerThread * thread_per_mpi_rank;
146 
147  if (targetRowsPerProcess == 0)
148  targetRowsPerProcess = minRowsPerProcess;
149 
150  // Stick this on the level so Zoltan2Interface can use this later
151  Set<LO>(currentLevel, "repartition: heuristic target rows per process", targetRowsPerProcess);
152 
153  // Check for validity of the node repartition option
154  TEUCHOS_TEST_FOR_EXCEPTION(nodeRepartLevel >= startLevel, Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): If 'repartition: node repartition level' is set, it must be less than or equal to 'repartition: start level'");
155 
156  RCP<Matrix> A;
158  RCP<const Map> map;
159  if (!useMap) {
160  Afact = GetFactory("A");
161  if (!Afact.is_null() && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) == Teuchos::null &&
162  Teuchos::rcp_dynamic_cast<const BlockedRAPFactory>(Afact) == Teuchos::null &&
163  Teuchos::rcp_dynamic_cast<const SubBlockAFactory>(Afact) == Teuchos::null) {
164  GetOStream(Warnings) << "MueLu::RepartitionHeuristicFactory::Build: The generation factory for A must "
165  "be a RAPFactory or a SubBlockAFactory providing the non-rebalanced matrix information! "
166  "It specifically must not be of type Rebalance(Blocked)AcFactory or similar. "
167  "Please check the input. Make also sure that \"number of partitions\" is provided to "
168  "the Interface class and the RepartitionFactory instance. Instead, we have a "
169  << Afact->description() << std::endl;
170  }
171  // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
172  A = Get<RCP<Matrix> >(currentLevel, "A");
173  map = A->getRowMap();
174  } else
175  map = Get<RCP<const Map> >(currentLevel, "Map");
176 
177  // ======================================================================================================
178  // Determine whether partitioning is needed
179  // ======================================================================================================
180  // NOTE: most tests include some global communication, which is why we currently only do tests until we make
181  // a decision on whether to repartition. However, there is value in knowing how "close" we are to having to
182  // rebalance an operator. So, it would probably be beneficial to do and report *all* tests.
183 
184  // Test0: Should we do node repartitioning?
185  if (currentLevel.GetLevelID() == nodeRepartLevel && map->getComm()->getSize() > 1) {
186  RCP<const Teuchos::Comm<int> > NodeComm = Get<RCP<const Teuchos::Comm<int> > >(currentLevel, "Node Comm");
187  TEUCHOS_TEST_FOR_EXCEPTION(NodeComm.is_null(), Exceptions::RuntimeError, "MueLu::RepartitionHeuristicFactory::Build(): NodeComm is null.");
188 
189  // If we only have one node, then we don't want to pop down to one rank
190  if (NodeComm()->getSize() != map->getComm()->getSize()) {
191  GetOStream(Statistics1) << "Repartitioning? YES: \n Within node only" << std::endl;
192  int nodeRank = NodeComm->getRank();
193 
194  // Do a reduction to get the total number of nodes
195  int isZero = (nodeRank == 0);
196  int numNodes = 0;
197  Teuchos::reduceAll(*map->getComm(), Teuchos::REDUCE_SUM, isZero, Teuchos::outArg(numNodes));
198  Set(currentLevel, "number of partitions", numNodes);
199  return;
200  }
201  }
202 
203  // Test1: skip repartitioning if current level is less than the specified minimum level for repartitioning
204  if (currentLevel.GetLevelID() < startLevel) {
205  GetOStream(Statistics1) << "Repartitioning? NO:"
206  << "\n current level = " << Teuchos::toString(currentLevel.GetLevelID()) << ", first level where repartitioning can happen is " + Teuchos::toString(startLevel) << std::endl;
207 
208  // a negative number of processors means: no repartitioning
209  Set(currentLevel, "number of partitions", -1);
210 
211  return;
212  }
213 
214  RCP<const Teuchos::Comm<int> > origComm = map->getComm();
215  RCP<const Teuchos::Comm<int> > comm = origComm;
216 
217  // Test 2: check whether A is actually distributed, i.e. more than one processor owns part of A
218  // TODO: this global communication can be avoided if we store the information with the matrix (it is known when matrix is created)
219  // TODO: further improvements could be achieved when we use subcommunicator for the active set. Then we only need to check its size
220 
221  // TODO: The block transfer factories do not check correctly whether or not repartitioning actually took place.
222  {
223  if (comm->getSize() == 1 && Teuchos::rcp_dynamic_cast<const RAPFactory>(Afact) != Teuchos::null) {
224  GetOStream(Statistics1) << "Repartitioning? NO:"
225  << "\n comm size = 1" << std::endl;
226 
227  Set(currentLevel, "number of partitions", -1);
228  return;
229  }
230 
231  int numActiveProcesses = 0;
232  MueLu_sumAll(comm, Teuchos::as<int>((map->getLocalNumElements() > 0) ? 1 : 0), numActiveProcesses);
233 
234  if (numActiveProcesses == 1) {
235  GetOStream(Statistics1) << "Repartitioning? NO:"
236  << "\n # processes with rows = " << Teuchos::toString(numActiveProcesses) << std::endl;
237 
238  Set(currentLevel, "number of partitions", 1);
239  return;
240  }
241  }
242 
243  bool test3 = false, test4 = false;
244  std::string msg3, msg4;
245 
246  // Test3: check whether number of rows on any processor satisfies the minimum number of rows requirement
247  // NOTE: Test2 ensures that repartitionning is not done when there is only one processor (it may or may not satisfy Test3)
248  if (minRowsPerProcess > 0) {
249  LO numMyRows = Teuchos::as<LO>(map->getLocalNumElements()), minNumRows, LOMAX = Teuchos::OrdinalTraits<LO>::max();
250  LO haveFewRows = (numMyRows < minRowsPerProcess ? 1 : 0), numWithFewRows = 0;
251  MueLu_sumAll(comm, haveFewRows, numWithFewRows);
252  MueLu_minAll(comm, (numMyRows > 0 ? numMyRows : LOMAX), minNumRows);
253 
254  // TODO: we could change it to repartition only if the number of processors with numRows < minNumRows is larger than some
255  // percentage of the total number. This way, we won't repartition if 2 out of 1000 processors don't have enough elements.
256  // I'm thinking maybe 20% threshold. To implement, simply add " && numWithFewRows < .2*numProcs" to the if statement.
257  if (numWithFewRows > 0)
258  test3 = true;
259 
260  msg3 = "\n min # rows per proc = " + Teuchos::toString(minNumRows) + ", min allowable = " + Teuchos::toString(minRowsPerProcess);
261  }
262 
263  // Test4: check whether the balance in the number of nonzeros per processor is greater than threshold
264  if (!test3) {
265  if (useMap)
266  msg4 = "";
267  else {
268  GO minNnz, maxNnz, numMyNnz = Teuchos::as<GO>(A->getLocalNumEntries());
269  MueLu_maxAll(comm, numMyNnz, maxNnz);
270  MueLu_minAll(comm, (numMyNnz > 0 ? numMyNnz : maxNnz), minNnz); // min nnz over all active processors
271  double imbalance = Teuchos::as<double>(maxNnz) / minNnz;
272 
273  if (imbalance > nonzeroImbalance)
274  test4 = true;
275 
276  msg4 = "\n nonzero imbalance = " + Teuchos::toString(imbalance) + ", max allowable = " + Teuchos::toString(nonzeroImbalance);
277  }
278  }
279 
280  if (!test3 && !test4) {
281  GetOStream(Statistics1) << "Repartitioning? NO:" << msg3 + msg4 << std::endl;
282 
283  // A negative number of partitions means: no repartitioning
284  Set(currentLevel, "number of partitions", -1);
285  return;
286  }
287 
288  GetOStream(Statistics1) << "Repartitioning? YES:" << msg3 + msg4 << std::endl;
289 
290  // ======================================================================================================
291  // Calculate number of partitions
292  // ======================================================================================================
293  // FIXME Quick way to figure out how many partitions there should be (same algorithm as ML)
294  // FIXME Should take into account nnz? Perhaps only when user is using min #nnz per row threshold.
295 
296  // The number of partitions is calculated by the RepartitionFactory and stored in "number of partitions" variable on
297  // the current level. If this variable is already set (e.g., by another instance of RepartitionFactory) then this number
298  // is used. The "number of partitions" variable serves as basic communication between the RepartitionFactory (which
299  // requests a certain number of partitions) and the *Interface classes which call the underlying partitioning algorithms
300  // and produce the "Partition" array with the requested number of partitions.
301  const auto globalNumRows = Teuchos::as<GO>(map->getGlobalNumElements());
302  int numPartitions = 1;
303  if (globalNumRows >= targetRowsPerProcess) {
304  // Make sure that each CPU thread has approximately targetRowsPerProcess
305  numPartitions = std::max(Teuchos::as<int>(globalNumRows / targetRowsPerProcess), 1);
306  }
307  numPartitions = std::min(numPartitions, comm->getSize());
308 
309  Set(currentLevel, "number of partitions", numPartitions);
310 
311  GetOStream(Statistics1) << "Number of partitions to use = " << numPartitions << std::endl;
312 } // Build
313 } // namespace MueLu
314 
315 #endif // ifdef HAVE_MPI
316 #endif /* PACKAGES_MUELU_SRC_REBALANCING_MUELU_REPARTITIONHEURISTICFACTORY_DEF_HPP_ */
#define MueLu_sumAll(rcpComm, in, out)
#define MueLu_maxAll(rcpComm, in, out)
void Build(Level &currentLevel) const
Build an object with this factory.
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
ParameterList & set(std::string const &name, T const &value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Print more statistics.
LocalOrdinal LO
#define SET_VALID_ENTRY(name)
#define MueLu_minAll(rcpComm, in, out)
bool isParameter(const std::string &name) const
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:99
Factory for building a thresholded operator.
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionHeuristicFactory needs, and the factories that generate that data...
int GetLevelID() const
Return level number.
Definition: MueLu_Level.cpp:76
Exception throws to report errors in the internal logical of the program.
Print all warning messages.
Factory for building coarse matrices.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
virtual std::string description() const
Return a simple one-line description of this object.
std::string toString(const T &t)
bool is_null() const