MueLu  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
MueLu_RepartitionFactory_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // MueLu: A package for multigrid based preconditioning
4 //
5 // Copyright 2012 NTESS and the MueLu contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef MUELU_REPARTITIONFACTORY_DEF_HPP
11 #define MUELU_REPARTITIONFACTORY_DEF_HPP
12 
13 #include <algorithm>
14 #include <iostream>
15 #include <sstream>
16 
17 #include "MueLu_RepartitionFactory_decl.hpp" // TMP JG NOTE: before other includes, otherwise I cannot test the fwd declaration in _def
18 
19 #ifdef HAVE_MPI
21 #include <Teuchos_CommHelpers.hpp>
23 
24 #include <Xpetra_Map.hpp>
25 #include <Xpetra_MapFactory.hpp>
26 #include <Xpetra_MultiVectorFactory.hpp>
27 #include <Xpetra_VectorFactory.hpp>
28 #include <Xpetra_Import.hpp>
29 #include <Xpetra_ImportFactory.hpp>
30 #include <Xpetra_Export.hpp>
31 #include <Xpetra_ExportFactory.hpp>
32 #include <Xpetra_Matrix.hpp>
33 #include <Xpetra_MatrixFactory.hpp>
34 
35 #include "MueLu_Utilities.hpp"
36 
37 #include "MueLu_CloneRepartitionInterface.hpp"
38 
39 #include "MueLu_Level.hpp"
40 #include "MueLu_MasterList.hpp"
41 #include "MueLu_Monitor.hpp"
42 #include "MueLu_PerfUtils.hpp"
43 
44 namespace MueLu {
45 
46 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
48 
49 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
51 
52 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
54  RCP<ParameterList> validParamList = rcp(new ParameterList());
55 
56 #define SET_VALID_ENTRY(name) validParamList->setEntry(name, MasterList::getEntry(name))
57  SET_VALID_ENTRY("repartition: print partition distribution");
58  SET_VALID_ENTRY("repartition: remap parts");
59  SET_VALID_ENTRY("repartition: remap num values");
60  SET_VALID_ENTRY("repartition: remap accept partition");
61  SET_VALID_ENTRY("repartition: node repartition level");
62  SET_VALID_ENTRY("repartition: save importer");
63 #undef SET_VALID_ENTRY
64 
65  validParamList->set<RCP<const FactoryBase> >("A", Teuchos::null, "Factory of the matrix A");
66  validParamList->set<RCP<const FactoryBase> >("number of partitions", Teuchos::null, "Instance of RepartitionHeuristicFactory.");
67  validParamList->set<RCP<const FactoryBase> >("Partition", Teuchos::null, "Factory of the partition");
68 
69  return validParamList;
70 }
71 
72 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
74  Input(currentLevel, "A");
75  Input(currentLevel, "number of partitions");
76  Input(currentLevel, "Partition");
77 }
78 
79 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
81  FactoryMonitor m(*this, "Build", currentLevel);
82 
83  const Teuchos::ParameterList& pL = GetParameterList();
84  // Access parameters here to make sure that we set the parameter entry flag to "used" even in case of short-circuit evaluation.
85  // TODO (JG): I don't really know if we want to do this.
86  bool remapPartitions = pL.get<bool>("repartition: remap parts");
87 
88  // TODO: We only need a CrsGraph. This class does not have to be templated on Scalar types.
89  RCP<Matrix> A = Get<RCP<Matrix> >(currentLevel, "A");
90  if (A == Teuchos::null) {
91  Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
92  return;
93  }
94  RCP<const Map> rowMap = A->getRowMap();
95  GO indexBase = rowMap->getIndexBase();
96  Xpetra::UnderlyingLib lib = rowMap->lib();
97 
98  RCP<const Teuchos::Comm<int> > origComm = rowMap->getComm();
99  RCP<const Teuchos::Comm<int> > comm = origComm;
100 
101  int myRank = comm->getRank();
102  int numProcs = comm->getSize();
103 
104  RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
105  TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
106  RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();
107 
109  int numPartitions = Get<int>(currentLevel, "number of partitions");
110 
111  // ======================================================================================================
112  // Construct decomposition vector
113  // ======================================================================================================
114  RCP<GOVector> decomposition = Get<RCP<GOVector> >(currentLevel, "Partition");
115 
116  // check which factory provides "Partition"
117  if (remapPartitions == true && Teuchos::rcp_dynamic_cast<const CloneRepartitionInterface>(GetFactory("Partition")) != Teuchos::null) {
118  // if "Partition" is provided by a CloneRepartitionInterface class we have to switch of remapPartitions
119  // as we can assume the processor Ids in Partition to be the expected ones. If we would do remapping we
120  // would get different processors for the different blocks which screws up matrix-matrix multiplication.
121  remapPartitions = false;
122  }
123 
124  // check special cases
125  if (numPartitions == 1) {
126  // Trivial case: decomposition is the trivial one, all zeros. We skip the call to Zoltan_Interface
127  // (this is mostly done to avoid extra output messages, as even if we didn't skip there is a shortcut
128  // in Zoltan[12]Interface).
129  // TODO: We can probably skip more work in this case (like building all extra data structures)
130  GetOStream(Runtime0) << "Only one partition: Skip call to the repartitioner." << std::endl;
131  } else if (numPartitions == -1) {
132  // No repartitioning necessary: decomposition should be Teuchos::null
133  GetOStream(Runtime0) << "No repartitioning necessary: partitions were left unchanged by the repartitioner" << std::endl;
134  Set<RCP<const Import> >(currentLevel, "Importer", Teuchos::null);
135  return;
136  }
137 
138  // If we're doing node away, we need to be sure to get the mapping to the NodeComm's rank 0.
139  const int nodeRepartLevel = pL.get<int>("repartition: node repartition level");
140  if (currentLevel.GetLevelID() == nodeRepartLevel) {
141  // NodePartitionInterface returns the *ranks* of the guy who gets the info, not the *partition number*
142  // In a sense, we've already done remap here.
143 
144  // FIXME: We need a low-comm import construction
145  remapPartitions = false;
146  }
147 
148  // ======================================================================================================
149  // Remap if necessary
150  // ======================================================================================================
151  // From a user perspective, we want user to not care about remapping, thinking of it as only a performance feature.
152  // There are two problems, however.
153  // (1) Next level aggregation depends on the order of GIDs in the vector, if one uses "natural" or "random" orderings.
154  // This also means that remapping affects next level aggregation, despite the fact that the _set_ of GIDs for
155  // each partition is the same.
156  // (2) Even with the fixed order of GIDs, the remapping may influence the aggregation for the next-next level.
157  // Let us consider the following example. Lets assume that when we don't do remapping, processor 0 would have
158  // GIDs {0,1,2}, and processor 1 GIDs {3,4,5}, and if we do remapping processor 0 would contain {3,4,5} and
159  // processor 1 {0,1,2}. Now, when we run repartitioning algorithm on the next level (say Zoltan1 RCB), it may
160  // be dependent on whether whether it is [{0,1,2}, {3,4,5}] or [{3,4,5}, {0,1,2}]. Specifically, the tie-breaking
161  // algorithm can resolve these differently. For instance, running
162  // mpirun -np 5 ./MueLu_ScalingTestParamList.exe --xml=easy_sa.xml --nx=12 --ny=12 --nz=12
163  // with
164  // <ParameterList name="MueLu">
165  // <Parameter name="coarse: max size" type="int" value="1"/>
166  // <Parameter name="repartition: enable" type="bool" value="true"/>
167  // <Parameter name="repartition: min rows per proc" type="int" value="2"/>
168  // <ParameterList name="level 1">
169  // <Parameter name="repartition: remap parts" type="bool" value="false/true"/>
170  // </ParameterList>
171  // </ParameterList>
172  // produces different repartitioning for level 2.
173  // This different repartitioning may then escalate into different aggregation for the next level.
174  //
175  // We fix (1) by fixing the order of GIDs in a vector by sorting the resulting vector.
176  // Fixing (2) is more complicated.
177  // FIXME: Fixing (2) in Zoltan may not be enough, as we may use some arbitration in MueLu,
178  // for instance with CoupledAggregation. What we really need to do is to use the same order of processors containing
179  // the same order of GIDs. To achieve that, the newly created subcommunicator must be conforming with the order. For
180  // instance, if we have [{0,1,2}, {3,4,5}], we create a subcommunicator where processor 0 gets rank 0, and processor 1
181  // gets rank 1. If, on the other hand, we have [{3,4,5}, {0,1,2}], we assign rank 1 to processor 0, and rank 0 to processor 1.
182  // This rank permutation requires help from Epetra/Tpetra, both of which have no such API in place.
183  // One should also be concerned that if we had such API in place, rank 0 in subcommunicator may no longer be rank 0 in
184  // MPI_COMM_WORLD, which may lead to issues for logging.
185  if (remapPartitions) {
186  SubFactoryMonitor m1(*this, "DeterminePartitionPlacement", currentLevel);
187 
188  bool acceptPartition = pL.get<bool>("repartition: remap accept partition");
189  bool allSubdomainsAcceptPartitions;
190  int localNumAcceptPartition = acceptPartition;
191  int globalNumAcceptPartition;
192  MueLu_sumAll(comm, localNumAcceptPartition, globalNumAcceptPartition);
193  GetOStream(Statistics2) << "Number of ranks that accept partitions: " << globalNumAcceptPartition << std::endl;
194  if (globalNumAcceptPartition < numPartitions) {
195  GetOStream(Warnings0) << "Not enough ranks are willing to accept a partition, allowing partitions on all ranks." << std::endl;
196  acceptPartition = true;
197  allSubdomainsAcceptPartitions = true;
198  } else if (numPartitions > numProcs) {
199  // We are trying to repartition to a larger communicator.
200  allSubdomainsAcceptPartitions = true;
201  } else {
202  allSubdomainsAcceptPartitions = false;
203  }
204 
205  DeterminePartitionPlacement(*A, *decomposition, numPartitions, acceptPartition, allSubdomainsAcceptPartitions);
206  }
207 
208  // ======================================================================================================
209  // Construct importer
210  // ======================================================================================================
211  // At this point, the following is true:
212  // * Each processors owns 0 or 1 partitions
213  // * If a processor owns a partition, that partition number is equal to the processor rank
214  // * The decomposition vector contains the partitions ids that the corresponding GID belongs to
215 
216  ArrayRCP<const GO> decompEntries;
217  if (decomposition->getLocalLength() > 0)
218  decompEntries = decomposition->getData(0);
219 
220 #ifdef HAVE_MUELU_DEBUG
221  // Test range of partition ids
222  int incorrectRank = -1;
223  for (int i = 0; i < decompEntries.size(); i++)
224  if (decompEntries[i] >= numProcs || decompEntries[i] < 0) {
225  incorrectRank = myRank;
226  break;
227  }
228 
229  int incorrectGlobalRank = -1;
230  MueLu_maxAll(comm, incorrectRank, incorrectGlobalRank);
231  TEUCHOS_TEST_FOR_EXCEPTION(incorrectGlobalRank > -1, Exceptions::RuntimeError, "pid " + Teuchos::toString(incorrectGlobalRank) + " encountered a partition number is that out-of-range");
232 #endif
233 
234  Array<GO> myGIDs;
235  myGIDs.reserve(decomposition->getLocalLength());
236 
237  // Step 0: Construct mapping
238  // part number -> GIDs I own which belong to this part
239  // NOTE: my own part GIDs are not part of the map
240  typedef std::map<GO, Array<GO> > map_type;
241  map_type sendMap;
242  for (LO i = 0; i < decompEntries.size(); i++) {
243  GO id = decompEntries[i];
244  GO GID = rowMap->getGlobalElement(i);
245 
246  if (id == myRank)
247  myGIDs.push_back(GID);
248  else
249  sendMap[id].push_back(GID);
250  }
251  decompEntries = Teuchos::null;
252 
253  if (IsPrint(Statistics2)) {
254  GO numLocalKept = myGIDs.size(), numGlobalKept, numGlobalRows = A->getGlobalNumRows();
255  MueLu_sumAll(comm, numLocalKept, numGlobalKept);
256  GetOStream(Statistics2) << "Unmoved rows: " << numGlobalKept << " / " << numGlobalRows << " (" << 100 * Teuchos::as<double>(numGlobalKept) / numGlobalRows << "%)" << std::endl;
257  }
258 
259  int numSend = sendMap.size(), numRecv;
260 
261  // Arrayify map keys
262  Array<GO> myParts(numSend), myPart(1);
263  int cnt = 0;
264  myPart[0] = myRank;
265  for (typename map_type::const_iterator it = sendMap.begin(); it != sendMap.end(); it++)
266  myParts[cnt++] = it->first;
267 
268  // Step 1: Find out how many processors send me data
269  // partsIndexBase starts from zero, as the processors ids start from zero
270  {
271  SubFactoryMonitor m1(*this, "Mapping Step 1", currentLevel);
272  GO partsIndexBase = 0;
273  RCP<Map> partsIHave = MapFactory ::Build(lib, Teuchos::OrdinalTraits<Xpetra::global_size_t>::invalid(), myParts(), partsIndexBase, comm);
274  RCP<Map> partsIOwn = MapFactory ::Build(lib, numProcs, myPart(), partsIndexBase, comm);
275  RCP<Export> partsExport = ExportFactory::Build(partsIHave, partsIOwn);
276 
279  if (numSend) {
280  ArrayRCP<GO> partsISendData = partsISend->getDataNonConst(0);
281  for (int i = 0; i < numSend; i++)
282  partsISendData[i] = 1;
283  }
284  (numPartsIRecv->getDataNonConst(0))[0] = 0;
285 
286  numPartsIRecv->doExport(*partsISend, *partsExport, Xpetra::ADD);
287  numRecv = (numPartsIRecv->getData(0))[0];
288  }
289 
290  // Step 2: Get my GIDs from everybody else
291  MPI_Datatype MpiType = Teuchos::Details::MpiTypeTraits<GO>::getType();
292  int msgTag = 12345; // TODO: use Comm::dup for all internal messaging
293 
294  // Post sends
295  Array<MPI_Request> sendReqs(numSend);
296  cnt = 0;
297  for (typename map_type::iterator it = sendMap.begin(); it != sendMap.end(); it++)
298  MPI_Isend(static_cast<void*>(it->second.getRawPtr()), it->second.size(), MpiType, Teuchos::as<GO>(it->first), msgTag, *rawMpiComm, &sendReqs[cnt++]);
299 
300  map_type recvMap;
301  size_t totalGIDs = myGIDs.size();
302  for (int i = 0; i < numRecv; i++) {
303  MPI_Status status;
304  MPI_Probe(MPI_ANY_SOURCE, msgTag, *rawMpiComm, &status);
305 
306  // Get rank and number of elements from status
307  int fromRank = status.MPI_SOURCE, count;
308  MPI_Get_count(&status, MpiType, &count);
309 
310  recvMap[fromRank].resize(count);
311  MPI_Recv(static_cast<void*>(recvMap[fromRank].getRawPtr()), count, MpiType, fromRank, msgTag, *rawMpiComm, &status);
312 
313  totalGIDs += count;
314  }
315 
316  // Do waits on send requests
317  if (numSend) {
318  Array<MPI_Status> sendStatuses(numSend);
319  MPI_Waitall(numSend, sendReqs.getRawPtr(), sendStatuses.getRawPtr());
320  }
321 
322  // Merge GIDs
323  myGIDs.reserve(totalGIDs);
324  for (typename map_type::const_iterator it = recvMap.begin(); it != recvMap.end(); it++) {
325  int offset = myGIDs.size(), len = it->second.size();
326  if (len) {
327  myGIDs.resize(offset + len);
328  memcpy(myGIDs.getRawPtr() + offset, it->second.getRawPtr(), len * sizeof(GO));
329  }
330  }
331  // NOTE 2: The general sorting algorithm could be sped up by using the knowledge that original myGIDs and all received chunks
332  // (i.e. it->second) are sorted. Therefore, a merge sort would work well in this situation.
333  std::sort(myGIDs.begin(), myGIDs.end());
334 
335  // Step 3: Construct importer
336  RCP<Map> newRowMap;
337  {
338  SubFactoryMonitor m1(*this, "Map construction", currentLevel);
339  newRowMap = MapFactory ::Build(lib, rowMap->getGlobalNumElements(), myGIDs(), indexBase, origComm);
340  }
341 
342  RCP<const Import> rowMapImporter;
343 
344  RCP<const BlockedMap> blockedRowMap = Teuchos::rcp_dynamic_cast<const BlockedMap>(rowMap);
345 
346  {
347  SubFactoryMonitor m1(*this, "Import construction", currentLevel);
348  // Generate a nonblocked rowmap if we need one
349  if (blockedRowMap.is_null())
350  rowMapImporter = ImportFactory::Build(rowMap, newRowMap);
351  else {
352  rowMapImporter = ImportFactory::Build(blockedRowMap->getMap(), newRowMap);
353  }
354  }
355 
356  // If we're running BlockedCrs we should chop up the newRowMap into a newBlockedRowMap here (and do likewise for importers)
357  if (!blockedRowMap.is_null()) {
358  SubFactoryMonitor m1(*this, "Blocking newRowMap and Importer", currentLevel);
359  RCP<const BlockedMap> blockedTargetMap = MueLu::UtilitiesBase<Scalar, LocalOrdinal, GlobalOrdinal, Node>::GeneratedBlockedTargetMap(*blockedRowMap, *rowMapImporter);
360 
361  // NOTE: This code qualifies as "correct but not particularly performant" If this needs to be sped up, we can probably read data from the existing importer to
362  // build sub-importers rather than generating new ones ex nihilo
363  size_t numBlocks = blockedRowMap->getNumMaps();
364  std::vector<RCP<const Import> > subImports(numBlocks);
365 
366  for (size_t i = 0; i < numBlocks; i++) {
367  RCP<const Map> source = blockedRowMap->getMap(i);
368  RCP<const Map> target = blockedTargetMap->getMap(i);
369  subImports[i] = ImportFactory::Build(source, target);
370  }
371  Set(currentLevel, "SubImporters", subImports);
372  }
373 
374  Set(currentLevel, "Importer", rowMapImporter);
375 
376  // Importer saving
377  bool save_importer = pL.get<bool>("repartition: save importer");
378  if (save_importer) {
379  currentLevel.Set("Importer", rowMapImporter, NoFactory::get());
380  currentLevel.AddKeepFlag("Importer", NoFactory::get(), MueLu::Final);
381  currentLevel.RemoveKeepFlag("Importer", NoFactory::get(), MueLu::UserData); // FIXME: This is a hack
382  }
383  // ======================================================================================================
384  // Print some data
385  // ======================================================================================================
386  if (!rowMapImporter.is_null() && IsPrint(Statistics2)) {
387  // int oldRank = SetProcRankVerbose(rebalancedAc->getRowMap()->getComm()->getRank());
388  GetOStream(Statistics2) << PerfUtils::PrintImporterInfo(rowMapImporter, "Importer for rebalancing");
389  // SetProcRankVerbose(oldRank);
390  }
391  if (pL.get<bool>("repartition: print partition distribution") && IsPrint(Statistics2)) {
392  // Print the grid of processors
393  GetOStream(Statistics2) << "Partition distribution over cores (ownership is indicated by '+')" << std::endl;
394 
395  char amActive = (myGIDs.size() ? 1 : 0);
396  std::vector<char> areActive(numProcs, 0);
397  MPI_Gather(&amActive, 1, MPI_CHAR, &areActive[0], 1, MPI_CHAR, 0, *rawMpiComm);
398 
399  int rowWidth = std::min(Teuchos::as<int>(ceil(sqrt(numProcs))), 100);
400  for (int proc = 0; proc < numProcs; proc += rowWidth) {
401  for (int j = 0; j < rowWidth; j++)
402  if (proc + j < numProcs)
403  GetOStream(Statistics2) << (areActive[proc + j] ? "+" : ".");
404  else
405  GetOStream(Statistics2) << " ";
406 
407  GetOStream(Statistics2) << " " << proc << ":" << std::min(proc + rowWidth, numProcs) - 1 << std::endl;
408  }
409  }
410 
411 } // Build
412 
413 //----------------------------------------------------------------------
414 template <typename T, typename W>
415 struct Triplet {
416  T i, j;
417  W v;
418 };
419 template <typename T, typename W>
420 static bool compareTriplets(const Triplet<T, W>& a, const Triplet<T, W>& b) {
421  return (a.v > b.v); // descending order
422 }
423 
424 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
425 void RepartitionFactory<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
426  DeterminePartitionPlacement(const Matrix& A, GOVector& decomposition, GO numPartitions, bool willAcceptPartition, bool allSubdomainsAcceptPartitions) const {
427  RCP<const Map> rowMap = A.getRowMap();
428 
429  RCP<const Teuchos::Comm<int> > comm = rowMap->getComm()->duplicate();
430  int numProcs = comm->getSize();
431 
432  RCP<const Teuchos::MpiComm<int> > tmpic = rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm);
433  TEUCHOS_TEST_FOR_EXCEPTION(tmpic == Teuchos::null, Exceptions::RuntimeError, "Cannot cast base Teuchos::Comm to Teuchos::MpiComm object.");
434  RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawMpiComm = tmpic->getRawMpiComm();
435 
436  const Teuchos::ParameterList& pL = GetParameterList();
437 
438  // maxLocal is a constant which determins the number of largest edges which are being exchanged
439  // The idea is that we do not want to construct the full bipartite graph, but simply a subset of
440  // it, which requires less communication. By selecting largest local edges we hope to achieve
441  // similar results but at a lower cost.
442  const int maxLocal = pL.get<int>("repartition: remap num values");
443  const int dataSize = 2 * maxLocal;
444 
445  ArrayRCP<GO> decompEntries;
446  if (decomposition.getLocalLength() > 0)
447  decompEntries = decomposition.getDataNonConst(0);
448 
449  // Step 1: Sort local edges by weight
450  // Each edge of a bipartite graph corresponds to a triplet (i, j, v) where
451  // i: processor id that has some piece of part with part_id = j
452  // j: part id
453  // v: weight of the edge
454  // We set edge weights to be the total number of nonzeros in rows on this processor which
455  // correspond to this part_id. The idea is that when we redistribute matrix, this weight
456  // is a good approximation of the amount of data to move.
457  // We use two maps, original which maps a partition id of an edge to the corresponding weight,
458  // and a reverse one, which is necessary to sort by edges.
459  std::map<GO, GO> lEdges;
460  if (willAcceptPartition)
461  for (LO i = 0; i < decompEntries.size(); i++)
462  lEdges[decompEntries[i]] += A.getNumEntriesInLocalRow(i);
463 
464  // Reverse map, so that edges are sorted by weight.
465  // This results in multimap, as we may have edges with the same weight
466  std::multimap<GO, GO> revlEdges;
467  for (typename std::map<GO, GO>::const_iterator it = lEdges.begin(); it != lEdges.end(); it++)
468  revlEdges.insert(std::make_pair(it->second, it->first));
469 
470  // Both lData and gData are arrays of data which we communicate. The data is stored
471  // in pairs, so that data[2*i+0] is the part index, and data[2*i+1] is the corresponding edge weight.
472  // We do not store processor id in data, as we can compute that by looking on the offset in the gData.
473  Array<GO> lData(dataSize, -1), gData(numProcs * dataSize);
474  int numEdges = 0;
475  for (typename std::multimap<GO, GO>::reverse_iterator rit = revlEdges.rbegin(); rit != revlEdges.rend() && numEdges < maxLocal; rit++) {
476  lData[2 * numEdges + 0] = rit->second; // part id
477  lData[2 * numEdges + 1] = rit->first; // edge weight
478  numEdges++;
479  }
480 
481  // Step 2: Gather most edges
482  // Each processors contributes maxLocal edges by providing maxLocal pairs <part id, weight>, which is of size dataSize
483  MPI_Datatype MpiType = Teuchos::Details::MpiTypeTraits<GO>::getType();
484  MPI_Allgather(static_cast<void*>(lData.getRawPtr()), dataSize, MpiType, static_cast<void*>(gData.getRawPtr()), dataSize, MpiType, *rawMpiComm);
485 
486  // Step 3: Construct mapping
487 
488  // Construct the set of triplets
489  Teuchos::Array<Triplet<int, int> > gEdges(numProcs * maxLocal);
490  Teuchos::Array<bool> procWillAcceptPartition(numProcs, allSubdomainsAcceptPartitions);
491  size_t k = 0;
492  for (LO i = 0; i < gData.size(); i += 2) {
493  int procNo = i / dataSize; // determine the processor by its offset (since every processor sends the same amount)
494  GO part = gData[i + 0];
495  GO weight = gData[i + 1];
496  if (part != -1) { // skip nonexistent edges
497  gEdges[k].i = procNo;
498  gEdges[k].j = part;
499  gEdges[k].v = weight;
500  procWillAcceptPartition[procNo] = true;
501  k++;
502  }
503  }
504  gEdges.resize(k);
505 
506  // Sort edges by weight
507  // NOTE: compareTriplets is actually a reverse sort, so the edges weight is in decreasing order
508  std::sort(gEdges.begin(), gEdges.end(), compareTriplets<int, int>);
509 
510  // Do matching
511  std::map<int, int> match;
512  Teuchos::Array<char> matchedRanks(numProcs, 0), matchedParts(numPartitions, 0);
513  int numMatched = 0;
514  for (typename Teuchos::Array<Triplet<int, int> >::const_iterator it = gEdges.begin(); it != gEdges.end(); it++) {
515  GO rank = it->i;
516  GO part = it->j;
517  if (matchedRanks[rank] == 0 && matchedParts[part] == 0) {
518  matchedRanks[rank] = 1;
519  matchedParts[part] = 1;
520  match[part] = rank;
521  numMatched++;
522  }
523  }
524  GetOStream(Statistics1) << "Number of unassigned partitions before cleanup stage: " << (numPartitions - numMatched) << " / " << numPartitions << std::endl;
525 
526  // Step 4: Assign unassigned partitions if necessary.
527  // We do that through desperate matching for remaining partitions:
528  // We select the lowest rank that can still take a partition.
529  // The reason it is done this way is that we don't need any extra communication, as we don't
530  // need to know which parts are valid.
531  if (numPartitions - numMatched > 0) {
532  Teuchos::Array<char> partitionCounts(numPartitions, 0);
533  for (typename std::map<int, int>::const_iterator it = match.begin(); it != match.end(); it++)
534  partitionCounts[it->first] += 1;
535  for (int part = 0, matcher = 0; part < numPartitions; part++) {
536  if (partitionCounts[part] == 0) {
537  // Find first non-matched rank that accepts partitions
538  while (matchedRanks[matcher] || !procWillAcceptPartition[matcher])
539  matcher++;
540 
541  match[part] = matcher++;
542  numMatched++;
543  }
544  }
545  }
546 
547  TEUCHOS_TEST_FOR_EXCEPTION(numMatched != numPartitions, Exceptions::RuntimeError, "MueLu::RepartitionFactory::DeterminePartitionPlacement: Only " << numMatched << " partitions out of " << numPartitions << " got assigned to ranks.");
548 
549  // Step 5: Permute entries in the decomposition vector
550  for (LO i = 0; i < decompEntries.size(); i++)
551  decompEntries[i] = match[decompEntries[i]];
552 }
553 
554 } // namespace MueLu
555 
556 #endif // ifdef HAVE_MPI
557 
558 #endif // MUELU_REPARTITIONFACTORY_DEF_HPP
Important warning messages (one line)
void Build(Level &currentLevel) const
Build an object with this factory.
void reserve(size_type n)
#define MueLu_sumAll(rcpComm, in, out)
#define MueLu_maxAll(rcpComm, in, out)
GlobalOrdinal GO
T & get(const std::string &name, T def_value)
Timer to be used in factories. Similar to Monitor but with additional timers.
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Print more statistics.
User data are always kept. This flag is set automatically when Level::Set(&quot;data&quot;, data) is used...
LocalOrdinal LO
One-liner description of what is happening.
ParameterList & set(std::string const &name, T &&value, std::string const &docString="", RCP< const ParameterEntryValidator > const &validator=null)
size_type size() const
#define SET_VALID_ENTRY(name)
static const NoFactory * get()
static std::string PrintImporterInfo(RCP< const Import > importer, const std::string &msgTag)
Print even more statistics.
static bool compareTriplets(const Triplet< T, W > &a, const Triplet< T, W > &b)
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
Class that holds all level-specific information.
Definition: MueLu_Level.hpp:63
Timer to be used in factories. Similar to SubMonitor but adds a timer level by level.
Keep data only for this run. Used to keep data useful for Hierarchy::Iterate(). Data will be deleted ...
static RCP< const Xpetra::BlockedMap< LocalOrdinal, GlobalOrdinal, Node > > GeneratedBlockedTargetMap(const Xpetra::BlockedMap< LocalOrdinal, GlobalOrdinal, Node > &sourceBlockedMap, const Xpetra::Import< LocalOrdinal, GlobalOrdinal, Node > &Importer)
void resize(size_type new_size, const value_type &x=value_type())
static RCP< Vector > Build(const Teuchos::RCP< const Map > &map, bool zeroOut=true)
iterator end()
void DeclareInput(Level &currentLevel) const
Determines the data that RepartitionFactory needs, and the factories that generate that data...
RepartitionFactory()
Constructor.
void push_back(const value_type &x)
size_type size() const
virtual ~RepartitionFactory()
Destructor.
Exception throws to report errors in the internal logical of the program.
RCP< const ParameterList > GetValidParameterList() const
Return a const parameter list of valid parameters that setParameterList() will accept.
iterator begin()
std::string toString(const T &t)