Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_DistributorPlan.cpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
11 
13 #include "Teuchos_StandardParameterEntryValidators.hpp"
14 #include "Tpetra_Util.hpp"
16 #include <numeric>
17 
18 namespace Tpetra {
19 namespace Details {
20 
21 std::string
23  if (sendType == DISTRIBUTOR_ISEND) {
24  return "Isend";
25  } else if (sendType == DISTRIBUTOR_SEND) {
26  return "Send";
27  } else if (sendType == DISTRIBUTOR_ALLTOALL) {
28  return "Alltoall";
29  }
30 #if defined(HAVE_TPETRA_MPI)
31  else if (sendType == DISTRIBUTOR_IALLTOFEWV) {
32  return "Ialltofewv";
33  }
34 #endif
35 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
36  else if (sendType == DISTRIBUTOR_MPIADVANCE_ALLTOALL) {
37  return "MpiAdvanceAlltoall";
38  } else if (sendType == DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
39  return "MpiAdvanceNbralltoallv";
40  }
41 #endif
42  else {
43  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument,
44  "Invalid "
45  "EDistributorSendType enum value "
46  << sendType << ".");
47  }
48 }
49 
51 DistributorSendTypeStringToEnum(const std::string_view s) {
52  if (s == "Isend") return DISTRIBUTOR_ISEND;
53  if (s == "Send") return DISTRIBUTOR_SEND;
54  if (s == "Alltoall") return DISTRIBUTOR_ALLTOALL;
55 #if defined(HAVE_TPETRA_MPI)
56  if (s == "Ialltofewv") return DISTRIBUTOR_IALLTOFEWV;
57 #endif
58 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
59  if (s == "MpiAdvanceAlltoall") return DISTRIBUTOR_MPIADVANCE_ALLTOALL;
60  if (s == "MpiAdvanceNbralltoallv") return DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV;
61 #endif
62  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string to convert to EDistributorSendType enum value: " << s);
63 }
64 
66 const std::string& validSendTypeOrThrow(const std::string& s) {
67  const auto valids = distributorSendTypes();
68  if (std::find(valids.begin(), valids.end(), s) == valids.end()) {
69  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string for EDistributorSendType enum value: " << s);
70  }
71  return s;
72 }
73 
74 std::string
76  switch (how) {
77  case Details::DISTRIBUTOR_NOT_INITIALIZED:
78  return "Not initialized yet";
79  case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS:
80  return "By createFromSends";
81  case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS:
82  return "By createFromRecvs";
83  case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS:
84  return "By createFromSendsAndRecvs";
85  case Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE:
86  return "By createReverseDistributor";
87  case Details::DISTRIBUTOR_INITIALIZED_BY_COPY:
88  return "By copy constructor";
89  default:
90  return "INVALID";
91  }
92 }
93 
94 DistributorPlan::DistributorPlan(Teuchos::RCP<const Teuchos::Comm<int>> comm)
95  : comm_(comm)
96  ,
97 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
98  mpixComm_(Teuchos::null)
99  ,
100 #endif
101  howInitialized_(DISTRIBUTOR_NOT_INITIALIZED)
102  , reversePlan_(Teuchos::null)
103  , sendType_(DistributorSendTypeStringToEnum(Behavior::defaultSendType()))
104  , sendMessageToSelf_(false)
105  , numSendsToOtherProcs_(0)
106  , maxSendLength_(0)
107  , numReceives_(0)
108  , totalReceiveLength_(0) {
109 }
110 
111 DistributorPlan::DistributorPlan(const DistributorPlan& otherPlan)
112  : comm_(otherPlan.comm_)
113  ,
114 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
115  mpixComm_(otherPlan.mpixComm_)
116  ,
117 #endif
118  howInitialized_(DISTRIBUTOR_INITIALIZED_BY_COPY)
119  , reversePlan_(otherPlan.reversePlan_)
120  , sendType_(otherPlan.sendType_)
121  , sendMessageToSelf_(otherPlan.sendMessageToSelf_)
122  , numSendsToOtherProcs_(otherPlan.numSendsToOtherProcs_)
123  , procIdsToSendTo_(otherPlan.procIdsToSendTo_)
124  , startsTo_(otherPlan.startsTo_)
125  , lengthsTo_(otherPlan.lengthsTo_)
126  , maxSendLength_(otherPlan.maxSendLength_)
127  , indicesTo_(otherPlan.indicesTo_)
128  , numReceives_(otherPlan.numReceives_)
129  , totalReceiveLength_(otherPlan.totalReceiveLength_)
130  , lengthsFrom_(otherPlan.lengthsFrom_)
131  , procsFrom_(otherPlan.procsFrom_)
132  , startsFrom_(otherPlan.startsFrom_)
133  , indicesFrom_(otherPlan.indicesFrom_)
134 #if defined(HAVE_TPETRACORE_MPI)
135  , roots_(otherPlan.roots_)
136 #endif
137 {
138 }
139 
140 size_t DistributorPlan::createFromSends(const Teuchos::ArrayView<const int>& exportProcIDs) {
141  using std::endl;
142  using Teuchos::outArg;
143  using Teuchos::REDUCE_MAX;
144  using Teuchos::reduceAll;
145  const char rawPrefix[] = "Tpetra::DistributorPlan::createFromSends";
146 
147  const size_t numExports = exportProcIDs.size();
148  const int myProcID = comm_->getRank();
149  const int numProcs = comm_->getSize();
150  const bool debug = Details::Behavior::debug("Distributor");
151 
152  // exportProcIDs tells us the communication pattern for this
153  // distributor. It dictates the way that the export data will be
154  // interpreted in doPosts(). We want to perform at most one
155  // send per process in doPosts; this is for two reasons:
156  // * minimize latency / overhead in the comm routines (nice)
157  // * match the number of receives and sends between processes
158  // (necessary)
159  //
160  // Teuchos::Comm requires that the data for a send are contiguous
161  // in a send buffer. Therefore, if the data in the send buffer
162  // for doPosts() are not contiguous, they will need to be copied
163  // into a contiguous buffer. The user has specified this
164  // noncontiguous pattern and we can't do anything about it.
165  // However, if they do not provide an efficient pattern, we will
166  // warn them if one of the following compile-time options has been
167  // set:
168  // * HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS
169  //
170  // If the data are contiguous, then we can post the sends in situ
171  // (i.e., without needing to copy them into a send buffer).
172  //
173  // Determine contiguity. There are a number of ways to do this:
174  // * If the export IDs are sorted, then all exports to a
175  // particular proc must be contiguous. This is what Epetra does.
176  // * If the export ID of the current export already has been
177  // listed, then the previous listing should correspond to the
178  // same export. This tests contiguity, but not sortedness.
179  //
180  // Both of these tests require O(n), where n is the number of
181  // exports. However, the latter will positively identify a greater
182  // portion of contiguous patterns. We use the latter method.
183  //
184  // Check to see if values are grouped by procs without gaps
185  // If so, indices_to -> 0.
186 
187  if (debug) {
188  // Test whether any process in the communicator got an invalid
189  // process ID. If badID != -1 on this process, then it equals
190  // this process' rank. The max of all badID over all processes
191  // is the max rank which has an invalid process ID.
192  int badID = -1;
193  for (size_t i = 0; i < numExports; ++i) {
194  const int exportID = exportProcIDs[i];
195  if (exportID >= numProcs || exportID < 0) {
196  badID = myProcID;
197  break;
198  }
199  }
200  int gbl_badID;
201  reduceAll<int, int>(*comm_, REDUCE_MAX, badID, outArg(gbl_badID));
202  TEUCHOS_TEST_FOR_EXCEPTION(gbl_badID >= 0, std::runtime_error, rawPrefix << "Proc " << gbl_badID << ", perhaps among other processes, got a bad "
203  "send process ID.");
204  }
205 
206  // Set up data structures for quick traversal of arrays.
207  // This contains the number of sends for each process ID.
208  //
209  // FIXME (mfh 20 Mar 2014) This is one of a few places in Tpetra
210  // that create an array of length the number of processes in the
211  // communicator (plus one). Given how this code uses this array,
212  // it should be straightforward to replace it with a hash table or
213  // some other more space-efficient data structure. In practice,
214  // most of the entries of starts should be zero for a sufficiently
215  // large process count, unless the communication pattern is dense.
216  // Note that it's important to be able to iterate through keys (i
217  // for which starts[i] is nonzero) in increasing order.
218  Teuchos::Array<size_t> starts(numProcs + 1, 0);
219 
220  // numActive is the number of sends that are not Null
221  size_t numActive = 0;
222  int needSendBuff = 0; // Boolean
223 
224  for (size_t i = 0; i < numExports; ++i) {
225  const int exportID = exportProcIDs[i];
226  if (exportID >= 0) {
227  // exportID is a valid process ID. Increment the number of
228  // messages this process will send to that process.
229  ++starts[exportID];
230 
231  // If we're sending more than one message to process exportID,
232  // then it is possible that the data are not contiguous.
233  // Check by seeing if the previous process ID in the list
234  // (exportProcIDs[i-1]) is the same. It's safe to use i-1,
235  // because if starts[exportID] > 1, then i must be > 1 (since
236  // the starts array was filled with zeros initially).
237 
238  // null entries break continuity.
239  // e.g., [ 0, 0, 0, 1, -99, 1, 2, 2, 2] is not contiguous
240  if (needSendBuff == 0 && starts[exportID] > 1 &&
241  exportID != exportProcIDs[i - 1]) {
242  needSendBuff = 1;
243  }
244  ++numActive;
245  }
246  }
247 
248 #if defined(HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS)
249  {
250  int global_needSendBuff;
251  reduceAll<int, int>(*comm_, REDUCE_MAX, needSendBuff,
252  outArg(global_needSendBuff));
254  global_needSendBuff != 0,
255  "::createFromSends: Grouping export IDs together by process rank often "
256  "improves performance.");
257  }
258 #endif
259 
260  // Determine from the caller's data whether or not the current
261  // process should send (a) message(s) to itself.
262  if (starts[myProcID] != 0) {
263  sendMessageToSelf_ = true;
264  } else {
265  sendMessageToSelf_ = false;
266  }
267 
268  if (!needSendBuff) {
269  // grouped by proc, no send buffer or indicesTo_ needed
270  numSendsToOtherProcs_ = 0;
271  // Count total number of sends, i.e., total number of procs to
272  // which we are sending. This includes myself, if applicable.
273  for (int i = 0; i < numProcs; ++i) {
274  if (starts[i]) {
275  ++numSendsToOtherProcs_;
276  }
277  }
278 
279  // Not only do we not need these, but we must clear them, as
280  // empty status of indicesTo is a flag used later.
281  indicesTo_.resize(0);
282  // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
283  // includes self sends. Set their values to zeros.
284  procIdsToSendTo_.assign(numSendsToOtherProcs_, 0);
285  startsTo_.assign(numSendsToOtherProcs_, 0);
286  lengthsTo_.assign(numSendsToOtherProcs_, 0);
287 
288  // set startsTo to the offset for each send (i.e., each proc ID)
289  // set procsTo to the proc ID for each send
290  // in interpreting this code, remember that we are assuming contiguity
291  // that is why index skips through the ranks
292  {
293  size_t procIndex = 0;
294  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
295  while (exportProcIDs[procIndex] < 0) {
296  ++procIndex; // skip all negative proc IDs
297  }
298  startsTo_[i] = procIndex;
299  int procID = exportProcIDs[procIndex];
300  procIdsToSendTo_[i] = procID;
301  procIndex += starts[procID];
302  }
303  }
304  // sort the startsTo and proc IDs together, in ascending order, according
305  // to proc IDs
306  if (numSendsToOtherProcs_ > 0) {
307  sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
308  }
309  // compute the maximum send length
310  maxSendLength_ = 0;
311  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
312  int procID = procIdsToSendTo_[i];
313  lengthsTo_[i] = starts[procID];
314  if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
315  maxSendLength_ = lengthsTo_[i];
316  }
317  }
318  } else {
319  // not grouped by proc, need send buffer and indicesTo_
320 
321  // starts[i] is the number of sends to proc i
322  // numActive equals number of sends total, \sum_i starts[i]
323 
324  // this loop starts at starts[1], so explicitly check starts[0]
325  if (starts[0] == 0) {
326  numSendsToOtherProcs_ = 0;
327  } else {
328  numSendsToOtherProcs_ = 1;
329  }
330  for (Teuchos::Array<size_t>::iterator i = starts.begin() + 1,
331  im1 = starts.begin();
332  i != starts.end(); ++i) {
333  if (*i != 0) ++numSendsToOtherProcs_;
334  *i += *im1;
335  im1 = i;
336  }
337  // starts[i] now contains the number of exports to procs 0 through i
338 
339  for (Teuchos::Array<size_t>::reverse_iterator ip1 = starts.rbegin(),
340  i = starts.rbegin() + 1;
341  i != starts.rend(); ++i) {
342  *ip1 = *i;
343  ip1 = i;
344  }
345  starts[0] = 0;
346  // starts[i] now contains the number of exports to procs 0 through
347  // i-1, i.e., all procs before proc i
348 
349  indicesTo_.resize(numActive);
350 
351  for (size_t i = 0; i < numExports; ++i) {
352  if (exportProcIDs[i] >= 0) {
353  // record the offset to the sendBuffer for this export
354  indicesTo_[starts[exportProcIDs[i]]] = i;
355  // now increment the offset for this proc
356  ++starts[exportProcIDs[i]];
357  }
358  }
359  // our send buffer will contain the export data for each of the procs
360  // we communicate with, in order by proc id
361  // sendBuffer = {proc_0_data, proc_1_data, ..., proc_np-1_data}
362  // indicesTo now maps each export to the location in our send buffer
363  // associated with the export
364  // data for export i located at sendBuffer[indicesTo[i]]
365  //
366  // starts[i] once again contains the number of exports to
367  // procs 0 through i
368  for (int proc = numProcs - 1; proc != 0; --proc) {
369  starts[proc] = starts[proc - 1];
370  }
371  starts.front() = 0;
372  starts[numProcs] = numActive;
373  //
374  // starts[proc] once again contains the number of exports to
375  // procs 0 through proc-1
376  // i.e., the start of my data in the sendBuffer
377 
378  // this contains invalid data at procs we don't care about, that is okay
379  procIdsToSendTo_.resize(numSendsToOtherProcs_);
380  startsTo_.resize(numSendsToOtherProcs_);
381  lengthsTo_.resize(numSendsToOtherProcs_);
382 
383  // for each group of sends/exports, record the destination proc,
384  // the length, and the offset for this send into the
385  // send buffer (startsTo_)
386  maxSendLength_ = 0;
387  size_t snd = 0;
388  for (int proc = 0; proc < numProcs; ++proc) {
389  if (starts[proc + 1] != starts[proc]) {
390  lengthsTo_[snd] = starts[proc + 1] - starts[proc];
391  startsTo_[snd] = starts[proc];
392  // record max length for all off-proc sends
393  if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
394  maxSendLength_ = lengthsTo_[snd];
395  }
396  procIdsToSendTo_[snd] = proc;
397  ++snd;
398  }
399  }
400  }
401 
402  if (sendMessageToSelf_) {
403  --numSendsToOtherProcs_;
404  }
405 
406  // Invert map to see what msgs are received and what length
407  computeReceives();
408 
409 #if defined(HAVE_TPETRA_MPI)
410  maybeInitializeRoots();
411 #endif
412 
413  // createFromRecvs() calls createFromSends(), but will set
414  // howInitialized_ again after calling createFromSends().
415  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS;
416 
417  return totalReceiveLength_;
418 }
419 
420 void DistributorPlan::createFromRecvs(const Teuchos::ArrayView<const int>& remoteProcIDs) {
421  *this = *getReversePlan();
422  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
423 }
424 
425 void DistributorPlan::createFromSendsAndRecvs(const Teuchos::ArrayView<const int>& exportProcIDs,
426  const Teuchos::ArrayView<const int>& remoteProcIDs) {
427  // note the exportProcIDs and remoteProcIDs _must_ be a list that has
428  // an entry for each GID. If the export/remoteProcIDs is taken from
429  // the getProcs{From|To} lists that are extracted from a previous distributor,
430  // it will generate a wrong answer, because those lists have a unique entry
431  // for each processor id. A version of this with lengthsTo and lengthsFrom
432  // should be made.
433 
434  howInitialized_ = Tpetra::Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS;
435 
436  int myProcID = comm_->getRank();
437  int numProcs = comm_->getSize();
438 
439  const size_t numExportIDs = exportProcIDs.size();
440  Teuchos::Array<size_t> starts(numProcs + 1, 0);
441 
442  size_t numActive = 0;
443  int needSendBuff = 0; // Boolean
444 
445  for (size_t i = 0; i < numExportIDs; i++) {
446  if (needSendBuff == 0 && i && (exportProcIDs[i] < exportProcIDs[i - 1]))
447  needSendBuff = 1;
448  if (exportProcIDs[i] >= 0) {
449  ++starts[exportProcIDs[i]];
450  ++numActive;
451  }
452  }
453 
454  sendMessageToSelf_ = (starts[myProcID] != 0) ? 1 : 0;
455 
456  numSendsToOtherProcs_ = 0;
457 
458  if (needSendBuff) // grouped by processor, no send buffer or indicesTo_ needed
459  {
460  if (starts[0] == 0) {
461  numSendsToOtherProcs_ = 0;
462  } else {
463  numSendsToOtherProcs_ = 1;
464  }
465  for (Teuchos::Array<size_t>::iterator i = starts.begin() + 1,
466  im1 = starts.begin();
467  i != starts.end(); ++i) {
468  if (*i != 0) ++numSendsToOtherProcs_;
469  *i += *im1;
470  im1 = i;
471  }
472  // starts[i] now contains the number of exports to procs 0 through i
473 
474  for (Teuchos::Array<size_t>::reverse_iterator ip1 = starts.rbegin(),
475  i = starts.rbegin() + 1;
476  i != starts.rend(); ++i) {
477  *ip1 = *i;
478  ip1 = i;
479  }
480  starts[0] = 0;
481  // starts[i] now contains the number of exports to procs 0 through
482  // i-1, i.e., all procs before proc i
483 
484  indicesTo_.resize(numActive);
485 
486  for (size_t i = 0; i < numExportIDs; ++i) {
487  if (exportProcIDs[i] >= 0) {
488  // record the offset to the sendBuffer for this export
489  indicesTo_[starts[exportProcIDs[i]]] = i;
490  // now increment the offset for this proc
491  ++starts[exportProcIDs[i]];
492  }
493  }
494  for (int proc = numProcs - 1; proc != 0; --proc) {
495  starts[proc] = starts[proc - 1];
496  }
497  starts.front() = 0;
498  starts[numProcs] = numActive;
499  procIdsToSendTo_.resize(numSendsToOtherProcs_);
500  startsTo_.resize(numSendsToOtherProcs_);
501  lengthsTo_.resize(numSendsToOtherProcs_);
502  maxSendLength_ = 0;
503  size_t snd = 0;
504  for (int proc = 0; proc < numProcs; ++proc) {
505  if (starts[proc + 1] != starts[proc]) {
506  lengthsTo_[snd] = starts[proc + 1] - starts[proc];
507  startsTo_[snd] = starts[proc];
508  // record max length for all off-proc sends
509  if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
510  maxSendLength_ = lengthsTo_[snd];
511  }
512  procIdsToSendTo_[snd] = proc;
513  ++snd;
514  }
515  }
516  } else {
517  // grouped by proc, no send buffer or indicesTo_ needed
518  numSendsToOtherProcs_ = 0;
519  // Count total number of sends, i.e., total number of procs to
520  // which we are sending. This includes myself, if applicable.
521  for (int i = 0; i < numProcs; ++i) {
522  if (starts[i]) {
523  ++numSendsToOtherProcs_;
524  }
525  }
526 
527  // Not only do we not need these, but we must clear them, as
528  // empty status of indicesTo is a flag used later.
529  indicesTo_.resize(0);
530  // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
531  // includes self sends. Set their values to zeros.
532  procIdsToSendTo_.assign(numSendsToOtherProcs_, 0);
533  startsTo_.assign(numSendsToOtherProcs_, 0);
534  lengthsTo_.assign(numSendsToOtherProcs_, 0);
535 
536  // set startsTo to the offset for each send (i.e., each proc ID)
537  // set procsTo to the proc ID for each send
538  // in interpreting this code, remember that we are assuming contiguity
539  // that is why index skips through the ranks
540  {
541  size_t procIndex = 0;
542  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
543  while (exportProcIDs[procIndex] < 0) {
544  ++procIndex; // skip all negative proc IDs
545  }
546  startsTo_[i] = procIndex;
547  int procID = exportProcIDs[procIndex];
548  procIdsToSendTo_[i] = procID;
549  procIndex += starts[procID];
550  }
551  }
552  // sort the startsTo and proc IDs together, in ascending order, according
553  // to proc IDs
554  if (numSendsToOtherProcs_ > 0) {
555  sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
556  }
557  // compute the maximum send length
558  maxSendLength_ = 0;
559  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
560  int procID = procIdsToSendTo_[i];
561  lengthsTo_[i] = starts[procID];
562  if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
563  maxSendLength_ = lengthsTo_[i];
564  }
565  }
566  }
567 
568  numSendsToOtherProcs_ -= sendMessageToSelf_;
569  std::vector<int> recv_list;
570  recv_list.reserve(numSendsToOtherProcs_); // reserve an initial guess for size needed
571 
572  int last_pid = -2;
573  for (int i = 0; i < remoteProcIDs.size(); i++) {
574  if (remoteProcIDs[i] > last_pid) {
575  recv_list.push_back(remoteProcIDs[i]);
576  last_pid = remoteProcIDs[i];
577  } else if (remoteProcIDs[i] < last_pid)
578  throw std::runtime_error("Tpetra::Distributor:::createFromSendsAndRecvs expected RemotePIDs to be in sorted order");
579  }
580  numReceives_ = recv_list.size();
581  if (numReceives_) {
582  procsFrom_.assign(numReceives_, 0);
583  lengthsFrom_.assign(numReceives_, 0);
584  indicesFrom_.assign(numReceives_, 0);
585  startsFrom_.assign(numReceives_, 0);
586  }
587  for (size_t i = 0, j = 0; i < numReceives_; ++i) {
588  int jlast = j;
589  procsFrom_[i] = recv_list[i];
590  startsFrom_[i] = j;
591  for (; j < (size_t)remoteProcIDs.size() &&
592  remoteProcIDs[jlast] == remoteProcIDs[j];
593  j++) {
594  ;
595  }
596  lengthsFrom_[i] = j - jlast;
597  }
598  totalReceiveLength_ = remoteProcIDs.size();
599  indicesFrom_.clear();
600  numReceives_ -= sendMessageToSelf_;
601 
602 #if defined(HAVE_TPETRA_MPI)
603  maybeInitializeRoots();
604 #endif
605 }
606 
607 Teuchos::RCP<DistributorPlan> DistributorPlan::getReversePlan() const {
608  if (reversePlan_.is_null()) createReversePlan();
609  return reversePlan_;
610 }
611 
612 void DistributorPlan::createReversePlan() const {
613  reversePlan_ = Teuchos::rcp(new DistributorPlan(comm_));
614  reversePlan_->howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE;
615  reversePlan_->sendType_ = sendType_;
616 
617 #if defined(HAVE_TPETRACORE_MPI)
618  // If the forward plan matches an all-to-few communication pattern,
619  // the reverse plan is few-to-all, so don't use a special all-to-few
620  // implementation for it
621  if (DISTRIBUTOR_IALLTOFEWV == sendType_) {
622  if (Behavior::verbose()) {
623  std::stringstream ss;
624  ss << __FILE__ << ":" << __LINE__ << " WARNING (Ialltofewv send type): using default for reversed Ialltofewv\n";
625  std::cerr << ss.str();
626  }
627 
628  reversePlan_->sendType_ = DistributorSendTypeStringToEnum(Behavior::defaultSendType());
629  }
630 #endif
631 
632  // The total length of all the sends of this DistributorPlan. We
633  // calculate it because it's the total length of all the receives
634  // of the reverse DistributorPlan.
635  size_t totalSendLength =
636  std::accumulate(lengthsTo_.begin(), lengthsTo_.end(), 0);
637 
638  // The maximum length of any of the receives of this DistributorPlan.
639  // We calculate it because it's the maximum length of any of the
640  // sends of the reverse DistributorPlan.
641  size_t maxReceiveLength = 0;
642  const int myProcID = comm_->getRank();
643  for (size_t i = 0; i < numReceives_; ++i) {
644  if (procsFrom_[i] != myProcID) {
645  // Don't count receives for messages sent by myself to myself.
646  if (lengthsFrom_[i] > maxReceiveLength) {
647  maxReceiveLength = lengthsFrom_[i];
648  }
649  }
650  }
651 
652  reversePlan_->sendMessageToSelf_ = sendMessageToSelf_;
653  reversePlan_->numSendsToOtherProcs_ = numReceives_;
654  reversePlan_->procIdsToSendTo_ = procsFrom_;
655  reversePlan_->startsTo_ = startsFrom_;
656  reversePlan_->lengthsTo_ = lengthsFrom_;
657  reversePlan_->maxSendLength_ = maxReceiveLength;
658  reversePlan_->indicesTo_ = indicesFrom_;
659  reversePlan_->numReceives_ = numSendsToOtherProcs_;
660  reversePlan_->totalReceiveLength_ = totalSendLength;
661  reversePlan_->lengthsFrom_ = lengthsTo_;
662  reversePlan_->procsFrom_ = procIdsToSendTo_;
663  reversePlan_->startsFrom_ = startsTo_;
664  reversePlan_->indicesFrom_ = indicesTo_;
665 
666 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
667  // is there a smarter way to do this
668  reversePlan_->initializeMpiAdvance();
669 #endif
670 
671 #if defined(HAVE_TPETRA_MPI)
672  reversePlan_->maybeInitializeRoots();
673 #endif
674 }
675 
676 void DistributorPlan::computeReceives() {
677  using Teuchos::Array;
678  using Teuchos::ArrayRCP;
679  using Teuchos::as;
680  using Teuchos::CommRequest;
681  using Teuchos::CommStatus;
682  using Teuchos::ireceive;
683  using Teuchos::RCP;
684  using Teuchos::rcp;
685  using Teuchos::receive;
686  using Teuchos::reduce;
687  using Teuchos::REDUCE_SUM;
688  using Teuchos::scatter;
689  using Teuchos::send;
690  using Teuchos::waitAll;
691 
692  const int myRank = comm_->getRank();
693  const int numProcs = comm_->getSize();
694 
695  const int mpiTag = DEFAULT_MPI_TAG;
696 
697  // toProcsFromMe[i] == the number of messages sent by this process
698  // to process i. The data in numSendsToOtherProcs_, procIdsToSendTo_, and lengthsTo_
699  // concern the contiguous sends. Therefore, each process will be
700  // listed in procIdsToSendTo_ at most once, and so toProcsFromMe[i] will
701  // either be 0 or 1.
702  {
703  Array<int> toProcsFromMe(numProcs, 0);
704 #ifdef HAVE_TPETRA_DEBUG
705  bool counting_error = false;
706 #endif // HAVE_TPETRA_DEBUG
707  for (size_t i = 0; i < (numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0)); ++i) {
708 #ifdef HAVE_TPETRA_DEBUG
709  if (toProcsFromMe[procIdsToSendTo_[i]] != 0) {
710  counting_error = true;
711  }
712 #endif // HAVE_TPETRA_DEBUG
713  toProcsFromMe[procIdsToSendTo_[i]] = 1;
714  }
715 #ifdef HAVE_TPETRA_DEBUG
716  // Note that SHARED_TEST_FOR_EXCEPTION does a global reduction
717  SHARED_TEST_FOR_EXCEPTION(counting_error, std::logic_error,
718  "Tpetra::Distributor::computeReceives: There was an error on at least "
719  "one process in counting the number of messages send by that process to "
720  "the other processs. Please report this bug to the Tpetra developers.",
721  *comm_);
722 #endif // HAVE_TPETRA_DEBUG
723 
724  // Compute the number of receives that this process needs to
725  // post. The number of receives includes any self sends (i.e.,
726  // messages sent by this process to itself).
727  //
728  // (We will use numReceives_ this below to post exactly that
729  // number of receives, with MPI_ANY_SOURCE as the sending rank.
730  // This will tell us from which processes this process expects
731  // to receive, and how many packets of data we expect to receive
732  // from each process.)
733  //
734  // toProcsFromMe[i] is the number of messages sent by this
735  // process to process i. Compute the sum (elementwise) of all
736  // the toProcsFromMe arrays on all processes in the
737  // communicator. If the array x is that sum, then if this
738  // process has rank j, x[j] is the number of messages sent
739  // to process j, that is, the number of receives on process j
740  // (including any messages sent by process j to itself).
741  //
742  // Yes, this requires storing and operating on an array of
743  // length P, where P is the number of processes in the
744  // communicator. Epetra does this too. Avoiding this O(P)
745  // memory bottleneck would require some research.
746  //
747  // mfh 09 Jan 2012, 15 Jul 2015: There are three ways to
748  // implement this O(P) memory algorithm.
749  //
750  // 1. Use MPI_Reduce and MPI_Scatter: reduce on the root
751  // process (0) from toProcsFromMe, to numRecvsOnEachProc.
752  // Then, scatter the latter, so that each process p gets
753  // numRecvsOnEachProc[p].
754  //
755  // 2. Like #1, but use MPI_Reduce_scatter instead of
756  // MPI_Reduce and MPI_Scatter. MPI_Reduce_scatter might be
757  // optimized to reduce the number of messages, but
758  // MPI_Reduce_scatter is more general than we need (it
759  // allows the equivalent of MPI_Scatterv). See Bug 6336.
760  //
761  // 3. Do an all-reduce on toProcsFromMe, and let my process
762  // (with rank myRank) get numReceives_ from
763  // toProcsFromMe[myRank]. The HPCCG miniapp uses the
764  // all-reduce method.
765  //
766  // Approaches 1 and 3 have the same critical path length.
767  // However, #3 moves more data. This is because the final
768  // result is just one integer, but #3 moves a whole array of
769  // results to all the processes. This is why we use Approach 1
770  // here.
771  //
772  // mfh 12 Apr 2013: See discussion in createFromSends() about
773  // how we could use this communication to propagate an error
774  // flag for "free" in a release build.
775 
776  const int root = 0; // rank of root process of the reduction
777  Array<int> numRecvsOnEachProc; // temp; only needed on root
778  if (myRank == root) {
779  numRecvsOnEachProc.resize(numProcs);
780  }
781  int numReceivesAsInt = 0; // output
782  reduce<int, int>(toProcsFromMe.getRawPtr(),
783  numRecvsOnEachProc.getRawPtr(),
784  numProcs, REDUCE_SUM, root, *comm_);
785  scatter<int, int>(numRecvsOnEachProc.getRawPtr(), 1,
786  &numReceivesAsInt, 1, root, *comm_);
787  numReceives_ = static_cast<size_t>(numReceivesAsInt);
788  }
789 
790  // Now we know numReceives_, which is this process' number of
791  // receives. Allocate the lengthsFrom_ and procsFrom_ arrays
792  // with this number of entries.
793  lengthsFrom_.assign(numReceives_, 0);
794  procsFrom_.assign(numReceives_, 0);
795 
796  //
797  // Ask (via nonblocking receive) each process from which we are
798  // receiving how many packets we should expect from it in the
799  // communication pattern.
800  //
801 
802  // At this point, numReceives_ includes any self message that
803  // there may be. At the end of this routine, we'll subtract off
804  // the self message (if there is one) from numReceives_. In this
805  // routine, we don't need to receive a message from ourselves in
806  // order to figure out our lengthsFrom_ and source process ID; we
807  // can just ask ourselves directly. Thus, the actual number of
808  // nonblocking receives we post here does not include the self
809  // message.
810  const size_t actualNumReceives = numReceives_ - (sendMessageToSelf_ ? 1 : 0);
811 
812  // Teuchos' wrapper for nonblocking receives requires receive
813  // buffers that it knows won't go away. This is why we use RCPs,
814  // one RCP per nonblocking receive request. They get allocated in
815  // the loop below.
816  Array<RCP<CommRequest<int>>> requests(actualNumReceives);
817  Array<ArrayRCP<size_t>> lengthsFromBuffers(actualNumReceives);
818  Array<RCP<CommStatus<int>>> statuses(actualNumReceives);
819 
820  // Teuchos::Comm treats a negative process ID as MPI_ANY_SOURCE
821  // (receive data from any process).
822 #ifdef HAVE_MPI
823  const int anySourceProc = MPI_ANY_SOURCE;
824 #else
825  const int anySourceProc = -1;
826 #endif
827 
828  // Post the (nonblocking) receives.
829  for (size_t i = 0; i < actualNumReceives; ++i) {
830  // Once the receive completes, we can ask the corresponding
831  // CommStatus object (output by wait()) for the sending process'
832  // ID (which we'll assign to procsFrom_[i] -- don't forget to
833  // do that!).
834  lengthsFromBuffers[i].resize(1);
835  lengthsFromBuffers[i][0] = as<size_t>(0);
836  requests[i] = ireceive<int, size_t>(lengthsFromBuffers[i], anySourceProc,
837  mpiTag, *comm_);
838  }
839 
840  // Post the sends: Tell each process to which we are sending how
841  // many packets it should expect from us in the communication
842  // pattern. We could use nonblocking sends here, as long as we do
843  // a waitAll() on all the sends and receives at once.
844  //
845  // We assume that numSendsToOtherProcs_ and sendMessageToSelf_ have already been
846  // set. The value of numSendsToOtherProcs_ (my process' number of sends) does
847  // not include any message that it might send to itself.
848  for (size_t i = 0; i < numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0); ++i) {
849  if (procIdsToSendTo_[i] != myRank) {
850  // Send a message to procIdsToSendTo_[i], telling that process that
851  // this communication pattern will send that process
852  // lengthsTo_[i] blocks of packets.
853  const size_t* const lengthsTo_i = &lengthsTo_[i];
854  send<int, size_t>(lengthsTo_i, 1, as<int>(procIdsToSendTo_[i]), mpiTag, *comm_);
855  } else {
856  // We don't need a send in the self-message case. If this
857  // process will send a message to itself in the communication
858  // pattern, then the last element of lengthsFrom_ and
859  // procsFrom_ corresponds to the self-message. Of course
860  // this process knows how long the message is, and the process
861  // ID is its own process ID.
862  lengthsFrom_[numReceives_ - 1] = lengthsTo_[i];
863  procsFrom_[numReceives_ - 1] = myRank;
864  }
865  }
866 
867  //
868  // Wait on all the receives. When they arrive, check the status
869  // output of wait() for the receiving process ID, unpack the
870  // request buffers into lengthsFrom_, and set procsFrom_ from the
871  // status.
872  //
873  waitAll(*comm_, requests(), statuses());
874  for (size_t i = 0; i < actualNumReceives; ++i) {
875  lengthsFrom_[i] = *lengthsFromBuffers[i];
876  procsFrom_[i] = statuses[i]->getSourceRank();
877  }
878 
879  // Sort the procsFrom_ array, and apply the same permutation to
880  // lengthsFrom_. This ensures that procsFrom_[i] and
881  // lengthsFrom_[i] refers to the same thing.
882  sort2(procsFrom_.begin(), procsFrom_.end(), lengthsFrom_.begin());
883 
884  // Compute indicesFrom_
885  totalReceiveLength_ =
886  std::accumulate(lengthsFrom_.begin(), lengthsFrom_.end(), 0);
887  indicesFrom_.clear();
888 
889  startsFrom_.clear();
890  startsFrom_.reserve(numReceives_);
891  for (size_t i = 0, j = 0; i < numReceives_; ++i) {
892  startsFrom_.push_back(j);
893  j += lengthsFrom_[i];
894  }
895 
896  if (sendMessageToSelf_) {
897  --numReceives_;
898  }
899 }
900 
901 void DistributorPlan::setParameterList(const Teuchos::RCP<Teuchos::ParameterList>& plist) {
902  using std::endl;
903  using Teuchos::FancyOStream;
904  using Teuchos::getIntegralValue;
905  using Teuchos::ParameterList;
906  using Teuchos::parameterList;
907  using Teuchos::RCP;
908 
909  if (!plist.is_null()) {
910  RCP<const ParameterList> validParams = getValidParameters();
911  plist->validateParametersAndSetDefaults(*validParams);
912 
913  const Details::EDistributorSendType sendType =
914  getIntegralValue<Details::EDistributorSendType>(*plist, "Send type");
915 
916  // Now that we've validated the input list, save the results.
917  sendType_ = sendType;
918 
919 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
920  initializeMpiAdvance();
921 #endif
922 
923  // ParameterListAcceptor semantics require pointer identity of the
924  // sublist passed to setParameterList(), so we save the pointer.
925  this->setMyParamList(plist);
926 
927 #if defined(HAVE_TPETRA_MPI)
928  maybeInitializeRoots();
929 #endif
930  }
931 }
932 
933 Teuchos::Array<std::string> distributorSendTypes() {
934  Teuchos::Array<std::string> sendTypes;
935  sendTypes.push_back("Isend");
936  sendTypes.push_back("Send");
937  sendTypes.push_back("Alltoall");
938 #if defined(HAVE_TPETRA_MPI)
939  sendTypes.push_back("Ialltofewv");
940 #endif
941 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
942  sendTypes.push_back("MpiAdvanceAlltoall");
943  sendTypes.push_back("MpiAdvanceNbralltoallv");
944 #endif
945  return sendTypes;
946 }
947 
948 Teuchos::Array<EDistributorSendType> distributorSendTypeEnums() {
949  Teuchos::Array<EDistributorSendType> res;
950  res.push_back(DISTRIBUTOR_ISEND);
951  res.push_back(DISTRIBUTOR_SEND);
952  res.push_back(DISTRIBUTOR_ALLTOALL);
953 #if defined(HAVE_TPETRA_MPI)
954  res.push_back(DISTRIBUTOR_IALLTOFEWV);
955 #endif
956 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
957  res.push_back(DISTRIBUTOR_MPIADVANCE_ALLTOALL);
958  res.push_back(DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV);
959 #endif
960  return res;
961 }
962 
963 Teuchos::RCP<const Teuchos::ParameterList>
964 DistributorPlan::getValidParameters() const {
965  using Teuchos::Array;
966  using Teuchos::ParameterList;
967  using Teuchos::parameterList;
968  using Teuchos::RCP;
969  using Teuchos::setStringToIntegralParameter;
970 
971  Array<std::string> sendTypes = distributorSendTypes();
972  const Array<Details::EDistributorSendType> sendTypeEnums = distributorSendTypeEnums();
973 
974  const std::string validatedSendType = validSendTypeOrThrow(Behavior::defaultSendType());
975 
976  RCP<ParameterList> plist = parameterList("Tpetra::Distributor");
977 
978  setStringToIntegralParameter<Details::EDistributorSendType>("Send type",
979  validatedSendType,
980  "When using MPI, the variant of send to use in "
981  "do[Reverse]Posts()",
982  sendTypes(), sendTypeEnums(), plist.getRawPtr());
983  plist->set("Timer Label", "", "Label for Time Monitor output");
984 
985  return Teuchos::rcp_const_cast<const ParameterList>(plist);
986 }
987 
988 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
989 
990 // Used by Teuchos::RCP to clean up an owned MPIX_Comm*
991 struct MpixCommDeallocator {
992  void free(MPIX_Comm** comm) const {
993  MPIX_Comm_free(comm);
994  }
995 };
996 
997 void DistributorPlan::initializeMpiAdvance() {
998  // assert the mpix communicator is null. if this is not the case we will figure out why
999  TEUCHOS_ASSERT(mpixComm_.is_null());
1000 
1001  // use the members to initialize the graph for neightborhood mode, or just the MPIX communicator for non-neighborhood mode
1002  Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm_);
1003  Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm = mpiComm->getRawMpiComm();
1004  int err = 0;
1005  if (sendType_ == DISTRIBUTOR_MPIADVANCE_ALLTOALL ||
1006  sendType_ == DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
1007  MPIX_Comm** mpixComm = new (MPIX_Comm*);
1008  err = MPIX_Comm_init(mpixComm, (*rawComm)());
1009  mpixComm_ = Teuchos::RCP(mpixComm,
1010  MpixCommDeallocator(),
1011  true /*take ownership*/
1012  );
1013  }
1014 
1015  TEUCHOS_ASSERT(err == 0);
1016 }
1017 #endif
1018 
1019 #if defined(HAVE_TPETRA_MPI)
1020 // FIXME: probably need to rename this function since it might change the sendType
1021 void DistributorPlan::maybeInitializeRoots() {
1022  // Only IALLTOFEWV needs to know the roots
1023  if (DISTRIBUTOR_IALLTOFEWV != sendType_) {
1024  roots_.clear();
1025  return;
1026  }
1027 
1028  ProfilingRegion region_maybeInitializeRoots("Tpetra::DistributorPlan::maybeInitializeRoots");
1029 
1030  // send my number of recvs to everyone
1031  const int numRecvs = (int)(getNumReceives() + (hasSelfMessage() ? 1 : 0));
1032  std::vector<int> sendbuf(comm_->getSize(), numRecvs);
1033  std::vector<int> recvbuf(comm_->getSize());
1034 
1035  // FIXME: is there a more natural way to do this?
1036  // Maybe MPI_Allreduce is better, we just care if anyone is sending anything to each process
1037  // we just need to know all processes that receive anything (including a self message)
1038  Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm_);
1039  Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm = mpiComm->getRawMpiComm();
1040  MPI_Comm comm = (*rawComm)();
1041  MPI_Alltoall(sendbuf.data(), 1, MPI_INT, recvbuf.data(), 1, MPI_INT, comm);
1042 
1043  roots_.clear();
1044  for (size_t root = 0; root < recvbuf.size(); ++root) {
1045  if (recvbuf[root] > 0) {
1046  roots_.push_back(root);
1047  }
1048  }
1049 
1050  // In "slow-path" communication, the data is not blocked according to sending / receiving proc.
1051  // The root-detection algorithm expects data to be blocked, so disable.
1052  int slow = !getIndicesTo().is_null() ? 1 : 0;
1053  MPI_Allreduce(MPI_IN_PLACE, &slow, 1, MPI_INT, MPI_LOR, comm);
1054  if (slow) {
1056  {
1057  std::stringstream ss;
1058  ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING: Ialltoallv send mode set, at least one rank's data is not grouped by rank. Setting to \"Send\"" << std::endl;
1059  std::cerr << ss.str();
1060  }
1061  }
1062 
1063  roots_.clear();
1064  sendType_ = DISTRIBUTOR_SEND;
1065  }
1066 
1067  // if there aren't many roots, probably someone wanted to use a gather somewhere but then just reused the import/export thing for a scatter
1068  // which this won't work well for
1069  // just fall back to SEND if roots are more than sqrt of comm
1070  if (roots_.size() * roots_.size() >= size_t(comm_->getSize())) {
1072  std::stringstream ss;
1073  ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING (Ialltoallv send type): too many roots (" << roots_.size() << ") for " << comm_->getSize() << " ranks. Setting send-type to \"Send\"" << std::endl;
1074  std::cerr << ss.str();
1075  }
1076  roots_.clear();
1077  sendType_ = DISTRIBUTOR_SEND;
1078  }
1079 }
1080 #endif // HAVE_TPETRA_MPI
1081 
1082 DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(size_t numPackets) const {
1083  const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1084 
1085  IndexView importStarts(actualNumReceives);
1086  IndexView importLengths(actualNumReceives);
1087 
1088  size_t offset = 0;
1089  for (size_t i = 0; i < actualNumReceives; ++i) {
1090  importStarts[i] = offset;
1091  offset += getLengthsFrom()[i] * numPackets;
1092  importLengths[i] = getLengthsFrom()[i] * numPackets;
1093  }
1094  return std::make_pair(importStarts, importLengths);
1095 }
1096 
1097 DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID) const {
1098  const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1099 
1100  IndexView importStarts(actualNumReceives);
1101  IndexView importLengths(actualNumReceives);
1102 
1103  size_t offset = 0;
1104  size_t curLIDoffset = 0;
1105  for (size_t i = 0; i < actualNumReceives; ++i) {
1106  size_t totalPacketsFrom_i = 0;
1107  for (size_t j = 0; j < getLengthsFrom()[i]; ++j) {
1108  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset + j];
1109  }
1110  curLIDoffset += getLengthsFrom()[i];
1111  importStarts[i] = offset;
1112  offset += totalPacketsFrom_i;
1113  importLengths[i] = totalPacketsFrom_i;
1114  }
1115  return std::make_pair(importStarts, importLengths);
1116 }
1117 
1118 DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(size_t numPackets) const {
1119  if (getIndicesTo().is_null()) {
1120  const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1121  IndexView exportStarts(actualNumSends);
1122  IndexView exportLengths(actualNumSends);
1123  for (size_t pp = 0; pp < actualNumSends; ++pp) {
1124  exportStarts[pp] = getStartsTo()[pp] * numPackets;
1125  exportLengths[pp] = getLengthsTo()[pp] * numPackets;
1126  }
1127  return std::make_pair(exportStarts, exportLengths);
1128  } else {
1129  const size_t numIndices = getIndicesTo().size();
1130  IndexView exportStarts(numIndices);
1131  IndexView exportLengths(numIndices);
1132  for (size_t j = 0; j < numIndices; ++j) {
1133  exportStarts[j] = getIndicesTo()[j] * numPackets;
1134  exportLengths[j] = numPackets;
1135  }
1136  return std::make_pair(exportStarts, exportLengths);
1137  }
1138 }
1139 
1140 DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID) const {
1141  if (getIndicesTo().is_null()) {
1142  const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1143  IndexView exportStarts(actualNumSends);
1144  IndexView exportLengths(actualNumSends);
1145  size_t offset = 0;
1146  for (size_t pp = 0; pp < actualNumSends; ++pp) {
1147  size_t numPackets = 0;
1148  for (size_t j = getStartsTo()[pp];
1149  j < getStartsTo()[pp] + getLengthsTo()[pp]; ++j) {
1150  numPackets += numExportPacketsPerLID[j];
1151  }
1152  exportStarts[pp] = offset;
1153  offset += numPackets;
1154  exportLengths[pp] = numPackets;
1155  }
1156  return std::make_pair(exportStarts, exportLengths);
1157  } else {
1158  const size_t numIndices = getIndicesTo().size();
1159  IndexView exportStarts(numIndices);
1160  IndexView exportLengths(numIndices);
1161  size_t offset = 0;
1162  for (size_t j = 0; j < numIndices; ++j) {
1163  exportStarts[j] = offset;
1164  offset += numExportPacketsPerLID[j];
1165  exportLengths[j] = numExportPacketsPerLID[j];
1166  }
1167  return std::make_pair(exportStarts, exportLengths);
1168  }
1169 }
1170 
1171 } // namespace Details
1172 } // namespace Tpetra
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
static bool debug()
Whether Tpetra is in debug mode.
const std::string & validSendTypeOrThrow(const std::string &s)
Valid enum values of distributor send types.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
#define TPETRA_EFFICIENCY_WARNING(throw_exception_test, msg)
Print or throw an efficency warning.
EDistributorSendType DistributorSendTypeStringToEnum(const std::string_view s)
Convert a string to an EDistributorSendType. Throw on error.
Teuchos::Array< EDistributorSendType > distributorSendTypeEnums()
Valid enum values of distributor send types.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
static bool verbose()
Whether Tpetra is in verbose mode.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
Teuchos::Array< std::string > distributorSendTypes()
Valid string values for Distributor&#39;s &quot;Send type&quot; parameter.
#define SHARED_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg, comm)
Test for exception, with reduction over the given communicator.
EDistributorSendType
The type of MPI send that Distributor should use.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.