Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_DistributorPlan.cpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
11 
13 #include "Teuchos_StandardParameterEntryValidators.hpp"
14 #include "Tpetra_Util.hpp"
16 #include <numeric>
17 
18 namespace Tpetra {
19 namespace Details {
20 
21 std::string
23 {
24  if (sendType == DISTRIBUTOR_ISEND) {
25  return "Isend";
26  }
27  else if (sendType == DISTRIBUTOR_SEND) {
28  return "Send";
29  }
30  else if (sendType == DISTRIBUTOR_ALLTOALL) {
31  return "Alltoall";
32  }
33 #if defined(HAVE_TPETRA_MPI)
34  else if (sendType == DISTRIBUTOR_IALLTOFEWV) {
35  return "Ialltofewv";
36  }
37 #endif
38 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
39  else if (sendType == DISTRIBUTOR_MPIADVANCE_ALLTOALL) {
40  return "MpiAdvanceAlltoall";
41  }
42  else if (sendType == DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
43  return "MpiAdvanceNbralltoallv";
44  }
45 #endif
46  else {
47  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid "
48  "EDistributorSendType enum value " << sendType << ".");
49  }
50 }
51 
53 DistributorSendTypeStringToEnum (const std::string_view s)
54 {
55  if (s == "Isend") return DISTRIBUTOR_ISEND;
56  if (s == "Send") return DISTRIBUTOR_SEND;
57  if (s == "Alltoall") return DISTRIBUTOR_ALLTOALL;
58 #if defined(HAVE_TPETRA_MPI)
59  if (s == "Ialltofewv") return DISTRIBUTOR_IALLTOFEWV;
60 #endif
61 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
62  if (s == "MpiAdvanceAlltoall") return DISTRIBUTOR_MPIADVANCE_ALLTOALL;
63  if (s == "MpiAdvanceNbralltoallv") return DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV;
64 #endif
65  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string to convert to EDistributorSendType enum value: " << s);
66 }
67 
69 const std::string &validSendTypeOrThrow(const std::string &s) {
70  const auto valids = distributorSendTypes();
71  if (std::find(valids.begin(), valids.end(), s) == valids.end()) {
72  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, "Invalid string for EDistributorSendType enum value: " << s);
73  }
74  return s;
75 }
76 
77 std::string
79 {
80  switch (how) {
81  case Details::DISTRIBUTOR_NOT_INITIALIZED:
82  return "Not initialized yet";
83  case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS:
84  return "By createFromSends";
85  case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS:
86  return "By createFromRecvs";
87  case Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS:
88  return "By createFromSendsAndRecvs";
89  case Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE:
90  return "By createReverseDistributor";
91  case Details::DISTRIBUTOR_INITIALIZED_BY_COPY:
92  return "By copy constructor";
93  default:
94  return "INVALID";
95  }
96 }
97 
98 DistributorPlan::DistributorPlan(Teuchos::RCP<const Teuchos::Comm<int>> comm)
99  : comm_(comm),
100 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
101  mpixComm_(Teuchos::null),
102 #endif
103  howInitialized_(DISTRIBUTOR_NOT_INITIALIZED),
104  reversePlan_(Teuchos::null),
105  sendType_(DistributorSendTypeStringToEnum(Behavior::defaultSendType())),
106  sendMessageToSelf_(false),
107  numSendsToOtherProcs_(0),
108  maxSendLength_(0),
109  numReceives_(0),
110  totalReceiveLength_(0)
111 { }
112 
113 DistributorPlan::DistributorPlan(const DistributorPlan& otherPlan)
114  : comm_(otherPlan.comm_),
115 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
116  mpixComm_(otherPlan.mpixComm_),
117 #endif
118  howInitialized_(DISTRIBUTOR_INITIALIZED_BY_COPY),
119  reversePlan_(otherPlan.reversePlan_),
120  sendType_(otherPlan.sendType_),
121  sendMessageToSelf_(otherPlan.sendMessageToSelf_),
122  numSendsToOtherProcs_(otherPlan.numSendsToOtherProcs_),
123  procIdsToSendTo_(otherPlan.procIdsToSendTo_),
124  startsTo_(otherPlan.startsTo_),
125  lengthsTo_(otherPlan.lengthsTo_),
126  maxSendLength_(otherPlan.maxSendLength_),
127  indicesTo_(otherPlan.indicesTo_),
128  numReceives_(otherPlan.numReceives_),
129  totalReceiveLength_(otherPlan.totalReceiveLength_),
130  lengthsFrom_(otherPlan.lengthsFrom_),
131  procsFrom_(otherPlan.procsFrom_),
132  startsFrom_(otherPlan.startsFrom_),
133  indicesFrom_(otherPlan.indicesFrom_)
134 #if defined(HAVE_TPETRACORE_MPI)
135  ,
136  roots_(otherPlan.roots_)
137 #endif
138 { }
139 
140 size_t DistributorPlan::createFromSends(const Teuchos::ArrayView<const int>& exportProcIDs) {
141  using Teuchos::outArg;
142  using Teuchos::REDUCE_MAX;
143  using Teuchos::reduceAll;
144  using std::endl;
145  const char rawPrefix[] = "Tpetra::DistributorPlan::createFromSends";
146 
147  const size_t numExports = exportProcIDs.size();
148  const int myProcID = comm_->getRank();
149  const int numProcs = comm_->getSize();
150  const bool debug = Details::Behavior::debug("Distributor");
151 
152  // exportProcIDs tells us the communication pattern for this
153  // distributor. It dictates the way that the export data will be
154  // interpreted in doPosts(). We want to perform at most one
155  // send per process in doPosts; this is for two reasons:
156  // * minimize latency / overhead in the comm routines (nice)
157  // * match the number of receives and sends between processes
158  // (necessary)
159  //
160  // Teuchos::Comm requires that the data for a send are contiguous
161  // in a send buffer. Therefore, if the data in the send buffer
162  // for doPosts() are not contiguous, they will need to be copied
163  // into a contiguous buffer. The user has specified this
164  // noncontiguous pattern and we can't do anything about it.
165  // However, if they do not provide an efficient pattern, we will
166  // warn them if one of the following compile-time options has been
167  // set:
168  // * HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS
169  //
170  // If the data are contiguous, then we can post the sends in situ
171  // (i.e., without needing to copy them into a send buffer).
172  //
173  // Determine contiguity. There are a number of ways to do this:
174  // * If the export IDs are sorted, then all exports to a
175  // particular proc must be contiguous. This is what Epetra does.
176  // * If the export ID of the current export already has been
177  // listed, then the previous listing should correspond to the
178  // same export. This tests contiguity, but not sortedness.
179  //
180  // Both of these tests require O(n), where n is the number of
181  // exports. However, the latter will positively identify a greater
182  // portion of contiguous patterns. We use the latter method.
183  //
184  // Check to see if values are grouped by procs without gaps
185  // If so, indices_to -> 0.
186 
187  if (debug) {
188  // Test whether any process in the communicator got an invalid
189  // process ID. If badID != -1 on this process, then it equals
190  // this process' rank. The max of all badID over all processes
191  // is the max rank which has an invalid process ID.
192  int badID = -1;
193  for (size_t i = 0; i < numExports; ++i) {
194  const int exportID = exportProcIDs[i];
195  if (exportID >= numProcs || exportID < 0) {
196  badID = myProcID;
197  break;
198  }
199  }
200  int gbl_badID;
201  reduceAll<int, int> (*comm_, REDUCE_MAX, badID, outArg (gbl_badID));
202  TEUCHOS_TEST_FOR_EXCEPTION
203  (gbl_badID >= 0, std::runtime_error, rawPrefix << "Proc "
204  << gbl_badID << ", perhaps among other processes, got a bad "
205  "send process ID.");
206  }
207 
208  // Set up data structures for quick traversal of arrays.
209  // This contains the number of sends for each process ID.
210  //
211  // FIXME (mfh 20 Mar 2014) This is one of a few places in Tpetra
212  // that create an array of length the number of processes in the
213  // communicator (plus one). Given how this code uses this array,
214  // it should be straightforward to replace it with a hash table or
215  // some other more space-efficient data structure. In practice,
216  // most of the entries of starts should be zero for a sufficiently
217  // large process count, unless the communication pattern is dense.
218  // Note that it's important to be able to iterate through keys (i
219  // for which starts[i] is nonzero) in increasing order.
220  Teuchos::Array<size_t> starts (numProcs + 1, 0);
221 
222  // numActive is the number of sends that are not Null
223  size_t numActive = 0;
224  int needSendBuff = 0; // Boolean
225 
226  for (size_t i = 0; i < numExports; ++i) {
227  const int exportID = exportProcIDs[i];
228  if (exportID >= 0) {
229  // exportID is a valid process ID. Increment the number of
230  // messages this process will send to that process.
231  ++starts[exportID];
232 
233  // If we're sending more than one message to process exportID,
234  // then it is possible that the data are not contiguous.
235  // Check by seeing if the previous process ID in the list
236  // (exportProcIDs[i-1]) is the same. It's safe to use i-1,
237  // because if starts[exportID] > 1, then i must be > 1 (since
238  // the starts array was filled with zeros initially).
239 
240  // null entries break continuity.
241  // e.g., [ 0, 0, 0, 1, -99, 1, 2, 2, 2] is not contiguous
242  if (needSendBuff == 0 && starts[exportID] > 1 &&
243  exportID != exportProcIDs[i-1]) {
244  needSendBuff = 1;
245  }
246  ++numActive;
247  }
248  }
249 
250 #if defined(HAVE_TPETRA_PRINT_EFFICIENCY_WARNINGS)
251  {
252  int global_needSendBuff;
253  reduceAll<int, int> (*comm_, REDUCE_MAX, needSendBuff,
254  outArg (global_needSendBuff));
256  global_needSendBuff != 0,
257  "::createFromSends: Grouping export IDs together by process rank often "
258  "improves performance.");
259  }
260 #endif
261 
262  // Determine from the caller's data whether or not the current
263  // process should send (a) message(s) to itself.
264  if (starts[myProcID] != 0) {
265  sendMessageToSelf_ = true;
266  }
267  else {
268  sendMessageToSelf_ = false;
269  }
270 
271  if (! needSendBuff) {
272  // grouped by proc, no send buffer or indicesTo_ needed
273  numSendsToOtherProcs_ = 0;
274  // Count total number of sends, i.e., total number of procs to
275  // which we are sending. This includes myself, if applicable.
276  for (int i = 0; i < numProcs; ++i) {
277  if (starts[i]) {
278  ++numSendsToOtherProcs_;
279  }
280  }
281 
282  // Not only do we not need these, but we must clear them, as
283  // empty status of indicesTo is a flag used later.
284  indicesTo_.resize(0);
285  // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
286  // includes self sends. Set their values to zeros.
287  procIdsToSendTo_.assign(numSendsToOtherProcs_,0);
288  startsTo_.assign(numSendsToOtherProcs_,0);
289  lengthsTo_.assign(numSendsToOtherProcs_,0);
290 
291  // set startsTo to the offset for each send (i.e., each proc ID)
292  // set procsTo to the proc ID for each send
293  // in interpreting this code, remember that we are assuming contiguity
294  // that is why index skips through the ranks
295  {
296  size_t procIndex = 0;
297  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
298  while (exportProcIDs[procIndex] < 0) {
299  ++procIndex; // skip all negative proc IDs
300  }
301  startsTo_[i] = procIndex;
302  int procID = exportProcIDs[procIndex];
303  procIdsToSendTo_[i] = procID;
304  procIndex += starts[procID];
305  }
306  }
307  // sort the startsTo and proc IDs together, in ascending order, according
308  // to proc IDs
309  if (numSendsToOtherProcs_ > 0) {
310  sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
311  }
312  // compute the maximum send length
313  maxSendLength_ = 0;
314  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
315  int procID = procIdsToSendTo_[i];
316  lengthsTo_[i] = starts[procID];
317  if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
318  maxSendLength_ = lengthsTo_[i];
319  }
320  }
321  }
322  else {
323  // not grouped by proc, need send buffer and indicesTo_
324 
325  // starts[i] is the number of sends to proc i
326  // numActive equals number of sends total, \sum_i starts[i]
327 
328  // this loop starts at starts[1], so explicitly check starts[0]
329  if (starts[0] == 0 ) {
330  numSendsToOtherProcs_ = 0;
331  }
332  else {
333  numSendsToOtherProcs_ = 1;
334  }
335  for (Teuchos::Array<size_t>::iterator i=starts.begin()+1,
336  im1=starts.begin();
337  i != starts.end(); ++i)
338  {
339  if (*i != 0) ++numSendsToOtherProcs_;
340  *i += *im1;
341  im1 = i;
342  }
343  // starts[i] now contains the number of exports to procs 0 through i
344 
345  for (Teuchos::Array<size_t>::reverse_iterator ip1=starts.rbegin(),
346  i=starts.rbegin()+1;
347  i != starts.rend(); ++i)
348  {
349  *ip1 = *i;
350  ip1 = i;
351  }
352  starts[0] = 0;
353  // starts[i] now contains the number of exports to procs 0 through
354  // i-1, i.e., all procs before proc i
355 
356  indicesTo_.resize(numActive);
357 
358  for (size_t i = 0; i < numExports; ++i) {
359  if (exportProcIDs[i] >= 0) {
360  // record the offset to the sendBuffer for this export
361  indicesTo_[starts[exportProcIDs[i]]] = i;
362  // now increment the offset for this proc
363  ++starts[exportProcIDs[i]];
364  }
365  }
366  // our send buffer will contain the export data for each of the procs
367  // we communicate with, in order by proc id
368  // sendBuffer = {proc_0_data, proc_1_data, ..., proc_np-1_data}
369  // indicesTo now maps each export to the location in our send buffer
370  // associated with the export
371  // data for export i located at sendBuffer[indicesTo[i]]
372  //
373  // starts[i] once again contains the number of exports to
374  // procs 0 through i
375  for (int proc = numProcs-1; proc != 0; --proc) {
376  starts[proc] = starts[proc-1];
377  }
378  starts.front() = 0;
379  starts[numProcs] = numActive;
380  //
381  // starts[proc] once again contains the number of exports to
382  // procs 0 through proc-1
383  // i.e., the start of my data in the sendBuffer
384 
385  // this contains invalid data at procs we don't care about, that is okay
386  procIdsToSendTo_.resize(numSendsToOtherProcs_);
387  startsTo_.resize(numSendsToOtherProcs_);
388  lengthsTo_.resize(numSendsToOtherProcs_);
389 
390  // for each group of sends/exports, record the destination proc,
391  // the length, and the offset for this send into the
392  // send buffer (startsTo_)
393  maxSendLength_ = 0;
394  size_t snd = 0;
395  for (int proc = 0; proc < numProcs; ++proc ) {
396  if (starts[proc+1] != starts[proc]) {
397  lengthsTo_[snd] = starts[proc+1] - starts[proc];
398  startsTo_[snd] = starts[proc];
399  // record max length for all off-proc sends
400  if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
401  maxSendLength_ = lengthsTo_[snd];
402  }
403  procIdsToSendTo_[snd] = proc;
404  ++snd;
405  }
406  }
407  }
408 
409  if (sendMessageToSelf_) {
410  --numSendsToOtherProcs_;
411  }
412 
413  // Invert map to see what msgs are received and what length
414  computeReceives();
415 
416 #if defined(HAVE_TPETRA_MPI)
417  maybeInitializeRoots();
418 #endif
419 
420  // createFromRecvs() calls createFromSends(), but will set
421  // howInitialized_ again after calling createFromSends().
422  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS;
423 
424  return totalReceiveLength_;
425 }
426 
427 void DistributorPlan::createFromRecvs(const Teuchos::ArrayView<const int>& remoteProcIDs)
428 {
429  *this = *getReversePlan();
430  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
431 }
432 
433 void DistributorPlan::createFromSendsAndRecvs(const Teuchos::ArrayView<const int>& exportProcIDs,
434  const Teuchos::ArrayView<const int>& remoteProcIDs)
435 {
436  // note the exportProcIDs and remoteProcIDs _must_ be a list that has
437  // an entry for each GID. If the export/remoteProcIDs is taken from
438  // the getProcs{From|To} lists that are extracted from a previous distributor,
439  // it will generate a wrong answer, because those lists have a unique entry
440  // for each processor id. A version of this with lengthsTo and lengthsFrom
441  // should be made.
442 
443  howInitialized_ = Tpetra::Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS;
444 
445  int myProcID = comm_->getRank ();
446  int numProcs = comm_->getSize();
447 
448  const size_t numExportIDs = exportProcIDs.size();
449  Teuchos::Array<size_t> starts (numProcs + 1, 0);
450 
451  size_t numActive = 0;
452  int needSendBuff = 0; // Boolean
453 
454  for(size_t i = 0; i < numExportIDs; i++ )
455  {
456  if( needSendBuff==0 && i && (exportProcIDs[i] < exportProcIDs[i-1]) )
457  needSendBuff = 1;
458  if( exportProcIDs[i] >= 0 )
459  {
460  ++starts[ exportProcIDs[i] ];
461  ++numActive;
462  }
463  }
464 
465  sendMessageToSelf_ = ( starts[myProcID] != 0 ) ? 1 : 0;
466 
467  numSendsToOtherProcs_ = 0;
468 
469  if( needSendBuff ) //grouped by processor, no send buffer or indicesTo_ needed
470  {
471  if (starts[0] == 0 ) {
472  numSendsToOtherProcs_ = 0;
473  }
474  else {
475  numSendsToOtherProcs_ = 1;
476  }
477  for (Teuchos::Array<size_t>::iterator i=starts.begin()+1,
478  im1=starts.begin();
479  i != starts.end(); ++i)
480  {
481  if (*i != 0) ++numSendsToOtherProcs_;
482  *i += *im1;
483  im1 = i;
484  }
485  // starts[i] now contains the number of exports to procs 0 through i
486 
487  for (Teuchos::Array<size_t>::reverse_iterator ip1=starts.rbegin(),
488  i=starts.rbegin()+1;
489  i != starts.rend(); ++i)
490  {
491  *ip1 = *i;
492  ip1 = i;
493  }
494  starts[0] = 0;
495  // starts[i] now contains the number of exports to procs 0 through
496  // i-1, i.e., all procs before proc i
497 
498  indicesTo_.resize(numActive);
499 
500  for (size_t i = 0; i < numExportIDs; ++i) {
501  if (exportProcIDs[i] >= 0) {
502  // record the offset to the sendBuffer for this export
503  indicesTo_[starts[exportProcIDs[i]]] = i;
504  // now increment the offset for this proc
505  ++starts[exportProcIDs[i]];
506  }
507  }
508  for (int proc = numProcs-1; proc != 0; --proc) {
509  starts[proc] = starts[proc-1];
510  }
511  starts.front() = 0;
512  starts[numProcs] = numActive;
513  procIdsToSendTo_.resize(numSendsToOtherProcs_);
514  startsTo_.resize(numSendsToOtherProcs_);
515  lengthsTo_.resize(numSendsToOtherProcs_);
516  maxSendLength_ = 0;
517  size_t snd = 0;
518  for (int proc = 0; proc < numProcs; ++proc ) {
519  if (starts[proc+1] != starts[proc]) {
520  lengthsTo_[snd] = starts[proc+1] - starts[proc];
521  startsTo_[snd] = starts[proc];
522  // record max length for all off-proc sends
523  if ((proc != myProcID) && (lengthsTo_[snd] > maxSendLength_)) {
524  maxSendLength_ = lengthsTo_[snd];
525  }
526  procIdsToSendTo_[snd] = proc;
527  ++snd;
528  }
529  }
530  }
531  else {
532  // grouped by proc, no send buffer or indicesTo_ needed
533  numSendsToOtherProcs_ = 0;
534  // Count total number of sends, i.e., total number of procs to
535  // which we are sending. This includes myself, if applicable.
536  for (int i = 0; i < numProcs; ++i) {
537  if (starts[i]) {
538  ++numSendsToOtherProcs_;
539  }
540  }
541 
542  // Not only do we not need these, but we must clear them, as
543  // empty status of indicesTo is a flag used later.
544  indicesTo_.resize(0);
545  // Size these to numSendsToOtherProcs_; note, at the moment, numSendsToOtherProcs_
546  // includes self sends. Set their values to zeros.
547  procIdsToSendTo_.assign(numSendsToOtherProcs_,0);
548  startsTo_.assign(numSendsToOtherProcs_,0);
549  lengthsTo_.assign(numSendsToOtherProcs_,0);
550 
551  // set startsTo to the offset for each send (i.e., each proc ID)
552  // set procsTo to the proc ID for each send
553  // in interpreting this code, remember that we are assuming contiguity
554  // that is why index skips through the ranks
555  {
556  size_t procIndex = 0;
557  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
558  while (exportProcIDs[procIndex] < 0) {
559  ++procIndex; // skip all negative proc IDs
560  }
561  startsTo_[i] = procIndex;
562  int procID = exportProcIDs[procIndex];
563  procIdsToSendTo_[i] = procID;
564  procIndex += starts[procID];
565  }
566  }
567  // sort the startsTo and proc IDs together, in ascending order, according
568  // to proc IDs
569  if (numSendsToOtherProcs_ > 0) {
570  sort2(procIdsToSendTo_.begin(), procIdsToSendTo_.end(), startsTo_.begin());
571  }
572  // compute the maximum send length
573  maxSendLength_ = 0;
574  for (size_t i = 0; i < numSendsToOtherProcs_; ++i) {
575  int procID = procIdsToSendTo_[i];
576  lengthsTo_[i] = starts[procID];
577  if ((procID != myProcID) && (lengthsTo_[i] > maxSendLength_)) {
578  maxSendLength_ = lengthsTo_[i];
579  }
580  }
581  }
582 
583 
584  numSendsToOtherProcs_ -= sendMessageToSelf_;
585  std::vector<int> recv_list;
586  recv_list.reserve(numSendsToOtherProcs_); //reserve an initial guess for size needed
587 
588  int last_pid=-2;
589  for(int i=0; i<remoteProcIDs.size(); i++) {
590  if(remoteProcIDs[i]>last_pid) {
591  recv_list.push_back(remoteProcIDs[i]);
592  last_pid = remoteProcIDs[i];
593  }
594  else if (remoteProcIDs[i]<last_pid)
595  throw std::runtime_error("Tpetra::Distributor:::createFromSendsAndRecvs expected RemotePIDs to be in sorted order");
596  }
597  numReceives_ = recv_list.size();
598  if(numReceives_) {
599  procsFrom_.assign(numReceives_,0);
600  lengthsFrom_.assign(numReceives_,0);
601  indicesFrom_.assign(numReceives_,0);
602  startsFrom_.assign(numReceives_,0);
603  }
604  for(size_t i=0,j=0; i<numReceives_; ++i) {
605  int jlast=j;
606  procsFrom_[i] = recv_list[i];
607  startsFrom_[i] = j;
608  for( ; j<(size_t)remoteProcIDs.size() &&
609  remoteProcIDs[jlast]==remoteProcIDs[j] ; j++){;}
610  lengthsFrom_[i] = j-jlast;
611  }
612  totalReceiveLength_ = remoteProcIDs.size();
613  indicesFrom_.clear ();
614  numReceives_-=sendMessageToSelf_;
615 
616 #if defined(HAVE_TPETRA_MPI)
617  maybeInitializeRoots();
618 #endif
619 }
620 
621 Teuchos::RCP<DistributorPlan> DistributorPlan::getReversePlan() const {
622  if (reversePlan_.is_null()) createReversePlan();
623  return reversePlan_;
624 }
625 
626 void DistributorPlan::createReversePlan() const
627 {
628  reversePlan_ = Teuchos::rcp(new DistributorPlan(comm_));
629  reversePlan_->howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE;
630  reversePlan_->sendType_ = sendType_;
631 
632 #if defined(HAVE_TPETRACORE_MPI)
633  // If the forward plan matches an all-to-few communication pattern,
634  // the reverse plan is few-to-all, so don't use a special all-to-few
635  // implementation for it
636  if (DISTRIBUTOR_IALLTOFEWV == sendType_) {
637  if (Behavior::verbose()) {
638  std::stringstream ss;
639  ss << __FILE__ << ":" << __LINE__ << " WARNING (Ialltofewv send type): using default for reversed Ialltofewv\n";
640  std::cerr << ss.str();
641  }
642 
643  reversePlan_->sendType_ = DistributorSendTypeStringToEnum(Behavior::defaultSendType());
644  }
645 #endif
646 
647 
648  // The total length of all the sends of this DistributorPlan. We
649  // calculate it because it's the total length of all the receives
650  // of the reverse DistributorPlan.
651  size_t totalSendLength =
652  std::accumulate(lengthsTo_.begin(), lengthsTo_.end(), 0);
653 
654  // The maximum length of any of the receives of this DistributorPlan.
655  // We calculate it because it's the maximum length of any of the
656  // sends of the reverse DistributorPlan.
657  size_t maxReceiveLength = 0;
658  const int myProcID = comm_->getRank();
659  for (size_t i=0; i < numReceives_; ++i) {
660  if (procsFrom_[i] != myProcID) {
661  // Don't count receives for messages sent by myself to myself.
662  if (lengthsFrom_[i] > maxReceiveLength) {
663  maxReceiveLength = lengthsFrom_[i];
664  }
665  }
666  }
667 
668  reversePlan_->sendMessageToSelf_ = sendMessageToSelf_;
669  reversePlan_->numSendsToOtherProcs_ = numReceives_;
670  reversePlan_->procIdsToSendTo_ = procsFrom_;
671  reversePlan_->startsTo_ = startsFrom_;
672  reversePlan_->lengthsTo_ = lengthsFrom_;
673  reversePlan_->maxSendLength_ = maxReceiveLength;
674  reversePlan_->indicesTo_ = indicesFrom_;
675  reversePlan_->numReceives_ = numSendsToOtherProcs_;
676  reversePlan_->totalReceiveLength_ = totalSendLength;
677  reversePlan_->lengthsFrom_ = lengthsTo_;
678  reversePlan_->procsFrom_ = procIdsToSendTo_;
679  reversePlan_->startsFrom_ = startsTo_;
680  reversePlan_->indicesFrom_ = indicesTo_;
681 
682 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
683  // is there a smarter way to do this
684  reversePlan_->initializeMpiAdvance();
685 #endif
686 
687 #if defined(HAVE_TPETRA_MPI)
688  reversePlan_->maybeInitializeRoots();
689 #endif
690 }
691 
692 void DistributorPlan::computeReceives()
693 {
694  using Teuchos::Array;
695  using Teuchos::ArrayRCP;
696  using Teuchos::as;
697  using Teuchos::CommStatus;
698  using Teuchos::CommRequest;
699  using Teuchos::ireceive;
700  using Teuchos::RCP;
701  using Teuchos::rcp;
702  using Teuchos::REDUCE_SUM;
703  using Teuchos::receive;
704  using Teuchos::reduce;
705  using Teuchos::scatter;
706  using Teuchos::send;
707  using Teuchos::waitAll;
708 
709  const int myRank = comm_->getRank();
710  const int numProcs = comm_->getSize();
711 
712  const int mpiTag = DEFAULT_MPI_TAG;
713 
714  // toProcsFromMe[i] == the number of messages sent by this process
715  // to process i. The data in numSendsToOtherProcs_, procIdsToSendTo_, and lengthsTo_
716  // concern the contiguous sends. Therefore, each process will be
717  // listed in procIdsToSendTo_ at most once, and so toProcsFromMe[i] will
718  // either be 0 or 1.
719  {
720  Array<int> toProcsFromMe (numProcs, 0);
721 #ifdef HAVE_TPETRA_DEBUG
722  bool counting_error = false;
723 #endif // HAVE_TPETRA_DEBUG
724  for (size_t i = 0; i < (numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0)); ++i) {
725 #ifdef HAVE_TPETRA_DEBUG
726  if (toProcsFromMe[procIdsToSendTo_[i]] != 0) {
727  counting_error = true;
728  }
729 #endif // HAVE_TPETRA_DEBUG
730  toProcsFromMe[procIdsToSendTo_[i]] = 1;
731  }
732 #ifdef HAVE_TPETRA_DEBUG
733  // Note that SHARED_TEST_FOR_EXCEPTION does a global reduction
734  SHARED_TEST_FOR_EXCEPTION(counting_error, std::logic_error,
735  "Tpetra::Distributor::computeReceives: There was an error on at least "
736  "one process in counting the number of messages send by that process to "
737  "the other processs. Please report this bug to the Tpetra developers.",
738  *comm_);
739 #endif // HAVE_TPETRA_DEBUG
740 
741  // Compute the number of receives that this process needs to
742  // post. The number of receives includes any self sends (i.e.,
743  // messages sent by this process to itself).
744  //
745  // (We will use numReceives_ this below to post exactly that
746  // number of receives, with MPI_ANY_SOURCE as the sending rank.
747  // This will tell us from which processes this process expects
748  // to receive, and how many packets of data we expect to receive
749  // from each process.)
750  //
751  // toProcsFromMe[i] is the number of messages sent by this
752  // process to process i. Compute the sum (elementwise) of all
753  // the toProcsFromMe arrays on all processes in the
754  // communicator. If the array x is that sum, then if this
755  // process has rank j, x[j] is the number of messages sent
756  // to process j, that is, the number of receives on process j
757  // (including any messages sent by process j to itself).
758  //
759  // Yes, this requires storing and operating on an array of
760  // length P, where P is the number of processes in the
761  // communicator. Epetra does this too. Avoiding this O(P)
762  // memory bottleneck would require some research.
763  //
764  // mfh 09 Jan 2012, 15 Jul 2015: There are three ways to
765  // implement this O(P) memory algorithm.
766  //
767  // 1. Use MPI_Reduce and MPI_Scatter: reduce on the root
768  // process (0) from toProcsFromMe, to numRecvsOnEachProc.
769  // Then, scatter the latter, so that each process p gets
770  // numRecvsOnEachProc[p].
771  //
772  // 2. Like #1, but use MPI_Reduce_scatter instead of
773  // MPI_Reduce and MPI_Scatter. MPI_Reduce_scatter might be
774  // optimized to reduce the number of messages, but
775  // MPI_Reduce_scatter is more general than we need (it
776  // allows the equivalent of MPI_Scatterv). See Bug 6336.
777  //
778  // 3. Do an all-reduce on toProcsFromMe, and let my process
779  // (with rank myRank) get numReceives_ from
780  // toProcsFromMe[myRank]. The HPCCG miniapp uses the
781  // all-reduce method.
782  //
783  // Approaches 1 and 3 have the same critical path length.
784  // However, #3 moves more data. This is because the final
785  // result is just one integer, but #3 moves a whole array of
786  // results to all the processes. This is why we use Approach 1
787  // here.
788  //
789  // mfh 12 Apr 2013: See discussion in createFromSends() about
790  // how we could use this communication to propagate an error
791  // flag for "free" in a release build.
792 
793  const int root = 0; // rank of root process of the reduction
794  Array<int> numRecvsOnEachProc; // temp; only needed on root
795  if (myRank == root) {
796  numRecvsOnEachProc.resize (numProcs);
797  }
798  int numReceivesAsInt = 0; // output
799  reduce<int, int> (toProcsFromMe.getRawPtr (),
800  numRecvsOnEachProc.getRawPtr (),
801  numProcs, REDUCE_SUM, root, *comm_);
802  scatter<int, int> (numRecvsOnEachProc.getRawPtr (), 1,
803  &numReceivesAsInt, 1, root, *comm_);
804  numReceives_ = static_cast<size_t> (numReceivesAsInt);
805  }
806 
807  // Now we know numReceives_, which is this process' number of
808  // receives. Allocate the lengthsFrom_ and procsFrom_ arrays
809  // with this number of entries.
810  lengthsFrom_.assign (numReceives_, 0);
811  procsFrom_.assign (numReceives_, 0);
812 
813  //
814  // Ask (via nonblocking receive) each process from which we are
815  // receiving how many packets we should expect from it in the
816  // communication pattern.
817  //
818 
819  // At this point, numReceives_ includes any self message that
820  // there may be. At the end of this routine, we'll subtract off
821  // the self message (if there is one) from numReceives_. In this
822  // routine, we don't need to receive a message from ourselves in
823  // order to figure out our lengthsFrom_ and source process ID; we
824  // can just ask ourselves directly. Thus, the actual number of
825  // nonblocking receives we post here does not include the self
826  // message.
827  const size_t actualNumReceives = numReceives_ - (sendMessageToSelf_ ? 1 : 0);
828 
829  // Teuchos' wrapper for nonblocking receives requires receive
830  // buffers that it knows won't go away. This is why we use RCPs,
831  // one RCP per nonblocking receive request. They get allocated in
832  // the loop below.
833  Array<RCP<CommRequest<int> > > requests (actualNumReceives);
834  Array<ArrayRCP<size_t> > lengthsFromBuffers (actualNumReceives);
835  Array<RCP<CommStatus<int> > > statuses (actualNumReceives);
836 
837  // Teuchos::Comm treats a negative process ID as MPI_ANY_SOURCE
838  // (receive data from any process).
839 #ifdef HAVE_MPI
840  const int anySourceProc = MPI_ANY_SOURCE;
841 #else
842  const int anySourceProc = -1;
843 #endif
844 
845  // Post the (nonblocking) receives.
846  for (size_t i = 0; i < actualNumReceives; ++i) {
847  // Once the receive completes, we can ask the corresponding
848  // CommStatus object (output by wait()) for the sending process'
849  // ID (which we'll assign to procsFrom_[i] -- don't forget to
850  // do that!).
851  lengthsFromBuffers[i].resize (1);
852  lengthsFromBuffers[i][0] = as<size_t> (0);
853  requests[i] = ireceive<int, size_t> (lengthsFromBuffers[i], anySourceProc,
854  mpiTag, *comm_);
855  }
856 
857  // Post the sends: Tell each process to which we are sending how
858  // many packets it should expect from us in the communication
859  // pattern. We could use nonblocking sends here, as long as we do
860  // a waitAll() on all the sends and receives at once.
861  //
862  // We assume that numSendsToOtherProcs_ and sendMessageToSelf_ have already been
863  // set. The value of numSendsToOtherProcs_ (my process' number of sends) does
864  // not include any message that it might send to itself.
865  for (size_t i = 0; i < numSendsToOtherProcs_ + (sendMessageToSelf_ ? 1 : 0); ++i) {
866  if (procIdsToSendTo_[i] != myRank) {
867  // Send a message to procIdsToSendTo_[i], telling that process that
868  // this communication pattern will send that process
869  // lengthsTo_[i] blocks of packets.
870  const size_t* const lengthsTo_i = &lengthsTo_[i];
871  send<int, size_t> (lengthsTo_i, 1, as<int> (procIdsToSendTo_[i]), mpiTag, *comm_);
872  }
873  else {
874  // We don't need a send in the self-message case. If this
875  // process will send a message to itself in the communication
876  // pattern, then the last element of lengthsFrom_ and
877  // procsFrom_ corresponds to the self-message. Of course
878  // this process knows how long the message is, and the process
879  // ID is its own process ID.
880  lengthsFrom_[numReceives_-1] = lengthsTo_[i];
881  procsFrom_[numReceives_-1] = myRank;
882  }
883  }
884 
885  //
886  // Wait on all the receives. When they arrive, check the status
887  // output of wait() for the receiving process ID, unpack the
888  // request buffers into lengthsFrom_, and set procsFrom_ from the
889  // status.
890  //
891  waitAll (*comm_, requests (), statuses ());
892  for (size_t i = 0; i < actualNumReceives; ++i) {
893  lengthsFrom_[i] = *lengthsFromBuffers[i];
894  procsFrom_[i] = statuses[i]->getSourceRank ();
895  }
896 
897  // Sort the procsFrom_ array, and apply the same permutation to
898  // lengthsFrom_. This ensures that procsFrom_[i] and
899  // lengthsFrom_[i] refers to the same thing.
900  sort2 (procsFrom_.begin(), procsFrom_.end(), lengthsFrom_.begin());
901 
902  // Compute indicesFrom_
903  totalReceiveLength_ =
904  std::accumulate (lengthsFrom_.begin (), lengthsFrom_.end (), 0);
905  indicesFrom_.clear ();
906 
907  startsFrom_.clear ();
908  startsFrom_.reserve (numReceives_);
909  for (size_t i = 0, j = 0; i < numReceives_; ++i) {
910  startsFrom_.push_back(j);
911  j += lengthsFrom_[i];
912  }
913 
914  if (sendMessageToSelf_) {
915  --numReceives_;
916  }
917 }
918 
919 void DistributorPlan::setParameterList(const Teuchos::RCP<Teuchos::ParameterList>& plist)
920 {
921  using Teuchos::FancyOStream;
922  using Teuchos::getIntegralValue;
923  using Teuchos::ParameterList;
924  using Teuchos::parameterList;
925  using Teuchos::RCP;
926  using std::endl;
927 
928  if (! plist.is_null()) {
929  RCP<const ParameterList> validParams = getValidParameters ();
930  plist->validateParametersAndSetDefaults (*validParams);
931 
932  const Details::EDistributorSendType sendType =
933  getIntegralValue<Details::EDistributorSendType> (*plist, "Send type");
934 
935  // Now that we've validated the input list, save the results.
936  sendType_ = sendType;
937 
938  #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
939  initializeMpiAdvance();
940  #endif
941 
942  // ParameterListAcceptor semantics require pointer identity of the
943  // sublist passed to setParameterList(), so we save the pointer.
944  this->setMyParamList (plist);
945 
946 #if defined(HAVE_TPETRA_MPI)
947  maybeInitializeRoots();
948 #endif
949  }
950 }
951 
952 Teuchos::Array<std::string> distributorSendTypes()
953 {
954  Teuchos::Array<std::string> sendTypes;
955  sendTypes.push_back ("Isend");
956  sendTypes.push_back ("Send");
957  sendTypes.push_back ("Alltoall");
958 #if defined(HAVE_TPETRA_MPI)
959  sendTypes.push_back ("Ialltofewv");
960 #endif
961 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
962  sendTypes.push_back ("MpiAdvanceAlltoall");
963  sendTypes.push_back ("MpiAdvanceNbralltoallv");
964 #endif
965  return sendTypes;
966 }
967 
968 Teuchos::Array<EDistributorSendType> distributorSendTypeEnums() {
969  Teuchos::Array<EDistributorSendType> res;
970  res.push_back (DISTRIBUTOR_ISEND);
971  res.push_back (DISTRIBUTOR_SEND);
972  res.push_back (DISTRIBUTOR_ALLTOALL);
973 #if defined(HAVE_TPETRA_MPI)
974  res.push_back (DISTRIBUTOR_IALLTOFEWV);
975 #endif
976 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
977  res.push_back (DISTRIBUTOR_MPIADVANCE_ALLTOALL);
978  res.push_back (DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV);
979 #endif
980  return res;
981 }
982 
983 Teuchos::RCP<const Teuchos::ParameterList>
984 DistributorPlan::getValidParameters() const
985 {
986  using Teuchos::Array;
987  using Teuchos::ParameterList;
988  using Teuchos::parameterList;
989  using Teuchos::RCP;
990  using Teuchos::setStringToIntegralParameter;
991 
992  Array<std::string> sendTypes = distributorSendTypes ();
993  const Array<Details::EDistributorSendType> sendTypeEnums = distributorSendTypeEnums ();
994 
995  const std::string validatedSendType = validSendTypeOrThrow(Behavior::defaultSendType());
996 
997  RCP<ParameterList> plist = parameterList ("Tpetra::Distributor");
998 
999  setStringToIntegralParameter<Details::EDistributorSendType> ("Send type",
1000  validatedSendType, "When using MPI, the variant of send to use in "
1001  "do[Reverse]Posts()", sendTypes(), sendTypeEnums(), plist.getRawPtr());
1002  plist->set ("Timer Label","","Label for Time Monitor output");
1003 
1004  return Teuchos::rcp_const_cast<const ParameterList> (plist);
1005 }
1006 
1007 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
1008 
1009 // Used by Teuchos::RCP to clean up an owned MPIX_Comm*
1010 struct MpixCommDeallocator {
1011  void free(MPIX_Comm **comm) const {
1012  MPIX_Comm_free(comm);
1013  }
1014 };
1015 
1016 void DistributorPlan::initializeMpiAdvance() {
1017 
1018  // assert the mpix communicator is null. if this is not the case we will figure out why
1019  TEUCHOS_ASSERT(mpixComm_.is_null());
1020 
1021  // use the members to initialize the graph for neightborhood mode, or just the MPIX communicator for non-neighborhood mode
1022  Teuchos::RCP<const Teuchos::MpiComm<int> > mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm_);
1023  Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawComm = mpiComm->getRawMpiComm();
1024  int err = 0;
1025  if (sendType_ == DISTRIBUTOR_MPIADVANCE_ALLTOALL ||
1026  sendType_ == DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV ) {
1027  MPIX_Comm **mpixComm = new(MPIX_Comm*);
1028  err = MPIX_Comm_init(mpixComm, (*rawComm)());
1029  mpixComm_ = Teuchos::RCP(mpixComm,
1030  MpixCommDeallocator(),
1031  true /*take ownership*/
1032  );
1033  }
1034 
1035  TEUCHOS_ASSERT(err == 0);
1036 }
1037 #endif
1038 
1039 #if defined(HAVE_TPETRA_MPI)
1040  // FIXME: probably need to rename this function since it might change the sendType
1041  void DistributorPlan::maybeInitializeRoots() {
1042 
1043  // Only IALLTOFEWV needs to know the roots
1044  if (DISTRIBUTOR_IALLTOFEWV != sendType_) {
1045  roots_.clear();
1046  return;
1047  }
1048 
1049  ProfilingRegion region_maybeInitializeRoots ("Tpetra::DistributorPlan::maybeInitializeRoots");
1050 
1051  // send my number of recvs to everyone
1052  const int numRecvs = (int)(getNumReceives() + (hasSelfMessage() ? 1 : 0));
1053  std::vector<int> sendbuf(comm_->getSize(), numRecvs);
1054  std::vector<int> recvbuf(comm_->getSize());
1055 
1056  // FIXME: is there a more natural way to do this?
1057  // Maybe MPI_Allreduce is better, we just care if anyone is sending anything to each process
1058  // we just need to know all processes that receive anything (including a self message)
1059  Teuchos::RCP<const Teuchos::MpiComm<int> > mpiComm = Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int> >(comm_);
1060  Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm> > rawComm = mpiComm->getRawMpiComm();
1061  MPI_Comm comm = (*rawComm)();
1062  MPI_Alltoall(sendbuf.data(), 1, MPI_INT, recvbuf.data(), 1, MPI_INT, comm);
1063 
1064  roots_.clear();
1065  for (size_t root = 0; root < recvbuf.size(); ++root) {
1066  if (recvbuf[root] > 0) {
1067  roots_.push_back(root);
1068  }
1069  }
1070 
1071  // In "slow-path" communication, the data is not blocked according to sending / receiving proc.
1072  // The root-detection algorithm expects data to be blocked, so disable.
1073  int slow = !getIndicesTo().is_null() ? 1 : 0;
1074  MPI_Allreduce(MPI_IN_PLACE, &slow, 1, MPI_INT, MPI_LOR, comm);
1075  if (slow) {
1076 
1078  {
1079  std::stringstream ss;
1080  ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING: Ialltoallv send mode set, at least one rank's data is not grouped by rank. Setting to \"Send\"" << std::endl;
1081  std::cerr << ss.str();
1082  }
1083  }
1084 
1085  roots_.clear();
1086  sendType_ = DISTRIBUTOR_SEND;
1087  }
1088 
1089  // if there aren't many roots, probably someone wanted to use a gather somewhere but then just reused the import/export thing for a scatter
1090  // which this won't work well for
1091  // just fall back to SEND if roots are more than sqrt of comm
1092  if (roots_.size() * roots_.size() >= size_t(comm_->getSize())) {
1094  std::stringstream ss;
1095  ss << __FILE__ << ":" << __LINE__ << " " << comm_->getRank() << ": WARNING (Ialltoallv send type): too many roots (" << roots_.size() << ") for " << comm_->getSize() << " ranks. Setting send-type to \"Send\"" << std::endl;
1096  std::cerr << ss.str();
1097  }
1098  roots_.clear();
1099  sendType_ = DISTRIBUTOR_SEND;
1100  }
1101  }
1102 #endif // HAVE_TPETRA_MPI
1103 
1104  DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(size_t numPackets) const {
1105  const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1106 
1107  IndexView importStarts(actualNumReceives);
1108  IndexView importLengths(actualNumReceives);
1109 
1110  size_t offset = 0;
1111  for (size_t i = 0; i < actualNumReceives; ++i) {
1112  importStarts[i] = offset;
1113  offset += getLengthsFrom()[i] * numPackets;
1114  importLengths[i] = getLengthsFrom()[i] * numPackets;
1115  }
1116  return std::make_pair(importStarts, importLengths);
1117  }
1118 
1119  DistributorPlan::SubViewLimits DistributorPlan::getImportViewLimits(const Teuchos::ArrayView<const size_t> &numImportPacketsPerLID) const {
1120 
1121  const size_t actualNumReceives = getNumReceives() + (hasSelfMessage() ? 1 : 0);
1122 
1123  IndexView importStarts(actualNumReceives);
1124  IndexView importLengths(actualNumReceives);
1125 
1126  size_t offset = 0;
1127  size_t curLIDoffset = 0;
1128  for (size_t i = 0; i < actualNumReceives; ++i) {
1129  size_t totalPacketsFrom_i = 0;
1130  for (size_t j = 0; j < getLengthsFrom()[i]; ++j) {
1131  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset + j];
1132  }
1133  curLIDoffset += getLengthsFrom()[i];
1134  importStarts[i] = offset;
1135  offset += totalPacketsFrom_i;
1136  importLengths[i] = totalPacketsFrom_i;
1137  }
1138  return std::make_pair(importStarts, importLengths);
1139  }
1140 
1141 
1142  DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(size_t numPackets) const {
1143  if (getIndicesTo().is_null()) {
1144 
1145  const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1146  IndexView exportStarts(actualNumSends);
1147  IndexView exportLengths(actualNumSends);
1148  for (size_t pp = 0; pp < actualNumSends; ++pp) {
1149  exportStarts[pp] = getStartsTo()[pp] * numPackets;
1150  exportLengths[pp] = getLengthsTo()[pp] * numPackets;
1151  }
1152  return std::make_pair(exportStarts, exportLengths);
1153  } else {
1154  const size_t numIndices = getIndicesTo().size();
1155  IndexView exportStarts(numIndices);
1156  IndexView exportLengths(numIndices);
1157  for (size_t j = 0; j < numIndices; ++j) {
1158  exportStarts[j] = getIndicesTo()[j]*numPackets;
1159  exportLengths[j] = numPackets;
1160  }
1161  return std::make_pair(exportStarts, exportLengths);
1162  }
1163  }
1164 
1165  DistributorPlan::SubViewLimits DistributorPlan::getExportViewLimits(const Teuchos::ArrayView<const size_t> &numExportPacketsPerLID) const {
1166  if (getIndicesTo().is_null()) {
1167  const size_t actualNumSends = getNumSends() + (hasSelfMessage() ? 1 : 0);
1168  IndexView exportStarts(actualNumSends);
1169  IndexView exportLengths(actualNumSends);
1170  size_t offset = 0;
1171  for (size_t pp = 0; pp < actualNumSends; ++pp) {
1172  size_t numPackets = 0;
1173  for (size_t j = getStartsTo()[pp];
1174  j < getStartsTo()[pp] + getLengthsTo()[pp]; ++j) {
1175  numPackets += numExportPacketsPerLID[j];
1176  }
1177  exportStarts[pp] = offset;
1178  offset += numPackets;
1179  exportLengths[pp] = numPackets;
1180  }
1181  return std::make_pair(exportStarts, exportLengths);
1182  } else {
1183  const size_t numIndices = getIndicesTo().size();
1184  IndexView exportStarts(numIndices);
1185  IndexView exportLengths(numIndices);
1186  size_t offset = 0;
1187  for (size_t j = 0; j < numIndices; ++j) {
1188  exportStarts[j] = offset;
1189  offset += numExportPacketsPerLID[j];
1190  exportLengths[j] = numExportPacketsPerLID[j];
1191  }
1192  return std::make_pair(exportStarts, exportLengths);
1193  }
1194  }
1195 
1196 }
1197 }
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
static bool debug()
Whether Tpetra is in debug mode.
const std::string & validSendTypeOrThrow(const std::string &s)
Valid enum values of distributor send types.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
#define TPETRA_EFFICIENCY_WARNING(throw_exception_test, msg)
Print or throw an efficency warning.
EDistributorSendType DistributorSendTypeStringToEnum(const std::string_view s)
Convert a string to an EDistributorSendType. Throw on error.
Teuchos::Array< EDistributorSendType > distributorSendTypeEnums()
Valid enum values of distributor send types.
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
static bool verbose()
Whether Tpetra is in verbose mode.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
Teuchos::Array< std::string > distributorSendTypes()
Valid string values for Distributor&#39;s &quot;Send type&quot; parameter.
#define SHARED_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg, comm)
Test for exception, with reduction over the given communicator.
EDistributorSendType
The type of MPI send that Distributor should use.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.