Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_DistObject_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DISTOBJECT_DEF_HPP
11 #define TPETRA_DISTOBJECT_DEF_HPP
12 
20 
21 #include "Tpetra_Distributor.hpp"
24 #include "Tpetra_Details_checkGlobalError.hpp"
26 #include "Tpetra_Util.hpp" // Details::createPrefix
27 #include "Teuchos_CommHelpers.hpp"
28 #include "Teuchos_TypeNameTraits.hpp"
29 #include <typeinfo>
30 #include <memory>
31 #include <sstream>
32 
33 namespace Tpetra {
34 
35 namespace { // (anonymous)
36 template <class DeviceType, class IndexType = size_t>
37 struct SumFunctor {
38  SumFunctor(const Kokkos::View<const size_t*, DeviceType>& viewToSum)
39  : viewToSum_(viewToSum) {}
40  KOKKOS_INLINE_FUNCTION void operator()(const IndexType i, size_t& lclSum) const {
41  lclSum += viewToSum_(i);
42  }
43  Kokkos::View<const size_t*, DeviceType> viewToSum_;
44 };
45 
46 template <class DeviceType, class IndexType = size_t>
47 size_t
48 countTotalImportPackets(const Kokkos::View<const size_t*, DeviceType>& numImportPacketsPerLID) {
49  using Kokkos::parallel_reduce;
50  typedef DeviceType DT;
51  typedef typename DT::execution_space DES;
52  typedef Kokkos::RangePolicy<DES, IndexType> range_type;
53 
54  const IndexType numOut = numImportPacketsPerLID.extent(0);
55  size_t totalImportPackets = 0;
56  parallel_reduce("Count import packets",
57  range_type(0, numOut),
58  SumFunctor<DeviceType, IndexType>(numImportPacketsPerLID),
59  totalImportPackets);
60  return totalImportPackets;
61 }
62 } // namespace
63 
64 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
66  DistObject(const Teuchos::RCP<const map_type>& map)
67  : map_(map) {}
68 
69 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
70 std::string
72  description() const {
73  using Teuchos::TypeNameTraits;
74 
75  std::ostringstream os;
76  os << "\"Tpetra::DistObject\": {"
77  << "Packet: " << TypeNameTraits<packet_type>::name()
78  << ", LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name()
79  << ", GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name()
80  << ", Node: " << TypeNameTraits<Node>::name();
81  if (this->getObjectLabel() != "") {
82  os << "Label: \"" << this->getObjectLabel() << "\"";
83  }
84  os << "}";
85  return os.str();
86 }
87 
88 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
90  describe(Teuchos::FancyOStream& out,
91  const Teuchos::EVerbosityLevel verbLevel) const {
92  using std::endl;
93  using Teuchos::rcpFromRef;
94  using Teuchos::TypeNameTraits;
95  const Teuchos::EVerbosityLevel vl = (verbLevel == Teuchos::VERB_DEFAULT) ? Teuchos::VERB_LOW : verbLevel;
96  Teuchos::RCP<const Teuchos::Comm<int>> comm = this->getMap()->getComm();
97  const int myRank = comm.is_null() ? 0 : comm->getRank();
98  const int numProcs = comm.is_null() ? 1 : comm->getSize();
99 
100  if (vl != Teuchos::VERB_NONE) {
101  Teuchos::OSTab tab0(out);
102  if (myRank == 0) {
103  out << "\"Tpetra::DistObject\":" << endl;
104  }
105  Teuchos::OSTab tab1(out);
106  if (myRank == 0) {
107  out << "Template parameters:" << endl;
108  {
109  Teuchos::OSTab tab2(out);
110  out << "Packet: " << TypeNameTraits<packet_type>::name() << endl
111  << "LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name() << endl
112  << "GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name() << endl
113  << "Node: " << TypeNameTraits<node_type>::name() << endl;
114  }
115  if (this->getObjectLabel() != "") {
116  out << "Label: \"" << this->getObjectLabel() << "\"" << endl;
117  }
118  } // if myRank == 0
119 
120  // Describe the Map.
121  {
122  if (myRank == 0) {
123  out << "Map:" << endl;
124  }
125  Teuchos::OSTab tab2(out);
126  map_->describe(out, vl);
127  }
128 
129  // At verbosity > VERB_LOW, each process prints something.
130  if (vl > Teuchos::VERB_LOW) {
131  for (int p = 0; p < numProcs; ++p) {
132  if (myRank == p) {
133  out << "Process " << myRank << ":" << endl;
134  Teuchos::OSTab tab2(out);
135  out << "Export buffer size (in packets): "
136  << exports_.extent(0)
137  << endl
138  << "Import buffer size (in packets): "
139  << imports_.extent(0)
140  << endl;
141  }
142  if (!comm.is_null()) {
143  comm->barrier(); // give output time to finish
144  comm->barrier();
145  comm->barrier();
146  }
147  } // for each process rank p
148  } // if vl > VERB_LOW
149  } // if vl != VERB_NONE
150 }
151 
152 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
154  removeEmptyProcessesInPlace(const Teuchos::RCP<const map_type>& /* newMap */) {
155  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
156  "Tpetra::DistObject::removeEmptyProcessesInPlace: Not implemented");
157 }
158 
159 /* These are provided in base DistObject template
160 template<class DistObjectType>
161 void
162 removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input,
163  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
164  typename DistObjectType::global_ordinal_type,
165  typename DistObjectType::node_type> >& newMap)
166 {
167  input->removeEmptyProcessesInPlace (newMap);
168  if (newMap.is_null ()) { // my process is excluded
169  input = Teuchos::null;
170  }
171 }
172 
173 template<class DistObjectType>
174 void
175 removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input)
176 {
177  using Teuchos::RCP;
178  typedef typename DistObjectType::local_ordinal_type LO;
179  typedef typename DistObjectType::global_ordinal_type GO;
180  typedef typename DistObjectType::node_type NT;
181  typedef Map<LO, GO, NT> map_type;
182 
183  RCP<const map_type> newMap = input->getMap ()->removeEmptyProcesses ();
184  removeEmptyProcessesInPlace<DistObjectType> (input, newMap);
185 }
186 */
187 
188 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
190  doImport(const SrcDistObject& source,
192  const CombineMode CM,
193  const bool restrictedMode) {
194  using Details::Behavior;
195  using std::endl;
196  const char modeString[] = "doImport (forward mode)";
197 
198  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
199  // output to std::cerr on every MPI process. This is unwise for
200  // runs with large numbers of MPI processes.
201  const bool verbose = Behavior::verbose("DistObject");
202  std::unique_ptr<std::string> prefix;
203  if (verbose) {
204  prefix = this->createPrefix("DistObject", modeString);
205  std::ostringstream os;
206  os << *prefix << "Start" << endl;
207  std::cerr << os.str();
208  }
209  this->beginImport(source, importer, CM, restrictedMode);
210  this->endImport(source, importer, CM, restrictedMode);
211  if (verbose) {
212  std::ostringstream os;
213  os << *prefix << "Done" << endl;
214  std::cerr << os.str();
215  }
216 }
217 
218 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
220  doExport(const SrcDistObject& source,
222  const CombineMode CM,
223  const bool restrictedMode) {
224  using Details::Behavior;
225  using std::endl;
226  const char modeString[] = "doExport (forward mode)";
227 
228  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
229  // output to std::cerr on every MPI process. This is unwise for
230  // runs with large numbers of MPI processes.
231  const bool verbose = Behavior::verbose("DistObject");
232  std::unique_ptr<std::string> prefix;
233  if (verbose) {
234  prefix = this->createPrefix("DistObject", modeString);
235  std::ostringstream os;
236  os << *prefix << "Start" << endl;
237  std::cerr << os.str();
238  }
239  this->beginExport(source, exporter, CM, restrictedMode);
240  this->endExport(source, exporter, CM, restrictedMode);
241  if (verbose) {
242  std::ostringstream os;
243  os << *prefix << "Done" << endl;
244  std::cerr << os.str();
245  }
246 }
247 
248 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
250  doImport(const SrcDistObject& source,
252  const CombineMode CM,
253  const bool restrictedMode) {
254  using Details::Behavior;
255  using std::endl;
256  const char modeString[] = "doImport (reverse mode)";
257 
258  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
259  // output to std::cerr on every MPI process. This is unwise for
260  // runs with large numbers of MPI processes.
261  const bool verbose = Behavior::verbose("DistObject");
262  std::unique_ptr<std::string> prefix;
263  if (verbose) {
264  prefix = this->createPrefix("DistObject", modeString);
265  std::ostringstream os;
266  os << *prefix << "Start" << endl;
267  std::cerr << os.str();
268  }
269  this->beginImport(source, exporter, CM, restrictedMode);
270  this->endImport(source, exporter, CM, restrictedMode);
271  if (verbose) {
272  std::ostringstream os;
273  os << *prefix << "Done" << endl;
274  std::cerr << os.str();
275  }
276 }
277 
278 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
280  doExport(const SrcDistObject& source,
282  const CombineMode CM,
283  const bool restrictedMode) {
284  using Details::Behavior;
285  using std::endl;
286  const char modeString[] = "doExport (reverse mode)";
287 
288  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
289  // output to std::cerr on every MPI process. This is unwise for
290  // runs with large numbers of MPI processes.
291  const bool verbose = Behavior::verbose("DistObject");
292  std::unique_ptr<std::string> prefix;
293  if (verbose) {
294  prefix = this->createPrefix("DistObject", modeString);
295  std::ostringstream os;
296  os << *prefix << "Start" << endl;
297  std::cerr << os.str();
298  }
299  this->beginExport(source, importer, CM, restrictedMode);
300  this->endExport(source, importer, CM, restrictedMode);
301  if (verbose) {
302  std::ostringstream os;
303  os << *prefix << "Done" << endl;
304  std::cerr << os.str();
305  }
306 }
307 
308 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
310  beginImport(const SrcDistObject& source,
312  const CombineMode CM,
313  const bool restrictedMode) {
314  using Details::Behavior;
315  using std::endl;
316  const char modeString[] = "beginImport (forward mode)";
317 
318  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
319  // output to std::cerr on every MPI process. This is unwise for
320  // runs with large numbers of MPI processes.
321  const bool verbose = Behavior::verbose("DistObject");
322  std::unique_ptr<std::string> prefix;
323  if (verbose) {
324  prefix = this->createPrefix("DistObject", modeString);
325  std::ostringstream os;
326  os << *prefix << "Start" << endl;
327  std::cerr << os.str();
328  }
329  this->beginTransfer(source, importer, modeString, DoForward, CM, restrictedMode);
330  if (verbose) {
331  std::ostringstream os;
332  os << *prefix << "Done" << endl;
333  std::cerr << os.str();
334  }
335 }
336 
337 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
338 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
339  beginExport(const SrcDistObject& source,
340  const Export<LocalOrdinal, GlobalOrdinal, Node>& exporter,
341  const CombineMode CM,
342  const bool restrictedMode) {
343  using Details::Behavior;
344  using std::endl;
345  const char modeString[] = "beginExport (forward mode)";
346 
347  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
348  // output to std::cerr on every MPI process. This is unwise for
349  // runs with large numbers of MPI processes.
350  const bool verbose = Behavior::verbose("DistObject");
351  std::unique_ptr<std::string> prefix;
352  if (verbose) {
353  prefix = this->createPrefix("DistObject", modeString);
354  std::ostringstream os;
355  os << *prefix << "Start" << endl;
356  std::cerr << os.str();
357  }
358  this->beginTransfer(source, exporter, modeString, DoForward, CM, restrictedMode);
359  if (verbose) {
360  std::ostringstream os;
361  os << *prefix << "Done" << endl;
362  std::cerr << os.str();
363  }
364 }
365 
366 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
367 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
368  beginImport(const SrcDistObject& source,
369  const Export<LocalOrdinal, GlobalOrdinal, Node>& exporter,
370  const CombineMode CM,
371  const bool restrictedMode) {
372  using Details::Behavior;
373  using std::endl;
374  const char modeString[] = "beginImport (reverse mode)";
375 
376  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
377  // output to std::cerr on every MPI process. This is unwise for
378  // runs with large numbers of MPI processes.
379  const bool verbose = Behavior::verbose("DistObject");
380  std::unique_ptr<std::string> prefix;
381  if (verbose) {
382  prefix = this->createPrefix("DistObject", modeString);
383  std::ostringstream os;
384  os << *prefix << "Start" << endl;
385  std::cerr << os.str();
386  }
387  this->beginTransfer(source, exporter, modeString, DoReverse, CM, restrictedMode);
388  if (verbose) {
389  std::ostringstream os;
390  os << *prefix << "Done" << endl;
391  std::cerr << os.str();
392  }
393 }
394 
395 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
396 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
397  beginExport(const SrcDistObject& source,
398  const Import<LocalOrdinal, GlobalOrdinal, Node>& importer,
399  const CombineMode CM,
400  const bool restrictedMode) {
401  using Details::Behavior;
402  using std::endl;
403  const char modeString[] = "beginExport (reverse mode)";
404 
405  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
406  // output to std::cerr on every MPI process. This is unwise for
407  // runs with large numbers of MPI processes.
408  const bool verbose = Behavior::verbose("DistObject");
409  std::unique_ptr<std::string> prefix;
410  if (verbose) {
411  prefix = this->createPrefix("DistObject", modeString);
412  std::ostringstream os;
413  os << *prefix << "Start" << endl;
414  std::cerr << os.str();
415  }
416  this->beginTransfer(source, importer, modeString, DoReverse, CM, restrictedMode);
417  if (verbose) {
418  std::ostringstream os;
419  os << *prefix << "Done" << endl;
420  std::cerr << os.str();
421  }
422 }
423 
424 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
425 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
426  endImport(const SrcDistObject& source,
427  const Import<LocalOrdinal, GlobalOrdinal, Node>& importer,
428  const CombineMode CM,
429  const bool restrictedMode) {
430  using Details::Behavior;
431  using std::endl;
432  const char modeString[] = "endImport (forward mode)";
433 
434  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
435  // output to std::cerr on every MPI process. This is unwise for
436  // runs with large numbers of MPI processes.
437  const bool verbose = Behavior::verbose("DistObject");
438  std::unique_ptr<std::string> prefix;
439  if (verbose) {
440  prefix = this->createPrefix("DistObject", modeString);
441  std::ostringstream os;
442  os << *prefix << "Start" << endl;
443  std::cerr << os.str();
444  }
445  this->endTransfer(source, importer, modeString, DoForward, CM, restrictedMode);
446  if (verbose) {
447  std::ostringstream os;
448  os << *prefix << "Done" << endl;
449  std::cerr << os.str();
450  }
451 }
452 
453 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
454 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
455  endExport(const SrcDistObject& source,
456  const Export<LocalOrdinal, GlobalOrdinal, Node>& exporter,
457  const CombineMode CM,
458  const bool restrictedMode) {
459  using Details::Behavior;
460  using std::endl;
461  const char modeString[] = "endExport (forward mode)";
462 
463  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
464  // output to std::cerr on every MPI process. This is unwise for
465  // runs with large numbers of MPI processes.
466  const bool verbose = Behavior::verbose("DistObject");
467  std::unique_ptr<std::string> prefix;
468  if (verbose) {
469  prefix = this->createPrefix("DistObject", modeString);
470  std::ostringstream os;
471  os << *prefix << "Start" << endl;
472  std::cerr << os.str();
473  }
474  this->endTransfer(source, exporter, modeString, DoForward, CM, restrictedMode);
475  if (verbose) {
476  std::ostringstream os;
477  os << *prefix << "Done" << endl;
478  std::cerr << os.str();
479  }
480 }
481 
482 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
483 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
484  endImport(const SrcDistObject& source,
485  const Export<LocalOrdinal, GlobalOrdinal, Node>& exporter,
486  const CombineMode CM,
487  const bool restrictedMode) {
488  using Details::Behavior;
489  using std::endl;
490  const char modeString[] = "endImport (reverse mode)";
491 
492  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
493  // output to std::cerr on every MPI process. This is unwise for
494  // runs with large numbers of MPI processes.
495  const bool verbose = Behavior::verbose("DistObject");
496  std::unique_ptr<std::string> prefix;
497  if (verbose) {
498  prefix = this->createPrefix("DistObject", modeString);
499  std::ostringstream os;
500  os << *prefix << "Start" << endl;
501  std::cerr << os.str();
502  }
503  this->endTransfer(source, exporter, modeString, DoReverse, CM, restrictedMode);
504  if (verbose) {
505  std::ostringstream os;
506  os << *prefix << "Done" << endl;
507  std::cerr << os.str();
508  }
509 }
510 
511 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
512 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
513  endExport(const SrcDistObject& source,
514  const Import<LocalOrdinal, GlobalOrdinal, Node>& importer,
515  const CombineMode CM,
516  const bool restrictedMode) {
517  using Details::Behavior;
518  using std::endl;
519  const char modeString[] = "endExport (reverse mode)";
520 
521  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
522  // output to std::cerr on every MPI process. This is unwise for
523  // runs with large numbers of MPI processes.
524  const bool verbose = Behavior::verbose("DistObject");
525  std::unique_ptr<std::string> prefix;
526  if (verbose) {
527  prefix = this->createPrefix("DistObject", modeString);
528  std::ostringstream os;
529  os << *prefix << "Start" << endl;
530  std::cerr << os.str();
531  }
532  this->endTransfer(source, importer, modeString, DoReverse, CM, restrictedMode);
533  if (verbose) {
534  std::ostringstream os;
535  os << *prefix << "Done" << endl;
536  std::cerr << os.str();
537  }
538 }
539 
540 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
543  return distributorActor_.isReady();
544 }
545 
546 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
548  isDistributed() const {
549  return map_->isDistributed();
550 }
551 
552 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
553 size_t
556  return 0; // default implementation; subclasses may override
557 }
558 
559 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
562  const ::Tpetra::Details::Transfer<local_ordinal_type, global_ordinal_type, node_type>& transfer,
563  const char modeString[],
564  const ReverseOption revOp,
565  const CombineMode CM,
566  bool restrictedMode) {
567  beginTransfer(src, transfer, modeString, revOp, CM, restrictedMode);
568  endTransfer(src, transfer, modeString, revOp, CM, restrictedMode);
569 }
570 
571 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
573  reallocImportsIfNeeded(const size_t newSize,
574  const bool verbose,
575  const std::string* prefix,
576  const bool /*remoteLIDsContiguous*/,
577  const CombineMode /*CM*/) {
578  if (verbose) {
579  std::ostringstream os;
580  os << *prefix << "Realloc (if needed) imports_ from "
581  << imports_.extent(0) << " to " << newSize << std::endl;
582  std::cerr << os.str();
583  }
585  const bool reallocated =
586  reallocDualViewIfNeeded(this->imports_, newSize, "imports");
587  if (verbose) {
588  std::ostringstream os;
589  os << *prefix << "Finished realloc'ing imports_" << std::endl;
590  std::cerr << os.str();
591  }
592  return reallocated;
593 }
594 
595 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
597  reallocArraysForNumPacketsPerLid(const size_t numExportLIDs,
598  const size_t numImportLIDs) {
599  using Details::Behavior;
600  using std::endl;
603  // If an array is already allocated, and if is at least
604  // tooBigFactor times bigger than it needs to be, free it and
605  // reallocate to the size we need, in order to save space.
606  // Otherwise, take subviews to reduce allocation size.
607  constexpr size_t tooBigFactor = 10;
608 
609  const bool verbose = Behavior::verbose("DistObject");
610  std::unique_ptr<std::string> prefix;
611  if (verbose) {
612  prefix = this->createPrefix("DistObject",
613  "reallocArraysForNumPacketsPerLid");
614  std::ostringstream os;
615  os << *prefix
616  << "numExportLIDs: " << numExportLIDs
617  << ", numImportLIDs: " << numImportLIDs
618  << endl;
619  os << *prefix << "DualView status before:" << endl
620  << *prefix
621  << dualViewStatusToString(this->numExportPacketsPerLID_,
622  "numExportPacketsPerLID_")
623  << endl
624  << *prefix
625  << dualViewStatusToString(this->numImportPacketsPerLID_,
626  "numImportPacketsPerLID_")
627  << endl;
628  std::cerr << os.str();
629  }
630 
631  // Reallocate numExportPacketsPerLID_ if needed.
632  const bool firstReallocated =
633  reallocDualViewIfNeeded(this->numExportPacketsPerLID_,
634  numExportLIDs,
635  "numExportPacketsPerLID",
636  tooBigFactor,
637  true); // need fence before, if realloc'ing
638 
639  // If we reallocated above, then we fenced after that
640  // reallocation. This means that we don't need to fence again,
641  // before the next reallocation.
642  const bool needFenceBeforeNextAlloc = !firstReallocated;
643  const bool secondReallocated =
644  reallocDualViewIfNeeded(this->numImportPacketsPerLID_,
645  numImportLIDs,
646  "numImportPacketsPerLID",
647  tooBigFactor,
648  needFenceBeforeNextAlloc);
649 
650  if (verbose) {
651  std::ostringstream os;
652  os << *prefix << "DualView status after:" << endl
653  << *prefix << dualViewStatusToString(this->numExportPacketsPerLID_, "numExportPacketsPerLID_")
654  << endl
655  << *prefix << dualViewStatusToString(this->numImportPacketsPerLID_, "numImportPacketsPerLID_")
656  << endl;
657  std::cerr << os.str();
658  }
659 
660  return firstReallocated || secondReallocated;
661 }
662 
663 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
666  const ::Tpetra::Details::Transfer<local_ordinal_type, global_ordinal_type, node_type>& transfer,
667  const char modeString[],
668  const ReverseOption revOp,
669  const CombineMode CM,
670  bool restrictedMode) {
671  using Details::Behavior;
674  using Kokkos::Compat::create_const_view;
675  using Kokkos::Compat::getArrayView;
676  using Kokkos::Compat::getConstArrayView;
677  using Kokkos::Compat::getKokkosViewDeepCopy;
678  using std::endl;
681 
682  const bool commOnHost = !Behavior::assumeMpiIsGPUAware();
683  const char funcNameHost[] = "Tpetra::DistObject::beginTransfer[Host]";
684  const char funcNameDevice[] = "Tpetra::DistObject::beginTransfer[Device]";
685  const char* funcName = commOnHost ? funcNameHost : funcNameDevice;
686 
687  ProfilingRegion region_doTransfer(funcName);
688  const bool verbose = Behavior::verbose("DistObject");
689  std::shared_ptr<std::string> prefix;
690  if (verbose) {
691  std::ostringstream os;
692  prefix = this->createPrefix("DistObject", "doTransfer");
693  os << *prefix << "Source type: " << Teuchos::typeName(src)
694  << ", Target type: " << Teuchos::typeName(*this) << endl;
695  std::cerr << os.str();
696  }
697 
698  // "Restricted Mode" does two things:
699  // 1) Skips copyAndPermute
700  // 2) Allows the "target" Map of the transfer to be a subset of
701  // the Map of *this, in a "locallyFitted" sense.
702  //
703  // This cannot be used if #2 is not true, OR there are permutes.
704  // Source Maps still need to match
705 
706  // mfh 18 Oct 2017: Set TPETRA_DEBUG to true to enable extra debug
707  // checks. These may communicate more.
708  const bool debug = Behavior::debug("DistObject");
709  if (debug) {
710  if (!restrictedMode && revOp == DoForward) {
711  const bool myMapSameAsTransferTgtMap =
712  this->getMap()->isSameAs(*(transfer.getTargetMap()));
713  TEUCHOS_TEST_FOR_EXCEPTION(!myMapSameAsTransferTgtMap, std::invalid_argument,
714  "Tpetra::DistObject::" << modeString << ": For forward-mode "
715  "communication, the target DistObject's Map must be the same "
716  "(in the sense of Tpetra::Map::isSameAs) as the input "
717  "Export/Import object's target Map.");
718  } else if (!restrictedMode && revOp == DoReverse) {
719  const bool myMapSameAsTransferSrcMap =
720  this->getMap()->isSameAs(*(transfer.getSourceMap()));
721  TEUCHOS_TEST_FOR_EXCEPTION(!myMapSameAsTransferSrcMap, std::invalid_argument,
722  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
723  "communication, the target DistObject's Map must be the same "
724  "(in the sense of Tpetra::Map::isSameAs) as the input "
725  "Export/Import object's source Map.");
726  } else if (restrictedMode && revOp == DoForward) {
727  const bool myMapLocallyFittedTransferTgtMap =
728  this->getMap()->isLocallyFitted(*(transfer.getTargetMap()));
729  TEUCHOS_TEST_FOR_EXCEPTION(!myMapLocallyFittedTransferTgtMap, std::invalid_argument,
730  "Tpetra::DistObject::" << modeString << ": For forward-mode "
731  "communication using restricted mode, Export/Import object's "
732  "target Map must be locally fitted (in the sense of "
733  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
734  } else { // if (restrictedMode && revOp == DoReverse)
735  const bool myMapLocallyFittedTransferSrcMap =
736  this->getMap()->isLocallyFitted(*(transfer.getSourceMap()));
737  TEUCHOS_TEST_FOR_EXCEPTION(!myMapLocallyFittedTransferSrcMap, std::invalid_argument,
738  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
739  "communication using restricted mode, Export/Import object's "
740  "source Map must be locally fitted (in the sense of "
741  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
742  }
743 
744  // SrcDistObject need not even _have_ Maps. However, if the
745  // source object is a DistObject, it has a Map, and we may
746  // compare that Map with the Transfer's Maps.
747  const this_type* srcDistObj = dynamic_cast<const this_type*>(&src);
748  if (srcDistObj != nullptr) {
749  if (revOp == DoForward) {
750  const bool srcMapSameAsImportSrcMap =
751  srcDistObj->getMap()->isSameAs(*(transfer.getSourceMap()));
752  TEUCHOS_TEST_FOR_EXCEPTION(!srcMapSameAsImportSrcMap, std::invalid_argument,
753  "Tpetra::DistObject::" << modeString << ": For forward-mode "
754  "communication, the source DistObject's Map must be the same "
755  "as the input Export/Import object's source Map.");
756  } else { // revOp == DoReverse
757  const bool srcMapSameAsImportTgtMap =
758  srcDistObj->getMap()->isSameAs(*(transfer.getTargetMap()));
759  TEUCHOS_TEST_FOR_EXCEPTION(!srcMapSameAsImportTgtMap, std::invalid_argument,
760  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
761  "communication, the source DistObject's Map must be the same "
762  "as the input Export/Import object's target Map.");
763  }
764  }
765  }
766 
767  const size_t numSameIDs = transfer.getNumSameIDs();
768  Distributor& distor = transfer.getDistributor();
769  const Details::DistributorPlan& distributorPlan = (revOp == DoForward) ? distor.getPlan() : *distor.getPlan().getReversePlan();
770 
771  TEUCHOS_TEST_FOR_EXCEPTION(debug && restrictedMode &&
772  (transfer.getPermuteToLIDs_dv().extent(0) != 0 ||
773  transfer.getPermuteFromLIDs_dv().extent(0) != 0),
774  std::invalid_argument,
775  "Tpetra::DistObject::" << modeString << ": Transfer object "
776  "cannot have permutes in restricted mode.");
777 
778  // Do we need all communication buffers to live on host?
779  if (verbose) {
780  std::ostringstream os;
781  os << *prefix << "doTransfer: Use new interface; "
782  "commOnHost="
783  << (commOnHost ? "true" : "false") << endl;
784  std::cerr << os.str();
785  }
786 
787  using const_lo_dv_type =
788  Kokkos::DualView<const local_ordinal_type*, buffer_device_type>;
789  const_lo_dv_type permuteToLIDs = (revOp == DoForward) ? transfer.getPermuteToLIDs_dv() : transfer.getPermuteFromLIDs_dv();
790  const_lo_dv_type permuteFromLIDs = (revOp == DoForward) ? transfer.getPermuteFromLIDs_dv() : transfer.getPermuteToLIDs_dv();
791  const_lo_dv_type remoteLIDs = (revOp == DoForward) ? transfer.getRemoteLIDs_dv() : transfer.getExportLIDs_dv();
792  const_lo_dv_type exportLIDs = (revOp == DoForward) ? transfer.getExportLIDs_dv() : transfer.getRemoteLIDs_dv();
793  const bool canTryAliasing = (revOp == DoForward) ? transfer.areRemoteLIDsContiguous() : transfer.areExportLIDsContiguous();
794  // const bool canTryAliasing = false;
795 
796  ProfilingRegion region_dTN(funcName);
797 
798  if (verbose) {
799  std::ostringstream os;
800  os << *prefix << "Input arguments:" << endl
801  << *prefix << " combineMode: " << combineModeToString(CM) << endl
802  << *prefix << " numSameIDs: " << numSameIDs << endl
803  << *prefix << " "
804  << dualViewStatusToString(permuteToLIDs, "permuteToLIDs") << endl
805  << *prefix << " "
806  << dualViewStatusToString(permuteFromLIDs, "permuteFromLIDs") << endl
807  << *prefix << " "
808  << dualViewStatusToString(remoteLIDs, "remoteLIDs") << endl
809  << *prefix << " "
810  << dualViewStatusToString(exportLIDs, "exportLIDs") << endl
811  << *prefix << " revOp: Do" << (revOp == DoReverse ? "Reverse" : "Forward") << endl
812  << *prefix << " commOnHost: " << (commOnHost ? "true" : "false") << endl;
813  std::cerr << os.str();
814  }
815 
816  {
817  ProfilingRegion region_cs("Tpetra::DistObject::beginTransfer::checkSizes");
818  if (verbose) {
819  std::ostringstream os;
820  os << *prefix << "1. checkSizes" << endl;
821  std::cerr << os.str();
822  }
823  const bool checkSizesResult = this->checkSizes(src);
824  TEUCHOS_TEST_FOR_EXCEPTION(!checkSizesResult, std::invalid_argument,
825  "Tpetra::DistObject::doTransfer: checkSizes() indicates that the "
826  "destination object is not a legal target for redistribution from the "
827  "source object. This probably means that they do not have the same "
828  "dimensions. For example, MultiVectors must have the same number of "
829  "rows and columns.");
830  }
831 
832  // The method may return zero even if the implementation actually
833  // does have a constant number of packets per LID. However, if it
834  // returns nonzero, we may use this information to avoid
835  // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare()
836  // will set this to its final value.
837  //
838  // We only need this if CM != ZERO, but it has to be lifted out of
839  // that scope because there are multiple tests for CM != ZERO.
840  size_t constantNumPackets = this->constantNumberOfPackets();
841  if (verbose) {
842  std::ostringstream os;
843  os << *prefix << "constantNumPackets=" << constantNumPackets << endl;
844  std::cerr << os.str();
845  }
846 
847  // Do we need to do communication?
848  bool needCommunication = true;
849  // We only need to send data if the combine mode is not ZERO.
850  if (CM != ZERO) {
851  // This may be NULL. It will be used below.
852  const this_type* srcDistObj = dynamic_cast<const this_type*>(&src);
853 
854  if (revOp == DoReverse && !this->isDistributed()) {
855  needCommunication = false;
856  }
857  // FIXME (mfh 30 Jun 2013): Checking whether the source object
858  // is distributed requires a cast to DistObject. If it's not a
859  // DistObject, then I'm not quite sure what to do. Perhaps it
860  // would be more appropriate for SrcDistObject to have an
861  // isDistributed() method. For now, I'll just assume that we
862  // need to do communication unless the cast succeeds and the
863  // source is not distributed.
864  else if (revOp == DoForward && srcDistObj != NULL &&
865  !srcDistObj->isDistributed()) {
866  needCommunication = false;
867  }
868  } // if (CM != ZERO)
869  else {
870  needCommunication = false;
871  }
872 
873  // The operations for the transfer can be performed in different
874  // order. The "safe" way is
875  //
876  // - copyAndPermute |
877  // - packAndPrepare |--- beginTransfer
878  // - doPostRecvs |
879  // - doPostSends |
880  //
881  // - doWaitsRecv |
882  // - unpackAndCombine |--- endTransfer
883  // - doWaitsSend |
884 
885  // This is "safe" because the local computation steps
886  // copyAndPermute and packAndPrepare are free to run on host or
887  // device provided that the data is appropriately synced.
888  // Afterwards, all the communication options can run independently
889  // of the computation. This means that there are no constraints in
890  // terms of memory spaces out of which the different steps need to
891  // run.
892 
893  // However, for performance it can be beneficial to overlap
894  // communication and computation, leading to this sequence of
895  // operations:
896  //
897  // - doPostRecvs |
898  // - packAndPrepare |--- beginTransfer
899  // - doPostSends |
900  // - copyAndPermute |
901  //
902  // - doWaitsRecv |
903  // - unpackAndCombine |--- endTransfer
904  // - doWaitsSend |
905  //
906  // Note that this is not the same as overlap of communication and
907  // computation in the sparse matrix-vector product which would involve
908  // performing computation between beginTransfer and endTransfer.
909  //
910  // The second approach has two advantages:
911  // 1) Receives and sends are separated by computation. This
912  // decreases the likelihood of MPI having to allocate temporary
913  // buffers for unexpectedly received messages.
914  // 2) Sends and doWaitsRecv in endTransfer are seperated by
915  // copyAndPermute, giving MPI time to make progress.
916  //
917  // The downside of this approach is as follows. The imports view
918  // used for the receives is potentially aliased to a subview of
919  // the target. This means that MPI will modify the target in the
920  // memory space that is determined by GPU awareness
921  // (Behavior::assumeMpiIsGPUAware). Since copyAndPermute will also
922  // be writing to the target at the same time, it will need to
923  // modify target in the same space.
924  //
925  // Given these additional constraints, we currently only enable
926  // the overlapping of communication and computation when constantNumPackets > 0
927  // and Behavior::enableGranularTransfers().
928 
929  const bool overlapTransferSteps = (constantNumPackets != 0) && Behavior::enableGranularTransfers();
930 
931  if (verbose) {
932  std::ostringstream os;
933  os << *prefix << "overlapTransferSteps=" << overlapTransferSteps << endl;
934  std::cerr << os.str();
935  }
936 
937  // Decide whether copyAndPermute needs to be run.
938  const bool thereAreIDsToCopy = (numSameIDs + permuteToLIDs.extent(0) != 0);
939  const bool needCopyAndPermute = (!restrictedMode && thereAreIDsToCopy);
940 
941  if (!overlapTransferSteps) {
943  // copyAndPermute
944 
945  // NOTE (mfh 26 Apr 2016) Chris Baker's implementation understood
946  // that if CM == INSERT || CM == REPLACE, the target object could
947  // be write only. We don't optimize for that here.
948 
949  if (needCopyAndPermute) {
950  // There is at least one GID to copy or permute.
951 
952  if (verbose) {
953  std::ostringstream os;
954  os << *prefix << "2. copyAndPermute" << endl;
955  std::cerr << os.str();
956  }
957  {
958  ProfilingRegion region_cp("Tpetra::DistObject::beginTransfer::copyAndPermute");
959 
960  this->copyAndPermute(src, numSameIDs, permuteToLIDs, permuteFromLIDs, CM);
961  }
962  if (verbose) {
963  std::ostringstream os;
964  os << *prefix << "After copyAndPermute:" << endl
965  << *prefix << " "
966  << dualViewStatusToString(permuteToLIDs, "permuteToLIDs")
967  << endl
968  << *prefix << " "
969  << dualViewStatusToString(permuteFromLIDs, "permuteFromLIDs")
970  << endl;
971  std::cerr << os.str();
972  }
973  }
974 
975  if (!needCommunication) {
976  if (verbose) {
977  std::ostringstream os;
978  os << *prefix << "Comm not needed; skipping" << endl;
979  std::cerr << os.str();
980  }
981  } else {
983  // packAndPrepare
984 
985  if (constantNumPackets == 0) {
986  if (verbose) {
987  std::ostringstream os;
988  os << *prefix << "3. (Re)allocate num{Ex,Im}portPacketsPerLID"
989  << endl;
990  std::cerr << os.str();
991  }
992  // This only reallocates if necessary, that is, if the sizes
993  // don't match.
994  this->reallocArraysForNumPacketsPerLid(exportLIDs.extent(0),
995  remoteLIDs.extent(0));
996  }
997 
998  if (verbose) {
999  std::ostringstream os;
1000  os << *prefix << "4. packAndPrepare: before, "
1001  << dualViewStatusToString(this->exports_, "exports_")
1002  << endl;
1003  std::cerr << os.str();
1004  }
1005 
1006  doPackAndPrepare(src, exportLIDs, constantNumPackets, execution_space());
1007  if (commOnHost) {
1008  this->exports_.sync_host();
1009  } else {
1010  this->exports_.sync_device();
1011  }
1012 
1013  if (verbose) {
1014  std::ostringstream os;
1015  os << *prefix << "5.1. After packAndPrepare, "
1016  << dualViewStatusToString(this->exports_, "exports_")
1017  << endl;
1018  std::cerr << os.str();
1019  }
1020 
1022  // reallocImportsIfNeeded
1023  if (constantNumPackets != 0) {
1024  ProfilingRegion region_reallocImportsIfNeeded("Tpetra::DistObject::beginTransfer::reallocImportsIfNeeded");
1025 
1026  // There are a constant number of packets per element. We
1027  // already know (from the number of "remote" (incoming)
1028  // elements) how many incoming elements we expect, so we can
1029  // resize the buffer accordingly.
1030  const size_t rbufLen = remoteLIDs.extent(0) * constantNumPackets;
1031  reallocImportsIfNeeded(rbufLen, verbose, prefix.get(), canTryAliasing, CM);
1032  }
1033 
1035  // doPostRecvs
1036 
1037  // If only one round of communication is required: post receives.
1038  // If two rounds are required: complete first round and post receives for second round.
1039  if (verbose) {
1040  std::ostringstream os;
1041  os << *prefix << "7.0. "
1042  << (revOp == DoReverse ? "Reverse" : "Forward")
1043  << " mode" << endl;
1044  std::cerr << os.str();
1045  }
1046 
1047  doPostRecvs(distributorPlan, constantNumPackets, commOnHost, prefix, canTryAliasing, CM);
1048 
1050  // doPostSends
1051 
1052  doPostSends(distributorPlan, constantNumPackets, commOnHost, prefix);
1053  } // if ( needCommunication )
1054 
1055  } // if ( ! overlapTransferSteps )
1056  else {
1057  // Overlap local computation with communication
1058 
1059  if (!needCommunication) {
1060  if (verbose) {
1061  std::ostringstream os;
1062  os << *prefix << "Comm not needed; skipping" << endl;
1063  std::cerr << os.str();
1064  }
1065  } else {
1067  // doPostRecvs
1068 
1070  // reallocImportsIfNeeded
1071  if (constantNumPackets != 0) {
1072  ProfilingRegion region_reallocImportsIfNeeded("Tpetra::DistObject::beginTransfer::reallocImportsIfNeeded");
1073 
1074  // There are a constant number of packets per element. We
1075  // already know (from the number of "remote" (incoming)
1076  // elements) how many incoming elements we expect, so we can
1077  // resize the buffer accordingly.
1078  const size_t rbufLen = remoteLIDs.extent(0) * constantNumPackets;
1079  reallocImportsIfNeeded(rbufLen, verbose, prefix.get(), canTryAliasing, CM);
1080  }
1081 
1082  // If only one round of communication is required: post receives.
1083  // If two rounds are required: complete first round and post receives for second round.
1084  if (verbose) {
1085  std::ostringstream os;
1086  os << *prefix << "7.0. "
1087  << (revOp == DoReverse ? "Reverse" : "Forward")
1088  << " mode" << endl;
1089  std::cerr << os.str();
1090  }
1091 
1092  doPostRecvs(distributorPlan, constantNumPackets, commOnHost, prefix, canTryAliasing, CM);
1093 
1095  // packAndPrepare
1096 
1097  if (constantNumPackets == 0) {
1098  if (verbose) {
1099  std::ostringstream os;
1100  os << *prefix << "3. (Re)allocate num{Ex,Im}portPacketsPerLID"
1101  << endl;
1102  std::cerr << os.str();
1103  }
1104  // This only reallocates if necessary, that is, if the sizes
1105  // don't match.
1106  this->reallocArraysForNumPacketsPerLid(exportLIDs.extent(0),
1107  remoteLIDs.extent(0));
1108  }
1109 
1110  if (verbose) {
1111  std::ostringstream os;
1112  os << *prefix << "4. packAndPrepare: before, "
1113  << dualViewStatusToString(this->exports_, "exports_")
1114  << endl;
1115  std::cerr << os.str();
1116  }
1117 
1118  doPackAndPrepare(src, exportLIDs, constantNumPackets, execution_space());
1119 
1120  if (commOnHost) {
1121  this->exports_.sync_host();
1122  } else {
1123  this->exports_.sync_device();
1124  }
1125 
1126  if (verbose) {
1127  std::ostringstream os;
1128  os << *prefix << "5.1. After packAndPrepare, "
1129  << dualViewStatusToString(this->exports_, "exports_")
1130  << endl;
1131  std::cerr << os.str();
1132  }
1133 
1135  // doPostSends
1136 
1137  doPostSends(distributorPlan, constantNumPackets, commOnHost, prefix);
1138 
1139  } // if ( needCommunication )
1140 
1142  // copyAndPermute
1143 
1144  // NOTE (mfh 26 Apr 2016) Chris Baker's implementation understood
1145  // that if CM == INSERT || CM == REPLACE, the target object could
1146  // be write only. We don't optimize for that here.
1147 
1148  if (needCopyAndPermute) {
1149  // There is at least one GID to copy or permute.
1150  if (verbose) {
1151  std::ostringstream os;
1152  os << *prefix << "2. copyAndPermute" << endl;
1153  std::cerr << os.str();
1154  }
1155 
1156  {
1157  ProfilingRegion region_cp("Tpetra::DistObject::beginTransfer::copyAndPermute");
1158 
1159  this->copyAndPermute(src, numSameIDs, permuteToLIDs, permuteFromLIDs, CM);
1160  }
1161 
1162  if (verbose) {
1163  std::ostringstream os;
1164  os << *prefix << "After copyAndPermute:" << endl
1165  << *prefix << " "
1166  << dualViewStatusToString(permuteToLIDs, "permuteToLIDs")
1167  << endl
1168  << *prefix << " "
1169  << dualViewStatusToString(permuteFromLIDs, "permuteFromLIDs")
1170  << endl;
1171  std::cerr << os.str();
1172  }
1173  }
1174  } // if ( overlapTransferSteps )
1175 }
1176 
1177 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1179  endTransfer(const SrcDistObject& src,
1180  const ::Tpetra::Details::Transfer<local_ordinal_type, global_ordinal_type, node_type>& transfer,
1181  const char modeString[],
1182  const ReverseOption revOp,
1183  const CombineMode CM,
1184  bool restrictedMode) {
1185  using Details::Behavior;
1188  using Kokkos::Compat::create_const_view;
1189  using Kokkos::Compat::getArrayView;
1190  using Kokkos::Compat::getConstArrayView;
1191  using Kokkos::Compat::getKokkosViewDeepCopy;
1192  using std::endl;
1195 
1196  const bool commOnHost = !Behavior::assumeMpiIsGPUAware();
1197  const char funcNameHost[] = "Tpetra::DistObject::endTransfer[Host]";
1198  const char funcNameDevice[] = "Tpetra::DistObject::endTransfer[Device]";
1199  const char* funcName = commOnHost ? funcNameHost : funcNameDevice;
1200  ProfilingRegion region_doTransfer(funcName);
1201  const bool verbose = Behavior::verbose("DistObject");
1202  std::shared_ptr<std::string> prefix;
1203  if (verbose) {
1204  std::ostringstream os;
1205  prefix = this->createPrefix("DistObject", "doTransfer");
1206  os << *prefix << "Source type: " << Teuchos::typeName(src)
1207  << ", Target type: " << Teuchos::typeName(*this) << endl;
1208  std::cerr << os.str();
1209  }
1210 
1211  // "Restricted Mode" does two things:
1212  // 1) Skips copyAndPermute
1213  // 2) Allows the "target" Map of the transfer to be a subset of
1214  // the Map of *this, in a "locallyFitted" sense.
1215  //
1216  // This cannot be used if #2 is not true, OR there are permutes.
1217  // Source Maps still need to match
1218 
1219  // mfh 18 Oct 2017: Set TPETRA_DEBUG to true to enable extra debug
1220  // checks. These may communicate more.
1221  const bool debug = Behavior::debug("DistObject");
1222  if (debug) {
1223  if (!restrictedMode && revOp == DoForward) {
1224  const bool myMapSameAsTransferTgtMap =
1225  this->getMap()->isSameAs(*(transfer.getTargetMap()));
1226  TEUCHOS_TEST_FOR_EXCEPTION(!myMapSameAsTransferTgtMap, std::invalid_argument,
1227  "Tpetra::DistObject::" << modeString << ": For forward-mode "
1228  "communication, the target DistObject's Map must be the same "
1229  "(in the sense of Tpetra::Map::isSameAs) as the input "
1230  "Export/Import object's target Map.");
1231  } else if (!restrictedMode && revOp == DoReverse) {
1232  const bool myMapSameAsTransferSrcMap =
1233  this->getMap()->isSameAs(*(transfer.getSourceMap()));
1234  TEUCHOS_TEST_FOR_EXCEPTION(!myMapSameAsTransferSrcMap, std::invalid_argument,
1235  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
1236  "communication, the target DistObject's Map must be the same "
1237  "(in the sense of Tpetra::Map::isSameAs) as the input "
1238  "Export/Import object's source Map.");
1239  } else if (restrictedMode && revOp == DoForward) {
1240  const bool myMapLocallyFittedTransferTgtMap =
1241  this->getMap()->isLocallyFitted(*(transfer.getTargetMap()));
1242  TEUCHOS_TEST_FOR_EXCEPTION(!myMapLocallyFittedTransferTgtMap, std::invalid_argument,
1243  "Tpetra::DistObject::" << modeString << ": For forward-mode "
1244  "communication using restricted mode, Export/Import object's "
1245  "target Map must be locally fitted (in the sense of "
1246  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
1247  } else { // if (restrictedMode && revOp == DoReverse)
1248  const bool myMapLocallyFittedTransferSrcMap =
1249  this->getMap()->isLocallyFitted(*(transfer.getSourceMap()));
1250  TEUCHOS_TEST_FOR_EXCEPTION(!myMapLocallyFittedTransferSrcMap, std::invalid_argument,
1251  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
1252  "communication using restricted mode, Export/Import object's "
1253  "source Map must be locally fitted (in the sense of "
1254  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
1255  }
1256 
1257  // SrcDistObject need not even _have_ Maps. However, if the
1258  // source object is a DistObject, it has a Map, and we may
1259  // compare that Map with the Transfer's Maps.
1260  const this_type* srcDistObj = dynamic_cast<const this_type*>(&src);
1261  if (srcDistObj != nullptr) {
1262  if (revOp == DoForward) {
1263  const bool srcMapSameAsImportSrcMap =
1264  srcDistObj->getMap()->isSameAs(*(transfer.getSourceMap()));
1265  TEUCHOS_TEST_FOR_EXCEPTION(!srcMapSameAsImportSrcMap, std::invalid_argument,
1266  "Tpetra::DistObject::" << modeString << ": For forward-mode "
1267  "communication, the source DistObject's Map must be the same "
1268  "as the input Export/Import object's source Map.");
1269  } else { // revOp == DoReverse
1270  const bool srcMapSameAsImportTgtMap =
1271  srcDistObj->getMap()->isSameAs(*(transfer.getTargetMap()));
1272  TEUCHOS_TEST_FOR_EXCEPTION(!srcMapSameAsImportTgtMap, std::invalid_argument,
1273  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
1274  "communication, the source DistObject's Map must be the same "
1275  "as the input Export/Import object's target Map.");
1276  }
1277  }
1278  }
1279 
1280  Distributor& distor = transfer.getDistributor();
1281  const Details::DistributorPlan& distributorPlan = (revOp == DoForward) ? distor.getPlan() : *distor.getPlan().getReversePlan();
1282 
1283  TEUCHOS_TEST_FOR_EXCEPTION(debug && restrictedMode &&
1284  (transfer.getPermuteToLIDs_dv().extent(0) != 0 ||
1285  transfer.getPermuteFromLIDs_dv().extent(0) != 0),
1286  std::invalid_argument,
1287  "Tpetra::DistObject::" << modeString << ": Transfer object "
1288  "cannot have permutes in restricted mode.");
1289 
1290  // Do we need all communication buffers to live on host?
1291  if (verbose) {
1292  std::ostringstream os;
1293  os << *prefix << "doTransfer: Use new interface; "
1294  "commOnHost="
1295  << (commOnHost ? "true" : "false") << endl;
1296  std::cerr << os.str();
1297  }
1298 
1299  using const_lo_dv_type =
1300  Kokkos::DualView<const local_ordinal_type*, buffer_device_type>;
1301  const_lo_dv_type remoteLIDs = (revOp == DoForward) ? transfer.getRemoteLIDs_dv() : transfer.getExportLIDs_dv();
1302 
1303  size_t constantNumPackets = this->constantNumberOfPackets();
1304 
1305  // We only need to send data if the combine mode is not ZERO.
1306  if (CM != ZERO) {
1307  // Do we need to do communication (via doWaitsRecv and doWaitsSend)?
1308  bool needCommunication = true;
1309 
1310  // This may be NULL. It will be used below.
1311  const this_type* srcDistObj = dynamic_cast<const this_type*>(&src);
1312 
1313  if (revOp == DoReverse && !this->isDistributed()) {
1314  needCommunication = false;
1315  }
1316  // FIXME (mfh 30 Jun 2013): Checking whether the source object
1317  // is distributed requires a cast to DistObject. If it's not a
1318  // DistObject, then I'm not quite sure what to do. Perhaps it
1319  // would be more appropriate for SrcDistObject to have an
1320  // isDistributed() method. For now, I'll just assume that we
1321  // need to do communication unless the cast succeeds and the
1322  // source is not distributed.
1323  else if (revOp == DoForward && srcDistObj != NULL &&
1324  !srcDistObj->isDistributed()) {
1325  needCommunication = false;
1326  }
1327 
1328  if (!needCommunication) {
1329  if (verbose) {
1330  std::ostringstream os;
1331  os << *prefix << "Comm not needed; skipping" << endl;
1332  std::cerr << os.str();
1333  }
1334  } else {
1335  distributorActor_.doWaitsRecv(distributorPlan);
1336 
1337  if (verbose) {
1338  std::ostringstream os;
1339  os << *prefix << "8. unpackAndCombine - remoteLIDs " << remoteLIDs.extent(0) << ", constantNumPackets " << constantNumPackets << endl;
1340  std::cerr << os.str();
1341  }
1342  doUnpackAndCombine(remoteLIDs, constantNumPackets, CM, execution_space());
1343 
1344  distributorActor_.doWaitsSend(distributorPlan);
1345  } // if (needCommunication)
1346  } // if (CM != ZERO)
1347 
1348  if (verbose) {
1349  std::ostringstream os;
1350  os << *prefix << "9. Done!" << endl;
1351  std::cerr << os.str();
1352  }
1353 
1354  if (verbose) {
1355  std::ostringstream os;
1356  os << *prefix << "Tpetra::DistObject::doTransfer: Done!" << endl;
1357  std::cerr << os.str();
1358  }
1359 }
1360 
1361 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1362 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1363  doPostRecvs(const Details::DistributorPlan& distributorPlan,
1364  size_t constantNumPackets,
1365  bool commOnHost,
1366  std::shared_ptr<std::string> prefix,
1367  const bool canTryAliasing,
1368  const CombineMode CM) {
1369  using Details::ProfilingRegion;
1370  using Kokkos::Compat::create_const_view;
1371  using std::endl;
1374 
1375  const char funcNameHost[] = "Tpetra::DistObject::doPostRecvs[Host]";
1376  const char funcNameDevice[] = "Tpetra::DistObject::doPostRecvs[Device]";
1377  const char* funcName = commOnHost ? funcNameHost : funcNameDevice;
1378  ProfilingRegion region_dpr(funcName);
1379 
1380  const bool verbose = Details::Behavior::verbose("DistObject");
1381 
1382  if (constantNumPackets == 0) { // variable num packets per LID
1383  if (verbose) {
1384  std::ostringstream os;
1385  os << *prefix << "7.1. Variable # packets / LID: first comm "
1386  << "(commOnHost = " << (commOnHost ? "true" : "false") << ")"
1387  << endl;
1388  std::cerr << os.str();
1389  }
1390  size_t totalImportPackets = 0;
1391  if (commOnHost) {
1392  if (this->numExportPacketsPerLID_.need_sync_host()) {
1393  this->numExportPacketsPerLID_.sync_host();
1394  }
1395  if (this->numImportPacketsPerLID_.need_sync_host()) {
1396  this->numImportPacketsPerLID_.sync_host();
1397  }
1398  this->numImportPacketsPerLID_.modify_host(); // out arg
1399  auto numExp_h =
1400  create_const_view(this->numExportPacketsPerLID_.view_host());
1401  auto numImp_h = this->numImportPacketsPerLID_.view_host();
1402 
1403  // MPI communication happens here.
1404  if (verbose) {
1405  std::ostringstream os;
1406  os << *prefix << "Call doPostsAndWaits"
1407  << endl;
1408  std::cerr << os.str();
1409  }
1410  distributorActor_.doPostsAndWaits(distributorPlan, numExp_h, 1, numImp_h);
1411 
1412  if (verbose) {
1413  std::ostringstream os;
1414  os << *prefix << "Count totalImportPackets" << std::endl;
1415  std::cerr << os.str();
1416  }
1417  using the_dev_type = typename decltype(numImp_h)::device_type;
1418  totalImportPackets = countTotalImportPackets<the_dev_type>(numImp_h);
1419  } else { // ! commOnHost
1420  this->numExportPacketsPerLID_.sync_device();
1421  this->numImportPacketsPerLID_.sync_device();
1422  this->numImportPacketsPerLID_.modify_device(); // out arg
1423  auto numExp_d = create_const_view(this->numExportPacketsPerLID_.view_device());
1424  auto numImp_d = this->numImportPacketsPerLID_.view_device();
1425 
1426  // MPI communication happens here.
1427  if (verbose) {
1428  std::ostringstream os;
1429  os << *prefix << "Call doPostsAndWaits"
1430  << endl;
1431  std::cerr << os.str();
1432  }
1433 
1434  distributorActor_.doPostsAndWaits(distributorPlan, numExp_d, 1, numImp_d);
1435 
1436  if (verbose) {
1437  std::ostringstream os;
1438  os << *prefix << "Count totalImportPackets" << std::endl;
1439  std::cerr << os.str();
1440  }
1441  using the_dev_type = typename decltype(numImp_d)::device_type;
1442  totalImportPackets = countTotalImportPackets<the_dev_type>(numImp_d);
1443  }
1444 
1445  if (verbose) {
1446  std::ostringstream os;
1447  os << *prefix << "totalImportPackets=" << totalImportPackets << endl;
1448  std::cerr << os.str();
1449  }
1450  this->reallocImportsIfNeeded(totalImportPackets, verbose,
1451  prefix.get(), canTryAliasing, CM);
1452  if (verbose) {
1453  std::ostringstream os;
1454  os << *prefix << "7.3. Second comm" << std::endl;
1455  std::cerr << os.str();
1456  }
1457 
1458  // mfh 04 Feb 2019: Distributor expects the "num packets per
1459  // LID" arrays on host, so that it can issue MPI sends and
1460  // receives correctly.
1461  this->numImportPacketsPerLID_.sync_host();
1462 
1463  // NOTE (mfh 25 Apr 2016, 01 Aug 2017) doPostsAndWaits and
1464  // doReversePostsAndWaits currently want
1465  // numExportPacketsPerLID and numImportPacketsPerLID as
1466  // Teuchos::ArrayView, rather than as Kokkos::View.
1467  //
1468  // NOTE (mfh 04 Feb 2019) This does NOT copy from host to
1469  // device. The above syncs might.
1470  auto numImportPacketsPerLID_av =
1471  getArrayViewFromDualView(this->numImportPacketsPerLID_);
1472 
1473  // imports_ is for output only, so we don't need to sync it
1474  // before marking it as modified. However, in order to
1475  // prevent spurious debug-mode errors (e.g., "modified on
1476  // both device and host"), we first need to clear its
1477  // "modified" flags.
1478  this->imports_.clear_sync_state();
1479 
1480  if (verbose) {
1481  std::ostringstream os;
1482  os << *prefix << "Comm on "
1483  << (commOnHost ? "host" : "device")
1484  << "; call doPostRecvs" << endl;
1485  std::cerr << os.str();
1486  }
1487 
1488  if (commOnHost) {
1489  this->imports_.modify_host();
1490  distributorActor_.doPostRecvs(distributorPlan,
1491  this->imports_.view_host(),
1492  numImportPacketsPerLID_av);
1493  } else { // pack on device
1494  this->imports_.modify_device();
1495  distributorActor_.doPostRecvs(distributorPlan,
1496  this->imports_.view_device(),
1497  numImportPacketsPerLID_av);
1498  }
1499  } else { // constant number of packets per LID
1500  if (verbose) {
1501  std::ostringstream os;
1502  os << *prefix << "7.1. Const # packets per LID: " << endl
1503  << *prefix << " "
1504  << dualViewStatusToString(this->exports_, "exports_")
1505  << endl
1506  << *prefix << " "
1507  << dualViewStatusToString(this->exports_, "imports_")
1508  << endl;
1509  std::cerr << os.str();
1510  }
1511  // imports_ is for output only, so we don't need to sync it
1512  // before marking it as modified. However, in order to
1513  // prevent spurious debug-mode errors (e.g., "modified on
1514  // both device and host"), we first need to clear its
1515  // "modified" flags.
1516  this->imports_.clear_sync_state();
1517 
1518  if (verbose) {
1519  std::ostringstream os;
1520  os << *prefix << "7.2. Comm on "
1521  << (commOnHost ? "host" : "device")
1522  << "; call doPostRecvs" << endl;
1523  std::cerr << os.str();
1524  }
1525  if (commOnHost) {
1526  this->imports_.modify_host();
1527  distributorActor_.doPostRecvs(distributorPlan,
1528  constantNumPackets,
1529  this->imports_.view_host());
1530  } else { // pack on device
1531  this->imports_.modify_device();
1532  distributorActor_.doPostRecvs(distributorPlan,
1533  constantNumPackets,
1534  this->imports_.view_device());
1535  } // commOnHost
1536  } // constant or variable num packets per LID
1537 }
1538 
1539 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1540 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1541  doPostSends(const Details::DistributorPlan& distributorPlan,
1542  size_t constantNumPackets,
1543  bool commOnHost,
1544  std::shared_ptr<std::string> prefix) {
1545  using Details::ProfilingRegion;
1546  using Kokkos::Compat::create_const_view;
1547  using std::endl;
1549 
1550  const char funcNameHost[] = "Tpetra::DistObject::doPostSends[Host]";
1551  const char funcNameDevice[] = "Tpetra::DistObject::doPostSends[Device]";
1552  const char* funcName = commOnHost ? funcNameHost : funcNameDevice;
1553  ProfilingRegion region_dps(funcName);
1554 
1555  const bool verbose = Details::Behavior::verbose("DistObject");
1556  if (verbose) {
1557  std::ostringstream os;
1558  os << *prefix << "Comm on "
1559  << (commOnHost ? "host" : "device")
1560  << "; call doPostSends" << endl;
1561  std::cerr << os.str();
1562  }
1563 
1564  if (constantNumPackets == 0) { // variable num packets per LID
1565  // mfh 04 Feb 2019: Distributor expects the "num packets per
1566  // LID" arrays on host, so that it can issue MPI sends and
1567  // receives correctly.
1568  this->numExportPacketsPerLID_.sync_host();
1569  this->numImportPacketsPerLID_.sync_host();
1570 
1571  // NOTE (mfh 25 Apr 2016, 01 Aug 2017) doPostsAndWaits and
1572  // doReversePostsAndWaits currently want
1573  // numExportPacketsPerLID and numImportPacketsPerLID as
1574  // Teuchos::ArrayView, rather than as Kokkos::View.
1575  //
1576  // NOTE (mfh 04 Feb 2019) This does NOT copy from host to
1577  // device. The above syncs might.
1578  auto numExportPacketsPerLID_av =
1579  getArrayViewFromDualView(this->numExportPacketsPerLID_);
1580  auto numImportPacketsPerLID_av =
1581  getArrayViewFromDualView(this->numImportPacketsPerLID_);
1582 
1583  if (commOnHost) {
1584  distributorActor_.doPostSends(distributorPlan,
1585  create_const_view(this->exports_.view_host()),
1586  numExportPacketsPerLID_av,
1587  this->imports_.view_host(),
1588  numImportPacketsPerLID_av);
1589  } else { // pack on device
1590  // We need to guarantee that packAndPrepare is done before we send.
1591  Kokkos::fence("DistObject::doPostSends-1"); // for UVM
1592  distributorActor_.doPostSends(distributorPlan,
1593  create_const_view(this->exports_.view_device()),
1594  numExportPacketsPerLID_av,
1595  this->imports_.view_device(),
1596  numImportPacketsPerLID_av);
1597  }
1598  } else { // constant number of packets per LID
1599  if (commOnHost) {
1600  distributorActor_.doPostSends(distributorPlan,
1601  create_const_view(this->exports_.view_host()),
1602  constantNumPackets,
1603  this->imports_.view_host());
1604  } else { // pack on device
1605  // We need to guarantee that packAndPrepare is done before we send.
1606  Kokkos::fence("DistObject::doPostSends-2"); // for UVM
1607  distributorActor_.doPostSends(distributorPlan,
1608  create_const_view(this->exports_.view_device()),
1609  constantNumPackets,
1610  this->imports_.view_device());
1611  } // commOnHost
1612  } // constant or variable num packets per LID
1613 }
1614 
1615 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1616 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1617  doPackAndPrepare(const SrcDistObject& src,
1618  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1619  size_t& constantNumPackets,
1620  const execution_space& space) {
1621  using Details::ProfilingRegion;
1622  using std::endl;
1623  const bool debug = Details::Behavior::debug("DistObject");
1624 
1625  ProfilingRegion region_pp("Tpetra::DistObject::doPackAndPrepare");
1626 
1627  // Ask the source to pack data. Also ask it whether there are
1628  // a constant number of packets per element
1629  // (constantNumPackets is an output argument). If there are,
1630  // constantNumPackets will come back nonzero. Otherwise, the
1631  // source will fill the numExportPacketsPerLID_ array.
1632 
1633  // FIXME (mfh 18 Oct 2017) if (! commOnHost), sync to device?
1634  // Alternately, make packAndPrepare take a "commOnHost"
1635  // argument to tell it where to leave the data?
1636  //
1637  // NOTE (mfh 04 Feb 2019) Subclasses of DistObject should have
1638  // the freedom to pack and unpack either on host or device.
1639  // We should prefer sync'ing only on demand. Thus, we can
1640  // answer the above question: packAndPrepare should not
1641  // take a commOnHost argument, and doTransferNew should sync
1642  // where needed, if needed.
1643  if (debug) {
1644  std::ostringstream lclErrStrm;
1645  bool lclSuccess = false;
1646  try {
1647  this->packAndPrepare(src, exportLIDs, this->exports_,
1648  this->numExportPacketsPerLID_,
1649  constantNumPackets, space);
1650  lclSuccess = true;
1651  } catch (std::exception& e) {
1652  lclErrStrm << "packAndPrepare threw an exception: "
1653  << endl
1654  << e.what();
1655  } catch (...) {
1656  lclErrStrm << "packAndPrepare threw an exception "
1657  "not a subclass of std::exception.";
1658  }
1659  const char gblErrMsgHeader[] =
1660  "Tpetra::DistObject "
1661  "threw an exception in packAndPrepare on "
1662  "one or more processes in the DistObject's communicator.";
1663  auto comm = getMap()->getComm();
1664  Details::checkGlobalError(std::cerr, lclSuccess,
1665  lclErrStrm.str().c_str(),
1666  gblErrMsgHeader, *comm);
1667  } else {
1668  this->packAndPrepare(src, exportLIDs, this->exports_,
1669  this->numExportPacketsPerLID_,
1670  constantNumPackets, space);
1671  }
1672 }
1673 
1674 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1675 void DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1676  doUnpackAndCombine(const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& remoteLIDs,
1677  size_t constantNumPackets,
1678  CombineMode CM,
1679  const execution_space& space) {
1680  using Details::ProfilingRegion;
1681  using std::endl;
1682  const bool debug = Details::Behavior::debug("DistObject");
1683 
1684  ProfilingRegion region_uc("Tpetra::DistObject::doUnpackAndCombine");
1685 
1686  if (debug) {
1687  std::ostringstream lclErrStrm;
1688  bool lclSuccess = false;
1689  try {
1690  this->unpackAndCombine(remoteLIDs, this->imports_,
1691  this->numImportPacketsPerLID_,
1692  constantNumPackets, CM, space);
1693  lclSuccess = true;
1694  } catch (std::exception& e) {
1695  lclErrStrm << "doUnpackAndCombine threw an exception: "
1696  << endl
1697  << e.what();
1698  } catch (...) {
1699  lclErrStrm << "doUnpackAndCombine threw an exception "
1700  "not a subclass of std::exception.";
1701  }
1702  const char gblErrMsgHeader[] =
1703  "Tpetra::DistObject "
1704  "threw an exception in unpackAndCombine on "
1705  "one or more processes in the DistObject's communicator.";
1706  auto comm = getMap()->getComm();
1707  Details::checkGlobalError(std::cerr, lclSuccess,
1708  lclErrStrm.str().c_str(),
1709  gblErrMsgHeader, *comm);
1710  } else {
1711  this->unpackAndCombine(remoteLIDs, this->imports_,
1712  this->numImportPacketsPerLID_,
1713  constantNumPackets, CM, space);
1714  }
1715 }
1716 
1717 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1720  const size_t,
1721  const Kokkos::DualView<
1722  const local_ordinal_type*,
1724  const Kokkos::DualView<
1725  const local_ordinal_type*,
1727  const CombineMode CM) {}
1728 
1729 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1731  const SrcDistObject& source, const size_t numSameIDs,
1732  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
1733  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
1734  const CombineMode CM, const execution_space& space) {
1735  /*
1736  This is called if the derived class doesn't know how to pack and prepare in
1737  an arbitrary execution space instance, but it was asked to anyway.
1738  Provide a safe illusion by actually doing the work in the default instance,
1739  and syncing the default instance with the provided instance.
1740  The caller expects
1741  1. any work in the provided instance to complete before this.
1742  2. This to complete before any following work in the provided instance.
1743  */
1744 
1745  space.fence("Tpetra::DistObject::copyAndPermute-1"); // // TODO: Tpetra::Details::Spaces::exec_space_wait
1746  copyAndPermute(source, numSameIDs, permuteToLIDs, permuteFromLIDs,
1747  CM); // default instance
1748  execution_space().fence("Tpetra::DistObject::copyAndPermute-2"); // TODO:
1749  // Tpetra::Details::Spaces::exec_space_wait
1750 }
1751 
1752 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1755  const Kokkos::DualView<
1756  const local_ordinal_type*,
1758  Kokkos::DualView<
1759  packet_type*,
1761  Kokkos::DualView<
1762  size_t*,
1764  size_t&) {}
1765 
1766 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1768  const SrcDistObject& source,
1769  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
1770  Kokkos::DualView<packet_type*, buffer_device_type>& exports,
1771  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
1772  size_t& constantNumPackets, const execution_space& space) {
1773  /*
1774  This is called if the derived class doesn't know how to pack and prepare in
1775  an arbitrary execution space instance, but it was asked to anyway.
1776  Provide a safe illusion by actually doing the work in the default instance,
1777  and syncing the default instance with the provided instance.
1778 
1779  The caller expects
1780  1. any work in the provided instance to complete before this.
1781  2. This to complete before any following work in the provided instance.
1782  */
1783 
1784  // wait for any work from prior operations in the provided instance to
1785  // complete
1786  space.fence("Tpetra::DistObject::packAndPrepare-1"); // TODO: Details::Spaces::exec_space_wait
1787 
1788  // pack and prepare in the default instance.
1789  packAndPrepare(source, exportLIDs, exports, numPacketsPerLID,
1790  constantNumPackets); // default instance
1791 
1792  // wait for the default instance to complete before returning, so any
1793  // following work inserted into the provided instance will be done after this
1794  execution_space().fence("Tpetra::DistObject::packAndPrepare-2"); // TODO: Details::Spaces::exec_space_wait
1795 }
1796 
1797 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1799  unpackAndCombine(const Kokkos::DualView<
1800  const local_ordinal_type*,
1801  buffer_device_type>& /* importLIDs */,
1802  Kokkos::DualView<
1803  packet_type*,
1804  buffer_device_type> /* imports */,
1805  Kokkos::DualView<
1806  size_t*,
1807  buffer_device_type> /* numPacketsPerLID */,
1808  const size_t /* constantNumPackets */,
1809  const CombineMode /* combineMode */) {}
1810 
1811 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1813  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
1814  Kokkos::DualView<packet_type*, buffer_device_type> imports,
1815  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
1816  const size_t constantNumPackets, const CombineMode combineMode,
1817  const execution_space& space) {
1818  // Wait for any work in the provided space to complete
1819  space.fence("Tpetra::DistObject::unpackAndCombine-1"); // TODO: Details::Spaces::exec_space_wait(execution_space(),
1820  // space);
1821  unpackAndCombine(importLIDs, imports, numPacketsPerLID, constantNumPackets,
1822  combineMode); // default instance
1823  // wait for unpack to finish in the default instance, since the caller
1824  // may be expecting sequential semantics in the `space` instance
1825  execution_space().fence("Tpetra::DistObject::unpackAndCombine-2"); // TODO: Details::Spaces::exec_space_wait(space,
1826  // execution_space());
1827 }
1828 
1829 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1831  std::ostream& os) const {
1832  using std::endl;
1833  using Teuchos::FancyOStream;
1834  using Teuchos::getFancyOStream;
1835  using Teuchos::RCP;
1836  using Teuchos::rcpFromRef;
1837 
1838  RCP<FancyOStream> out = getFancyOStream(rcpFromRef(os));
1839  this->describe(*out, Teuchos::VERB_DEFAULT);
1840 }
1841 
1842 template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1843 std::unique_ptr<std::string>
1845  const char className[], const char methodName[]) const {
1846  auto map = this->getMap();
1847  auto comm = map.is_null() ? Teuchos::null : map->getComm();
1848  return Details::createPrefix(comm.getRawPtr(), className, methodName);
1849 }
1850 
1851 template <class DistObjectType>
1853  Teuchos::RCP<DistObjectType>& input,
1854  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
1855  typename DistObjectType::global_ordinal_type,
1856  typename DistObjectType::node_type>>& newMap) {
1857  input->removeEmptyProcessesInPlace(newMap);
1858  if (newMap.is_null()) { // my process is excluded
1859  input = Teuchos::null;
1860  }
1861 }
1862 
1863 template <class DistObjectType>
1864 void removeEmptyProcessesInPlace(Teuchos::RCP<DistObjectType>& input) {
1865  auto newMap = input->getMap()->removeEmptyProcesses();
1866  removeEmptyProcessesInPlace<DistObjectType>(input, newMap);
1867 }
1868 
1869 // Explicit instantiation macro for general DistObject.
1870 #define TPETRA_DISTOBJECT_INSTANT(SCALAR, LO, GO, NODE) \
1871  template class DistObject<SCALAR, LO, GO, NODE>;
1872 
1873 // Explicit instantiation macro for DistObject<char, ...>.
1874 // The "SLGN" stuff above doesn't work for Packet=char.
1875 #define TPETRA_DISTOBJECT_INSTANT_CHAR(LO, GO, NODE) \
1876  template class DistObject<char, LO, GO, NODE>;
1877 
1878 } // namespace Tpetra
1879 
1880 #endif // TPETRA_DISTOBJECT_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
const Details::DistributorPlan & getPlan() const
Get this Distributor&#39;s DistributorPlan.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM)
Perform copies and permutations that are local to the calling (MPI) process.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void doImport(const SrcDistObject &source, const Import< LocalOrdinal, GlobalOrdinal, Node > &importer, const CombineMode CM, const bool restrictedMode=false)
Import data into this object using an Import object (&quot;forward mode&quot;).
typename::Kokkos::ArithTraits< Packet >::val_type packet_type
The type of each datum being sent or received in an Import or Export.
void print(std::ostream &os) const
Print this object to the given output stream.
virtual bool reallocArraysForNumPacketsPerLid(const size_t numExportLIDs, const size_t numImportLIDs)
Reallocate numExportPacketsPerLID_ and/or numImportPacketsPerLID_, if necessary.
bool isDistributed() const
Whether this is a globally distributed object.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode)
Perform any unpacking and combining after communication.
static bool debug()
Whether Tpetra is in debug mode.
virtual void doTransfer(const SrcDistObject &src, const ::Tpetra::Details::Transfer< local_ordinal_type, global_ordinal_type, node_type > &transfer, const char modeString[], const ReverseOption revOp, const CombineMode CM, const bool restrictedMode)
Redistribute data across (MPI) processes.
void beginTransfer(const SrcDistObject &src, const ::Tpetra::Details::Transfer< local_ordinal_type, global_ordinal_type, node_type > &transfer, const char modeString[], const ReverseOption revOp, const CombineMode CM, const bool restrictedMode)
Implementation detail of doTransfer.
typename device_type::execution_space execution_space
The Kokkos execution space.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual void packAndPrepare(const SrcDistObject &source, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets)
Pack data and metadata for communication (sends).
Sets up and executes a communication plan for a Tpetra DistObject.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Abstract base class for objects that can be the source of an Import or Export operation.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
LocalOrdinal local_ordinal_type
The type of local indices.
Replace old values with zero.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
ReverseOption
Whether the data transfer should be performed in forward or reverse mode.
DistObject(const Teuchos::RCP< const map_type > &map)
Constructor.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual std::string description() const
One-line descriptiion of this object.
bool transferArrived() const
Whether the data from an import/export operation has arrived, and is ready for the unpack and combine...
virtual size_t constantNumberOfPackets() const
Whether the implementation&#39;s instance promises always to have a constant number of packets per LID (l...
virtual bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix, const bool remoteLIDsContiguous=false, const CombineMode CM=INSERT)
Reallocate imports_ if needed.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
Stand-alone utility functions and macros.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print a descriptiion of this object to the given output stream.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
Base class for distributed Tpetra objects that support data redistribution.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
Definition: Tpetra_Util.cpp:69
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes which contain no entries in this object&#39;s Map.
Description of Tpetra&#39;s behavior.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.