Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_DistObject_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTOBJECT_DEF_HPP
43 #define TPETRA_DISTOBJECT_DEF_HPP
44 
52 
53 #include "Tpetra_Distributor.hpp"
57 #include <typeinfo>
58 #include <memory>
59 #include <sstream>
60 
61 namespace Tpetra {
62 
63  namespace { // (anonymous)
64  template<class DeviceType, class IndexType = size_t>
65  struct SumFunctor {
66  SumFunctor (const Kokkos::View<const size_t*, DeviceType>& viewToSum) :
67  viewToSum_ (viewToSum) {}
68  KOKKOS_INLINE_FUNCTION void operator() (const IndexType i, size_t& lclSum) const {
69  lclSum += viewToSum_(i);
70  }
71  Kokkos::View<const size_t*, DeviceType> viewToSum_;
72  };
73 
74  template<class DeviceType, class IndexType = size_t>
75  size_t
76  countTotalImportPackets (const Kokkos::View<const size_t*, DeviceType>& numImportPacketsPerLID)
77  {
78  using Kokkos::parallel_reduce;
79  typedef DeviceType DT;
80  typedef typename DT::execution_space DES;
81  typedef Kokkos::RangePolicy<DES, IndexType> range_type;
82 
83  const IndexType numOut = numImportPacketsPerLID.extent (0);
84  size_t totalImportPackets = 0;
85  parallel_reduce ("Count import packets",
86  range_type (0, numOut),
87  SumFunctor<DeviceType, IndexType> (numImportPacketsPerLID),
88  totalImportPackets);
89  return totalImportPackets;
90  }
91  } // namespace (anonymous)
92 
93 
94  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
96  DistObject (const Teuchos::RCP<const map_type>& map) :
97  map_ (map)
98  {
99 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
100  using Teuchos::RCP;
101  using Teuchos::Time;
102  using Teuchos::TimeMonitor;
103 
104  RCP<Time> doXferTimer =
105  TimeMonitor::lookupCounter ("Tpetra::DistObject::doTransfer");
106  if (doXferTimer.is_null ()) {
107  doXferTimer =
108  TimeMonitor::getNewCounter ("Tpetra::DistObject::doTransfer");
109  }
110  doXferTimer_ = doXferTimer;
111 
112  RCP<Time> copyAndPermuteTimer =
113  TimeMonitor::lookupCounter ("Tpetra::DistObject::copyAndPermute");
114  if (copyAndPermuteTimer.is_null ()) {
115  copyAndPermuteTimer =
116  TimeMonitor::getNewCounter ("Tpetra::DistObject::copyAndPermute");
117  }
118  copyAndPermuteTimer_ = copyAndPermuteTimer;
119 
120  RCP<Time> packAndPrepareTimer =
121  TimeMonitor::lookupCounter ("Tpetra::DistObject::packAndPrepare");
122  if (packAndPrepareTimer.is_null ()) {
123  packAndPrepareTimer =
124  TimeMonitor::getNewCounter ("Tpetra::DistObject::packAndPrepare");
125  }
126  packAndPrepareTimer_ = packAndPrepareTimer;
127 
128  RCP<Time> doPostsAndWaitsTimer =
129  TimeMonitor::lookupCounter ("Tpetra::DistObject::doPostsAndWaits");
130  if (doPostsAndWaitsTimer.is_null ()) {
131  doPostsAndWaitsTimer =
132  TimeMonitor::getNewCounter ("Tpetra::DistObject::doPostsAndWaits");
133  }
134  doPostsAndWaitsTimer_ = doPostsAndWaitsTimer;
135 
136  RCP<Time> unpackAndCombineTimer =
137  TimeMonitor::lookupCounter ("Tpetra::DistObject::unpackAndCombine");
138  if (unpackAndCombineTimer.is_null ()) {
139  unpackAndCombineTimer =
140  TimeMonitor::getNewCounter ("Tpetra::DistObject::unpackAndCombine");
141  }
142  unpackAndCombineTimer_ = unpackAndCombineTimer;
143 #endif // HAVE_TPETRA_TRANSFER_TIMERS
144  }
145 
146  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
147  std::string
149  description () const
150  {
151  using Teuchos::TypeNameTraits;
152 
153  std::ostringstream os;
154  os << "\"Tpetra::DistObject\": {"
155  << "Packet: " << TypeNameTraits<packet_type>::name ()
156  << ", LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name ()
157  << ", GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name ()
158  << ", Node: " << TypeNameTraits<Node>::name ();
159  if (this->getObjectLabel () != "") {
160  os << "Label: \"" << this->getObjectLabel () << "\"";
161  }
162  os << "}";
163  return os.str ();
164  }
165 
166  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
167  void
169  describe (Teuchos::FancyOStream &out,
170  const Teuchos::EVerbosityLevel verbLevel) const
171  {
172  using Teuchos::rcpFromRef;
173  using Teuchos::TypeNameTraits;
174  using std::endl;
175  const Teuchos::EVerbosityLevel vl = (verbLevel == Teuchos::VERB_DEFAULT) ?
176  Teuchos::VERB_LOW : verbLevel;
177  Teuchos::RCP<const Teuchos::Comm<int> > comm = this->getMap ()->getComm ();
178  const int myRank = comm.is_null () ? 0 : comm->getRank ();
179  const int numProcs = comm.is_null () ? 1 : comm->getSize ();
180 
181  if (vl != Teuchos::VERB_NONE) {
182  Teuchos::OSTab tab0 (out);
183  if (myRank == 0) {
184  out << "\"Tpetra::DistObject\":" << endl;
185  }
186  Teuchos::OSTab tab1 (out);
187  if (myRank == 0) {
188  out << "Template parameters:" << endl;
189  {
190  Teuchos::OSTab tab2 (out);
191  out << "Packet: " << TypeNameTraits<packet_type>::name () << endl
192  << "LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name () << endl
193  << "GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name () << endl
194  << "Node: " << TypeNameTraits<node_type>::name () << endl;
195  }
196  if (this->getObjectLabel () != "") {
197  out << "Label: \"" << this->getObjectLabel () << "\"" << endl;
198  }
199  } // if myRank == 0
200 
201  // Describe the Map.
202  {
203  if (myRank == 0) {
204  out << "Map:" << endl;
205  }
206  Teuchos::OSTab tab2 (out);
207  map_->describe (out, vl);
208  }
209 
210  // At verbosity > VERB_LOW, each process prints something.
211  if (vl > Teuchos::VERB_LOW) {
212  for (int p = 0; p < numProcs; ++p) {
213  if (myRank == p) {
214  out << "Process " << myRank << ":" << endl;
215  Teuchos::OSTab tab2 (out);
216  out << "Export buffer size (in packets): "
217  << exports_.extent (0)
218  << endl
219  << "Import buffer size (in packets): "
220  << imports_.extent (0)
221  << endl;
222  }
223  if (! comm.is_null ()) {
224  comm->barrier (); // give output time to finish
225  comm->barrier ();
226  comm->barrier ();
227  }
228  } // for each process rank p
229  } // if vl > VERB_LOW
230  } // if vl != VERB_NONE
231  }
232 
233  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
234  void
236  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& /* newMap */)
237  {
238  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
239  "Tpetra::DistObject::removeEmptyProcessesInPlace: Not implemented");
240  }
241 
242  /* These are provided in base DistObject template
243  template<class DistObjectType>
244  void
245  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input,
246  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
247  typename DistObjectType::global_ordinal_type,
248  typename DistObjectType::node_type> >& newMap)
249  {
250  input->removeEmptyProcessesInPlace (newMap);
251  if (newMap.is_null ()) { // my process is excluded
252  input = Teuchos::null;
253  }
254  }
255 
256  template<class DistObjectType>
257  void
258  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input)
259  {
260  using Teuchos::RCP;
261  typedef typename DistObjectType::local_ordinal_type LO;
262  typedef typename DistObjectType::global_ordinal_type GO;
263  typedef typename DistObjectType::node_type NT;
264  typedef Map<LO, GO, NT> map_type;
265 
266  RCP<const map_type> newMap = input->getMap ()->removeEmptyProcesses ();
267  removeEmptyProcessesInPlace<DistObjectType> (input, newMap);
268  }
269  */
270 
271  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
272  void
274  doImport (const SrcDistObject& source,
276  const CombineMode CM,
277  const bool restrictedMode)
278  {
279  using std::endl;
280  const char modeString[] = "doImport (forward mode)";
281 
282  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
283  // output to std::cerr on every MPI process. This is unwise for
284  // runs with large numbers of MPI processes.
285  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
286  std::unique_ptr<std::string> prefix;
287  if (verbose) {
288  int myRank = 0;
289  auto map = this->getMap ();
290  if (! map.is_null ()) {
291  auto comm = map->getComm ();
292  if (! comm.is_null ()) {
293  myRank = comm->getRank ();
294  }
295  }
296  prefix = [myRank] () {
297  std::ostringstream os;
298  os << "Proc " << myRank << ": Tpetra::DistObject::doTransfer: ";
299  return std::unique_ptr<std::string> (new std::string (os.str ()));
300  } ();
301  std::ostringstream os;
302  os << *prefix << "Start" << endl;
303  std::cerr << os.str ();
304  }
305  this->doTransfer (source, importer, modeString, DoForward, CM, restrictedMode);
306  if (verbose) {
307  std::ostringstream os;
308  os << *prefix << "Done!" << endl;
309  std::cerr << os.str ();
310  }
311  }
312 
313  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
314  void
316  doExport (const SrcDistObject& source,
318  const CombineMode CM,
319  const bool restrictedMode)
320  {
321  using std::endl;
322  const char modeString[] = "doExport (forward mode)";
323 
324  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
325  // output to std::cerr on every MPI process. This is unwise for
326  // runs with large numbers of MPI processes.
327  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
328  std::unique_ptr<std::string> prefix;
329  if (verbose) {
330  int myRank = 0;
331  auto map = this->getMap ();
332  if (! map.is_null ()) {
333  auto comm = map->getComm ();
334  if (! comm.is_null ()) {
335  myRank = comm->getRank ();
336  }
337  }
338  prefix = [myRank] () {
339  std::ostringstream os;
340  os << "(Proc " << myRank << ") ";
341  return std::unique_ptr<std::string> (new std::string (os.str ()));
342  } ();
343  std::ostringstream os;
344  os << *prefix << "Tpetra::DistObject::" << modeString << ":" << endl;
345  std::cerr << os.str ();
346  }
347  this->doTransfer (source, exporter, modeString, DoForward, CM, restrictedMode);
348  if (verbose) {
349  std::ostringstream os;
350  os << *prefix << "Tpetra::DistObject::" << modeString << ": Done!"
351  << endl;
352  std::cerr << os.str ();
353  }
354  }
355 
356  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
357  void
359  doImport (const SrcDistObject& source,
361  const CombineMode CM,
362  const bool restrictedMode)
363  {
364  using std::endl;
365  const char modeString[] = "doImport (reverse mode)";
366 
367  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
368  // output to std::cerr on every MPI process. This is unwise for
369  // runs with large numbers of MPI processes.
370  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
371  std::unique_ptr<std::string> prefix;
372  if (verbose) {
373  int myRank = 0;
374  auto map = this->getMap ();
375  if (! map.is_null ()) {
376  auto comm = map->getComm ();
377  if (! comm.is_null ()) {
378  myRank = comm->getRank ();
379  }
380  }
381  prefix = [myRank] () {
382  std::ostringstream os;
383  os << "(Proc " << myRank << ") ";
384  return std::unique_ptr<std::string> (new std::string (os.str ()));
385  } ();
386  std::ostringstream os;
387  os << *prefix << "Tpetra::DistObject::" << modeString << ":" << endl;
388  std::cerr << os.str ();
389  }
390  this->doTransfer (source, exporter, modeString, DoReverse, CM, restrictedMode);
391  if (verbose) {
392  std::ostringstream os;
393  os << *prefix << "Tpetra::DistObject::" << modeString << ": Done!"
394  << endl;
395  std::cerr << os.str ();
396  }
397  }
398 
399  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
400  void
402  doExport (const SrcDistObject& source,
404  const CombineMode CM,
405  const bool restrictedMode)
406  {
407  using std::endl;
408  const char modeString[] = "doExport (reverse mode)";
409 
410  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
411  // output to std::cerr on every MPI process. This is unwise for
412  // runs with large numbers of MPI processes.
413  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
414  std::unique_ptr<std::string> prefix;
415  if (verbose) {
416  int myRank = 0;
417  auto map = this->getMap ();
418  if (! map.is_null ()) {
419  auto comm = map->getComm ();
420  if (! comm.is_null ()) {
421  myRank = comm->getRank ();
422  }
423  }
424  prefix = [myRank] () {
425  std::ostringstream os;
426  os << "(Proc " << myRank << ") ";
427  return std::unique_ptr<std::string> (new std::string (os.str ()));
428  } ();
429  std::ostringstream os;
430  os << *prefix << "Tpetra::DistObject::" << modeString << ":" << endl;
431  std::cerr << os.str ();
432  }
433  this->doTransfer (source, importer, modeString, DoReverse, CM, restrictedMode);
434  if (verbose) {
435  std::ostringstream os;
436  os << *prefix << "Tpetra::DistObject::" << modeString << ": Done!"
437  << endl;
438  std::cerr << os.str ();
439  }
440  }
441 
442  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
443  bool
445  isDistributed () const {
446  return map_->isDistributed ();
447  }
448 
449  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
450  size_t
453  return 0; // default implementation; subclasses may override
454  }
455 
456  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
457  void
460  const ::Tpetra::Details::Transfer<local_ordinal_type, global_ordinal_type, node_type>& transfer,
461  const char modeString[],
462  const ReverseOption revOp,
463  const CombineMode CM,
464  bool restrictedMode)
465  {
467  using ::Tpetra::Details::ProfilingRegion;
468  using std::endl;
469 
470  ProfilingRegion region_doTransfer ("Tpetra::DistObject::doTransfer");
471  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
472  std::unique_ptr<std::string> prefix;
473  if (verbose) {
474  auto map = this->getMap ();
475  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
476  const int myRank = comm.is_null () ? -1 : comm->getRank ();
477  std::ostringstream os;
478  os << "Proc " << myRank << ": Tpetra::DistObject::doTransfer: ";
479  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
480  os << *prefix << "Source type: " << typeid (src).name ()
481  << ", Target type: " << typeid (*this).name () << endl;
482  std::cerr << os.str ();
483  }
484 
485  // "Restricted Mode" does two things:
486  // 1) Skips copyAndPermute
487  // 2) Allows the "target" Map of the transfer to be a subset of
488  // the Map of *this, in a "locallyFitted" sense.
489  //
490  // This cannot be used if #2 is not true, OR there are permutes.
491  // Source Maps still need to match
492 
493  // mfh 18 Oct 2017: Set TPETRA_DEBUG to true to enable extra debug
494  // checks. These may communicate more.
495  const bool debug = ::Tpetra::Details::Behavior::debug ();
496  if (debug) {
497  if (!restrictedMode && revOp == DoForward) {
498  const bool myMapSameAsTransferTgtMap =
499  this->getMap ()->isSameAs (* (transfer.getTargetMap ()));
500  TEUCHOS_TEST_FOR_EXCEPTION
501  (! myMapSameAsTransferTgtMap, std::invalid_argument,
502  "Tpetra::DistObject::" << modeString << ": For forward-mode "
503  "communication, the target DistObject's Map must be the same "
504  "(in the sense of Tpetra::Map::isSameAs) as the input "
505  "Export/Import object's target Map.");
506  }
507  else if (!restrictedMode && revOp == DoReverse) {
508  const bool myMapSameAsTransferSrcMap =
509  this->getMap ()->isSameAs (* (transfer.getSourceMap ()));
510  TEUCHOS_TEST_FOR_EXCEPTION
511  (! myMapSameAsTransferSrcMap, std::invalid_argument,
512  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
513  "communication, the target DistObject's Map must be the same "
514  "(in the sense of Tpetra::Map::isSameAs) as the input "
515  "Export/Import object's source Map.");
516  }
517  else if (restrictedMode && revOp == DoForward) {
518  const bool myMapLocallyFittedTransferTgtMap =
519  this->getMap ()->isLocallyFitted (* (transfer.getTargetMap ()));
520  TEUCHOS_TEST_FOR_EXCEPTION
521  (! myMapLocallyFittedTransferTgtMap , std::invalid_argument,
522  "Tpetra::DistObject::" << modeString << ": For forward-mode "
523  "communication using restricted mode, Export/Import object's "
524  "target Map must be locally fitted (in the sense of "
525  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
526  }
527  else { // if (restrictedMode && revOp == DoReverse) {
528  const bool myMapLocallyFittedTransferSrcMap =
529  this->getMap ()->isLocallyFitted (* (transfer.getSourceMap ()));
530  TEUCHOS_TEST_FOR_EXCEPTION
531  (! myMapLocallyFittedTransferSrcMap, std::invalid_argument,
532  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
533  "communication using restricted mode, Export/Import object's "
534  "source Map must be locally fitted (in the sense of "
535  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
536  }
537 
538  // SrcDistObject need not even _have_ Maps. However, if the
539  // source object is a DistObject, it has a Map, and we may
540  // compare that Map with the Transfer's Maps.
541  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
542  if (srcDistObj != nullptr) {
543  if (revOp == DoForward) {
544  const bool srcMapSameAsImportSrcMap =
545  srcDistObj->getMap ()->isSameAs (* (transfer.getSourceMap ()));
546  TEUCHOS_TEST_FOR_EXCEPTION
547  (! srcMapSameAsImportSrcMap, std::invalid_argument,
548  "Tpetra::DistObject::" << modeString << ": For forward-mode "
549  "communication, the source DistObject's Map must be the same "
550  "as the input Export/Import object's source Map.");
551  }
552  else { // revOp == DoReverse
553  const bool srcMapSameAsImportTgtMap =
554  srcDistObj->getMap ()->isSameAs (* (transfer.getTargetMap ()));
555  TEUCHOS_TEST_FOR_EXCEPTION
556  (! srcMapSameAsImportTgtMap, std::invalid_argument,
557  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
558  "communication, the source DistObject's Map must be the same "
559  "as the input Export/Import object's target Map.");
560  }
561  }
562  }
563 
564  const size_t numSameIDs = transfer.getNumSameIDs ();
565  Distributor& distor = transfer.getDistributor ();
566 
567  TEUCHOS_TEST_FOR_EXCEPTION
568  (debug && restrictedMode &&
569  (transfer.getPermuteToLIDs_dv().extent(0) != 0 ||
570  transfer.getPermuteFromLIDs_dv().extent(0) != 0),
571  std::invalid_argument,
572  "Tpetra::DistObject::" << modeString << ": Transfer object "
573  "cannot have permutes in restricted mode.");
574 
575 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
576  const bool useTheNewInterface = this->useNewInterface ();
577 #else
578  const bool useTheNewInterface = true;
579 #endif // TPETRA_ENABLE_DEPRECATED_CODE
580 
581  if (useTheNewInterface) {
582  using ::Tpetra::Details::Behavior;
583  // Do we need all communication buffers to live on host?
584  const bool commOnHost = ! Behavior::assumeMpiIsCudaAware ();
585  if (verbose) {
586  std::ostringstream os;
587  os << *prefix << "doTransfer: Use new interface; "
588  "commOnHost=" << (commOnHost ? "true" : "false") << endl;
589  std::cerr << os.str ();
590  }
591 
592  auto permToLIDs = (revOp == DoForward) ?
593  transfer.getPermuteToLIDs_dv () :
594  transfer.getPermuteFromLIDs_dv ();
595  auto permFromLIDs = (revOp == DoForward) ?
596  transfer.getPermuteFromLIDs_dv () :
597  transfer.getPermuteToLIDs_dv ();
598  auto remoteLIDs = (revOp == DoForward) ?
599  transfer.getRemoteLIDs_dv () :
600  transfer.getExportLIDs_dv ();
601  auto exportLIDs = (revOp == DoForward) ?
602  transfer.getExportLIDs_dv () :
603  transfer.getRemoteLIDs_dv ();
604  doTransferNew (src, CM, numSameIDs, permToLIDs, permFromLIDs,
605  remoteLIDs, exportLIDs, distor, revOp, commOnHost,restrictedMode);
606  }
607 
608 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
609  if (! useTheNewInterface) {
610  if (verbose) {
611  std::ostringstream os;
612  os << *prefix << "doTransfer: Use old interface" << endl;
613  std::cerr << os.str ();
614  }
615  const auto permToLIDs = (revOp == DoForward) ?
616  transfer.getPermuteToLIDs () : transfer.getPermuteFromLIDs ();
617  const auto permFromLIDs = (revOp == DoForward) ?
618  transfer.getPermuteFromLIDs () : transfer.getPermuteToLIDs ();
619  const auto exportLIDs = (revOp == DoForward) ?
620  transfer.getExportLIDs () : transfer.getRemoteLIDs ();
621  const auto remoteLIDs = (revOp == DoForward) ?
622  transfer.getRemoteLIDs () : transfer.getExportLIDs ();
623  doTransferOld (src, CM, numSameIDs, permToLIDs, permFromLIDs,
624  remoteLIDs, exportLIDs, distor, revOp, restrictedMode);
625  }
626 #endif // TPETRA_ENABLE_DEPRECATED_CODE
627 
628  if (verbose) {
629  std::ostringstream os;
630  os << *prefix << "Tpetra::DistObject::doTransfer: Done!" << endl;
631  std::cerr << os.str ();
632  }
633  }
634 
635  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
636  bool
638  reallocImportsIfNeeded (const size_t newSize,
639  const bool verbose,
640  const std::string* prefix)
641  {
642  if (verbose) {
643  std::ostringstream os;
644  os << *prefix << "Realloc (if needed) imports_ from "
645  << imports_.extent (0) << " to " << newSize << std::endl;
646  std::cerr << os.str ();
647  }
649  const bool reallocated =
650  reallocDualViewIfNeeded (this->imports_, newSize, "imports");
651  if (verbose) {
652  std::ostringstream os;
653  os << *prefix << "Finished realloc'ing imports_" << std::endl;
654  std::cerr << os.str ();
655  }
656  return reallocated;
657  }
658 
659  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
660  bool
662  reallocArraysForNumPacketsPerLid (const size_t numExportLIDs,
663  const size_t numImportLIDs)
664  {
667  using std::endl;
668  // If an array is already allocated, and if is at least
669  // tooBigFactor times bigger than it needs to be, free it and
670  // reallocate to the size we need, in order to save space.
671  // Otherwise, take subviews to reduce allocation size.
672  constexpr size_t tooBigFactor = 10;
673 
674  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
675  std::unique_ptr<std::string> prefix;
676  if (verbose) {
677  const int myRank = [&] () {
678  auto map = this->getMap ();
679  if (map.get () == nullptr) {
680  return -1;
681  }
682  auto comm = map->getComm ();
683  if (comm.get () == nullptr) {
684  return -2;
685  }
686  return comm->getRank ();
687  } ();
688  std::ostringstream os;
689  os << "Proc " << myRank << ": reallocArraysForNumPacketsPerLid("
690  << numExportLIDs << ", " << numImportLIDs << "): ";
691  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
692  }
693 
694  if (verbose) {
695  std::ostringstream os;
696  os << *prefix << "before:" << endl
697  << *prefix << dualViewStatusToString (this->numExportPacketsPerLID_,
698  "numExportPacketsPerLID_")
699  << endl
700  << *prefix << dualViewStatusToString (this->numImportPacketsPerLID_,
701  "numImportPacketsPerLID_")
702  << endl;
703  std::cerr << os.str ();
704  }
705 
706  // Reallocate numExportPacketsPerLID_ if needed.
707  const bool firstReallocated =
708  reallocDualViewIfNeeded (this->numExportPacketsPerLID_,
709  numExportLIDs,
710  "numExportPacketsPerLID",
711  tooBigFactor,
712  true); // need fence before, if realloc'ing
713 
714  // If we reallocated above, then we fenced after that
715  // reallocation. This means that we don't need to fence again,
716  // before the next reallocation.
717  const bool needFenceBeforeNextAlloc = ! firstReallocated;
718  const bool secondReallocated =
719  reallocDualViewIfNeeded (this->numImportPacketsPerLID_,
720  numImportLIDs,
721  "numImportPacketsPerLID",
722  tooBigFactor,
723  needFenceBeforeNextAlloc);
724 
725  if (verbose) {
726  std::ostringstream os;
727  os << *prefix << "after:" << endl
728  << *prefix << dualViewStatusToString (this->numExportPacketsPerLID_,
729  "numExportPacketsPerLID_")
730  << endl
731  << *prefix << dualViewStatusToString (this->numImportPacketsPerLID_,
732  "numImportPacketsPerLID_")
733  << endl;
734  std::cerr << os.str ();
735  }
736 
737  return firstReallocated || secondReallocated;
738  }
739 
740  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
741  void
744  const CombineMode CM,
745  const size_t numSameIDs,
746  const Kokkos::DualView<const local_ordinal_type*,
747  buffer_device_type>& permuteToLIDs,
748  const Kokkos::DualView<const local_ordinal_type*,
749  buffer_device_type>& permuteFromLIDs,
750  const Kokkos::DualView<const local_ordinal_type*,
751  buffer_device_type>& remoteLIDs,
752  const Kokkos::DualView<const local_ordinal_type*,
753  buffer_device_type>& exportLIDs,
754  Distributor& distor,
755  const ReverseOption revOp,
756  const bool commOnHost,
757  const bool restrictedMode)
758  {
761  using ::Tpetra::Details::ProfilingRegion;
762  using Kokkos::Compat::getArrayView;
763  using Kokkos::Compat::getConstArrayView;
764  using Kokkos::Compat::getKokkosViewDeepCopy;
765  using Kokkos::Compat::create_const_view;
766  using std::endl;
767  using DT = device_type;
768  using DES = typename DT::execution_space;
769 
770  ProfilingRegion region_dTN ("Tpetra::DistObject::doTransferNew");
771 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
772  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor
773  // of Kokkos profiling.
774  Teuchos::TimeMonitor doXferMon (*doXferTimer_);
775 #endif // HAVE_TPETRA_TRANSFER_TIMERS
776 
777  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
778  // Prefix for verbose output. Use a pointer, so we don't pay for
779  // string construction unless needed. We set this below.
780  std::unique_ptr<std::string> prefix;
781  if (verbose) {
782  auto map = this->getMap ();
783  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
784  const int myRank = comm.is_null () ? 0 : comm->getRank ();
785  std::ostringstream os;
786  os << "Proc " << myRank << ": Tpetra::CrsMatrix::doTransferNew: ";
787  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
788  }
789 
790  if (verbose) {
791  std::ostringstream os;
792  os << *prefix << "Input arguments:" << endl
793  << *prefix << " combineMode: " << combineModeToString (CM) << endl
794  << *prefix << " numSameIDs: " << numSameIDs << endl
795  << *prefix << " "
796  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
797  << *prefix << " "
798  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl
799  << *prefix << " "
800  << dualViewStatusToString (remoteLIDs, "remoteLIDs") << endl
801  << *prefix << " "
802  << dualViewStatusToString (exportLIDs, "exportLIDs") << endl
803  << *prefix << " revOp: Do" << (revOp == DoReverse ? "Reverse" : "Forward") << endl
804  << *prefix << " commOnHost: " << (commOnHost ? "true" : "false") << endl;
805  std::cerr << os.str ();
806  }
807 
808  {
809  ProfilingRegion region_cs ("Tpetra::DistObject::doTransferNew::checkSizes");
810  if (verbose) {
811  std::ostringstream os;
812  os << *prefix << "1. checkSizes" << endl;
813  std::cerr << os.str ();
814  }
815  const bool checkSizesResult = this->checkSizes (src);
816  TEUCHOS_TEST_FOR_EXCEPTION
817  (! checkSizesResult, std::invalid_argument,
818  "Tpetra::DistObject::doTransfer: checkSizes() indicates that the "
819  "destination object is not a legal target for redistribution from the "
820  "source object. This probably means that they do not have the same "
821  "dimensions. For example, MultiVectors must have the same number of "
822  "rows and columns.");
823  }
824 
825  // NOTE (mfh 26 Apr 2016) Chris Baker's implementation understood
826  // that if CM == INSERT || CM == REPLACE, the target object could
827  // be write only. We don't optimize for that here.
828 
829  if (!restrictedMode && numSameIDs + permuteToLIDs.extent (0) != 0) {
830  // There is at least one GID to copy or permute.
831  if (verbose) {
832  std::ostringstream os;
833  os << *prefix << "2. copyAndPermute" << endl;
834  std::cerr << os.str ();
835  }
836  ProfilingRegion region_cp
837  ("Tpetra::DistObject::doTransferNew::copyAndPermute");
838 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
839  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor
840  // of Kokkos profiling.
841  Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_);
842 #endif // HAVE_TPETRA_TRANSFER_TIMERS
843 
844  if (numSameIDs + permuteToLIDs.extent (0) != 0) {
845  // There is at least one GID to copy or permute.
846  if (verbose) {
847  std::ostringstream os;
848  os << *prefix << "2. copyAndPermute" << endl;
849  std::cerr << os.str ();
850  }
851 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
852  this->copyAndPermuteNew (src, numSameIDs, permuteToLIDs,
853  permuteFromLIDs);
854 #else // TPETRA_ENABLE_DEPRECATED_CODE
855  this->copyAndPermute (src, numSameIDs, permuteToLIDs,
856  permuteFromLIDs);
857 #endif // TPETRA_ENABLE_DEPRECATED_CODE
858  if (verbose) {
859  std::ostringstream os;
860  os << *prefix << "After copyAndPermute:" << endl
861  << *prefix << " "
862  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
863  << endl
864  << *prefix << " "
865  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
866  << endl;
867  std::cerr << os.str ();
868  }
869  }
870  }
871 
872  // The method may return zero even if the implementation actually
873  // does have a constant number of packets per LID. However, if it
874  // returns nonzero, we may use this information to avoid
875  // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare()
876  // will set this to its final value.
877  //
878  // We only need this if CM != ZERO, but it has to be lifted out of
879  // that scope because there are multiple tests for CM != ZERO.
880  size_t constantNumPackets = this->constantNumberOfPackets ();
881  if (verbose) {
882  std::ostringstream os;
883  os << *prefix << "constantNumPackets=" << constantNumPackets << endl;
884  std::cerr << os.str ();
885  }
886 
887  // We only need to pack communication buffers if the combine mode
888  // is not ZERO. A "ZERO combine mode" means that the results are
889  // the same as if we had received all zeros, and added them to the
890  // existing values. That means we don't need to communicate.
891  if (CM != ZERO) {
892  if (constantNumPackets == 0) {
893  if (verbose) {
894  std::ostringstream os;
895  os << *prefix << "3. (Re)allocate num{Ex,Im}portPacketsPerLID"
896  << endl;
897  std::cerr << os.str ();
898  }
899  // This only reallocates if necessary, that is, if the sizes
900  // don't match.
901  this->reallocArraysForNumPacketsPerLid (exportLIDs.extent (0),
902  remoteLIDs.extent (0));
903  }
904 
905  if (verbose) {
906  std::ostringstream os;
907  os << *prefix << "4. packAndPrepare: before, "
908  << dualViewStatusToString (this->exports_, "exports_")
909  << endl;
910  std::cerr << os.str ();
911  }
912  {
913  ProfilingRegion region_pp
914  ("Tpetra::DistObject::doTransferNew::packAndPrepare");
915 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
916  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in
917  // favor of Kokkos profiling.
918  Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_);
919 #endif // HAVE_TPETRA_TRANSFER_TIMERS
920 
921  // Ask the source to pack data. Also ask it whether there are
922  // a constant number of packets per element
923  // (constantNumPackets is an output argument). If there are,
924  // constantNumPackets will come back nonzero. Otherwise, the
925  // source will fill the numExportPacketsPerLID_ array.
926 
927  // FIXME (mfh 18 Oct 2017) if (! commOnHost), sync to device?
928  // Alternately, make packAndPrepare take a "commOnHost"
929  // argument to tell it where to leave the data?
930  //
931  // NOTE (mfh 04 Feb 2019) Subclasses of DistObject should have
932  // the freedom to pack and unpack either on host or device.
933  // We should prefer sync'ing only on demand. Thus, we can
934  // answer the above question: packAndPrepare should not
935  // take a commOnHost argument, and doTransferNew should sync
936  // where needed, if needed.
937 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
938  this->packAndPrepareNew (src, exportLIDs, this->exports_,
939  this->numExportPacketsPerLID_,
940  constantNumPackets, distor);
941 #else // TPETRA_ENABLE_DEPRECATED_CODE
942  this->packAndPrepare (src, exportLIDs, this->exports_,
943  this->numExportPacketsPerLID_,
944  constantNumPackets, distor);
945 #endif // TPETRA_ENABLE_DEPRECATED_CODE
946  if (commOnHost) {
947  if (this->exports_.need_sync_host ()) {
948  this->exports_.sync_host ();
949  }
950  }
951  else { // ! commOnHost
952  if (this->exports_.need_sync_device ()) {
953  this->exports_.sync_device ();
954  }
955  }
956  }
957  if (verbose) {
958  std::ostringstream os;
959  os << *prefix << "5.1. After packAndPrepare, "
960  << dualViewStatusToString (this->exports_, "exports_")
961  << endl;
962  std::cerr << os.str ();
963  }
964  } // if (CM != ZERO)
965 
966  // We only need to send data if the combine mode is not ZERO.
967  if (CM != ZERO) {
968  if (constantNumPackets != 0) {
969  // There are a constant number of packets per element. We
970  // already know (from the number of "remote" (incoming)
971  // elements) how many incoming elements we expect, so we can
972  // resize the buffer accordingly.
973  const size_t rbufLen = remoteLIDs.extent (0) * constantNumPackets;
974  reallocImportsIfNeeded (rbufLen, verbose, prefix.get ());
975  }
976 
977  // Do we need to do communication (via doPostsAndWaits)?
978  bool needCommunication = true;
979 
980  // This may be NULL. It will be used below.
981  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
982 
983  if (revOp == DoReverse && ! this->isDistributed ()) {
984  needCommunication = false;
985  }
986  // FIXME (mfh 30 Jun 2013): Checking whether the source object
987  // is distributed requires a cast to DistObject. If it's not a
988  // DistObject, then I'm not quite sure what to do. Perhaps it
989  // would be more appropriate for SrcDistObject to have an
990  // isDistributed() method. For now, I'll just assume that we
991  // need to do communication unless the cast succeeds and the
992  // source is not distributed.
993  else if (revOp == DoForward && srcDistObj != NULL &&
994  ! srcDistObj->isDistributed ()) {
995  needCommunication = false;
996  }
997 
998  if (! needCommunication) {
999  if (verbose) {
1000  std::ostringstream os;
1001  os << *prefix << "Comm not needed; skipping" << endl;
1002  std::cerr << os.str ();
1003  }
1004  }
1005  else {
1006  ProfilingRegion region_dpw
1007  ("Tpetra::DistObject::doTransferNew::doPostsAndWaits");
1008 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1009  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in
1010  // favor of Kokkos profiling.
1011  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1012 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1013 
1014  if (verbose) {
1015  std::ostringstream os;
1016  os << *prefix << "7.0. "
1017  << (revOp == DoReverse ? "Reverse" : "Forward")
1018  << " mode" << endl;
1019  std::cerr << os.str ();
1020  }
1021 
1022  if (constantNumPackets == 0) { // variable num packets per LID
1023  if (verbose) {
1024  std::ostringstream os;
1025  os << *prefix << "7.1. Variable # packets / LID: first comm "
1026  << "(commOnHost = " << (commOnHost ? "true" : "false") << ")"
1027  << endl;
1028  std::cerr << os.str ();
1029  }
1030  size_t totalImportPackets = 0;
1031  if (commOnHost) {
1032  if (this->numExportPacketsPerLID_.need_sync_host ()) {
1033  this->numExportPacketsPerLID_.sync_host ();
1034  }
1035  if (this->numImportPacketsPerLID_.need_sync_host ()) {
1036  this->numImportPacketsPerLID_.sync_host ();
1037  }
1038  this->numImportPacketsPerLID_.modify_host (); // out arg
1039  auto numExp_h =
1040  create_const_view (this->numExportPacketsPerLID_.view_host ());
1041  auto numImp_h = this->numImportPacketsPerLID_.view_host ();
1042 
1043  // MPI communication happens here.
1044  if (verbose) {
1045  std::ostringstream os;
1046  os << *prefix << "Call do"
1047  << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits"
1048  << endl;
1049  std::cerr << os.str ();
1050  }
1051  if (revOp == DoReverse) {
1052  distor.doReversePostsAndWaits (numExp_h, 1, numImp_h);
1053  }
1054  else {
1055  distor.doPostsAndWaits (numExp_h, 1, numImp_h);
1056  }
1057  DES::fence (); // just in case UVM doesn't behave right
1058 
1059  if (verbose) {
1060  std::ostringstream os;
1061  os << *prefix << "Count totalImportPackets" << std::endl;
1062  std::cerr << os.str ();
1063  }
1064  using the_dev_type = typename decltype (numImp_h)::device_type;
1065  totalImportPackets = countTotalImportPackets<the_dev_type> (numImp_h);
1066  }
1067  else { // ! commOnHost
1068  if (this->numExportPacketsPerLID_.need_sync_device ()) {
1069  this->numExportPacketsPerLID_.sync_device ();
1070  }
1071  if (this->numImportPacketsPerLID_.need_sync_device ()) {
1072  this->numImportPacketsPerLID_.sync_device ();
1073  }
1074  this->numImportPacketsPerLID_.modify_device (); // out arg
1075  auto numExp_d = create_const_view
1076  (this->numExportPacketsPerLID_.view_device ());
1077  auto numImp_d = this->numImportPacketsPerLID_.view_device ();
1078 
1079  // MPI communication happens here.
1080  if (verbose) {
1081  std::ostringstream os;
1082  os << *prefix << "Call do"
1083  << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits"
1084  << endl;
1085  std::cerr << os.str ();
1086  }
1087  if (revOp == DoReverse) {
1088  distor.doReversePostsAndWaits (numExp_d, 1, numImp_d);
1089  }
1090  else {
1091  distor.doPostsAndWaits (numExp_d, 1, numImp_d);
1092  }
1093  DES::fence (); // just in case UVM doesn't behave right
1094 
1095  if (verbose) {
1096  std::ostringstream os;
1097  os << *prefix << "Count totalImportPackets" << std::endl;
1098  std::cerr << os.str ();
1099  }
1100  using the_dev_type = typename decltype (numImp_d)::device_type;
1101  totalImportPackets = countTotalImportPackets<the_dev_type> (numImp_d);
1102  }
1103 
1104  if (verbose) {
1105  std::ostringstream os;
1106  os << *prefix << "totalImportPackets=" << totalImportPackets << endl;
1107  std::cerr << os.str ();
1108  }
1109  this->reallocImportsIfNeeded (totalImportPackets, verbose,
1110  prefix.get ());
1111  if (verbose) {
1112  std::ostringstream os;
1113  os << *prefix << "7.3. Second comm" << std::endl;
1114  std::cerr << os.str ();
1115  }
1116 
1117  // mfh 04 Feb 2019: Distributor expects the "num packets per
1118  // LID" arrays on host, so that it can issue MPI sends and
1119  // receives correctly.
1120  if (this->numExportPacketsPerLID_.need_sync_host ()) {
1121  this->numExportPacketsPerLID_.sync_host ();
1122  }
1123  if (this->numImportPacketsPerLID_.need_sync_host ()) {
1124  this->numImportPacketsPerLID_.sync_host ();
1125  }
1126 
1127  // NOTE (mfh 25 Apr 2016, 01 Aug 2017) doPostsAndWaits and
1128  // doReversePostsAndWaits currently want
1129  // numExportPacketsPerLID and numImportPacketsPerLID as
1130  // Teuchos::ArrayView, rather than as Kokkos::View.
1131  //
1132  // NOTE (mfh 04 Feb 2019) This does NOT copy from host to
1133  // device. The above syncs might.
1134  auto numExportPacketsPerLID_av =
1135  getArrayViewFromDualView (this->numExportPacketsPerLID_);
1136  auto numImportPacketsPerLID_av =
1137  getArrayViewFromDualView (this->numImportPacketsPerLID_);
1138 
1139  // imports_ is for output only, so we don't need to sync it
1140  // before marking it as modified. However, in order to
1141  // prevent spurious debug-mode errors (e.g., "modified on
1142  // both device and host"), we first need to clear its
1143  // "modified" flags.
1144  this->imports_.clear_sync_state ();
1145 
1146  if (verbose) {
1147  std::ostringstream os;
1148  os << *prefix << "Comm on "
1149  << (commOnHost ? "host" : "device")
1150  << "; call do" << (revOp == DoReverse ? "Reverse" : "")
1151  << "PostsAndWaits" << endl;
1152  std::cerr << os.str ();
1153  }
1154 
1155  if (commOnHost) {
1156  this->imports_.modify_host ();
1157  if (revOp == DoReverse) {
1158  distor.doReversePostsAndWaits
1159  (create_const_view (this->exports_.view_host ()),
1160  numExportPacketsPerLID_av,
1161  this->imports_.view_host (),
1162  numImportPacketsPerLID_av);
1163  }
1164  else {
1165  distor.doPostsAndWaits
1166  (create_const_view (this->exports_.view_host ()),
1167  numExportPacketsPerLID_av,
1168  this->imports_.view_host (),
1169  numImportPacketsPerLID_av);
1170  }
1171  }
1172  else { // pack on device
1173  this->imports_.modify_device ();
1174  if (revOp == DoReverse) {
1175  distor.doReversePostsAndWaits
1176  (create_const_view (this->exports_.view_device ()),
1177  numExportPacketsPerLID_av,
1178  this->imports_.view_device (),
1179  numImportPacketsPerLID_av);
1180  }
1181  else {
1182  distor.doPostsAndWaits
1183  (create_const_view (this->exports_.view_device ()),
1184  numExportPacketsPerLID_av,
1185  this->imports_.view_device (),
1186  numImportPacketsPerLID_av);
1187  }
1188  }
1189  }
1190  else { // constant number of packets per LID
1191  if (verbose) {
1192  std::ostringstream os;
1193  os << *prefix << "7.1. Const # packets per LID: " << endl
1194  << *prefix << " "
1195  << dualViewStatusToString (this->exports_, "exports_")
1196  << endl
1197  << *prefix << " "
1198  << dualViewStatusToString (this->exports_, "imports_")
1199  << endl;
1200  std::cerr << os.str ();
1201  }
1202  // imports_ is for output only, so we don't need to sync it
1203  // before marking it as modified. However, in order to
1204  // prevent spurious debug-mode errors (e.g., "modified on
1205  // both device and host"), we first need to clear its
1206  // "modified" flags.
1207  this->imports_.clear_sync_state ();
1208 
1209  if (verbose) {
1210  std::ostringstream os;
1211  os << *prefix << "7.2. Comm on "
1212  << (commOnHost ? "host" : "device")
1213  << "; call do" << (revOp == DoReverse ? "Reverse" : "")
1214  << "PostsAndWaits" << endl;
1215  std::cerr << os.str ();
1216  }
1217  if (commOnHost) {
1218  this->imports_.modify_host ();
1219  if (revOp == DoReverse) {
1220  distor.doReversePostsAndWaits
1221  (create_const_view (this->exports_.view_host ()),
1222  constantNumPackets,
1223  this->imports_.view_host ());
1224  }
1225  else {
1226  distor.doPostsAndWaits
1227  (create_const_view (this->exports_.view_host ()),
1228  constantNumPackets,
1229  this->imports_.view_host ());
1230  }
1231  }
1232  else { // pack on device
1233  this->imports_.modify_device ();
1234  if (revOp == DoReverse) {
1235  distor.doReversePostsAndWaits
1236  (create_const_view (this->exports_.view_device ()),
1237  constantNumPackets,
1238  this->imports_.view_device ());
1239  }
1240  else {
1241  distor.doPostsAndWaits
1242  (create_const_view (this->exports_.view_device ()),
1243  constantNumPackets,
1244  this->imports_.view_device ());
1245  }
1246  } // commOnHost
1247  } // constant or variable num packets per LID
1248 
1249  if (verbose) {
1250  std::ostringstream os;
1251  os << *prefix << "8. unpackAndCombine" << endl;
1252  std::cerr << os.str ();
1253  }
1254  ProfilingRegion region_uc
1255  ("Tpetra::DistObject::doTransferNew::unpackAndCombine");
1256 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1257  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in
1258  // favor of Kokkos profiling.
1259  Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_);
1260 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1261 
1262  // NOTE (mfh 26 Apr 2016) We don't actually need to sync the
1263  // input DualViews, but they DO need to be most recently
1264  // updated in the same memory space.
1265  //
1266  // FIXME (mfh 26 Apr 2016) Check that all input DualViews
1267  // were most recently updated in the same memory space, and
1268  // sync them to the same place (based on commOnHost) if not.
1269 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1270  this->unpackAndCombineNew (remoteLIDs, this->imports_,
1271  this->numImportPacketsPerLID_,
1272  constantNumPackets, distor, CM);
1273 #else // TPETRA_ENABLE_DEPRECATED_CODE
1274  this->unpackAndCombine (remoteLIDs, this->imports_,
1275  this->numImportPacketsPerLID_,
1276  constantNumPackets, distor, CM);
1277 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1278  } // if (needCommunication)
1279  } // if (CM != ZERO)
1280 
1281  if (verbose) {
1282  std::ostringstream os;
1283  os << *prefix << "9. Done!" << endl;
1284  std::cerr << os.str ();
1285  }
1286  }
1287 
1288 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1289  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1290  void TPETRA_DEPRECATED
1292  doTransferOld (const SrcDistObject& src,
1293  CombineMode CM,
1294  size_t numSameIDs,
1295  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1296  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1297  const Teuchos::ArrayView<const LocalOrdinal>& remoteLIDs,
1298  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
1299  Distributor& distor,
1300  ReverseOption revOp,
1301  const bool restrictedMode)
1302  {
1304  using ::Tpetra::Details::ProfilingRegion;
1306  using std::endl;
1307  const char prefixRaw[] = "Tpetra::DistObject::doTransferOld: ";
1308 
1309  ProfilingRegion region_doTransferOld ("Tpetra::DistObject::doTransferOld");
1310 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1311  // FIXME (mfh 04 Feb 2019) Remove Teuchos::TimeMonitor and use
1312  // Kokkos profiling instead.
1313  Teuchos::TimeMonitor doXferMon (*doXferTimer_);
1314 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1315 
1316  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
1317  std::unique_ptr<std::string> prefix;
1318  if (verbose) {
1319  auto map = this->getMap ();
1320  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1321  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1322  std::ostringstream os;
1323  os << "Proc " << myRank << ": " << prefixRaw;
1324  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1325  }
1326 
1327  TEUCHOS_TEST_FOR_EXCEPTION(
1328  ! checkSizes (src), std::invalid_argument,
1329  prefixRaw << "checkSizes() indicates that the "
1330  "destination object is not a legal target for redistribution from the "
1331  "source object. This probably means that they do not have the same "
1332  "dimensions. For example, MultiVectors must have the same number of "
1333  "rows and columns.");
1334  KokkosClassic::ReadWriteOption rwo = KokkosClassic::ReadWrite;
1335  if (CM == INSERT || CM == REPLACE) {
1336  const size_t numIDsToWrite = numSameIDs +
1337  static_cast<size_t> (permuteToLIDs.size ()) +
1338  static_cast<size_t> (remoteLIDs.size ());
1339  if (numIDsToWrite == this->getMap ()->getNodeNumElements ()) {
1340  // We're overwriting all of our local data in the destination
1341  // object, so a write-only view suffices.
1342  //
1343  // FIXME (mfh 10 Apr 2012) This doesn't make sense for a
1344  // CrsMatrix with a dynamic graph. INSERT mode could mean
1345  // that we're adding new entries to the object, but we don't
1346  // want to get rid of the old ones.
1347  rwo = KokkosClassic::WriteOnly;
1348  }
1349  }
1350 
1351  if (verbose) {
1352  std::ostringstream os;
1353  os << *prefix << "ReadWriteOption: ";
1354  if (rwo == KokkosClassic::ReadWrite) {
1355  os << "ReadWrite";
1356  }
1357  else if (rwo == KokkosClassic::WriteOnly) {
1358  os << "ReadWrite";
1359  }
1360  else {
1361  os << "Something else; weird!";
1362  }
1363  os << endl;
1364  std::cerr << os.str ();
1365  }
1366 
1367  // Tell the source to create a read-only view of its data. On a
1368  // discrete accelerator such as a GPU, this brings EVERYTHING from
1369  // device memory to host memory.
1370  //
1371  // FIXME (mfh 23 Mar 2012) By passing in the list of GIDs (or
1372  // rather, local LIDs to send) and packet counts, createViews()
1373  // could create a "sparse view" that only brings in the necessary
1374  // data from device to host memory.
1375  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
1376  if (srcDistObj != NULL) {
1377  if (verbose) {
1378  std::ostringstream os;
1379  os << *prefix << "Call srcDistObject->createViews()" << endl;
1380  std::cerr << os.str ();
1381  }
1382  srcDistObj->createViews ();
1383  }
1384  else {
1385  if (verbose) {
1386  std::ostringstream os;
1387  os << *prefix << "Source object has a different type than target object"
1388  << endl;
1389  std::cerr << os.str ();
1390  }
1391  }
1392 
1393  // Tell the target to create a view of its data. Depending on
1394  // rwo, this could be a write-only view or a read-and-write view.
1395  // On a discrete accelerator such as a GPU, a write-only view only
1396  // requires a transfer from host to device memory. A
1397  // read-and-write view requires a two-way transfer. This has the
1398  // same problem as createViews(): it transfers EVERYTHING, not
1399  // just the necessary data.
1400  //
1401  // FIXME (mfh 23 Mar 2012) By passing in the list of GIDs (or
1402  // rather, local LIDs into which to receive) and packet counts,
1403  // createViewsNonConst() could create a "sparse view" that only
1404  // transfers the necessary data.
1405  if (verbose) {
1406  std::ostringstream os;
1407  os << *prefix << "Call createViewsNonConst" << endl;
1408  std::cerr << os.str ();
1409  }
1410  this->createViewsNonConst (rwo);
1411 
1412  if (!restrictedMode && numSameIDs + permuteToLIDs.size()) {
1413 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1414  Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_);
1415 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1416  if (verbose) {
1417  std::ostringstream os;
1418  os << *prefix << "Call copyAndPermute" << endl;
1419  std::cerr << os.str ();
1420  }
1421  // There is at least one GID to copy or permute.
1422  copyAndPermute (src, numSameIDs, permuteToLIDs, permuteFromLIDs);
1423  }
1424  else {
1425  if (verbose) {
1426  std::ostringstream os;
1427  os << *prefix << "Skipping copyAndPermute" << endl;
1428  std::cerr << os.str ();
1429  }
1430  }
1431 
1432  // The method may return zero even if the implementation actually
1433  // does have a constant number of packets per LID. However, if it
1434  // returns nonzero, we may use this information to avoid
1435  // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare()
1436  // will set this to its final value.
1437  //
1438  // We only need this if CM != ZERO, but it has to be lifted out of
1439  // that scope because there are multiple tests for CM != ZERO.
1440  size_t constantNumPackets = this->constantNumberOfPackets ();
1441  if (verbose) {
1442  std::ostringstream os;
1443  os << *prefix << "constantNumPackets=" << constantNumPackets << endl;
1444  std::cerr << os.str ();
1445  }
1446 
1447  // We only need to pack communication buffers if the combine mode
1448  // is not ZERO. A "ZERO combine mode" means that the results are
1449  // the same as if we had received all zeros, and added them to the
1450  // existing values. That means we don't need to communicate.
1451  if (CM != ZERO) {
1452  if (constantNumPackets == 0) {
1453  this->reallocArraysForNumPacketsPerLid (exportLIDs.size (),
1454  remoteLIDs.size ());
1455  }
1456 
1457  if (verbose) {
1458  std::ostringstream os;
1459  os << *prefix << "Preparing for packAndPrepare" << endl;
1460  std::cerr << os.str ();
1461  }
1462  {
1463 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1464  Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_);
1465 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1466  // Ask the source to pack data. Also ask it whether there are a
1467  // constant number of packets per element (constantNumPackets is
1468  // an output argument). If there are, constantNumPackets will
1469  // come back nonzero. Otherwise, the source will fill the
1470  // numExportPacketsPerLID_ array.
1471  numExportPacketsPerLID_.modify_host ();
1472  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
1473  getArrayViewFromDualView (numExportPacketsPerLID_);
1474 
1475  // FIXME (mfh 26 Apr 2016) For backwards compatibility, use
1476  // the old packAndPrepare interface that takes and resizes the
1477  // exports buffer as a Teuchos::Array<packet_type>. Then,
1478  // copy out that buffer into the host version of exports_.
1479 
1480  Teuchos::Array<packet_type> exportsOld;
1481  if (verbose) {
1482  std::ostringstream os;
1483  os << *prefix << "Call packAndPrepare" << endl;
1484  std::cerr << os.str ();
1485  }
1486  packAndPrepare (src, exportLIDs, exportsOld, numExportPacketsPerLID,
1487  constantNumPackets, distor);
1488  const size_t exportsLen = static_cast<size_t> (exportsOld.size ());
1489  reallocDualViewIfNeeded (this->exports_, exportsLen, "exports");
1490  Kokkos::View<const packet_type*, Kokkos::HostSpace,
1491  Kokkos::MemoryUnmanaged> exportsOldK (exportsOld.getRawPtr (),
1492  exportsLen);
1493  exports_.modify_host ();
1494  Kokkos::deep_copy (exports_.view_host (),
1495  exportsOldK);
1496  }
1497  }
1498 
1499  // We don't need the source's data anymore, so it can let go of
1500  // its views. On an accelerator device with a separate memory
1501  // space (like a GPU), this frees host memory, since device memory
1502  // has the "master" version of the data.
1503  if (srcDistObj != nullptr) {
1504  if (verbose) {
1505  std::ostringstream os;
1506  os << *prefix << "Call srcDistObj->releaseViews()" << endl;
1507  std::cerr << os.str ();
1508  }
1509  srcDistObj->releaseViews ();
1510  }
1511  else {
1512  if (verbose) {
1513  std::ostringstream os;
1514  os << *prefix << "Skipping srcDistObj->releaseViews()" << endl;
1515  std::cerr << os.str ();
1516  }
1517  }
1518 
1519  // We only need to send data if the combine mode is not ZERO.
1520  if (CM != ZERO) {
1521  if (constantNumPackets != 0) {
1522  // There are a constant number of packets per element. We
1523  // already know (from the number of "remote" (incoming)
1524  // elements) how many incoming elements we expect, so we can
1525  // resize the buffer accordingly.
1526  const size_t rbufLen = remoteLIDs.size() * constantNumPackets;
1527  if (verbose) {
1528  std::ostringstream os;
1529  os << *prefix << "Const # packets: imports_.extent(0)="
1530  << imports_.extent (0) << ", ; calling reallocImportsIfNeeded("
1531  "rbufLen=" << rbufLen << ", verbose=true)" << endl;
1532  std::cerr << os.str ();
1533  }
1534  reallocImportsIfNeeded (rbufLen, verbose, prefix.get ());
1535  }
1536 
1537  // Do we need to do communication (via doPostsAndWaits)?
1538  bool needCommunication = true;
1539  if (revOp == DoReverse && ! isDistributed ()) {
1540  needCommunication = false;
1541  }
1542  // FIXME (mfh 30 Jun 2013): Checking whether the source object
1543  // is distributed requires a cast to DistObject. If it's not a
1544  // DistObject, then I'm not quite sure what to do. Perhaps it
1545  // would be more appropriate for SrcDistObject to have an
1546  // isDistributed() method. For now, I'll just assume that we
1547  // need to do communication unless the cast succeeds and the
1548  // source is not distributed.
1549  else if (revOp == DoForward && srcDistObj != NULL &&
1550  ! srcDistObj->isDistributed ()) {
1551  needCommunication = false;
1552  }
1553 
1554  if (verbose) {
1555  std::ostringstream os;
1556  os << *prefix << "needCommunication="
1557  << (needCommunication ? "true" : "false")
1558  << ", revOp="
1559  << (revOp == DoReverse ? "DoReverse" : "DoForward") << endl;
1560  std::cerr << os.str ();
1561  }
1562 
1563  if (needCommunication) {
1564  if (revOp == DoReverse) {
1565 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1566  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1567 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1568  if (constantNumPackets == 0) { //variable num-packets-per-LID:
1569  // First communicate the number of packets per LID to receive.
1570 
1571  // Make sure that host has the latest version, since we're
1572  // using the version on host. If host has the latest
1573  // version already, syncing to host does nothing.
1574  numExportPacketsPerLID_.sync_host ();
1575  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
1576  getArrayViewFromDualView (numExportPacketsPerLID_);
1577 
1578  // numImportPacketsPerLID_ is the output array here, so
1579  // mark it as modified. It's strictly output, so we don't
1580  // have to sync from device.
1581  //numImportPacketsPerLID_.sync_host ();
1582  numImportPacketsPerLID_.modify_host ();
1583  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
1584  getArrayViewFromDualView (numImportPacketsPerLID_);
1585 
1586  if (verbose) {
1587  std::ostringstream os;
1588  os << *prefix << "Call doReversePostsAndWaits (3-arg)" << endl;
1589  std::cerr << os.str ();
1590  }
1591  distor.doReversePostsAndWaits (numExportPacketsPerLID, 1,
1592  numImportPacketsPerLID);
1593 
1594  if (verbose) {
1595  std::ostringstream os;
1596  os << *prefix << "Compute totalImportPackets" << endl;
1597  std::cerr << os.str ();
1598  }
1599  size_t totalImportPackets = 0;
1600  {
1601  typedef typename Kokkos::DualView<size_t*,
1602  device_type>::t_host::execution_space host_exec_space;
1603  typedef Kokkos::RangePolicy<host_exec_space, Array_size_type> range_type;
1604  const size_t* const arrayToSum = numImportPacketsPerLID.getRawPtr ();
1605  Kokkos::parallel_reduce ("Count import packets",
1606  range_type (0, numImportPacketsPerLID.size ()),
1607  [=] (const Array_size_type& i, size_t& lclSum) {
1608  lclSum += arrayToSum[i];
1609  }, totalImportPackets);
1610  }
1611 
1612  if (verbose) {
1613  std::ostringstream os;
1614  os << *prefix << "totalImportPackets=" << totalImportPackets
1615  << "; calling reallocImportsIfNeeded" << endl;
1616  std::cerr << os.str ();
1617  }
1618  reallocImportsIfNeeded (totalImportPackets, verbose, prefix.get ());
1619 
1620  // We don't need to sync imports_, because it is only for
1621  // output here. Similarly, we don't need to mark exports_
1622  // as modified, since it is read only here. This legacy
1623  // version of doTransfer only uses host arrays.
1624  imports_.modify_host ();
1625  Teuchos::ArrayView<packet_type> hostImports =
1626  getArrayViewFromDualView (imports_);
1627  exports_.sync_host ();
1628  Teuchos::ArrayView<const packet_type> hostExports =
1629  getArrayViewFromDualView (exports_);
1630 
1631  if (verbose) {
1632  std::ostringstream os;
1633  os << *prefix << "Call doReversePostsAndWaits (4-arg)"
1634  << endl;
1635  std::cerr << os.str ();
1636  }
1637  distor.doReversePostsAndWaits (hostExports,
1638  numExportPacketsPerLID,
1639  hostImports,
1640  numImportPacketsPerLID);
1641  }
1642  else {
1643  // We don't need to sync imports_, because it is only for
1644  // output here. Similarly, we don't need to mark exports_
1645  // as modified, since it is read only here. This legacy
1646  // version of doTransfer only uses host arrays.
1647  imports_.modify_host ();
1648  Teuchos::ArrayView<packet_type> hostImports =
1649  getArrayViewFromDualView (imports_);
1650  exports_.sync_host ();
1651  Teuchos::ArrayView<const packet_type> hostExports =
1652  getArrayViewFromDualView (exports_);
1653 
1654  if (verbose) {
1655  std::ostringstream os;
1656  os << *prefix << "Call doReversePostsAndWaits (3-arg)"
1657  << endl;
1658  std::cerr << os.str ();
1659  }
1660  distor.doReversePostsAndWaits (hostExports,
1661  constantNumPackets,
1662  hostImports);
1663  }
1664  }
1665  else { // revOp == DoForward
1666 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1667  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1668 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1669  if (constantNumPackets == 0) { //variable num-packets-per-LID:
1670  // First communicate the number of packets per LID to receive.
1671 
1672  // Make sure that host has the latest version, since we're
1673  // using the version on host. If host has the latest
1674  // version already, syncing to host does nothing.
1675  numExportPacketsPerLID_.sync_host ();
1676  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
1677  getArrayViewFromDualView (numExportPacketsPerLID_);
1678 
1679  // numImportPacketsPerLID_ is the output array here, so
1680  // mark it as modified. It's strictly output, so we don't
1681  // have to sync from device.
1682  //numImportPacketsPerLID_.sync_host ();
1683  numImportPacketsPerLID_.modify_host ();
1684  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
1685  getArrayViewFromDualView (numImportPacketsPerLID_);
1686 
1687  if (verbose) {
1688  std::ostringstream os;
1689  os << *prefix << "Call doPostsAndWaits (3-arg)" << endl;
1690  std::cerr << os.str ();
1691  }
1692  distor.doPostsAndWaits (numExportPacketsPerLID, 1,
1693  numImportPacketsPerLID);
1694 
1695  if (verbose) {
1696  std::ostringstream os;
1697  os << *prefix << "Compute totalImportPackets" << endl;
1698  std::cerr << os.str ();
1699  }
1700  size_t totalImportPackets = 0;
1701  {
1702  typedef typename Kokkos::DualView<size_t*,
1703  device_type>::t_host::execution_space host_exec_space;
1704  typedef Kokkos::RangePolicy<host_exec_space, Array_size_type> range_type;
1705  const size_t* const arrayToSum = numImportPacketsPerLID.getRawPtr ();
1706  Kokkos::parallel_reduce ("Count import packets",
1707  range_type (0, numImportPacketsPerLID.size ()),
1708  [=] (const Array_size_type& i, size_t& lclSum) {
1709  lclSum += arrayToSum[i];
1710  }, totalImportPackets);
1711  }
1712 
1713  if (verbose) {
1714  std::ostringstream os;
1715  os << *prefix << "totalImportPackets=" << totalImportPackets
1716  << "; calling reallocImportsIfNeeded" << endl;
1717  std::cerr << os.str ();
1718  }
1719  reallocImportsIfNeeded (totalImportPackets, verbose, prefix.get ());
1720 
1721  // We don't need to sync imports_, because it is only for
1722  // output here. Similarly, we don't need to mark exports_
1723  // as modified, since it is read only here. This legacy
1724  // version of doTransfer only uses host arrays.
1725  imports_.modify_host ();
1726  Teuchos::ArrayView<packet_type> hostImports =
1727  getArrayViewFromDualView (imports_);
1728  exports_.sync_host ();
1729  Teuchos::ArrayView<const packet_type> hostExports =
1730  getArrayViewFromDualView (exports_);
1731 
1732  if (verbose) {
1733  std::ostringstream os;
1734  os << *prefix << "Call doPostsAndWaits (4-arg)" << endl;
1735  std::cerr << os.str ();
1736  }
1737  distor.doPostsAndWaits (hostExports,
1738  numExportPacketsPerLID,
1739  hostImports,
1740  numImportPacketsPerLID);
1741  }
1742  else {
1743  // We don't need to sync imports_, because it is only for
1744  // output here. Similarly, we don't need to mark exports_
1745  // as modified, since it is read only here. This legacy
1746  // version of doTransfer only uses host arrays.
1747  imports_.modify_host ();
1748  Teuchos::ArrayView<packet_type> hostImports =
1749  getArrayViewFromDualView (imports_);
1750  exports_.sync_host ();
1751  Teuchos::ArrayView<const packet_type> hostExports =
1752  getArrayViewFromDualView (exports_);
1753 
1754  if (verbose) {
1755  std::ostringstream os;
1756  os << *prefix << "Call doPostsAndWaits (3-arg)" << endl;
1757  std::cerr << os.str ();
1758  }
1759  distor.doPostsAndWaits (hostExports,
1760  constantNumPackets,
1761  hostImports);
1762  }
1763  }
1764 
1765  if (verbose) {
1766  std::ostringstream os;
1767  os << *prefix << "Preparing for unpackAndCombine" << endl;
1768  std::cerr << os.str ();
1769  }
1770  {
1771 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1772  Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_);
1773 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1774 
1775  // We don't need to sync imports_, because it is only for
1776  // output here. This legacy version of doTransfer only uses
1777  // host arrays.
1778  imports_.modify_host ();
1779  Teuchos::ArrayView<packet_type> hostImports =
1780  getArrayViewFromDualView (imports_);
1781  // NOTE (mfh 25 Apr 2016) unpackAndCombine doesn't actually
1782  // change its numImportPacketsPerLID argument, so we don't
1783  // have to mark it modified here.
1784  numImportPacketsPerLID_.sync_host ();
1785  // FIXME (mfh 25 Apr 2016) unpackAndCombine doesn't actually
1786  // change its numImportPacketsPerLID argument, so we should
1787  // be able to use a const Teuchos::ArrayView here.
1788  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
1789  getArrayViewFromDualView (numImportPacketsPerLID_);
1790 
1791  if (verbose) {
1792  std::ostringstream os;
1793  os << *prefix << "Call unpackAndCombine" << endl;
1794  std::cerr << os.str ();
1795  }
1796  unpackAndCombine (remoteLIDs, hostImports, numImportPacketsPerLID,
1797  constantNumPackets, distor, CM);
1798  }
1799  }
1800  } // if (CM != ZERO)
1801 
1802  if (verbose) {
1803  std::ostringstream os;
1804  os << *prefix << "Call releaseViews()" << endl;
1805  std::cerr << os.str ();
1806  }
1807  this->releaseViews ();
1808 
1809  if (verbose) {
1810  std::ostringstream os;
1811  os << *prefix << "Done!" << endl;
1812  std::cerr << os.str ();
1813  }
1814  }
1815 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1816 
1817  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1818  void
1819  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1820 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1821  copyAndPermuteNew
1822 #else // TPETRA_ENABLE_DEPRECATED_CODE
1823  copyAndPermute
1824 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1825  (const SrcDistObject&,
1826  const size_t,
1827  const Kokkos::DualView<
1828  const local_ordinal_type*,
1829  buffer_device_type>&,
1830  const Kokkos::DualView<
1831  const local_ordinal_type*,
1832  buffer_device_type>&)
1833  {}
1834 
1835  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1836  void
1837  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1838 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1839  packAndPrepareNew
1840 #else // TPETRA_ENABLE_DEPRECATED_CODE
1841  packAndPrepare
1842 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1843  (const SrcDistObject&,
1844  const Kokkos::DualView<
1845  const local_ordinal_type*,
1846  buffer_device_type>&,
1847  Kokkos::DualView<
1848  packet_type*,
1849  buffer_device_type>&,
1850  Kokkos::DualView<
1851  size_t*,
1852  buffer_device_type>,
1853  size_t&,
1854  Distributor&)
1855  {}
1856 
1857  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1858  void
1859  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1860 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1861  unpackAndCombineNew
1862 #else // TPETRA_ENABLE_DEPRECATED_CODE
1863  unpackAndCombine
1864 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1865  (const Kokkos::DualView<
1866  const local_ordinal_type*,
1867  buffer_device_type>& /* importLIDs */,
1868  Kokkos::DualView<
1869  packet_type*,
1870  buffer_device_type> /* imports */,
1871  Kokkos::DualView<
1872  size_t*,
1873  buffer_device_type> /* numPacketsPerLID */,
1874  const size_t /* constantNumPackets */,
1875  Distributor& /* distor */,
1876  const CombineMode /* combineMode */)
1877  {}
1878 
1879 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1880  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1881  void TPETRA_DEPRECATED
1883  copyAndPermute (const SrcDistObject& /* source */,
1884  const size_t /* numSameIDs */,
1885  const Teuchos::ArrayView<const local_ordinal_type>& /* permuteToLIDs */,
1886  const Teuchos::ArrayView<const local_ordinal_type>& /* permuteFromLIDs */)
1887  {}
1888 
1889  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1890  void TPETRA_DEPRECATED
1892  packAndPrepare (const SrcDistObject& /* source */,
1893  const Teuchos::ArrayView<const local_ordinal_type>& /* exportLIDs */,
1894  Teuchos::Array<packet_type>& /* exports */,
1895  const Teuchos::ArrayView<size_t>& /* numPacketsPerLID */,
1896  size_t& /* constantNumPackets */,
1897  Distributor& /* distor */)
1898  {}
1899 
1900  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1901  void TPETRA_DEPRECATED
1903  unpackAndCombine (const Teuchos::ArrayView<const local_ordinal_type>& /* importLIDs */,
1904  const Teuchos::ArrayView<const packet_type>& /* imports */,
1905  const Teuchos::ArrayView<size_t>& /* numPacketsPerLID */,
1906  const size_t /* constantNumPackets */,
1907  Distributor& /* distor */,
1908  const CombineMode /* combineMode */)
1909  {}
1910 
1911  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1912  void TPETRA_DEPRECATED
1913  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1914  createViews () const
1915  {}
1916 
1917  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1918  void TPETRA_DEPRECATED
1919  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1920  createViewsNonConst (KokkosClassic::ReadWriteOption /*rwo*/)
1921  {}
1922 
1923  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1924  void TPETRA_DEPRECATED
1925  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1926  releaseViews () const
1927  {}
1928 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1929 
1930  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1931  void
1933  print (std::ostream& os) const
1934  {
1935  using Teuchos::FancyOStream;
1936  using Teuchos::getFancyOStream;
1937  using Teuchos::RCP;
1938  using Teuchos::rcpFromRef;
1939  using std::endl;
1940 
1941  RCP<FancyOStream> out = getFancyOStream (rcpFromRef (os));
1942  this->describe (*out, Teuchos::VERB_DEFAULT);
1943  }
1944 
1945  template<class DistObjectType>
1946  void
1947  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input,
1948  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
1949  typename DistObjectType::global_ordinal_type,
1950  typename DistObjectType::node_type> >& newMap)
1951  {
1952  input->removeEmptyProcessesInPlace (newMap);
1953  if (newMap.is_null ()) { // my process is excluded
1954  input = Teuchos::null;
1955  }
1956  }
1957 
1958  template<class DistObjectType>
1959  void
1960  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input)
1961  {
1962  auto newMap = input->getMap ()->removeEmptyProcesses ();
1963  removeEmptyProcessesInPlace<DistObjectType> (input, newMap);
1964  }
1965 
1966 // Explicit instantiation macro for general DistObject.
1967 #define TPETRA_DISTOBJECT_INSTANT(SCALAR, LO, GO, NODE) \
1968  template class DistObject< SCALAR , LO , GO , NODE >;
1969 
1970 // Explicit instantiation macro for DistObject<char, ...>.
1971 // The "SLGN" stuff above doesn't work for Packet=char.
1972 #define TPETRA_DISTOBJECT_INSTANT_CHAR(LO, GO, NODE) \
1973  template class DistObject< char , LO , GO , NODE >;
1974 
1975 } // namespace Tpetra
1976 
1977 #endif // TPETRA_DISTOBJECT_DEF_HPP
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void doImport(const SrcDistObject &source, const Import< LocalOrdinal, GlobalOrdinal, Node > &importer, const CombineMode CM, const bool restrictedMode=false)
Import data into this object using an Import object (&quot;forward mode&quot;).
virtual void packAndPrepare(const SrcDistObject &source, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets, Distributor &distor)
Pack data and metadata for communication (sends).
virtual void doTransferNew(const SrcDistObject &src, const CombineMode CM, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &remoteLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Distributor &distor, const ReverseOption revOp, const bool commOnHost, const bool restrictedMode)
Implementation detail of doTransfer.
void print(std::ostream &os) const
Print this object to the given output stream.
virtual bool reallocArraysForNumPacketsPerLid(const size_t numExportLIDs, const size_t numImportLIDs)
Reallocate numExportPacketsPerLID_ and/or numImportPacketsPerLID_, if necessary.
bool isDistributed() const
Whether this is a globally distributed object.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
static bool debug()
Whether Tpetra is in debug mode.
int local_ordinal_type
Default value of Scalar template parameter.
virtual void doTransfer(const SrcDistObject &src, const ::Tpetra::Details::Transfer< local_ordinal_type, global_ordinal_type, node_type > &transfer, const char modeString[], const ReverseOption revOp, const CombineMode CM, const bool restrictedMode)
Redistribute data across (MPI) processes.
typename Node::device_type device_type
The Kokkos Device type.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Sets up and executes a communication plan for a Tpetra DistObject.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Abstract base class for objects that can be the source of an Import or Export operation.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix)
Reallocate imports_ if needed.
Replace existing values with new values.
LocalOrdinal local_ordinal_type
The type of local indices.
Replace old values with zero.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
ReverseOption
Whether the data transfer should be performed in forward or reverse mode.
DistObject(const Teuchos::RCP< const map_type > &map)
Constructor.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual std::string description() const
One-line descriptiion of this object.
virtual size_t constantNumberOfPackets() const
Whether the implementation&#39;s instance promises always to have a constant number of packets per LID (l...
A parallel distribution of indices over processes.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print a descriptiion of this object to the given output stream.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs)
Perform copies and permutations that are local to the calling (MPI) process.
Base class for distributed Tpetra objects that support data redistribution.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode)
Perform any unpacking and combining after communication.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes which contain no entries in this object&#39;s Map.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.