Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_DistObject_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTOBJECT_DEF_HPP
43 #define TPETRA_DISTOBJECT_DEF_HPP
44 
52 
53 #include "Tpetra_Distributor.hpp"
57 #include <typeinfo>
58 #include <memory>
59 #include <sstream>
60 
61 namespace Tpetra {
62 
63  namespace { // (anonymous)
64  template<class DeviceType, class IndexType = size_t>
65  struct SumFunctor {
66  SumFunctor (const Kokkos::View<const size_t*, DeviceType>& viewToSum) :
67  viewToSum_ (viewToSum) {}
68  KOKKOS_INLINE_FUNCTION void operator() (const IndexType i, size_t& lclSum) const {
69  lclSum += viewToSum_(i);
70  }
71  Kokkos::View<const size_t*, DeviceType> viewToSum_;
72  };
73 
74  template<class DeviceType, class IndexType = size_t>
75  size_t
76  countTotalImportPackets (const Kokkos::View<const size_t*, DeviceType>& numImportPacketsPerLID)
77  {
78  using Kokkos::parallel_reduce;
79  typedef DeviceType DT;
80  typedef typename DT::execution_space DES;
81  typedef Kokkos::RangePolicy<DES, IndexType> range_type;
82 
83  const IndexType numOut = numImportPacketsPerLID.extent (0);
84  size_t totalImportPackets = 0;
85  parallel_reduce ("Count import packets",
86  range_type (0, numOut),
87  SumFunctor<DeviceType, IndexType> (numImportPacketsPerLID),
88  totalImportPackets);
89  return totalImportPackets;
90  }
91  } // namespace (anonymous)
92 
93 
94  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
96  DistObject (const Teuchos::RCP<const map_type>& map) :
97  map_ (map)
98  {
99 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
100  using Teuchos::RCP;
101  using Teuchos::Time;
102  using Teuchos::TimeMonitor;
103 
104  RCP<Time> doXferTimer =
105  TimeMonitor::lookupCounter ("Tpetra::DistObject::doTransfer");
106  if (doXferTimer.is_null ()) {
107  doXferTimer =
108  TimeMonitor::getNewCounter ("Tpetra::DistObject::doTransfer");
109  }
110  doXferTimer_ = doXferTimer;
111 
112  RCP<Time> copyAndPermuteTimer =
113  TimeMonitor::lookupCounter ("Tpetra::DistObject::copyAndPermute");
114  if (copyAndPermuteTimer.is_null ()) {
115  copyAndPermuteTimer =
116  TimeMonitor::getNewCounter ("Tpetra::DistObject::copyAndPermute");
117  }
118  copyAndPermuteTimer_ = copyAndPermuteTimer;
119 
120  RCP<Time> packAndPrepareTimer =
121  TimeMonitor::lookupCounter ("Tpetra::DistObject::packAndPrepare");
122  if (packAndPrepareTimer.is_null ()) {
123  packAndPrepareTimer =
124  TimeMonitor::getNewCounter ("Tpetra::DistObject::packAndPrepare");
125  }
126  packAndPrepareTimer_ = packAndPrepareTimer;
127 
128  RCP<Time> doPostsAndWaitsTimer =
129  TimeMonitor::lookupCounter ("Tpetra::DistObject::doPostsAndWaits");
130  if (doPostsAndWaitsTimer.is_null ()) {
131  doPostsAndWaitsTimer =
132  TimeMonitor::getNewCounter ("Tpetra::DistObject::doPostsAndWaits");
133  }
134  doPostsAndWaitsTimer_ = doPostsAndWaitsTimer;
135 
136  RCP<Time> unpackAndCombineTimer =
137  TimeMonitor::lookupCounter ("Tpetra::DistObject::unpackAndCombine");
138  if (unpackAndCombineTimer.is_null ()) {
139  unpackAndCombineTimer =
140  TimeMonitor::getNewCounter ("Tpetra::DistObject::unpackAndCombine");
141  }
142  unpackAndCombineTimer_ = unpackAndCombineTimer;
143 #endif // HAVE_TPETRA_TRANSFER_TIMERS
144  }
145 
146  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
147  std::string
149  description () const
150  {
151  using Teuchos::TypeNameTraits;
152 
153  std::ostringstream os;
154  os << "\"Tpetra::DistObject\": {"
155  << "Packet: " << TypeNameTraits<packet_type>::name ()
156  << ", LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name ()
157  << ", GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name ()
158  << ", Node: " << TypeNameTraits<Node>::name ();
159  if (this->getObjectLabel () != "") {
160  os << "Label: \"" << this->getObjectLabel () << "\"";
161  }
162  os << "}";
163  return os.str ();
164  }
165 
166  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
167  void
169  describe (Teuchos::FancyOStream &out,
170  const Teuchos::EVerbosityLevel verbLevel) const
171  {
172  using Teuchos::rcpFromRef;
173  using Teuchos::TypeNameTraits;
174  using std::endl;
175  const Teuchos::EVerbosityLevel vl = (verbLevel == Teuchos::VERB_DEFAULT) ?
176  Teuchos::VERB_LOW : verbLevel;
177  Teuchos::RCP<const Teuchos::Comm<int> > comm = this->getMap ()->getComm ();
178  const int myRank = comm.is_null () ? 0 : comm->getRank ();
179  const int numProcs = comm.is_null () ? 1 : comm->getSize ();
180 
181  if (vl != Teuchos::VERB_NONE) {
182  Teuchos::OSTab tab0 (out);
183  if (myRank == 0) {
184  out << "\"Tpetra::DistObject\":" << endl;
185  }
186  Teuchos::OSTab tab1 (out);
187  if (myRank == 0) {
188  out << "Template parameters:" << endl;
189  {
190  Teuchos::OSTab tab2 (out);
191  out << "Packet: " << TypeNameTraits<packet_type>::name () << endl
192  << "LocalOrdinal: " << TypeNameTraits<local_ordinal_type>::name () << endl
193  << "GlobalOrdinal: " << TypeNameTraits<global_ordinal_type>::name () << endl
194  << "Node: " << TypeNameTraits<node_type>::name () << endl;
195  }
196  if (this->getObjectLabel () != "") {
197  out << "Label: \"" << this->getObjectLabel () << "\"" << endl;
198  }
199  } // if myRank == 0
200 
201  // Describe the Map.
202  {
203  if (myRank == 0) {
204  out << "Map:" << endl;
205  }
206  Teuchos::OSTab tab2 (out);
207  map_->describe (out, vl);
208  }
209 
210  // At verbosity > VERB_LOW, each process prints something.
211  if (vl > Teuchos::VERB_LOW) {
212  for (int p = 0; p < numProcs; ++p) {
213  if (myRank == p) {
214  out << "Process " << myRank << ":" << endl;
215  Teuchos::OSTab tab2 (out);
216  out << "Export buffer size (in packets): "
217  << exports_.extent (0)
218  << endl
219  << "Import buffer size (in packets): "
220  << imports_.extent (0)
221  << endl;
222  }
223  if (! comm.is_null ()) {
224  comm->barrier (); // give output time to finish
225  comm->barrier ();
226  comm->barrier ();
227  }
228  } // for each process rank p
229  } // if vl > VERB_LOW
230  } // if vl != VERB_NONE
231  }
232 
233  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
234  void
236  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& /* newMap */)
237  {
238  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error,
239  "Tpetra::DistObject::removeEmptyProcessesInPlace: Not implemented");
240  }
241 
242  /* These are provided in base DistObject template
243  template<class DistObjectType>
244  void
245  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input,
246  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
247  typename DistObjectType::global_ordinal_type,
248  typename DistObjectType::node_type> >& newMap)
249  {
250  input->removeEmptyProcessesInPlace (newMap);
251  if (newMap.is_null ()) { // my process is excluded
252  input = Teuchos::null;
253  }
254  }
255 
256  template<class DistObjectType>
257  void
258  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input)
259  {
260  using Teuchos::RCP;
261  typedef typename DistObjectType::local_ordinal_type LO;
262  typedef typename DistObjectType::global_ordinal_type GO;
263  typedef typename DistObjectType::node_type NT;
264  typedef Map<LO, GO, NT> map_type;
265 
266  RCP<const map_type> newMap = input->getMap ()->removeEmptyProcesses ();
267  removeEmptyProcessesInPlace<DistObjectType> (input, newMap);
268  }
269  */
270 
271  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
272  void
274  doImport (const SrcDistObject& source,
276  const CombineMode CM,
277  const bool restrictedMode)
278  {
279  using std::endl;
280  const char modeString[] = "doImport (forward mode)";
281 
282  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
283  // output to std::cerr on every MPI process. This is unwise for
284  // runs with large numbers of MPI processes.
285  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
286  std::unique_ptr<std::string> prefix;
287  if (verbose) {
288  int myRank = 0;
289  auto map = this->getMap ();
290  if (! map.is_null ()) {
291  auto comm = map->getComm ();
292  if (! comm.is_null ()) {
293  myRank = comm->getRank ();
294  }
295  }
296  prefix = [myRank] () {
297  std::ostringstream os;
298  os << "Proc " << myRank << ": Tpetra::DistObject::doTransfer: ";
299  return std::unique_ptr<std::string> (new std::string (os.str ()));
300  } ();
301  std::ostringstream os;
302  os << *prefix << "Start" << endl;
303  std::cerr << os.str ();
304  }
305  this->doTransfer (source, importer, modeString, DoForward, CM, restrictedMode);
306  if (verbose) {
307  std::ostringstream os;
308  os << *prefix << "Done!" << endl;
309  std::cerr << os.str ();
310  }
311  }
312 
313  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
314  void
316  doExport (const SrcDistObject& source,
318  const CombineMode CM,
319  const bool restrictedMode)
320  {
321  using std::endl;
322  const char modeString[] = "doExport (forward mode)";
323 
324  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
325  // output to std::cerr on every MPI process. This is unwise for
326  // runs with large numbers of MPI processes.
327  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
328  std::unique_ptr<std::string> prefix;
329  if (verbose) {
330  int myRank = 0;
331  auto map = this->getMap ();
332  if (! map.is_null ()) {
333  auto comm = map->getComm ();
334  if (! comm.is_null ()) {
335  myRank = comm->getRank ();
336  }
337  }
338  prefix = [myRank] () {
339  std::ostringstream os;
340  os << "(Proc " << myRank << ") ";
341  return std::unique_ptr<std::string> (new std::string (os.str ()));
342  } ();
343  std::ostringstream os;
344  os << *prefix << "Tpetra::DistObject::" << modeString << ":" << endl;
345  std::cerr << os.str ();
346  }
347  this->doTransfer (source, exporter, modeString, DoForward, CM, restrictedMode);
348  if (verbose) {
349  std::ostringstream os;
350  os << *prefix << "Tpetra::DistObject::" << modeString << ": Done!"
351  << endl;
352  std::cerr << os.str ();
353  }
354  }
355 
356  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
357  void
359  doImport (const SrcDistObject& source,
361  const CombineMode CM,
362  const bool restrictedMode)
363  {
364  using std::endl;
365  const char modeString[] = "doImport (reverse mode)";
366 
367  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
368  // output to std::cerr on every MPI process. This is unwise for
369  // runs with large numbers of MPI processes.
370  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
371  std::unique_ptr<std::string> prefix;
372  if (verbose) {
373  int myRank = 0;
374  auto map = this->getMap ();
375  if (! map.is_null ()) {
376  auto comm = map->getComm ();
377  if (! comm.is_null ()) {
378  myRank = comm->getRank ();
379  }
380  }
381  prefix = [myRank] () {
382  std::ostringstream os;
383  os << "(Proc " << myRank << ") ";
384  return std::unique_ptr<std::string> (new std::string (os.str ()));
385  } ();
386  std::ostringstream os;
387  os << *prefix << "Tpetra::DistObject::" << modeString << ":" << endl;
388  std::cerr << os.str ();
389  }
390  this->doTransfer (source, exporter, modeString, DoReverse, CM, restrictedMode);
391  if (verbose) {
392  std::ostringstream os;
393  os << *prefix << "Tpetra::DistObject::" << modeString << ": Done!"
394  << endl;
395  std::cerr << os.str ();
396  }
397  }
398 
399  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
400  void
402  doExport (const SrcDistObject& source,
404  const CombineMode CM,
405  const bool restrictedMode)
406  {
407  using std::endl;
408  const char modeString[] = "doExport (reverse mode)";
409 
410  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
411  // output to std::cerr on every MPI process. This is unwise for
412  // runs with large numbers of MPI processes.
413  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
414  std::unique_ptr<std::string> prefix;
415  if (verbose) {
416  int myRank = 0;
417  auto map = this->getMap ();
418  if (! map.is_null ()) {
419  auto comm = map->getComm ();
420  if (! comm.is_null ()) {
421  myRank = comm->getRank ();
422  }
423  }
424  prefix = [myRank] () {
425  std::ostringstream os;
426  os << "(Proc " << myRank << ") ";
427  return std::unique_ptr<std::string> (new std::string (os.str ()));
428  } ();
429  std::ostringstream os;
430  os << *prefix << "Tpetra::DistObject::" << modeString << ":" << endl;
431  std::cerr << os.str ();
432  }
433  this->doTransfer (source, importer, modeString, DoReverse, CM, restrictedMode);
434  if (verbose) {
435  std::ostringstream os;
436  os << *prefix << "Tpetra::DistObject::" << modeString << ": Done!"
437  << endl;
438  std::cerr << os.str ();
439  }
440  }
441 
442  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
443  bool
445  isDistributed () const {
446  return map_->isDistributed ();
447  }
448 
449  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
450  size_t
453  return 0; // default implementation; subclasses may override
454  }
455 
456  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
457  void
460  const ::Tpetra::Details::Transfer<local_ordinal_type, global_ordinal_type, node_type>& transfer,
461  const char modeString[],
462  const ReverseOption revOp,
463  const CombineMode CM,
464  bool restrictedMode)
465  {
467  using ::Tpetra::Details::ProfilingRegion;
468  using std::endl;
469 
470  ProfilingRegion region_doTransfer ("Tpetra::DistObject::doTransfer");
471  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
472  std::unique_ptr<std::string> prefix;
473  if (verbose) {
474  auto map = this->getMap ();
475  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
476  const int myRank = comm.is_null () ? -1 : comm->getRank ();
477  std::ostringstream os;
478  os << "Proc " << myRank << ": Tpetra::DistObject::doTransfer: ";
479  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
480  os << *prefix << "Source type: " << typeid (src).name ()
481  << ", Target type: " << typeid (*this).name () << endl;
482  std::cerr << os.str ();
483  }
484 
485  // "Restricted Mode" does two things:
486  // 1) Skips copyAndPermute
487  // 2) Allows the "target" Map of the transfer to be a subset of
488  // the Map of *this, in a "locallyFitted" sense.
489  //
490  // This cannot be used if #2 is not true, OR there are permutes.
491  // Source Maps still need to match
492 
493  // mfh 18 Oct 2017: Set TPETRA_DEBUG to true to enable extra debug
494  // checks. These may communicate more.
495  const bool debug = ::Tpetra::Details::Behavior::debug ();
496  if (debug) {
497  if (!restrictedMode && revOp == DoForward) {
498  const bool myMapSameAsTransferTgtMap =
499  this->getMap ()->isSameAs (* (transfer.getTargetMap ()));
500  TEUCHOS_TEST_FOR_EXCEPTION
501  (! myMapSameAsTransferTgtMap, std::invalid_argument,
502  "Tpetra::DistObject::" << modeString << ": For forward-mode "
503  "communication, the target DistObject's Map must be the same "
504  "(in the sense of Tpetra::Map::isSameAs) as the input "
505  "Export/Import object's target Map.");
506  }
507  else if (!restrictedMode && revOp == DoReverse) {
508  const bool myMapSameAsTransferSrcMap =
509  this->getMap ()->isSameAs (* (transfer.getSourceMap ()));
510  TEUCHOS_TEST_FOR_EXCEPTION
511  (! myMapSameAsTransferSrcMap, std::invalid_argument,
512  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
513  "communication, the target DistObject's Map must be the same "
514  "(in the sense of Tpetra::Map::isSameAs) as the input "
515  "Export/Import object's source Map.");
516  }
517  else if (restrictedMode && revOp == DoForward) {
518  const bool myMapLocallyFittedTransferTgtMap =
519  this->getMap ()->isLocallyFitted (* (transfer.getTargetMap ()));
520  TEUCHOS_TEST_FOR_EXCEPTION
521  (! myMapLocallyFittedTransferTgtMap , std::invalid_argument,
522  "Tpetra::DistObject::" << modeString << ": For forward-mode "
523  "communication using restricted mode, Export/Import object's "
524  "target Map must be locally fitted (in the sense of "
525  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
526  }
527  else { // if (restrictedMode && revOp == DoReverse) {
528  const bool myMapLocallyFittedTransferSrcMap =
529  this->getMap ()->isLocallyFitted (* (transfer.getSourceMap ()));
530  TEUCHOS_TEST_FOR_EXCEPTION
531  (! myMapLocallyFittedTransferSrcMap, std::invalid_argument,
532  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
533  "communication using restricted mode, Export/Import object's "
534  "source Map must be locally fitted (in the sense of "
535  "Tpetra::Map::isLocallyFitted) to target DistObject's Map.");
536  }
537 
538  // SrcDistObject need not even _have_ Maps. However, if the
539  // source object is a DistObject, it has a Map, and we may
540  // compare that Map with the Transfer's Maps.
541  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
542  if (srcDistObj != nullptr) {
543  if (revOp == DoForward) {
544  const bool srcMapSameAsImportSrcMap =
545  srcDistObj->getMap ()->isSameAs (* (transfer.getSourceMap ()));
546  TEUCHOS_TEST_FOR_EXCEPTION
547  (! srcMapSameAsImportSrcMap, std::invalid_argument,
548  "Tpetra::DistObject::" << modeString << ": For forward-mode "
549  "communication, the source DistObject's Map must be the same "
550  "as the input Export/Import object's source Map.");
551  }
552  else { // revOp == DoReverse
553  const bool srcMapSameAsImportTgtMap =
554  srcDistObj->getMap ()->isSameAs (* (transfer.getTargetMap ()));
555  TEUCHOS_TEST_FOR_EXCEPTION
556  (! srcMapSameAsImportTgtMap, std::invalid_argument,
557  "Tpetra::DistObject::" << modeString << ": For reverse-mode "
558  "communication, the source DistObject's Map must be the same "
559  "as the input Export/Import object's target Map.");
560  }
561  }
562  }
563 
564  const size_t numSameIDs = transfer.getNumSameIDs ();
565  Distributor& distor = transfer.getDistributor ();
566 
567  TEUCHOS_TEST_FOR_EXCEPTION
568  (debug && restrictedMode &&
569  (transfer.getPermuteToLIDs_dv().extent(0) != 0 ||
570  transfer.getPermuteFromLIDs_dv().extent(0) != 0),
571  std::invalid_argument,
572  "Tpetra::DistObject::" << modeString << ": Transfer object "
573  "cannot have permutes in restricted mode.");
574 
575 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
576  const bool useTheNewInterface = this->useNewInterface ();
577 #else
578  const bool useTheNewInterface = true;
579 #endif // TPETRA_ENABLE_DEPRECATED_CODE
580 
581  if (useTheNewInterface) {
582  using ::Tpetra::Details::Behavior;
583  // Do we need all communication buffers to live on host?
584  const bool commOnHost = ! Behavior::assumeMpiIsCudaAware ();
585  if (verbose) {
586  std::ostringstream os;
587  os << *prefix << "doTransfer: Use new interface; "
588  "commOnHost=" << (commOnHost ? "true" : "false") << endl;
589  std::cerr << os.str ();
590  }
591 
592  using const_lo_dv_type =
593  Kokkos::DualView<const local_ordinal_type*, buffer_device_type>;
594  const_lo_dv_type permToLIDs = (revOp == DoForward) ?
595  transfer.getPermuteToLIDs_dv () :
596  transfer.getPermuteFromLIDs_dv ();
597  const_lo_dv_type permFromLIDs = (revOp == DoForward) ?
598  transfer.getPermuteFromLIDs_dv () :
599  transfer.getPermuteToLIDs_dv ();
600  const_lo_dv_type remoteLIDs = (revOp == DoForward) ?
601  transfer.getRemoteLIDs_dv () :
602  transfer.getExportLIDs_dv ();
603  const_lo_dv_type exportLIDs = (revOp == DoForward) ?
604  transfer.getExportLIDs_dv () :
605  transfer.getRemoteLIDs_dv ();
606  doTransferNew (src, CM, numSameIDs, permToLIDs, permFromLIDs,
607  remoteLIDs, exportLIDs, distor, revOp, commOnHost,restrictedMode);
608  }
609 
610 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
611  if (! useTheNewInterface) {
612  if (verbose) {
613  std::ostringstream os;
614  os << *prefix << "doTransfer: Use old interface" << endl;
615  std::cerr << os.str ();
616  }
617  const auto permToLIDs = (revOp == DoForward) ?
618  transfer.getPermuteToLIDs () : transfer.getPermuteFromLIDs ();
619  const auto permFromLIDs = (revOp == DoForward) ?
620  transfer.getPermuteFromLIDs () : transfer.getPermuteToLIDs ();
621  const auto exportLIDs = (revOp == DoForward) ?
622  transfer.getExportLIDs () : transfer.getRemoteLIDs ();
623  const auto remoteLIDs = (revOp == DoForward) ?
624  transfer.getRemoteLIDs () : transfer.getExportLIDs ();
625  doTransferOld (src, CM, numSameIDs, permToLIDs, permFromLIDs,
626  remoteLIDs, exportLIDs, distor, revOp, restrictedMode);
627  }
628 #endif // TPETRA_ENABLE_DEPRECATED_CODE
629 
630  if (verbose) {
631  std::ostringstream os;
632  os << *prefix << "Tpetra::DistObject::doTransfer: Done!" << endl;
633  std::cerr << os.str ();
634  }
635  }
636 
637  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
638  bool
640  reallocImportsIfNeeded (const size_t newSize,
641  const bool verbose,
642  const std::string* prefix)
643  {
644  if (verbose) {
645  std::ostringstream os;
646  os << *prefix << "Realloc (if needed) imports_ from "
647  << imports_.extent (0) << " to " << newSize << std::endl;
648  std::cerr << os.str ();
649  }
651  const bool reallocated =
652  reallocDualViewIfNeeded (this->imports_, newSize, "imports");
653  if (verbose) {
654  std::ostringstream os;
655  os << *prefix << "Finished realloc'ing imports_" << std::endl;
656  std::cerr << os.str ();
657  }
658  return reallocated;
659  }
660 
661  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
662  bool
664  reallocArraysForNumPacketsPerLid (const size_t numExportLIDs,
665  const size_t numImportLIDs)
666  {
669  using std::endl;
670  // If an array is already allocated, and if is at least
671  // tooBigFactor times bigger than it needs to be, free it and
672  // reallocate to the size we need, in order to save space.
673  // Otherwise, take subviews to reduce allocation size.
674  constexpr size_t tooBigFactor = 10;
675 
676  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
677  std::unique_ptr<std::string> prefix;
678  if (verbose) {
679  const int myRank = [&] () {
680  auto map = this->getMap ();
681  if (map.get () == nullptr) {
682  return -1;
683  }
684  auto comm = map->getComm ();
685  if (comm.get () == nullptr) {
686  return -2;
687  }
688  return comm->getRank ();
689  } ();
690  std::ostringstream os;
691  os << "Proc " << myRank << ": reallocArraysForNumPacketsPerLid("
692  << numExportLIDs << ", " << numImportLIDs << "): ";
693  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
694  }
695 
696  if (verbose) {
697  std::ostringstream os;
698  os << *prefix << "before:" << endl
699  << *prefix << dualViewStatusToString (this->numExportPacketsPerLID_,
700  "numExportPacketsPerLID_")
701  << endl
702  << *prefix << dualViewStatusToString (this->numImportPacketsPerLID_,
703  "numImportPacketsPerLID_")
704  << endl;
705  std::cerr << os.str ();
706  }
707 
708  // Reallocate numExportPacketsPerLID_ if needed.
709  const bool firstReallocated =
710  reallocDualViewIfNeeded (this->numExportPacketsPerLID_,
711  numExportLIDs,
712  "numExportPacketsPerLID",
713  tooBigFactor,
714  true); // need fence before, if realloc'ing
715 
716  // If we reallocated above, then we fenced after that
717  // reallocation. This means that we don't need to fence again,
718  // before the next reallocation.
719  const bool needFenceBeforeNextAlloc = ! firstReallocated;
720  const bool secondReallocated =
721  reallocDualViewIfNeeded (this->numImportPacketsPerLID_,
722  numImportLIDs,
723  "numImportPacketsPerLID",
724  tooBigFactor,
725  needFenceBeforeNextAlloc);
726 
727  if (verbose) {
728  std::ostringstream os;
729  os << *prefix << "after:" << endl
730  << *prefix << dualViewStatusToString (this->numExportPacketsPerLID_,
731  "numExportPacketsPerLID_")
732  << endl
733  << *prefix << dualViewStatusToString (this->numImportPacketsPerLID_,
734  "numImportPacketsPerLID_")
735  << endl;
736  std::cerr << os.str ();
737  }
738 
739  return firstReallocated || secondReallocated;
740  }
741 
742  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
743  void
746  const CombineMode CM,
747  const size_t numSameIDs,
748  const Kokkos::DualView<const local_ordinal_type*,
749  buffer_device_type>& permuteToLIDs,
750  const Kokkos::DualView<const local_ordinal_type*,
751  buffer_device_type>& permuteFromLIDs,
752  const Kokkos::DualView<const local_ordinal_type*,
753  buffer_device_type>& remoteLIDs,
754  const Kokkos::DualView<const local_ordinal_type*,
755  buffer_device_type>& exportLIDs,
756  Distributor& distor,
757  const ReverseOption revOp,
758  const bool commOnHost,
759  const bool restrictedMode)
760  {
763  using ::Tpetra::Details::ProfilingRegion;
764  using Kokkos::Compat::getArrayView;
765  using Kokkos::Compat::getConstArrayView;
766  using Kokkos::Compat::getKokkosViewDeepCopy;
767  using Kokkos::Compat::create_const_view;
768  using std::endl;
769  using DT = device_type;
770  using DES = typename DT::execution_space;
771 
772  ProfilingRegion region_dTN ("Tpetra::DistObject::doTransferNew");
773 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
774  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor
775  // of Kokkos profiling.
776  Teuchos::TimeMonitor doXferMon (*doXferTimer_);
777 #endif // HAVE_TPETRA_TRANSFER_TIMERS
778 
779  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
780  // Prefix for verbose output. Use a pointer, so we don't pay for
781  // string construction unless needed. We set this below.
782  std::unique_ptr<std::string> prefix;
783  if (verbose) {
784  auto map = this->getMap ();
785  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
786  const int myRank = comm.is_null () ? 0 : comm->getRank ();
787  std::ostringstream os;
788  os << "Proc " << myRank << ": Tpetra::CrsMatrix::doTransferNew: ";
789  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
790  }
791 
792  if (verbose) {
793  std::ostringstream os;
794  os << *prefix << "Input arguments:" << endl
795  << *prefix << " combineMode: " << combineModeToString (CM) << endl
796  << *prefix << " numSameIDs: " << numSameIDs << endl
797  << *prefix << " "
798  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
799  << *prefix << " "
800  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl
801  << *prefix << " "
802  << dualViewStatusToString (remoteLIDs, "remoteLIDs") << endl
803  << *prefix << " "
804  << dualViewStatusToString (exportLIDs, "exportLIDs") << endl
805  << *prefix << " revOp: Do" << (revOp == DoReverse ? "Reverse" : "Forward") << endl
806  << *prefix << " commOnHost: " << (commOnHost ? "true" : "false") << endl;
807  std::cerr << os.str ();
808  }
809 
810  {
811  ProfilingRegion region_cs ("Tpetra::DistObject::doTransferNew::checkSizes");
812  if (verbose) {
813  std::ostringstream os;
814  os << *prefix << "1. checkSizes" << endl;
815  std::cerr << os.str ();
816  }
817  const bool checkSizesResult = this->checkSizes (src);
818  TEUCHOS_TEST_FOR_EXCEPTION
819  (! checkSizesResult, std::invalid_argument,
820  "Tpetra::DistObject::doTransfer: checkSizes() indicates that the "
821  "destination object is not a legal target for redistribution from the "
822  "source object. This probably means that they do not have the same "
823  "dimensions. For example, MultiVectors must have the same number of "
824  "rows and columns.");
825  }
826 
827  // NOTE (mfh 26 Apr 2016) Chris Baker's implementation understood
828  // that if CM == INSERT || CM == REPLACE, the target object could
829  // be write only. We don't optimize for that here.
830 
831  if (!restrictedMode && numSameIDs + permuteToLIDs.extent (0) != 0) {
832  // There is at least one GID to copy or permute.
833  if (verbose) {
834  std::ostringstream os;
835  os << *prefix << "2. copyAndPermute" << endl;
836  std::cerr << os.str ();
837  }
838  ProfilingRegion region_cp
839  ("Tpetra::DistObject::doTransferNew::copyAndPermute");
840 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
841  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in favor
842  // of Kokkos profiling.
843  Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_);
844 #endif // HAVE_TPETRA_TRANSFER_TIMERS
845 
846  if (numSameIDs + permuteToLIDs.extent (0) != 0) {
847  // There is at least one GID to copy or permute.
848  if (verbose) {
849  std::ostringstream os;
850  os << *prefix << "2. copyAndPermute" << endl;
851  std::cerr << os.str ();
852  }
853 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
854  this->copyAndPermuteNew (src, numSameIDs, permuteToLIDs,
855  permuteFromLIDs);
856 #else // TPETRA_ENABLE_DEPRECATED_CODE
857  this->copyAndPermute (src, numSameIDs, permuteToLIDs,
858  permuteFromLIDs);
859 #endif // TPETRA_ENABLE_DEPRECATED_CODE
860  if (verbose) {
861  std::ostringstream os;
862  os << *prefix << "After copyAndPermute:" << endl
863  << *prefix << " "
864  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
865  << endl
866  << *prefix << " "
867  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
868  << endl;
869  std::cerr << os.str ();
870  }
871  }
872  }
873 
874  // The method may return zero even if the implementation actually
875  // does have a constant number of packets per LID. However, if it
876  // returns nonzero, we may use this information to avoid
877  // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare()
878  // will set this to its final value.
879  //
880  // We only need this if CM != ZERO, but it has to be lifted out of
881  // that scope because there are multiple tests for CM != ZERO.
882  size_t constantNumPackets = this->constantNumberOfPackets ();
883  if (verbose) {
884  std::ostringstream os;
885  os << *prefix << "constantNumPackets=" << constantNumPackets << endl;
886  std::cerr << os.str ();
887  }
888 
889  // We only need to pack communication buffers if the combine mode
890  // is not ZERO. A "ZERO combine mode" means that the results are
891  // the same as if we had received all zeros, and added them to the
892  // existing values. That means we don't need to communicate.
893  if (CM != ZERO) {
894  if (constantNumPackets == 0) {
895  if (verbose) {
896  std::ostringstream os;
897  os << *prefix << "3. (Re)allocate num{Ex,Im}portPacketsPerLID"
898  << endl;
899  std::cerr << os.str ();
900  }
901  // This only reallocates if necessary, that is, if the sizes
902  // don't match.
903  this->reallocArraysForNumPacketsPerLid (exportLIDs.extent (0),
904  remoteLIDs.extent (0));
905  }
906 
907  if (verbose) {
908  std::ostringstream os;
909  os << *prefix << "4. packAndPrepare: before, "
910  << dualViewStatusToString (this->exports_, "exports_")
911  << endl;
912  std::cerr << os.str ();
913  }
914  {
915  ProfilingRegion region_pp
916  ("Tpetra::DistObject::doTransferNew::packAndPrepare");
917 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
918  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in
919  // favor of Kokkos profiling.
920  Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_);
921 #endif // HAVE_TPETRA_TRANSFER_TIMERS
922 
923  // Ask the source to pack data. Also ask it whether there are
924  // a constant number of packets per element
925  // (constantNumPackets is an output argument). If there are,
926  // constantNumPackets will come back nonzero. Otherwise, the
927  // source will fill the numExportPacketsPerLID_ array.
928 
929  // FIXME (mfh 18 Oct 2017) if (! commOnHost), sync to device?
930  // Alternately, make packAndPrepare take a "commOnHost"
931  // argument to tell it where to leave the data?
932  //
933  // NOTE (mfh 04 Feb 2019) Subclasses of DistObject should have
934  // the freedom to pack and unpack either on host or device.
935  // We should prefer sync'ing only on demand. Thus, we can
936  // answer the above question: packAndPrepare should not
937  // take a commOnHost argument, and doTransferNew should sync
938  // where needed, if needed.
939 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
940  this->packAndPrepareNew (src, exportLIDs, this->exports_,
941  this->numExportPacketsPerLID_,
942  constantNumPackets, distor);
943 #else // TPETRA_ENABLE_DEPRECATED_CODE
944  this->packAndPrepare (src, exportLIDs, this->exports_,
945  this->numExportPacketsPerLID_,
946  constantNumPackets, distor);
947 #endif // TPETRA_ENABLE_DEPRECATED_CODE
948  if (commOnHost) {
949  if (this->exports_.need_sync_host ()) {
950  this->exports_.sync_host ();
951  }
952  }
953  else { // ! commOnHost
954  if (this->exports_.need_sync_device ()) {
955  this->exports_.sync_device ();
956  }
957  }
958  }
959  if (verbose) {
960  std::ostringstream os;
961  os << *prefix << "5.1. After packAndPrepare, "
962  << dualViewStatusToString (this->exports_, "exports_")
963  << endl;
964  std::cerr << os.str ();
965  }
966  } // if (CM != ZERO)
967 
968  // We only need to send data if the combine mode is not ZERO.
969  if (CM != ZERO) {
970  if (constantNumPackets != 0) {
971  // There are a constant number of packets per element. We
972  // already know (from the number of "remote" (incoming)
973  // elements) how many incoming elements we expect, so we can
974  // resize the buffer accordingly.
975  const size_t rbufLen = remoteLIDs.extent (0) * constantNumPackets;
976  reallocImportsIfNeeded (rbufLen, verbose, prefix.get ());
977  }
978 
979  // Do we need to do communication (via doPostsAndWaits)?
980  bool needCommunication = true;
981 
982  // This may be NULL. It will be used below.
983  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
984 
985  if (revOp == DoReverse && ! this->isDistributed ()) {
986  needCommunication = false;
987  }
988  // FIXME (mfh 30 Jun 2013): Checking whether the source object
989  // is distributed requires a cast to DistObject. If it's not a
990  // DistObject, then I'm not quite sure what to do. Perhaps it
991  // would be more appropriate for SrcDistObject to have an
992  // isDistributed() method. For now, I'll just assume that we
993  // need to do communication unless the cast succeeds and the
994  // source is not distributed.
995  else if (revOp == DoForward && srcDistObj != NULL &&
996  ! srcDistObj->isDistributed ()) {
997  needCommunication = false;
998  }
999 
1000  if (! needCommunication) {
1001  if (verbose) {
1002  std::ostringstream os;
1003  os << *prefix << "Comm not needed; skipping" << endl;
1004  std::cerr << os.str ();
1005  }
1006  }
1007  else {
1008  ProfilingRegion region_dpw
1009  ("Tpetra::DistObject::doTransferNew::doPostsAndWaits");
1010 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1011  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in
1012  // favor of Kokkos profiling.
1013  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1014 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1015 
1016  if (verbose) {
1017  std::ostringstream os;
1018  os << *prefix << "7.0. "
1019  << (revOp == DoReverse ? "Reverse" : "Forward")
1020  << " mode" << endl;
1021  std::cerr << os.str ();
1022  }
1023 
1024  if (constantNumPackets == 0) { // variable num packets per LID
1025  if (verbose) {
1026  std::ostringstream os;
1027  os << *prefix << "7.1. Variable # packets / LID: first comm "
1028  << "(commOnHost = " << (commOnHost ? "true" : "false") << ")"
1029  << endl;
1030  std::cerr << os.str ();
1031  }
1032  size_t totalImportPackets = 0;
1033  if (commOnHost) {
1034  if (this->numExportPacketsPerLID_.need_sync_host ()) {
1035  this->numExportPacketsPerLID_.sync_host ();
1036  }
1037  if (this->numImportPacketsPerLID_.need_sync_host ()) {
1038  this->numImportPacketsPerLID_.sync_host ();
1039  }
1040  this->numImportPacketsPerLID_.modify_host (); // out arg
1041  auto numExp_h =
1042  create_const_view (this->numExportPacketsPerLID_.view_host ());
1043  auto numImp_h = this->numImportPacketsPerLID_.view_host ();
1044 
1045  // MPI communication happens here.
1046  if (verbose) {
1047  std::ostringstream os;
1048  os << *prefix << "Call do"
1049  << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits"
1050  << endl;
1051  std::cerr << os.str ();
1052  }
1053  if (revOp == DoReverse) {
1054  distor.doReversePostsAndWaits (numExp_h, 1, numImp_h);
1055  }
1056  else {
1057  distor.doPostsAndWaits (numExp_h, 1, numImp_h);
1058  }
1059  DES().fence (); // just in case UVM doesn't behave right
1060 
1061  if (verbose) {
1062  std::ostringstream os;
1063  os << *prefix << "Count totalImportPackets" << std::endl;
1064  std::cerr << os.str ();
1065  }
1066  using the_dev_type = typename decltype (numImp_h)::device_type;
1067  totalImportPackets = countTotalImportPackets<the_dev_type> (numImp_h);
1068  }
1069  else { // ! commOnHost
1070  if (this->numExportPacketsPerLID_.need_sync_device ()) {
1071  this->numExportPacketsPerLID_.sync_device ();
1072  }
1073  if (this->numImportPacketsPerLID_.need_sync_device ()) {
1074  this->numImportPacketsPerLID_.sync_device ();
1075  }
1076  this->numImportPacketsPerLID_.modify_device (); // out arg
1077  auto numExp_d = create_const_view
1078  (this->numExportPacketsPerLID_.view_device ());
1079  auto numImp_d = this->numImportPacketsPerLID_.view_device ();
1080 
1081  // MPI communication happens here.
1082  if (verbose) {
1083  std::ostringstream os;
1084  os << *prefix << "Call do"
1085  << (revOp == DoReverse ? "Reverse" : "") << "PostsAndWaits"
1086  << endl;
1087  std::cerr << os.str ();
1088  }
1089  if (revOp == DoReverse) {
1090  distor.doReversePostsAndWaits (numExp_d, 1, numImp_d);
1091  }
1092  else {
1093  distor.doPostsAndWaits (numExp_d, 1, numImp_d);
1094  }
1095  DES().fence (); // just in case UVM doesn't behave right
1096 
1097  if (verbose) {
1098  std::ostringstream os;
1099  os << *prefix << "Count totalImportPackets" << std::endl;
1100  std::cerr << os.str ();
1101  }
1102  using the_dev_type = typename decltype (numImp_d)::device_type;
1103  totalImportPackets = countTotalImportPackets<the_dev_type> (numImp_d);
1104  }
1105 
1106  if (verbose) {
1107  std::ostringstream os;
1108  os << *prefix << "totalImportPackets=" << totalImportPackets << endl;
1109  std::cerr << os.str ();
1110  }
1111  this->reallocImportsIfNeeded (totalImportPackets, verbose,
1112  prefix.get ());
1113  if (verbose) {
1114  std::ostringstream os;
1115  os << *prefix << "7.3. Second comm" << std::endl;
1116  std::cerr << os.str ();
1117  }
1118 
1119  // mfh 04 Feb 2019: Distributor expects the "num packets per
1120  // LID" arrays on host, so that it can issue MPI sends and
1121  // receives correctly.
1122  if (this->numExportPacketsPerLID_.need_sync_host ()) {
1123  this->numExportPacketsPerLID_.sync_host ();
1124  }
1125  if (this->numImportPacketsPerLID_.need_sync_host ()) {
1126  this->numImportPacketsPerLID_.sync_host ();
1127  }
1128 
1129  // NOTE (mfh 25 Apr 2016, 01 Aug 2017) doPostsAndWaits and
1130  // doReversePostsAndWaits currently want
1131  // numExportPacketsPerLID and numImportPacketsPerLID as
1132  // Teuchos::ArrayView, rather than as Kokkos::View.
1133  //
1134  // NOTE (mfh 04 Feb 2019) This does NOT copy from host to
1135  // device. The above syncs might.
1136  auto numExportPacketsPerLID_av =
1137  getArrayViewFromDualView (this->numExportPacketsPerLID_);
1138  auto numImportPacketsPerLID_av =
1139  getArrayViewFromDualView (this->numImportPacketsPerLID_);
1140 
1141  // imports_ is for output only, so we don't need to sync it
1142  // before marking it as modified. However, in order to
1143  // prevent spurious debug-mode errors (e.g., "modified on
1144  // both device and host"), we first need to clear its
1145  // "modified" flags.
1146  this->imports_.clear_sync_state ();
1147 
1148  if (verbose) {
1149  std::ostringstream os;
1150  os << *prefix << "Comm on "
1151  << (commOnHost ? "host" : "device")
1152  << "; call do" << (revOp == DoReverse ? "Reverse" : "")
1153  << "PostsAndWaits" << endl;
1154  std::cerr << os.str ();
1155  }
1156 
1157  if (commOnHost) {
1158  this->imports_.modify_host ();
1159  if (revOp == DoReverse) {
1160  distor.doReversePostsAndWaits
1161  (create_const_view (this->exports_.view_host ()),
1162  numExportPacketsPerLID_av,
1163  this->imports_.view_host (),
1164  numImportPacketsPerLID_av);
1165  }
1166  else {
1167  distor.doPostsAndWaits
1168  (create_const_view (this->exports_.view_host ()),
1169  numExportPacketsPerLID_av,
1170  this->imports_.view_host (),
1171  numImportPacketsPerLID_av);
1172  }
1173  }
1174  else { // pack on device
1175  this->imports_.modify_device ();
1176  if (revOp == DoReverse) {
1177  distor.doReversePostsAndWaits
1178  (create_const_view (this->exports_.view_device ()),
1179  numExportPacketsPerLID_av,
1180  this->imports_.view_device (),
1181  numImportPacketsPerLID_av);
1182  }
1183  else {
1184  distor.doPostsAndWaits
1185  (create_const_view (this->exports_.view_device ()),
1186  numExportPacketsPerLID_av,
1187  this->imports_.view_device (),
1188  numImportPacketsPerLID_av);
1189  }
1190  }
1191  }
1192  else { // constant number of packets per LID
1193  if (verbose) {
1194  std::ostringstream os;
1195  os << *prefix << "7.1. Const # packets per LID: " << endl
1196  << *prefix << " "
1197  << dualViewStatusToString (this->exports_, "exports_")
1198  << endl
1199  << *prefix << " "
1200  << dualViewStatusToString (this->exports_, "imports_")
1201  << endl;
1202  std::cerr << os.str ();
1203  }
1204  // imports_ is for output only, so we don't need to sync it
1205  // before marking it as modified. However, in order to
1206  // prevent spurious debug-mode errors (e.g., "modified on
1207  // both device and host"), we first need to clear its
1208  // "modified" flags.
1209  this->imports_.clear_sync_state ();
1210 
1211  if (verbose) {
1212  std::ostringstream os;
1213  os << *prefix << "7.2. Comm on "
1214  << (commOnHost ? "host" : "device")
1215  << "; call do" << (revOp == DoReverse ? "Reverse" : "")
1216  << "PostsAndWaits" << endl;
1217  std::cerr << os.str ();
1218  }
1219  if (commOnHost) {
1220  this->imports_.modify_host ();
1221  if (revOp == DoReverse) {
1222  distor.doReversePostsAndWaits
1223  (create_const_view (this->exports_.view_host ()),
1224  constantNumPackets,
1225  this->imports_.view_host ());
1226  }
1227  else {
1228  distor.doPostsAndWaits
1229  (create_const_view (this->exports_.view_host ()),
1230  constantNumPackets,
1231  this->imports_.view_host ());
1232  }
1233  }
1234  else { // pack on device
1235  this->imports_.modify_device ();
1236  if (revOp == DoReverse) {
1237  distor.doReversePostsAndWaits
1238  (create_const_view (this->exports_.view_device ()),
1239  constantNumPackets,
1240  this->imports_.view_device ());
1241  }
1242  else {
1243  distor.doPostsAndWaits
1244  (create_const_view (this->exports_.view_device ()),
1245  constantNumPackets,
1246  this->imports_.view_device ());
1247  }
1248  } // commOnHost
1249  } // constant or variable num packets per LID
1250 
1251  if (verbose) {
1252  std::ostringstream os;
1253  os << *prefix << "8. unpackAndCombine" << endl;
1254  std::cerr << os.str ();
1255  }
1256  ProfilingRegion region_uc
1257  ("Tpetra::DistObject::doTransferNew::unpackAndCombine");
1258 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1259  // FIXME (mfh 04 Feb 2019) Deprecate Teuchos::TimeMonitor in
1260  // favor of Kokkos profiling.
1261  Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_);
1262 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1263 
1264  // NOTE (mfh 26 Apr 2016) We don't actually need to sync the
1265  // input DualViews, but they DO need to be most recently
1266  // updated in the same memory space.
1267  //
1268  // FIXME (mfh 26 Apr 2016) Check that all input DualViews
1269  // were most recently updated in the same memory space, and
1270  // sync them to the same place (based on commOnHost) if not.
1271 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1272  this->unpackAndCombineNew (remoteLIDs, this->imports_,
1273  this->numImportPacketsPerLID_,
1274  constantNumPackets, distor, CM);
1275 #else // TPETRA_ENABLE_DEPRECATED_CODE
1276  this->unpackAndCombine (remoteLIDs, this->imports_,
1277  this->numImportPacketsPerLID_,
1278  constantNumPackets, distor, CM);
1279 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1280  } // if (needCommunication)
1281  } // if (CM != ZERO)
1282 
1283  if (verbose) {
1284  std::ostringstream os;
1285  os << *prefix << "9. Done!" << endl;
1286  std::cerr << os.str ();
1287  }
1288  }
1289 
1290 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1291  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1292  void TPETRA_DEPRECATED
1294  doTransferOld (const SrcDistObject& src,
1295  CombineMode CM,
1296  size_t numSameIDs,
1297  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1298  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1299  const Teuchos::ArrayView<const LocalOrdinal>& remoteLIDs,
1300  const Teuchos::ArrayView<const LocalOrdinal>& exportLIDs,
1301  Distributor& distor,
1302  ReverseOption revOp,
1303  const bool restrictedMode)
1304  {
1306  using ::Tpetra::Details::ProfilingRegion;
1308  using std::endl;
1309  const char prefixRaw[] = "Tpetra::DistObject::doTransferOld: ";
1310 
1311  ProfilingRegion region_doTransferOld ("Tpetra::DistObject::doTransferOld");
1312 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1313  // FIXME (mfh 04 Feb 2019) Remove Teuchos::TimeMonitor and use
1314  // Kokkos profiling instead.
1315  Teuchos::TimeMonitor doXferMon (*doXferTimer_);
1316 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1317 
1318  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
1319  std::unique_ptr<std::string> prefix;
1320  if (verbose) {
1321  auto map = this->getMap ();
1322  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
1323  const int myRank = comm.is_null () ? -1 : comm->getRank ();
1324  std::ostringstream os;
1325  os << "Proc " << myRank << ": " << prefixRaw;
1326  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1327  }
1328 
1329  TEUCHOS_TEST_FOR_EXCEPTION(
1330  ! checkSizes (src), std::invalid_argument,
1331  prefixRaw << "checkSizes() indicates that the "
1332  "destination object is not a legal target for redistribution from the "
1333  "source object. This probably means that they do not have the same "
1334  "dimensions. For example, MultiVectors must have the same number of "
1335  "rows and columns.");
1336  KokkosClassic::ReadWriteOption rwo = KokkosClassic::ReadWrite;
1337  if (CM == INSERT || CM == REPLACE) {
1338  const size_t numIDsToWrite = numSameIDs +
1339  static_cast<size_t> (permuteToLIDs.size ()) +
1340  static_cast<size_t> (remoteLIDs.size ());
1341  if (numIDsToWrite == this->getMap ()->getNodeNumElements ()) {
1342  // We're overwriting all of our local data in the destination
1343  // object, so a write-only view suffices.
1344  //
1345  // FIXME (mfh 10 Apr 2012) This doesn't make sense for a
1346  // CrsMatrix with a dynamic graph. INSERT mode could mean
1347  // that we're adding new entries to the object, but we don't
1348  // want to get rid of the old ones.
1349  rwo = KokkosClassic::WriteOnly;
1350  }
1351  }
1352 
1353  if (verbose) {
1354  std::ostringstream os;
1355  os << *prefix << "ReadWriteOption: ";
1356  if (rwo == KokkosClassic::ReadWrite) {
1357  os << "ReadWrite";
1358  }
1359  else if (rwo == KokkosClassic::WriteOnly) {
1360  os << "ReadWrite";
1361  }
1362  else {
1363  os << "Something else; weird!";
1364  }
1365  os << endl;
1366  std::cerr << os.str ();
1367  }
1368 
1369  // Tell the source to create a read-only view of its data. On a
1370  // discrete accelerator such as a GPU, this brings EVERYTHING from
1371  // device memory to host memory.
1372  //
1373  // FIXME (mfh 23 Mar 2012) By passing in the list of GIDs (or
1374  // rather, local LIDs to send) and packet counts, createViews()
1375  // could create a "sparse view" that only brings in the necessary
1376  // data from device to host memory.
1377  const this_type* srcDistObj = dynamic_cast<const this_type*> (&src);
1378  if (srcDistObj != NULL) {
1379  if (verbose) {
1380  std::ostringstream os;
1381  os << *prefix << "Call srcDistObject->createViews()" << endl;
1382  std::cerr << os.str ();
1383  }
1384  srcDistObj->createViews ();
1385  }
1386  else {
1387  if (verbose) {
1388  std::ostringstream os;
1389  os << *prefix << "Source object has a different type than target object"
1390  << endl;
1391  std::cerr << os.str ();
1392  }
1393  }
1394 
1395  // Tell the target to create a view of its data. Depending on
1396  // rwo, this could be a write-only view or a read-and-write view.
1397  // On a discrete accelerator such as a GPU, a write-only view only
1398  // requires a transfer from host to device memory. A
1399  // read-and-write view requires a two-way transfer. This has the
1400  // same problem as createViews(): it transfers EVERYTHING, not
1401  // just the necessary data.
1402  //
1403  // FIXME (mfh 23 Mar 2012) By passing in the list of GIDs (or
1404  // rather, local LIDs into which to receive) and packet counts,
1405  // createViewsNonConst() could create a "sparse view" that only
1406  // transfers the necessary data.
1407  if (verbose) {
1408  std::ostringstream os;
1409  os << *prefix << "Call createViewsNonConst" << endl;
1410  std::cerr << os.str ();
1411  }
1412  this->createViewsNonConst (rwo);
1413 
1414  if (!restrictedMode && numSameIDs + permuteToLIDs.size()) {
1415 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1416  Teuchos::TimeMonitor copyAndPermuteMon (*copyAndPermuteTimer_);
1417 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1418  if (verbose) {
1419  std::ostringstream os;
1420  os << *prefix << "Call copyAndPermute" << endl;
1421  std::cerr << os.str ();
1422  }
1423  // There is at least one GID to copy or permute.
1424  copyAndPermute (src, numSameIDs, permuteToLIDs, permuteFromLIDs);
1425  }
1426  else {
1427  if (verbose) {
1428  std::ostringstream os;
1429  os << *prefix << "Skipping copyAndPermute" << endl;
1430  std::cerr << os.str ();
1431  }
1432  }
1433 
1434  // The method may return zero even if the implementation actually
1435  // does have a constant number of packets per LID. However, if it
1436  // returns nonzero, we may use this information to avoid
1437  // (re)allocating num{Ex,Im}portPacketsPerLID_. packAndPrepare()
1438  // will set this to its final value.
1439  //
1440  // We only need this if CM != ZERO, but it has to be lifted out of
1441  // that scope because there are multiple tests for CM != ZERO.
1442  size_t constantNumPackets = this->constantNumberOfPackets ();
1443  if (verbose) {
1444  std::ostringstream os;
1445  os << *prefix << "constantNumPackets=" << constantNumPackets << endl;
1446  std::cerr << os.str ();
1447  }
1448 
1449  // We only need to pack communication buffers if the combine mode
1450  // is not ZERO. A "ZERO combine mode" means that the results are
1451  // the same as if we had received all zeros, and added them to the
1452  // existing values. That means we don't need to communicate.
1453  if (CM != ZERO) {
1454  if (constantNumPackets == 0) {
1455  this->reallocArraysForNumPacketsPerLid (exportLIDs.size (),
1456  remoteLIDs.size ());
1457  }
1458 
1459  if (verbose) {
1460  std::ostringstream os;
1461  os << *prefix << "Preparing for packAndPrepare" << endl;
1462  std::cerr << os.str ();
1463  }
1464  {
1465 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1466  Teuchos::TimeMonitor packAndPrepareMon (*packAndPrepareTimer_);
1467 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1468  // Ask the source to pack data. Also ask it whether there are a
1469  // constant number of packets per element (constantNumPackets is
1470  // an output argument). If there are, constantNumPackets will
1471  // come back nonzero. Otherwise, the source will fill the
1472  // numExportPacketsPerLID_ array.
1473  numExportPacketsPerLID_.modify_host ();
1474  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
1475  getArrayViewFromDualView (numExportPacketsPerLID_);
1476 
1477  // FIXME (mfh 26 Apr 2016) For backwards compatibility, use
1478  // the old packAndPrepare interface that takes and resizes the
1479  // exports buffer as a Teuchos::Array<packet_type>. Then,
1480  // copy out that buffer into the host version of exports_.
1481 
1482  Teuchos::Array<packet_type> exportsOld;
1483  if (verbose) {
1484  std::ostringstream os;
1485  os << *prefix << "Call packAndPrepare" << endl;
1486  std::cerr << os.str ();
1487  }
1488  packAndPrepare (src, exportLIDs, exportsOld, numExportPacketsPerLID,
1489  constantNumPackets, distor);
1490  const size_t exportsLen = static_cast<size_t> (exportsOld.size ());
1491  reallocDualViewIfNeeded (this->exports_, exportsLen, "exports");
1492  Kokkos::View<const packet_type*, Kokkos::HostSpace,
1493  Kokkos::MemoryUnmanaged> exportsOldK (exportsOld.getRawPtr (),
1494  exportsLen);
1495  exports_.modify_host ();
1496  Kokkos::deep_copy (exports_.view_host (),
1497  exportsOldK);
1498  }
1499  }
1500 
1501  // We don't need the source's data anymore, so it can let go of
1502  // its views. On an accelerator device with a separate memory
1503  // space (like a GPU), this frees host memory, since device memory
1504  // has the "master" version of the data.
1505  if (srcDistObj != nullptr) {
1506  if (verbose) {
1507  std::ostringstream os;
1508  os << *prefix << "Call srcDistObj->releaseViews()" << endl;
1509  std::cerr << os.str ();
1510  }
1511  srcDistObj->releaseViews ();
1512  }
1513  else {
1514  if (verbose) {
1515  std::ostringstream os;
1516  os << *prefix << "Skipping srcDistObj->releaseViews()" << endl;
1517  std::cerr << os.str ();
1518  }
1519  }
1520 
1521  // We only need to send data if the combine mode is not ZERO.
1522  if (CM != ZERO) {
1523  if (constantNumPackets != 0) {
1524  // There are a constant number of packets per element. We
1525  // already know (from the number of "remote" (incoming)
1526  // elements) how many incoming elements we expect, so we can
1527  // resize the buffer accordingly.
1528  const size_t rbufLen = remoteLIDs.size() * constantNumPackets;
1529  if (verbose) {
1530  std::ostringstream os;
1531  os << *prefix << "Const # packets: imports_.extent(0)="
1532  << imports_.extent (0) << ", ; calling reallocImportsIfNeeded("
1533  "rbufLen=" << rbufLen << ", verbose=true)" << endl;
1534  std::cerr << os.str ();
1535  }
1536  reallocImportsIfNeeded (rbufLen, verbose, prefix.get ());
1537  }
1538 
1539  // Do we need to do communication (via doPostsAndWaits)?
1540  bool needCommunication = true;
1541  if (revOp == DoReverse && ! isDistributed ()) {
1542  needCommunication = false;
1543  }
1544  // FIXME (mfh 30 Jun 2013): Checking whether the source object
1545  // is distributed requires a cast to DistObject. If it's not a
1546  // DistObject, then I'm not quite sure what to do. Perhaps it
1547  // would be more appropriate for SrcDistObject to have an
1548  // isDistributed() method. For now, I'll just assume that we
1549  // need to do communication unless the cast succeeds and the
1550  // source is not distributed.
1551  else if (revOp == DoForward && srcDistObj != NULL &&
1552  ! srcDistObj->isDistributed ()) {
1553  needCommunication = false;
1554  }
1555 
1556  if (verbose) {
1557  std::ostringstream os;
1558  os << *prefix << "needCommunication="
1559  << (needCommunication ? "true" : "false")
1560  << ", revOp="
1561  << (revOp == DoReverse ? "DoReverse" : "DoForward") << endl;
1562  std::cerr << os.str ();
1563  }
1564 
1565  if (needCommunication) {
1566  if (revOp == DoReverse) {
1567 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1568  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1569 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1570  if (constantNumPackets == 0) { //variable num-packets-per-LID:
1571  // First communicate the number of packets per LID to receive.
1572 
1573  // Make sure that host has the latest version, since we're
1574  // using the version on host. If host has the latest
1575  // version already, syncing to host does nothing.
1576  numExportPacketsPerLID_.sync_host ();
1577  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
1578  getArrayViewFromDualView (numExportPacketsPerLID_);
1579 
1580  // numImportPacketsPerLID_ is the output array here, so
1581  // mark it as modified. It's strictly output, so we don't
1582  // have to sync from device.
1583  //numImportPacketsPerLID_.sync_host ();
1584  numImportPacketsPerLID_.modify_host ();
1585  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
1586  getArrayViewFromDualView (numImportPacketsPerLID_);
1587 
1588  if (verbose) {
1589  std::ostringstream os;
1590  os << *prefix << "Call doReversePostsAndWaits (3-arg)" << endl;
1591  std::cerr << os.str ();
1592  }
1593  distor.doReversePostsAndWaits (numExportPacketsPerLID, 1,
1594  numImportPacketsPerLID);
1595 
1596  if (verbose) {
1597  std::ostringstream os;
1598  os << *prefix << "Compute totalImportPackets" << endl;
1599  std::cerr << os.str ();
1600  }
1601  size_t totalImportPackets = 0;
1602  {
1603  typedef typename Kokkos::DualView<size_t*,
1604  device_type>::t_host::execution_space host_exec_space;
1605  typedef Kokkos::RangePolicy<host_exec_space, Array_size_type> range_type;
1606  const size_t* const arrayToSum = numImportPacketsPerLID.getRawPtr ();
1607  Kokkos::parallel_reduce ("Count import packets",
1608  range_type (0, numImportPacketsPerLID.size ()),
1609  [=] (const Array_size_type& i, size_t& lclSum) {
1610  lclSum += arrayToSum[i];
1611  }, totalImportPackets);
1612  }
1613 
1614  if (verbose) {
1615  std::ostringstream os;
1616  os << *prefix << "totalImportPackets=" << totalImportPackets
1617  << "; calling reallocImportsIfNeeded" << endl;
1618  std::cerr << os.str ();
1619  }
1620  reallocImportsIfNeeded (totalImportPackets, verbose, prefix.get ());
1621 
1622  // We don't need to sync imports_, because it is only for
1623  // output here. Similarly, we don't need to mark exports_
1624  // as modified, since it is read only here. This legacy
1625  // version of doTransfer only uses host arrays.
1626  imports_.modify_host ();
1627  Teuchos::ArrayView<packet_type> hostImports =
1628  getArrayViewFromDualView (imports_);
1629  exports_.sync_host ();
1630  Teuchos::ArrayView<const packet_type> hostExports =
1631  getArrayViewFromDualView (exports_);
1632 
1633  if (verbose) {
1634  std::ostringstream os;
1635  os << *prefix << "Call doReversePostsAndWaits (4-arg)"
1636  << endl;
1637  std::cerr << os.str ();
1638  }
1639  distor.doReversePostsAndWaits (hostExports,
1640  numExportPacketsPerLID,
1641  hostImports,
1642  numImportPacketsPerLID);
1643  }
1644  else {
1645  // We don't need to sync imports_, because it is only for
1646  // output here. Similarly, we don't need to mark exports_
1647  // as modified, since it is read only here. This legacy
1648  // version of doTransfer only uses host arrays.
1649  imports_.modify_host ();
1650  Teuchos::ArrayView<packet_type> hostImports =
1651  getArrayViewFromDualView (imports_);
1652  exports_.sync_host ();
1653  Teuchos::ArrayView<const packet_type> hostExports =
1654  getArrayViewFromDualView (exports_);
1655 
1656  if (verbose) {
1657  std::ostringstream os;
1658  os << *prefix << "Call doReversePostsAndWaits (3-arg)"
1659  << endl;
1660  std::cerr << os.str ();
1661  }
1662  distor.doReversePostsAndWaits (hostExports,
1663  constantNumPackets,
1664  hostImports);
1665  }
1666  }
1667  else { // revOp == DoForward
1668 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1669  Teuchos::TimeMonitor doPostsAndWaitsMon (*doPostsAndWaitsTimer_);
1670 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1671  if (constantNumPackets == 0) { //variable num-packets-per-LID:
1672  // First communicate the number of packets per LID to receive.
1673 
1674  // Make sure that host has the latest version, since we're
1675  // using the version on host. If host has the latest
1676  // version already, syncing to host does nothing.
1677  numExportPacketsPerLID_.sync_host ();
1678  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
1679  getArrayViewFromDualView (numExportPacketsPerLID_);
1680 
1681  // numImportPacketsPerLID_ is the output array here, so
1682  // mark it as modified. It's strictly output, so we don't
1683  // have to sync from device.
1684  //numImportPacketsPerLID_.sync_host ();
1685  numImportPacketsPerLID_.modify_host ();
1686  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
1687  getArrayViewFromDualView (numImportPacketsPerLID_);
1688 
1689  if (verbose) {
1690  std::ostringstream os;
1691  os << *prefix << "Call doPostsAndWaits (3-arg)" << endl;
1692  std::cerr << os.str ();
1693  }
1694  distor.doPostsAndWaits (numExportPacketsPerLID, 1,
1695  numImportPacketsPerLID);
1696 
1697  if (verbose) {
1698  std::ostringstream os;
1699  os << *prefix << "Compute totalImportPackets" << endl;
1700  std::cerr << os.str ();
1701  }
1702  size_t totalImportPackets = 0;
1703  {
1704  typedef typename Kokkos::DualView<size_t*,
1705  device_type>::t_host::execution_space host_exec_space;
1706  typedef Kokkos::RangePolicy<host_exec_space, Array_size_type> range_type;
1707  const size_t* const arrayToSum = numImportPacketsPerLID.getRawPtr ();
1708  Kokkos::parallel_reduce ("Count import packets",
1709  range_type (0, numImportPacketsPerLID.size ()),
1710  [=] (const Array_size_type& i, size_t& lclSum) {
1711  lclSum += arrayToSum[i];
1712  }, totalImportPackets);
1713  }
1714 
1715  if (verbose) {
1716  std::ostringstream os;
1717  os << *prefix << "totalImportPackets=" << totalImportPackets
1718  << "; calling reallocImportsIfNeeded" << endl;
1719  std::cerr << os.str ();
1720  }
1721  reallocImportsIfNeeded (totalImportPackets, verbose, prefix.get ());
1722 
1723  // We don't need to sync imports_, because it is only for
1724  // output here. Similarly, we don't need to mark exports_
1725  // as modified, since it is read only here. This legacy
1726  // version of doTransfer only uses host arrays.
1727  imports_.modify_host ();
1728  Teuchos::ArrayView<packet_type> hostImports =
1729  getArrayViewFromDualView (imports_);
1730  exports_.sync_host ();
1731  Teuchos::ArrayView<const packet_type> hostExports =
1732  getArrayViewFromDualView (exports_);
1733 
1734  if (verbose) {
1735  std::ostringstream os;
1736  os << *prefix << "Call doPostsAndWaits (4-arg)" << endl;
1737  std::cerr << os.str ();
1738  }
1739  distor.doPostsAndWaits (hostExports,
1740  numExportPacketsPerLID,
1741  hostImports,
1742  numImportPacketsPerLID);
1743  }
1744  else {
1745  // We don't need to sync imports_, because it is only for
1746  // output here. Similarly, we don't need to mark exports_
1747  // as modified, since it is read only here. This legacy
1748  // version of doTransfer only uses host arrays.
1749  imports_.modify_host ();
1750  Teuchos::ArrayView<packet_type> hostImports =
1751  getArrayViewFromDualView (imports_);
1752  exports_.sync_host ();
1753  Teuchos::ArrayView<const packet_type> hostExports =
1754  getArrayViewFromDualView (exports_);
1755 
1756  if (verbose) {
1757  std::ostringstream os;
1758  os << *prefix << "Call doPostsAndWaits (3-arg)" << endl;
1759  std::cerr << os.str ();
1760  }
1761  distor.doPostsAndWaits (hostExports,
1762  constantNumPackets,
1763  hostImports);
1764  }
1765  }
1766 
1767  if (verbose) {
1768  std::ostringstream os;
1769  os << *prefix << "Preparing for unpackAndCombine" << endl;
1770  std::cerr << os.str ();
1771  }
1772  {
1773 #ifdef HAVE_TPETRA_TRANSFER_TIMERS
1774  Teuchos::TimeMonitor unpackAndCombineMon (*unpackAndCombineTimer_);
1775 #endif // HAVE_TPETRA_TRANSFER_TIMERS
1776 
1777  // We don't need to sync imports_, because it is only for
1778  // output here. This legacy version of doTransfer only uses
1779  // host arrays.
1780  imports_.modify_host ();
1781  Teuchos::ArrayView<packet_type> hostImports =
1782  getArrayViewFromDualView (imports_);
1783  // NOTE (mfh 25 Apr 2016) unpackAndCombine doesn't actually
1784  // change its numImportPacketsPerLID argument, so we don't
1785  // have to mark it modified here.
1786  numImportPacketsPerLID_.sync_host ();
1787  // FIXME (mfh 25 Apr 2016) unpackAndCombine doesn't actually
1788  // change its numImportPacketsPerLID argument, so we should
1789  // be able to use a const Teuchos::ArrayView here.
1790  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
1791  getArrayViewFromDualView (numImportPacketsPerLID_);
1792 
1793  if (verbose) {
1794  std::ostringstream os;
1795  os << *prefix << "Call unpackAndCombine" << endl;
1796  std::cerr << os.str ();
1797  }
1798  unpackAndCombine (remoteLIDs, hostImports, numImportPacketsPerLID,
1799  constantNumPackets, distor, CM);
1800  }
1801  }
1802  } // if (CM != ZERO)
1803 
1804  if (verbose) {
1805  std::ostringstream os;
1806  os << *prefix << "Call releaseViews()" << endl;
1807  std::cerr << os.str ();
1808  }
1809  this->releaseViews ();
1810 
1811  if (verbose) {
1812  std::ostringstream os;
1813  os << *prefix << "Done!" << endl;
1814  std::cerr << os.str ();
1815  }
1816  }
1817 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1818 
1819  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1820  void
1821  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1822 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1823  copyAndPermuteNew
1824 #else // TPETRA_ENABLE_DEPRECATED_CODE
1825  copyAndPermute
1826 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1827  (const SrcDistObject&,
1828  const size_t,
1829  const Kokkos::DualView<
1830  const local_ordinal_type*,
1831  buffer_device_type>&,
1832  const Kokkos::DualView<
1833  const local_ordinal_type*,
1834  buffer_device_type>&)
1835  {}
1836 
1837  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1838  void
1839  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1840 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1841  packAndPrepareNew
1842 #else // TPETRA_ENABLE_DEPRECATED_CODE
1843  packAndPrepare
1844 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1845  (const SrcDistObject&,
1846  const Kokkos::DualView<
1847  const local_ordinal_type*,
1848  buffer_device_type>&,
1849  Kokkos::DualView<
1850  packet_type*,
1851  buffer_device_type>&,
1852  Kokkos::DualView<
1853  size_t*,
1854  buffer_device_type>,
1855  size_t&,
1856  Distributor&)
1857  {}
1858 
1859  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1860  void
1861  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1862 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1863  unpackAndCombineNew
1864 #else // TPETRA_ENABLE_DEPRECATED_CODE
1865  unpackAndCombine
1866 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1867  (const Kokkos::DualView<
1868  const local_ordinal_type*,
1869  buffer_device_type>& /* importLIDs */,
1870  Kokkos::DualView<
1871  packet_type*,
1872  buffer_device_type> /* imports */,
1873  Kokkos::DualView<
1874  size_t*,
1875  buffer_device_type> /* numPacketsPerLID */,
1876  const size_t /* constantNumPackets */,
1877  Distributor& /* distor */,
1878  const CombineMode /* combineMode */)
1879  {}
1880 
1881 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1882  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1883  void TPETRA_DEPRECATED
1885  copyAndPermute (const SrcDistObject& /* source */,
1886  const size_t /* numSameIDs */,
1887  const Teuchos::ArrayView<const local_ordinal_type>& /* permuteToLIDs */,
1888  const Teuchos::ArrayView<const local_ordinal_type>& /* permuteFromLIDs */)
1889  {}
1890 
1891  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1892  void TPETRA_DEPRECATED
1894  packAndPrepare (const SrcDistObject& /* source */,
1895  const Teuchos::ArrayView<const local_ordinal_type>& /* exportLIDs */,
1896  Teuchos::Array<packet_type>& /* exports */,
1897  const Teuchos::ArrayView<size_t>& /* numPacketsPerLID */,
1898  size_t& /* constantNumPackets */,
1899  Distributor& /* distor */)
1900  {}
1901 
1902  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1903  void TPETRA_DEPRECATED
1905  unpackAndCombine (const Teuchos::ArrayView<const local_ordinal_type>& /* importLIDs */,
1906  const Teuchos::ArrayView<const packet_type>& /* imports */,
1907  const Teuchos::ArrayView<size_t>& /* numPacketsPerLID */,
1908  const size_t /* constantNumPackets */,
1909  Distributor& /* distor */,
1910  const CombineMode /* combineMode */)
1911  {}
1912 
1913  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1914  void TPETRA_DEPRECATED
1915  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1916  createViews () const
1917  {}
1918 
1919  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1920  void TPETRA_DEPRECATED
1921  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1922  createViewsNonConst (KokkosClassic::ReadWriteOption /*rwo*/)
1923  {}
1924 
1925  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1926  void TPETRA_DEPRECATED
1927  DistObject<Packet, LocalOrdinal, GlobalOrdinal, Node>::
1928  releaseViews () const
1929  {}
1930 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1931 
1932  template <class Packet, class LocalOrdinal, class GlobalOrdinal, class Node>
1933  void
1935  print (std::ostream& os) const
1936  {
1937  using Teuchos::FancyOStream;
1938  using Teuchos::getFancyOStream;
1939  using Teuchos::RCP;
1940  using Teuchos::rcpFromRef;
1941  using std::endl;
1942 
1943  RCP<FancyOStream> out = getFancyOStream (rcpFromRef (os));
1944  this->describe (*out, Teuchos::VERB_DEFAULT);
1945  }
1946 
1947  template<class DistObjectType>
1948  void
1949  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input,
1950  const Teuchos::RCP<const Map<typename DistObjectType::local_ordinal_type,
1951  typename DistObjectType::global_ordinal_type,
1952  typename DistObjectType::node_type> >& newMap)
1953  {
1954  input->removeEmptyProcessesInPlace (newMap);
1955  if (newMap.is_null ()) { // my process is excluded
1956  input = Teuchos::null;
1957  }
1958  }
1959 
1960  template<class DistObjectType>
1961  void
1962  removeEmptyProcessesInPlace (Teuchos::RCP<DistObjectType>& input)
1963  {
1964  auto newMap = input->getMap ()->removeEmptyProcesses ();
1965  removeEmptyProcessesInPlace<DistObjectType> (input, newMap);
1966  }
1967 
1968 // Explicit instantiation macro for general DistObject.
1969 #define TPETRA_DISTOBJECT_INSTANT(SCALAR, LO, GO, NODE) \
1970  template class DistObject< SCALAR , LO , GO , NODE >;
1971 
1972 // Explicit instantiation macro for DistObject<char, ...>.
1973 // The "SLGN" stuff above doesn't work for Packet=char.
1974 #define TPETRA_DISTOBJECT_INSTANT_CHAR(LO, GO, NODE) \
1975  template class DistObject< char , LO , GO , NODE >;
1976 
1977 } // namespace Tpetra
1978 
1979 #endif // TPETRA_DISTOBJECT_DEF_HPP
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void doImport(const SrcDistObject &source, const Import< LocalOrdinal, GlobalOrdinal, Node > &importer, const CombineMode CM, const bool restrictedMode=false)
Import data into this object using an Import object (&quot;forward mode&quot;).
virtual void packAndPrepare(const SrcDistObject &source, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets, Distributor &distor)
Pack data and metadata for communication (sends).
virtual void doTransferNew(const SrcDistObject &src, const CombineMode CM, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &remoteLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Distributor &distor, const ReverseOption revOp, const bool commOnHost, const bool restrictedMode)
Implementation detail of doTransfer.
void print(std::ostream &os) const
Print this object to the given output stream.
virtual bool reallocArraysForNumPacketsPerLid(const size_t numExportLIDs, const size_t numImportLIDs)
Reallocate numExportPacketsPerLID_ and/or numImportPacketsPerLID_, if necessary.
bool isDistributed() const
Whether this is a globally distributed object.
void removeEmptyProcessesInPlace(Teuchos::RCP< DistObjectType > &input, const Teuchos::RCP< const Map< typename DistObjectType::local_ordinal_type, typename DistObjectType::global_ordinal_type, typename DistObjectType::node_type > > &newMap)
Remove processes which contain no elements in this object&#39;s Map.
static bool debug()
Whether Tpetra is in debug mode.
virtual void doTransfer(const SrcDistObject &src, const ::Tpetra::Details::Transfer< local_ordinal_type, global_ordinal_type, node_type > &transfer, const char modeString[], const ReverseOption revOp, const CombineMode CM, const bool restrictedMode)
Redistribute data across (MPI) processes.
typename Node::device_type device_type
The Kokkos Device type.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
int local_ordinal_type
Default value of Scalar template parameter.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Kokkos::DualView< T *, DT > getDualViewCopyFromArrayView(const Teuchos::ArrayView< const T > &x_av, const char label[], const bool leaveOnHost)
Get a 1-D Kokkos::DualView which is a deep copy of the input Teuchos::ArrayView (which views host mem...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Sets up and executes a communication plan for a Tpetra DistObject.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
bool reallocDualViewIfNeeded(Kokkos::DualView< ValueType *, DeviceType > &dv, const size_t newSize, const char newLabel[], const size_t tooBigFactor=2, const bool needFenceBeforeRealloc=true)
Reallocate the DualView in/out argument, if needed.
Abstract base class for objects that can be the source of an Import or Export operation.
Declaration and definition of Tpetra::Details::reallocDualViewIfNeeded, an implementation detail of T...
bool reallocImportsIfNeeded(const size_t newSize, const bool verbose, const std::string *prefix)
Reallocate imports_ if needed.
Replace existing values with new values.
LocalOrdinal local_ordinal_type
The type of local indices.
Replace old values with zero.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
ReverseOption
Whether the data transfer should be performed in forward or reverse mode.
DistObject(const Teuchos::RCP< const map_type > &map)
Constructor.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual std::string description() const
One-line descriptiion of this object.
virtual size_t constantNumberOfPackets() const
Whether the implementation&#39;s instance promises always to have a constant number of packets per LID (l...
A parallel distribution of indices over processes.
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
virtual void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Print a descriptiion of this object to the given output stream.
Kokkos::Device< typename device_type::execution_space, buffer_memory_space > buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs)
Perform copies and permutations that are local to the calling (MPI) process.
Base class for distributed Tpetra objects that support data redistribution.
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode)
Perform any unpacking and combining after communication.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap)
Remove processes which contain no entries in this object&#39;s Map.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.