Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_Distributor.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DISTRIBUTOR_HPP
43 #define TPETRA_DISTRIBUTOR_HPP
44 
45 #include "Tpetra_Util.hpp"
46 #include <Teuchos_as.hpp>
47 #include <Teuchos_Describable.hpp>
48 #include <Teuchos_ParameterListAcceptorDefaultBase.hpp>
49 #include <Teuchos_VerboseObject.hpp>
51 
52 // If TPETRA_DISTRIBUTOR_TIMERS is defined, Distributor will time
53 // doPosts (both versions) and doWaits, and register those timers with
54 // Teuchos::TimeMonitor so that summarize() or report() will show
55 // results.
56 
57 // #ifndef TPETRA_DISTRIBUTOR_TIMERS
58 // # define TPETRA_DISTRIBUTOR_TIMERS 1
59 // #endif // TPETRA_DISTRIBUTOR_TIMERS
60 
61 #ifdef TPETRA_DISTRIBUTOR_TIMERS
62 # undef TPETRA_DISTRIBUTOR_TIMERS
63 #endif // TPETRA_DISTRIBUTOR_TIMERS
64 
65 #include "KokkosCompat_View.hpp"
66 #include "Kokkos_Core.hpp"
67 #include "Kokkos_TeuchosCommAdapters.hpp"
68 #include <memory>
69 #include <sstream>
70 #include <type_traits>
71 
72 namespace Tpetra {
73 
74  namespace Details {
80  DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
81  DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
82  DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send)
83  DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend)
84  };
85 
90  std::string
92 
98  DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
99  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
100  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
101  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS, // By createFromSendsAndRecvs
102  DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
103  DISTRIBUTOR_INITIALIZED_BY_COPY, // By copy constructor
104  };
105 
110  std::string
112 
113  } // namespace Details
114 
121  Teuchos::Array<std::string> distributorSendTypes ();
122 
190  class Distributor :
191  public Teuchos::Describable,
192  public Teuchos::ParameterListAcceptorDefaultBase {
193  public:
195 
196 
205  explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
206 
218  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
219  const Teuchos::RCP<Teuchos::FancyOStream>& out);
220 
234  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
235  const Teuchos::RCP<Teuchos::ParameterList>& plist);
236 
253  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
254  const Teuchos::RCP<Teuchos::FancyOStream>& out,
255  const Teuchos::RCP<Teuchos::ParameterList>& plist);
256 
258  Distributor (const Distributor& distributor);
259 
264  virtual ~Distributor () = default;
265 
271  void swap (Distributor& rhs);
272 
274 
276 
281  void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
282 
287  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
288 
290 
292 
312  size_t createFromSends (const Teuchos::ArrayView<const int>& exportProcIDs);
313 
347  template <class Ordinal>
348  void
349  createFromRecvs (const Teuchos::ArrayView<const Ordinal>& remoteIDs,
350  const Teuchos::ArrayView<const int>& remoteProcIDs,
351  Teuchos::Array<Ordinal>& exportIDs,
352  Teuchos::Array<int>& exportProcIDs);
353 
361  void
362  createFromSendsAndRecvs (const Teuchos::ArrayView<const int>& exportProcIDs,
363  const Teuchos::ArrayView<const int>& remoteProcIDs);
364 
366 
368 
372  size_t getNumReceives() const;
373 
377  size_t getNumSends() const;
378 
380  bool hasSelfMessage() const;
381 
383  size_t getMaxSendLength() const;
384 
386  size_t getTotalReceiveLength() const;
387 
392  Teuchos::ArrayView<const int> getProcsFrom() const;
393 
398  Teuchos::ArrayView<const int> getProcsTo() const;
399 
407  Teuchos::ArrayView<const size_t> getLengthsFrom() const;
408 
416  Teuchos::ArrayView<const size_t> getLengthsTo() const;
417 
423  return howInitialized_;
424  }
425 
427 
429 
440  Teuchos::RCP<Distributor> getReverse() const;
441 
443 
445 
466  template <class Packet>
467  void
468  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
469  size_t numPackets,
470  const Teuchos::ArrayView<Packet> &imports);
471 
493  template <class Packet>
494  void
495  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
496  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
497  const Teuchos::ArrayView<Packet> &imports,
498  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
499 
524  template <class Packet>
525  void
526  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
527  size_t numPackets,
528  const Teuchos::ArrayRCP<Packet> &imports);
529 
548  template <class Packet>
549  void
550  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
551  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
552  const Teuchos::ArrayRCP<Packet> &imports,
553  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
554 
561  void doWaits ();
562 
567  template <class Packet>
568  void
569  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
570  size_t numPackets,
571  const Teuchos::ArrayView<Packet> &imports);
572 
577  template <class Packet>
578  void
579  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
580  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
581  const Teuchos::ArrayView<Packet> &imports,
582  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
583 
588  template <class Packet>
589  void
590  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
591  size_t numPackets,
592  const Teuchos::ArrayRCP<Packet> &imports);
593 
598  template <class Packet>
599  void
600  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
601  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
602  const Teuchos::ArrayRCP<Packet> &imports,
603  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
604 
611  void doReverseWaits ();
612 
633  template <class ExpView, class ImpView>
634  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
636  const ExpView &exports,
637  size_t numPackets,
638  const ImpView &imports);
639 
661  template <class ExpView, class ImpView>
662  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
663  doPostsAndWaits (const ExpView &exports,
664  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
665  const ImpView &imports,
666  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
667 
692  template <class ExpView, class ImpView>
693  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
694  doPosts (const ExpView &exports,
695  size_t numPackets,
696  const ImpView &imports);
697 
716  template <class ExpView, class ImpView>
717  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
718  doPosts (const ExpView &exports,
719  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
720  const ImpView &imports,
721  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
722 
727  template <class ExpView, class ImpView>
728  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
729  doReversePostsAndWaits (const ExpView &exports,
730  size_t numPackets,
731  const ImpView &imports);
732 
737  template <class ExpView, class ImpView>
738  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
739  doReversePostsAndWaits (const ExpView &exports,
740  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
741  const ImpView &imports,
742  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
743 
748  template <class ExpView, class ImpView>
749  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
750  doReversePosts (const ExpView &exports,
751  size_t numPackets,
752  const ImpView &imports);
753 
758  template <class ExpView, class ImpView>
759  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
760  doReversePosts (const ExpView &exports,
761  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
762  const ImpView &imports,
763  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
764 
768  void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{
769  bytes_sent = lastRoundBytesSend_;
770  bytes_recvd = lastRoundBytesRecv_;
771  }
772 
774 
776 
778  std::string description() const;
779 
801  void
802  describe (Teuchos::FancyOStream& out,
803  const Teuchos::EVerbosityLevel verbLevel =
804  Teuchos::Describable::verbLevel_default) const;
806 
807  private:
809  Teuchos::RCP<const Teuchos::Comm<int> > comm_;
810 
812  Teuchos::RCP<Teuchos::FancyOStream> out_;
813 
815  Details::EDistributorHowInitialized howInitialized_;
816 
818 
819 
822 
824  bool barrierBetween_;
825 
827  bool verbose_;
829 
833  bool selfMessage_;
834 
844  size_t numSends_;
845 
850  Teuchos::Array<int> procsTo_;
851 
860  Teuchos::Array<size_t> startsTo_;
861 
867  Teuchos::Array<size_t> lengthsTo_;
868 
872  size_t maxSendLength_;
873 
889  Teuchos::Array<size_t> indicesTo_;
890 
900  size_t numReceives_;
901 
908  size_t totalReceiveLength_;
909 
915  Teuchos::Array<size_t> lengthsFrom_;
916 
922  Teuchos::Array<int> procsFrom_;
923 
929  Teuchos::Array<size_t> startsFrom_;
930 
936  Teuchos::Array<size_t> indicesFrom_;
937 
944  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int> > > requests_;
945 
950  mutable Teuchos::RCP<Distributor> reverseDistributor_;
951 
953  size_t lastRoundBytesSend_;
954 
956  size_t lastRoundBytesRecv_;
957 
958 #ifdef TPETRA_DISTRIBUTOR_TIMERS
959  Teuchos::RCP<Teuchos::Time> timer_doPosts3_;
960  Teuchos::RCP<Teuchos::Time> timer_doPosts4_;
961  Teuchos::RCP<Teuchos::Time> timer_doWaits_;
962  Teuchos::RCP<Teuchos::Time> timer_doPosts3_recvs_;
963  Teuchos::RCP<Teuchos::Time> timer_doPosts4_recvs_;
964  Teuchos::RCP<Teuchos::Time> timer_doPosts3_barrier_;
965  Teuchos::RCP<Teuchos::Time> timer_doPosts4_barrier_;
966  Teuchos::RCP<Teuchos::Time> timer_doPosts3_sends_;
967  Teuchos::RCP<Teuchos::Time> timer_doPosts4_sends_;
968 
970  void makeTimers ();
971 #endif // TPETRA_DISTRIBUTOR_TIMERS
972 
984  bool useDistinctTags_;
985 
990  int getTag (const int pathTag) const;
991 
1002  void computeReceives ();
1003 
1016  template <class Ordinal>
1017  void computeSends (const Teuchos::ArrayView<const Ordinal> &remoteGIDs,
1018  const Teuchos::ArrayView<const int> &remoteProcIDs,
1019  Teuchos::Array<Ordinal> &exportGIDs,
1020  Teuchos::Array<int> &exportProcIDs);
1021 
1023  void createReverseDistributor() const;
1024 
1025 
1030  std::string
1031  localDescribeToString (const Teuchos::EVerbosityLevel vl) const;
1032  }; // class Distributor
1033 
1034 
1035  template <class Packet>
1036  void Distributor::
1037  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1038  size_t numPackets,
1039  const Teuchos::ArrayView<Packet>& imports)
1040  {
1041  using Teuchos::arcp;
1042  using Teuchos::ArrayRCP;
1043  typedef typename ArrayRCP<const Packet>::size_type size_type;
1044 
1045  TEUCHOS_TEST_FOR_EXCEPTION(
1046  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1047  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1048  " outstanding nonblocking messages pending. It is incorrect to call "
1049  "this method with posts outstanding.");
1050 
1051  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1052  // requiring that the memory location is persisting (as is
1053  // necessary for nonblocking receives). However, it need only
1054  // persist until doWaits() completes, so it is safe for us to use
1055  // a nonpersisting reference in this case. The use of a
1056  // nonpersisting reference is purely a performance optimization.
1057 
1058  //const Packet* exportsPtr = exports.getRawPtr();
1059  //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
1060  // exports.size(), false);
1061  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1062  static_cast<size_type> (0),
1063  exports.size(), false);
1064 
1065  // For some reason, neither of the options below (that use arcp)
1066  // compile for Packet=std::complex<double> with GCC 4.5.1. The
1067  // issue only arises with the exports array. This is why we
1068  // construct a separate nonowning ArrayRCP.
1069 
1070  // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1071  // numPackets,
1072  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1073  // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
1074  // numPackets,
1075  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1076  doPosts (exportsArcp,
1077  numPackets,
1078  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1079  doWaits ();
1080 
1081  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1082  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1083  }
1084 
1085  template <class Packet>
1086  void Distributor::
1087  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1088  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1089  const Teuchos::ArrayView<Packet> &imports,
1090  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1091  {
1092  using Teuchos::arcp;
1093  using Teuchos::ArrayRCP;
1094 
1095  TEUCHOS_TEST_FOR_EXCEPTION(
1096  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1097  "doPostsAndWaits: There are " << requests_.size () << " outstanding "
1098  "nonblocking messages pending. It is incorrect to call doPostsAndWaits "
1099  "with posts outstanding.");
1100 
1101  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1102  // requiring that the memory location is persisting (as is
1103  // necessary for nonblocking receives). However, it need only
1104  // persist until doWaits() completes, so it is safe for us to use
1105  // a nonpersisting reference in this case.
1106 
1107  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1108  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1109  // with some versions of GCC. The issue only arises with the
1110  // exports array. This is why we construct a separate nonowning
1111  // ArrayRCP.
1112  typedef typename ArrayRCP<const Packet>::size_type size_type;
1113  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1114  static_cast<size_type> (0),
1115  exports.size (), false);
1116  // mfh 04 Apr 2012: This is the offending code. This statement
1117  // would normally be in place of "exportsArcp" in the
1118  // doPosts() call below.
1119  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1120  doPosts (exportsArcp,
1121  numExportPacketsPerLID,
1122  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1123  numImportPacketsPerLID);
1124  doWaits ();
1125 
1126  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1127  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1128  }
1129 
1130 
1131  template <class Packet>
1132  void Distributor::
1133  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1134  size_t numPackets,
1135  const Teuchos::ArrayRCP<Packet>& imports)
1136  {
1137  using Teuchos::Array;
1138  using Teuchos::ArrayRCP;
1139  using Teuchos::ArrayView;
1140  using Teuchos::as;
1141  using Teuchos::FancyOStream;
1142  using Teuchos::includesVerbLevel;
1143  using Teuchos::ireceive;
1144  using Teuchos::isend;
1145  using Teuchos::OSTab;
1146  using Teuchos::readySend;
1147  using Teuchos::send;
1148  using Teuchos::ssend;
1149  using Teuchos::TypeNameTraits;
1150  using Teuchos::typeName;
1151  using std::endl;
1152  typedef Array<size_t>::size_type size_type;
1153 
1154 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1155  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
1156 #endif // TPETRA_DISTRIBUTOR_TIMERS
1157 
1158  const int myRank = comm_->getRank ();
1159  // Run-time configurable parameters that come from the input
1160  // ParameterList set by setParameterList().
1161  const Details::EDistributorSendType sendType = sendType_;
1162  const bool doBarrier = barrierBetween_;
1163 
1164  Teuchos::OSTab tab0 (out_);
1165  std::unique_ptr<std::string> prefix;
1166  if (verbose_) {
1167  std::ostringstream os;
1168  os << "Proc " << myRank << ": Distributor::doPosts(3-arg, ArrayRCP): ";
1169  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
1170  os << endl;
1171  *out_ << os.str ();
1172  }
1173  Teuchos::OSTab tab1 (out_);
1174 
1175  TEUCHOS_TEST_FOR_EXCEPTION(
1176  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1177  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Ready-send "
1178  "version requires a barrier between posting receives and posting ready "
1179  "sends. This should have been checked before. "
1180  "Please report this bug to the Tpetra developers.");
1181 
1182  size_t selfReceiveOffset = 0;
1183 
1184  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
1185  // check whether we're doing reverse mode before checking the
1186  // length of the imports array.
1187  if (howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE) {
1188  // Each message has the same number of packets.
1189  //
1190  // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
1191  // inequality to a less-than seems to have fixed Bug 6170. It's
1192  // OK for the 'imports' array to be longer than it needs to be;
1193  // I'm just curious why it would be.
1194  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
1195  TEUCHOS_TEST_FOR_EXCEPTION
1196  (static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1197  std::invalid_argument,
1198  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1199  "The 'imports' array must have enough entries to hold the expected number "
1200  "of import packets. imports.size() = " << imports.size () << " < "
1201  "totalNumImportPackets = " << totalNumImportPackets << ".");
1202  }
1203 
1204  // MPI tag for nonblocking receives and blocking sends in this
1205  // method. Some processes might take the "fast" path
1206  // (indicesTo_.empty()) and others might take the "slow" path for
1207  // the same doPosts() call, so the path tag must be the same for
1208  // both.
1209  const int pathTag = 0;
1210  const int tag = this->getTag (pathTag);
1211 
1212 #ifdef HAVE_TPETRA_DEBUG
1213  TEUCHOS_TEST_FOR_EXCEPTION
1214  (requests_.size () != 0,
1215  std::logic_error,
1216  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Process "
1217  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
1218 #endif // HAVE_TPETRA_DEBUG
1219 
1220  // Distributor uses requests_.size() as the number of outstanding
1221  // nonblocking message requests, so we resize to zero to maintain
1222  // this invariant.
1223  //
1224  // numReceives_ does _not_ include the self message, if there is
1225  // one. Here, we do actually send a message to ourselves, so we
1226  // include any self message in the "actual" number of receives to
1227  // post.
1228  //
1229  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1230  // doesn't (re)allocate its array of requests. That happens in
1231  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1232  // demand), or Resize_().
1233  const size_type actualNumReceives = as<size_type> (numReceives_) +
1234  as<size_type> (selfMessage_ ? 1 : 0);
1235  requests_.resize (0);
1236 
1237  if (verbose_) {
1238  std::ostringstream os;
1239  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1240  << ": Post receives" << endl;
1241  *out_ << os.str ();
1242  }
1243 
1244  // Post the nonblocking receives. It's common MPI wisdom to post
1245  // receives before sends. In MPI terms, this means favoring
1246  // adding to the "posted queue" (of receive requests) over adding
1247  // to the "unexpected queue" (of arrived messages not yet matched
1248  // with a receive).
1249  {
1250 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1251  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
1252 #endif // TPETRA_DISTRIBUTOR_TIMERS
1253 
1254  size_t curBufOffset = 0;
1255  for (size_type i = 0; i < actualNumReceives; ++i) {
1256  const size_t curBufLen = lengthsFrom_[i] * numPackets;
1257  if (procsFrom_[i] != myRank) {
1258  if (verbose_) {
1259  std::ostringstream os;
1260  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1261  << ": Post irecv: {source: " << procsFrom_[i]
1262  << ", tag: " << tag << "}" << endl;
1263  *out_ << os.str ();
1264  }
1265  // If my process is receiving these packet(s) from another
1266  // process (not a self-receive):
1267  //
1268  // 1. Set up the persisting view (recvBuf) of the imports
1269  // array, given the offset and size (total number of
1270  // packets from process procsFrom_[i]).
1271  // 2. Start the Irecv and save the resulting request.
1272  TEUCHOS_TEST_FOR_EXCEPTION(
1273  curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
1274  std::logic_error,
1275  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1276  "Exceeded size of 'imports' array in packing loop on Process " <<
1277  myRank << ". imports.size() = " << imports.size () << " < "
1278  "curBufOffset(" << curBufOffset << ") + curBufLen(" << curBufLen
1279  << ").");
1280  ArrayRCP<Packet> recvBuf =
1281  imports.persistingView (curBufOffset, curBufLen);
1282  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1283  tag, *comm_));
1284  }
1285  else { // Receiving from myself
1286  selfReceiveOffset = curBufOffset; // Remember the self-recv offset
1287  }
1288  curBufOffset += curBufLen;
1289  }
1290  }
1291 
1292  if (doBarrier) {
1293 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1294  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
1295 #endif // TPETRA_DISTRIBUTOR_TIMERS
1296 
1297  if (verbose_) {
1298  std::ostringstream os;
1299  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1300  << ": Barrier" << endl;
1301  *out_ << os.str ();
1302  }
1303  // If we are using ready sends (MPI_Rsend) below, we need to do
1304  // a barrier before we post the ready sends. This is because a
1305  // ready send requires that its matching receive has already
1306  // been posted before the send has been posted. The only way to
1307  // guarantee that in this case is to use a barrier.
1308  comm_->barrier ();
1309  }
1310 
1311 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1312  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
1313 #endif // TPETRA_DISTRIBUTOR_TIMERS
1314 
1315  // setup scan through procsTo_ list starting with higher numbered procs
1316  // (should help balance message traffic)
1317  //
1318  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
1319  // It doesn't depend on the input at all.
1320  size_t numBlocks = numSends_ + selfMessage_;
1321  size_t procIndex = 0;
1322  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
1323  ++procIndex;
1324  }
1325  if (procIndex == numBlocks) {
1326  procIndex = 0;
1327  }
1328 
1329  size_t selfNum = 0;
1330  size_t selfIndex = 0;
1331 
1332  if (verbose_) {
1333  std::ostringstream os;
1334  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1335  << ": Post sends" << endl;
1336  *out_ << os.str ();
1337  }
1338 
1339  if (indicesTo_.empty ()) {
1340  // Data are already blocked (laid out) by process, so we don't
1341  // need a separate send buffer (besides the exports array).
1342  for (size_t i = 0; i < numBlocks; ++i) {
1343  size_t p = i + procIndex;
1344  if (p > (numBlocks - 1)) {
1345  p -= numBlocks;
1346  }
1347 
1348  if (procsTo_[p] != myRank) {
1349  if (verbose_) {
1350  std::ostringstream os;
1351  os << *prefix << ": Post send: {target: "
1352  << procsTo_[p] << ", tag: " << tag << "}" << endl;
1353  *out_ << os.str ();
1354  }
1355 
1356  ArrayView<const Packet> tmpSend =
1357  exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
1358 
1359  if (sendType == Details::DISTRIBUTOR_SEND) {
1360  send<int, Packet> (tmpSend.getRawPtr (),
1361  as<int> (tmpSend.size ()),
1362  procsTo_[p], tag, *comm_);
1363  }
1364  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1365  ArrayRCP<const Packet> tmpSendBuf =
1366  exports.persistingView (startsTo_[p] * numPackets,
1367  lengthsTo_[p] * numPackets);
1368  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1369  tag, *comm_));
1370  }
1371  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1372  readySend<int, Packet> (tmpSend.getRawPtr (),
1373  as<int> (tmpSend.size ()),
1374  procsTo_[p], tag, *comm_);
1375  }
1376  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1377  ssend<int, Packet> (tmpSend.getRawPtr (),
1378  as<int> (tmpSend.size ()),
1379  procsTo_[p], tag, *comm_);
1380  } else {
1381  TEUCHOS_TEST_FOR_EXCEPTION(
1382  true, std::logic_error,
1383  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1384  "Invalid send type. We should never get here. "
1385  "Please report this bug to the Tpetra developers.");
1386  }
1387  }
1388  else { // "Sending" the message to myself
1389  selfNum = p;
1390  }
1391  }
1392 
1393  if (selfMessage_) {
1394  if (verbose_) {
1395  std::ostringstream os;
1396  os << *prefix << "Fast: Self-send" << endl;
1397  *out_ << os.str ();
1398  }
1399  // This is how we "send a message to ourself": we copy from
1400  // the export buffer to the import buffer. That saves
1401  // Teuchos::Comm implementations other than MpiComm (in
1402  // particular, SerialComm) the trouble of implementing self
1403  // messages correctly. (To do this right, SerialComm would
1404  // need internal buffer space for messages, keyed on the
1405  // message's tag.)
1406  std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
1407  exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
1408  imports.begin()+selfReceiveOffset);
1409  }
1410  }
1411  else { // data are not blocked by proc, use send buffer
1412  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
1413  // sends), because the buffer is only long enough for one send.
1414  ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
1415 
1416  TEUCHOS_TEST_FOR_EXCEPTION(
1417  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1418  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1419  "The \"send buffer\" code path doesn't currently work with "
1420  "nonblocking sends.");
1421 
1422  for (size_t i = 0; i < numBlocks; ++i) {
1423  size_t p = i + procIndex;
1424  if (p > (numBlocks - 1)) {
1425  p -= numBlocks;
1426  }
1427 
1428  if (procsTo_[p] != myRank) {
1429  if (verbose_) {
1430  std::ostringstream os;
1431  os << *prefix << "Slow: Post send: "
1432  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
1433  *out_ << os.str ();
1434  }
1435 
1436  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1437  size_t sendArrayOffset = 0;
1438  size_t j = startsTo_[p];
1439  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1440  srcBegin = exports.begin() + indicesTo_[j]*numPackets;
1441  srcEnd = srcBegin + numPackets;
1442  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1443  sendArrayOffset += numPackets;
1444  }
1445  ArrayView<const Packet> tmpSend =
1446  sendArray.view (0, lengthsTo_[p]*numPackets);
1447 
1448  if (sendType == Details::DISTRIBUTOR_SEND) {
1449  send<int, Packet> (tmpSend.getRawPtr (),
1450  as<int> (tmpSend.size ()),
1451  procsTo_[p], tag, *comm_);
1452  }
1453  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1454  ArrayRCP<const Packet> tmpSendBuf =
1455  sendArray.persistingView (0, lengthsTo_[p] * numPackets);
1456  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1457  tag, *comm_));
1458  }
1459  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1460  readySend<int, Packet> (tmpSend.getRawPtr (),
1461  as<int> (tmpSend.size ()),
1462  procsTo_[p], tag, *comm_);
1463  }
1464  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1465  ssend<int, Packet> (tmpSend.getRawPtr (),
1466  as<int> (tmpSend.size ()),
1467  procsTo_[p], tag, *comm_);
1468  }
1469  else {
1470  TEUCHOS_TEST_FOR_EXCEPTION(
1471  true, std::logic_error,
1472  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1473  "Invalid send type. We should never get here. "
1474  "Please report this bug to the Tpetra developers.");
1475  }
1476  }
1477  else { // "Sending" the message to myself
1478  selfNum = p;
1479  selfIndex = startsTo_[p];
1480  }
1481  }
1482 
1483  if (selfMessage_) {
1484  if (verbose_) {
1485  std::ostringstream os;
1486  os << *prefix << "Slow: Self-send" << endl;
1487  *out_ << os.str ();
1488  }
1489  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1490  std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
1491  exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
1492  imports.begin() + selfReceiveOffset);
1493  ++selfIndex;
1494  selfReceiveOffset += numPackets;
1495  }
1496  }
1497  }
1498 
1499  if (verbose_) {
1500  std::ostringstream os;
1501  os << *prefix << "Done!" << endl;
1502  *out_ << os.str ();
1503  }
1504  }
1505 
1506  template <class Packet>
1507  void Distributor::
1508  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1509  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1510  const Teuchos::ArrayRCP<Packet>& imports,
1511  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1512  {
1513  using Teuchos::Array;
1514  using Teuchos::ArrayRCP;
1515  using Teuchos::ArrayView;
1516  using Teuchos::as;
1517  using Teuchos::ireceive;
1518  using Teuchos::isend;
1519  using Teuchos::readySend;
1520  using Teuchos::send;
1521  using Teuchos::ssend;
1522  using Teuchos::TypeNameTraits;
1523 #ifdef HAVE_TEUCHOS_DEBUG
1524  using Teuchos::OSTab;
1525 #endif // HAVE_TEUCHOS_DEBUG
1526  using std::endl;
1527  typedef Array<size_t>::size_type size_type;
1528 
1529  Teuchos::OSTab tab (out_);
1530 
1531 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1532  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
1533 #endif // TPETRA_DISTRIBUTOR_TIMERS
1534 
1535  // Run-time configurable parameters that come from the input
1536  // ParameterList set by setParameterList().
1537  const Details::EDistributorSendType sendType = sendType_;
1538  const bool doBarrier = barrierBetween_;
1539 
1540 // #ifdef HAVE_TEUCHOS_DEBUG
1541 // // Prepare for verbose output, if applicable.
1542 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1543 // Teuchos::RCP<Teuchos::FancyOStream> out = this->getOStream ();
1544 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1545 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1546 
1547 // if (doPrint) {
1548 // // Only need one process to print out parameters.
1549 // *out << "Distributor::doPosts (4 args)" << endl;
1550 // }
1551 // // Add one tab level. We declare this outside the doPrint scopes
1552 // // so that the tab persists until the end of this method.
1553 // Teuchos::OSTab tab = this->getOSTab ();
1554 // if (doPrint) {
1555 // *out << "Parameters:" << endl;
1556 // {
1557 // OSTab tab2 (out);
1558 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1559 // << endl << "barrierBetween: " << doBarrier << endl;
1560 // }
1561 // }
1562 // #endif // HAVE_TEUCHOS_DEBUG
1563 
1564  TEUCHOS_TEST_FOR_EXCEPTION(
1565  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
1566  std::logic_error,
1567  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Ready-send "
1568  "version requires a barrier between posting receives and posting ready "
1569  "ends. This should have been checked before. "
1570  "Please report this bug to the Tpetra developers.");
1571 
1572  const int myProcID = comm_->getRank ();
1573  size_t selfReceiveOffset = 0;
1574 
1575 #ifdef HAVE_TEUCHOS_DEBUG
1576  // Different messages may have different numbers of packets.
1577  size_t totalNumImportPackets = 0;
1578  for (size_t ii = 0; ii < static_cast<size_t> (numImportPacketsPerLID.size ()); ++ii) {
1579  totalNumImportPackets += numImportPacketsPerLID[ii];
1580  }
1581  TEUCHOS_TEST_FOR_EXCEPTION(
1582  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1583  std::runtime_error,
1584  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): The 'imports' "
1585  "array must have enough entries to hold the expected number of import "
1586  "packets. imports.size() = " << imports.size() << " < "
1587  "totalNumImportPackets = " << totalNumImportPackets << ".");
1588 #endif // HAVE_TEUCHOS_DEBUG
1589 
1590  // MPI tag for nonblocking receives and blocking sends in this
1591  // method. Some processes might take the "fast" path
1592  // (indicesTo_.empty()) and others might take the "slow" path for
1593  // the same doPosts() call, so the path tag must be the same for
1594  // both.
1595  const int pathTag = 1;
1596  const int tag = this->getTag (pathTag);
1597 
1598 #ifdef HAVE_TEUCHOS_DEBUG
1599  TEUCHOS_TEST_FOR_EXCEPTION
1600  (requests_.size () != 0,
1601  std::logic_error,
1602  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Process "
1603  << myProcID << ": requests_.size() = " << requests_.size ()
1604  << " != 0.");
1605 #endif // HAVE_TEUCHOS_DEBUG
1606  if (verbose_) {
1607  std::ostringstream os;
1608  os << "Proc " << myProcID << ": doPosts(4 args, Teuchos::ArrayRCP, "
1609  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
1610  *out_ << os.str ();
1611  }
1612 
1613  // Distributor uses requests_.size() as the number of outstanding
1614  // nonblocking message requests, so we resize to zero to maintain
1615  // this invariant.
1616  //
1617  // numReceives_ does _not_ include the self message, if there is
1618  // one. Here, we do actually send a message to ourselves, so we
1619  // include any self message in the "actual" number of receives to
1620  // post.
1621  //
1622  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1623  // doesn't (re)allocate its array of requests. That happens in
1624  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1625  // demand), or Resize_().
1626  const size_type actualNumReceives = as<size_type> (numReceives_) +
1627  as<size_type> (selfMessage_ ? 1 : 0);
1628  requests_.resize (0);
1629 
1630  // Post the nonblocking receives. It's common MPI wisdom to post
1631  // receives before sends. In MPI terms, this means favoring
1632  // adding to the "posted queue" (of receive requests) over adding
1633  // to the "unexpected queue" (of arrived messages not yet matched
1634  // with a receive).
1635  {
1636 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1637  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
1638 #endif // TPETRA_DISTRIBUTOR_TIMERS
1639 
1640  size_t curBufferOffset = 0;
1641  size_t curLIDoffset = 0;
1642  for (size_type i = 0; i < actualNumReceives; ++i) {
1643  size_t totalPacketsFrom_i = 0;
1644  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
1645  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1646  }
1647  curLIDoffset += lengthsFrom_[i];
1648  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
1649  // If my process is receiving these packet(s) from another
1650  // process (not a self-receive), and if there is at least
1651  // one packet to receive:
1652  //
1653  // 1. Set up the persisting view (recvBuf) into the imports
1654  // array, given the offset and size (total number of
1655  // packets from process procsFrom_[i]).
1656  // 2. Start the Irecv and save the resulting request.
1657  ArrayRCP<Packet> recvBuf =
1658  imports.persistingView (curBufferOffset, totalPacketsFrom_i);
1659  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1660  tag, *comm_));
1661  }
1662  else { // Receiving these packet(s) from myself
1663  selfReceiveOffset = curBufferOffset; // Remember the offset
1664  }
1665  curBufferOffset += totalPacketsFrom_i;
1666  }
1667  }
1668 
1669  if (doBarrier) {
1670 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1671  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
1672 #endif // TPETRA_DISTRIBUTOR_TIMERS
1673  // If we are using ready sends (MPI_Rsend) below, we need to do
1674  // a barrier before we post the ready sends. This is because a
1675  // ready send requires that its matching receive has already
1676  // been posted before the send has been posted. The only way to
1677  // guarantee that in this case is to use a barrier.
1678  comm_->barrier ();
1679  }
1680 
1681 #ifdef TPETRA_DISTRIBUTOR_TIMERS
1682  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
1683 #endif // TPETRA_DISTRIBUTOR_TIMERS
1684 
1685  // setup arrays containing starting-offsets into exports for each send,
1686  // and num-packets-to-send for each send.
1687  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
1688  size_t maxNumPackets = 0;
1689  size_t curPKToffset = 0;
1690  for (size_t pp=0; pp<numSends_; ++pp) {
1691  sendPacketOffsets[pp] = curPKToffset;
1692  size_t numPackets = 0;
1693  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
1694  numPackets += numExportPacketsPerLID[j];
1695  }
1696  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1697  packetsPerSend[pp] = numPackets;
1698  curPKToffset += numPackets;
1699  }
1700 
1701  // setup scan through procsTo_ list starting with higher numbered procs
1702  // (should help balance message traffic)
1703  size_t numBlocks = numSends_+ selfMessage_;
1704  size_t procIndex = 0;
1705  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
1706  ++procIndex;
1707  }
1708  if (procIndex == numBlocks) {
1709  procIndex = 0;
1710  }
1711 
1712  size_t selfNum = 0;
1713  size_t selfIndex = 0;
1714 
1715  if (indicesTo_.empty()) {
1716  if (verbose_) {
1717  std::ostringstream os;
1718  os << "Proc " << myProcID
1719  << ": doPosts(4 args, Teuchos::ArrayRCP, fast): posting sends" << endl;
1720  *out_ << os.str ();
1721  }
1722 
1723  // Data are already blocked (laid out) by process, so we don't
1724  // need a separate send buffer (besides the exports array).
1725  for (size_t i = 0; i < numBlocks; ++i) {
1726  size_t p = i + procIndex;
1727  if (p > (numBlocks - 1)) {
1728  p -= numBlocks;
1729  }
1730 
1731  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
1732  ArrayView<const Packet> tmpSend =
1733  exports.view (sendPacketOffsets[p], packetsPerSend[p]);
1734 
1735  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
1736  send<int, Packet> (tmpSend.getRawPtr (),
1737  as<int> (tmpSend.size ()),
1738  procsTo_[p], tag, *comm_);
1739  }
1740  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1741  readySend<int, Packet> (tmpSend.getRawPtr (),
1742  as<int> (tmpSend.size ()),
1743  procsTo_[p], tag, *comm_);
1744  }
1745  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1746  ArrayRCP<const Packet> tmpSendBuf =
1747  exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
1748  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1749  tag, *comm_));
1750  }
1751  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1752  ssend<int, Packet> (tmpSend.getRawPtr (),
1753  as<int> (tmpSend.size ()),
1754  procsTo_[p], tag, *comm_);
1755  }
1756  else {
1757  TEUCHOS_TEST_FOR_EXCEPTION(
1758  true, std::logic_error,
1759  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1760  "Invalid send type. We should never get here. Please report "
1761  "this bug to the Tpetra developers.");
1762  }
1763  }
1764  else { // "Sending" the message to myself
1765  selfNum = p;
1766  }
1767  }
1768 
1769  if (selfMessage_) {
1770  std::copy (exports.begin()+sendPacketOffsets[selfNum],
1771  exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
1772  imports.begin()+selfReceiveOffset);
1773  }
1774  if (verbose_) {
1775  std::ostringstream os;
1776  os << "Proc " << myProcID
1777  << ": doPosts(4 args, Teuchos::ArrayRCP, fast) done" << endl;
1778  *out_ << os.str ();
1779  }
1780  }
1781  else { // data are not blocked by proc, use send buffer
1782  if (verbose_) {
1783  std::ostringstream os;
1784  os << "Proc " << myProcID
1785  << ": doPosts(4 args, Teuchos::ArrayRCP, slow): posting sends" << endl;
1786  *out_ << os.str ();
1787  }
1788 
1789  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
1790  ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
1791 
1792  TEUCHOS_TEST_FOR_EXCEPTION(
1793  sendType == Details::DISTRIBUTOR_ISEND,
1794  std::logic_error,
1795  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1796  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
1797 
1798  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1799  size_t ioffset = 0;
1800  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
1801  indicesOffsets[j] = ioffset;
1802  ioffset += numExportPacketsPerLID[j];
1803  }
1804 
1805  for (size_t i = 0; i < numBlocks; ++i) {
1806  size_t p = i + procIndex;
1807  if (p > (numBlocks - 1)) {
1808  p -= numBlocks;
1809  }
1810 
1811  if (procsTo_[p] != myProcID) {
1812  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1813  size_t sendArrayOffset = 0;
1814  size_t j = startsTo_[p];
1815  size_t numPacketsTo_p = 0;
1816  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1817  srcBegin = exports.begin() + indicesOffsets[j];
1818  srcEnd = srcBegin + numExportPacketsPerLID[j];
1819  numPacketsTo_p += numExportPacketsPerLID[j];
1820  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1821  sendArrayOffset += numExportPacketsPerLID[j];
1822  }
1823  if (numPacketsTo_p > 0) {
1824  ArrayView<const Packet> tmpSend =
1825  sendArray.view (0, numPacketsTo_p);
1826 
1827  if (sendType == Details::DISTRIBUTOR_RSEND) {
1828  readySend<int, Packet> (tmpSend.getRawPtr (),
1829  as<int> (tmpSend.size ()),
1830  procsTo_[p], tag, *comm_);
1831  }
1832  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1833  ArrayRCP<const Packet> tmpSendBuf =
1834  sendArray.persistingView (0, numPacketsTo_p);
1835  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1836  tag, *comm_));
1837  }
1838  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1839  ssend<int, Packet> (tmpSend.getRawPtr (),
1840  as<int> (tmpSend.size ()),
1841  procsTo_[p], tag, *comm_);
1842  }
1843  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
1844  send<int, Packet> (tmpSend.getRawPtr (),
1845  as<int> (tmpSend.size ()),
1846  procsTo_[p], tag, *comm_);
1847  }
1848  }
1849  }
1850  else { // "Sending" the message to myself
1851  selfNum = p;
1852  selfIndex = startsTo_[p];
1853  }
1854  }
1855 
1856  if (selfMessage_) {
1857  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1858  std::copy (exports.begin()+indicesOffsets[selfIndex],
1859  exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
1860  imports.begin() + selfReceiveOffset);
1861  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1862  ++selfIndex;
1863  }
1864  }
1865  if (verbose_) {
1866  std::ostringstream os;
1867  os << "Proc " << myProcID
1868  << ": doPosts(4 args, Teuchos::ArrayRCP, slow) done" << endl;
1869  *out_ << os.str ();
1870  }
1871  }
1872  }
1873 
1874  template <class Packet>
1875  void Distributor::
1876  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1877  size_t numPackets,
1878  const Teuchos::ArrayView<Packet>& imports)
1879  {
1880  using Teuchos::arcp;
1881  using Teuchos::ArrayRCP;
1882  using Teuchos::as;
1883 
1884  // doReversePosts() takes exports and imports as ArrayRCPs,
1885  // requiring that the memory locations are persisting. However,
1886  // they need only persist within the scope of that routine, so it
1887  // is safe for us to use nonpersisting references in this case.
1888 
1889  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1890  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1891  // with some versions of GCC. The issue only arises with the
1892  // exports array. This is why we construct a separate nonowning
1893  // ArrayRCP.
1894  typedef typename ArrayRCP<const Packet>::size_type size_type;
1895  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
1896  exports.size(), false);
1897  // mfh 04 Apr 2012: This is the offending code. This statement
1898  // would normally be in place of "exportsArcp" in the
1899  // doReversePosts() call below.
1900  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
1901  doReversePosts (exportsArcp,
1902  numPackets,
1903  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1904  doReverseWaits ();
1905 
1906  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1907  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1908  }
1909 
1910  template <class Packet>
1911  void Distributor::
1912  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1913  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1914  const Teuchos::ArrayView<Packet> &imports,
1915  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1916  {
1917  using Teuchos::as;
1918  using Teuchos::arcp;
1919  using Teuchos::ArrayRCP;
1920 
1921  TEUCHOS_TEST_FOR_EXCEPTION(
1922  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1923  "doReversePostsAndWaits(4 args): There are " << requests_.size ()
1924  << " outstanding nonblocking messages pending. It is incorrect to call "
1925  "this method with posts outstanding.");
1926 
1927  // doReversePosts() accepts the exports and imports arrays as
1928  // ArrayRCPs, requiring that the memory location is persisting (as
1929  // is necessary for nonblocking receives). However, it need only
1930  // persist until doReverseWaits() completes, so it is safe for us
1931  // to use a nonpersisting reference in this case. The use of a
1932  // nonpersisting reference is purely a performance optimization.
1933 
1934  // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
1935  // for Packet=std::complex<double> fails to compile with some
1936  // versions of GCC. The issue only arises with the exports array.
1937  // This is why we construct a separate nonowning ArrayRCP.
1938  typedef typename ArrayRCP<const Packet>::size_type size_type;
1939  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
1940  exports.size (), false);
1941  doReversePosts (exportsArcp,
1942  numExportPacketsPerLID,
1943  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1944  numImportPacketsPerLID);
1945  doReverseWaits ();
1946 
1947  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1948  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1949  }
1950 
1951  template <class Packet>
1952  void Distributor::
1953  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1954  size_t numPackets,
1955  const Teuchos::ArrayRCP<Packet>& imports)
1956  {
1957  // FIXME (mfh 29 Mar 2012) WHY?
1958  TEUCHOS_TEST_FOR_EXCEPTION(
1959  ! indicesTo_.empty (), std::runtime_error,
1960  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1961  "communication when original data are blocked by process.");
1962  if (reverseDistributor_.is_null ()) {
1963  createReverseDistributor ();
1964  }
1965  reverseDistributor_->doPosts (exports, numPackets, imports);
1966  }
1967 
1968  template <class Packet>
1969  void Distributor::
1970  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1971  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1972  const Teuchos::ArrayRCP<Packet>& imports,
1973  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1974  {
1975  // FIXME (mfh 29 Mar 2012) WHY?
1976  TEUCHOS_TEST_FOR_EXCEPTION(
1977  ! indicesTo_.empty (), std::runtime_error,
1978  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1979  "communication when original data are blocked by process.");
1980  if (reverseDistributor_.is_null ()) {
1981  createReverseDistributor ();
1982  }
1983  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
1984  imports, numImportPacketsPerLID);
1985  }
1986 
1987  template <class ExpView, class ImpView>
1988  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
1989  Distributor::
1990  doPostsAndWaits (const ExpView& exports,
1991  size_t numPackets,
1992  const ImpView& imports)
1993  {
1994  using Teuchos::RCP;
1995  using Teuchos::rcp;
1996  using std::endl;
1997 
1998  RCP<Teuchos::OSTab> tab0, tab1;
1999  if (verbose_) {
2000  tab0 = rcp (new Teuchos::OSTab (out_));
2001  const int myRank = comm_->getRank ();
2002  std::ostringstream os;
2003  os << "Proc " << myRank
2004  << ": Distributor::doPostsAndWaits(3 args, Kokkos): "
2005  << "{sendType: " << DistributorSendTypeEnumToString (sendType_)
2006  << ", barrierBetween: " << barrierBetween_ << "}" << endl;
2007  *out_ << os.str ();
2008  tab1 = rcp (new Teuchos::OSTab (out_));
2009  }
2010 
2011  TEUCHOS_TEST_FOR_EXCEPTION(
2012  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
2013  "doPostsAndWaits(3 args): There are " << requests_.size () <<
2014  " outstanding nonblocking messages pending. It is incorrect to call "
2015  "this method with posts outstanding.");
2016 
2017  if (verbose_) {
2018  const int myRank = comm_->getRank ();
2019  std::ostringstream os;
2020  os << "Proc " << myRank
2021  << ": Distributor::doPostsAndWaits: Call doPosts" << endl;
2022  *out_ << os.str ();
2023  }
2024  doPosts (exports, numPackets, imports);
2025  if (verbose_) {
2026  const int myRank = comm_->getRank ();
2027  std::ostringstream os;
2028  os << "Proc " << myRank
2029  << ": Distributor::doPostsAndWaits: Call doWaits" << endl;
2030  *out_ << os.str ();
2031  }
2032  doWaits ();
2033  }
2034 
2035  template <class ExpView, class ImpView>
2036  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2037  Distributor::
2038  doPostsAndWaits (const ExpView& exports,
2039  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2040  const ImpView& imports,
2041  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2042  {
2043  TEUCHOS_TEST_FOR_EXCEPTION(
2044  requests_.size () != 0, std::runtime_error,
2045  "Tpetra::Distributor::doPostsAndWaits(4 args): There are "
2046  << requests_.size () << " outstanding nonblocking messages pending. "
2047  "It is incorrect to call this method with posts outstanding.");
2048 
2049  doPosts (exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
2050  doWaits ();
2051  }
2052 
2053 
2054  template <class ExpView, class ImpView>
2055  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2056  Distributor::
2057  doPosts (const ExpView &exports,
2058  size_t numPackets,
2059  const ImpView &imports)
2060  {
2061  using Teuchos::Array;
2062  using Teuchos::as;
2063  using Teuchos::FancyOStream;
2064  using Teuchos::includesVerbLevel;
2065  using Teuchos::ireceive;
2066  using Teuchos::isend;
2067  using Teuchos::OSTab;
2068  using Teuchos::readySend;
2069  using Teuchos::send;
2070  using Teuchos::ssend;
2071  using Teuchos::TypeNameTraits;
2072  using Teuchos::typeName;
2073  using std::endl;
2074  using Kokkos::Compat::create_const_view;
2075  using Kokkos::Compat::create_view;
2076  using Kokkos::Compat::subview_offset;
2077  using Kokkos::Compat::deep_copy_offset;
2078  typedef Array<size_t>::size_type size_type;
2079  typedef ExpView exports_view_type;
2080  typedef ImpView imports_view_type;
2081 
2082 #ifdef KOKKOS_ENABLE_CUDA
2083  static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2084  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2085  "Please do not use Tpetra::Distributor with UVM "
2086  "allocations. See GitHub issue #1088.");
2087 #endif // KOKKOS_ENABLE_CUDA
2088 
2089 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2090  Teuchos::TimeMonitor timeMon (*timer_doPosts3_);
2091 #endif // TPETRA_DISTRIBUTOR_TIMERS
2092 
2093  const int myRank = comm_->getRank ();
2094  // Run-time configurable parameters that come from the input
2095  // ParameterList set by setParameterList().
2096  const Details::EDistributorSendType sendType = sendType_;
2097  const bool doBarrier = barrierBetween_;
2098 
2099  Teuchos::OSTab tab0 (out_);
2100  if (verbose_) {
2101  std::ostringstream os;
2102  os << "Proc " << myRank
2103  << ": Distributor::doPosts(3 args, Kokkos)" << endl;
2104  *out_ << os.str ();
2105  }
2106  Teuchos::OSTab tab1 (out_);
2107 
2108  TEUCHOS_TEST_FOR_EXCEPTION(
2109  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2110  std::logic_error,
2111  "Tpetra::Distributor::doPosts(3 args, Kokkos): Ready-send version "
2112  "requires a barrier between posting receives and posting ready sends. "
2113  "This should have been checked before. "
2114  "Please report this bug to the Tpetra developers.");
2115 
2116  size_t selfReceiveOffset = 0;
2117 
2118  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
2119  // check whether we're doing reverse mode before checking the
2120  // length of the imports array.
2121  if (false /* howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE */) {
2122  // Each message has the same number of packets.
2123  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
2124 
2125  if (verbose_) {
2126  std::ostringstream os;
2127  os << "Proc " << myRank << ": doPosts: totalNumImportPackets = " <<
2128  totalNumImportPackets << " = " << totalReceiveLength_ << " * " <<
2129  numPackets << "; imports.extent(0) = " << imports.extent (0)
2130  << endl;
2131  *out_ << os.str ();
2132  }
2133 
2134 #ifdef HAVE_TPETRA_DEBUG
2135  // mfh 31 Mar 2016: Extra special all-reduce check to help diagnose #227.
2136  {
2137  const size_t importBufSize = static_cast<size_t> (imports.extent (0));
2138  const int lclBad = (importBufSize < totalNumImportPackets) ? 1 : 0;
2139  int gblBad = 0;
2140  using Teuchos::reduceAll;
2141  using Teuchos::REDUCE_MAX;
2142  using Teuchos::outArg;
2143  reduceAll (*comm_, REDUCE_MAX, lclBad, outArg (gblBad));
2144  TEUCHOS_TEST_FOR_EXCEPTION
2145  (gblBad != 0,
2146  std::runtime_error,
2147  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2148  "On one or more MPI processes, the 'imports' array "
2149  "does not have enough entries to hold the expected number of "
2150  "import packets. ");
2151  }
2152 #else
2153  TEUCHOS_TEST_FOR_EXCEPTION
2154  (static_cast<size_t> (imports.extent (0)) < totalNumImportPackets,
2155  std::runtime_error,
2156  "Tpetra::Distributor::doPosts(3 args, Kokkos): The 'imports' "
2157  "array must have enough entries to hold the expected number of import "
2158  "packets. imports.extent(0) = " << imports.extent (0) << " < "
2159  "totalNumImportPackets = " << totalNumImportPackets << " = "
2160  "totalReceiveLength_ (" << totalReceiveLength_ << ") * numPackets ("
2161  << numPackets << ").");
2162 #endif // HAVE_TPETRA_DEBUG
2163  }
2164 
2165  // MPI tag for nonblocking receives and blocking sends in this
2166  // method. Some processes might take the "fast" path
2167  // (indicesTo_.empty()) and others might take the "slow" path for
2168  // the same doPosts() call, so the path tag must be the same for
2169  // both.
2170  const int pathTag = 0;
2171  const int tag = this->getTag (pathTag);
2172 
2173 #ifdef HAVE_TPETRA_DEBUG
2174  TEUCHOS_TEST_FOR_EXCEPTION
2175  (requests_.size () != 0,
2176  std::logic_error,
2177  "Tpetra::Distributor::doPosts(3 args, Kokkos): Process "
2178  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
2179 #endif // HAVE_TPETRA_DEBUG
2180 
2181  // Distributor uses requests_.size() as the number of outstanding
2182  // nonblocking message requests, so we resize to zero to maintain
2183  // this invariant.
2184  //
2185  // numReceives_ does _not_ include the self message, if there is
2186  // one. Here, we do actually send a message to ourselves, so we
2187  // include any self message in the "actual" number of receives to
2188  // post.
2189  //
2190  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2191  // doesn't (re)allocate its array of requests. That happens in
2192  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2193  // demand), or Resize_().
2194  const size_type actualNumReceives = as<size_type> (numReceives_) +
2195  as<size_type> (selfMessage_ ? 1 : 0);
2196  requests_.resize (0);
2197 
2198  if (verbose_) {
2199  std::ostringstream os;
2200  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2201  << (indicesTo_.empty () ? "fast" : "slow") << "): Post receives"
2202  << endl;
2203  *out_ << os.str ();
2204  }
2205 
2206  // Post the nonblocking receives. It's common MPI wisdom to post
2207  // receives before sends. In MPI terms, this means favoring
2208  // adding to the "posted queue" (of receive requests) over adding
2209  // to the "unexpected queue" (of arrived messages not yet matched
2210  // with a receive).
2211  {
2212 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2213  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3_recvs_);
2214 #endif // TPETRA_DISTRIBUTOR_TIMERS
2215 
2216  size_t curBufferOffset = 0;
2217  for (size_type i = 0; i < actualNumReceives; ++i) {
2218  const size_t curBufLen = lengthsFrom_[i] * numPackets;
2219  if (procsFrom_[i] != myRank) {
2220  if (verbose_) {
2221  std::ostringstream os;
2222  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2223  << (indicesTo_.empty () ? "fast" : "slow") << "): "
2224  << "Post irecv: {source: " << procsFrom_[i]
2225  << ", tag: " << tag << "}" << endl;
2226  *out_ << os.str ();
2227  }
2228  // If my process is receiving these packet(s) from another
2229  // process (not a self-receive):
2230  //
2231  // 1. Set up the persisting view (recvBuf) of the imports
2232  // array, given the offset and size (total number of
2233  // packets from process procsFrom_[i]).
2234  // 2. Start the Irecv and save the resulting request.
2235  TEUCHOS_TEST_FOR_EXCEPTION(
2236  curBufferOffset + curBufLen > static_cast<size_t> (imports.size ()),
2237  std::logic_error, "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2238  "Exceeded size of 'imports' array in packing loop on Process " <<
2239  myRank << ". imports.size() = " << imports.size () << " < "
2240  "curBufferOffset(" << curBufferOffset << ") + curBufLen(" <<
2241  curBufLen << ").");
2242  imports_view_type recvBuf =
2243  subview_offset (imports, curBufferOffset, curBufLen);
2244  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2245  tag, *comm_));
2246  }
2247  else { // Receiving from myself
2248  selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
2249  }
2250  curBufferOffset += curBufLen;
2251  }
2252  }
2253 
2254  if (doBarrier) {
2255 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2256  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3_barrier_);
2257 #endif // TPETRA_DISTRIBUTOR_TIMERS
2258 
2259  if (verbose_) {
2260  std::ostringstream os;
2261  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2262  << (indicesTo_.empty () ? "fast" : "slow") << "): Barrier" << endl;
2263  *out_ << os.str ();
2264  }
2265  // If we are using ready sends (MPI_Rsend) below, we need to do
2266  // a barrier before we post the ready sends. This is because a
2267  // ready send requires that its matching receive has already
2268  // been posted before the send has been posted. The only way to
2269  // guarantee that in this case is to use a barrier.
2270  comm_->barrier ();
2271  }
2272 
2273 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2274  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3_sends_);
2275 #endif // TPETRA_DISTRIBUTOR_TIMERS
2276 
2277  // setup scan through procsTo_ list starting with higher numbered procs
2278  // (should help balance message traffic)
2279  //
2280  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
2281  // It doesn't depend on the input at all.
2282  size_t numBlocks = numSends_ + selfMessage_;
2283  size_t procIndex = 0;
2284  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
2285  ++procIndex;
2286  }
2287  if (procIndex == numBlocks) {
2288  procIndex = 0;
2289  }
2290 
2291  size_t selfNum = 0;
2292  size_t selfIndex = 0;
2293 
2294  if (verbose_) {
2295  std::ostringstream os;
2296  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, "
2297  << (indicesTo_.empty () ? "fast" : "slow") << "): Post sends" << endl;
2298  *out_ << os.str ();
2299  }
2300 
2301  if (indicesTo_.empty()) {
2302  if (verbose_) {
2303  std::ostringstream os;
2304  os << "Proc " << myRank
2305  << ": doPosts(3 args, Kokkos, fast): posting sends" << endl;
2306  *out_ << os.str ();
2307  }
2308 
2309  // Data are already blocked (laid out) by process, so we don't
2310  // need a separate send buffer (besides the exports array).
2311  for (size_t i = 0; i < numBlocks; ++i) {
2312  size_t p = i + procIndex;
2313  if (p > (numBlocks - 1)) {
2314  p -= numBlocks;
2315  }
2316 
2317  if (procsTo_[p] != myRank) {
2318  if (verbose_) {
2319  std::ostringstream os;
2320  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, fast): Post send: "
2321  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
2322  *out_ << os.str ();
2323  }
2324 
2325  exports_view_type tmpSend = subview_offset(
2326  exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
2327 
2328  if (sendType == Details::DISTRIBUTOR_SEND) {
2329  send<int> (tmpSend,
2330  as<int> (tmpSend.size ()),
2331  procsTo_[p], tag, *comm_);
2332  }
2333  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2334  exports_view_type tmpSendBuf =
2335  subview_offset (exports, startsTo_[p] * numPackets,
2336  lengthsTo_[p] * numPackets);
2337  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2338  tag, *comm_));
2339  }
2340  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2341  readySend<int> (tmpSend,
2342  as<int> (tmpSend.size ()),
2343  procsTo_[p], tag, *comm_);
2344  }
2345  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2346  ssend<int> (tmpSend,
2347  as<int> (tmpSend.size ()),
2348  procsTo_[p], tag, *comm_);
2349  } else {
2350  TEUCHOS_TEST_FOR_EXCEPTION(
2351  true,
2352  std::logic_error,
2353  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2354  "Invalid send type. We should never get here. "
2355  "Please report this bug to the Tpetra developers.");
2356  }
2357  }
2358  else { // "Sending" the message to myself
2359  selfNum = p;
2360  }
2361  }
2362 
2363  if (selfMessage_) {
2364  if (verbose_) {
2365  std::ostringstream os;
2366  os << "Proc " << myRank
2367  << ": doPosts(3 args, Kokkos, fast): Self-send" << endl;
2368  *out_ << os.str ();
2369  }
2370  // This is how we "send a message to ourself": we copy from
2371  // the export buffer to the import buffer. That saves
2372  // Teuchos::Comm implementations other than MpiComm (in
2373  // particular, SerialComm) the trouble of implementing self
2374  // messages correctly. (To do this right, SerialComm would
2375  // need internal buffer space for messages, keyed on the
2376  // message's tag.)
2377  deep_copy_offset(imports, exports, selfReceiveOffset,
2378  startsTo_[selfNum]*numPackets,
2379  lengthsTo_[selfNum]*numPackets);
2380  }
2381  if (verbose_) {
2382  std::ostringstream os;
2383  os << "Proc " << myRank << ": doPosts(3 args, Kokkos, fast) done" << endl;
2384  *out_ << os.str ();
2385  }
2386  }
2387  else { // data are not blocked by proc, use send buffer
2388  if (verbose_) {
2389  std::ostringstream os;
2390  os << "Proc " << myRank
2391  << ": doPosts(3 args, Kokkos, slow): posting sends" << endl;
2392  *out_ << os.str ();
2393  }
2394 
2395  typedef typename ExpView::non_const_value_type Packet;
2396  typedef typename ExpView::array_layout Layout;
2397  typedef typename ExpView::device_type Device;
2398  typedef typename ExpView::memory_traits Mem;
2399  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray",
2400  maxSendLength_ * numPackets);
2401 
2402  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
2403  // sends), because the buffer is only long enough for one send.
2404  TEUCHOS_TEST_FOR_EXCEPTION(
2405  sendType == Details::DISTRIBUTOR_ISEND,
2406  std::logic_error,
2407  "Tpetra::Distributor::doPosts(3 args, Kokkos): The \"send buffer\" code path "
2408  "doesn't currently work with nonblocking sends.");
2409 
2410  for (size_t i = 0; i < numBlocks; ++i) {
2411  size_t p = i + procIndex;
2412  if (p > (numBlocks - 1)) {
2413  p -= numBlocks;
2414  }
2415 
2416  if (procsTo_[p] != myRank) {
2417  if (verbose_) {
2418  std::ostringstream os;
2419  os << "Proc " << myRank
2420  << ": doPosts(3 args, Kokkos, slow): Post send: {target: "
2421  << procsTo_[p] << ", tag: " << tag << "}" << endl;
2422  *out_ << os.str ();
2423  }
2424 
2425  size_t sendArrayOffset = 0;
2426  size_t j = startsTo_[p];
2427  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2428  deep_copy_offset(sendArray, exports, sendArrayOffset,
2429  indicesTo_[j]*numPackets, numPackets);
2430  sendArrayOffset += numPackets;
2431  }
2432  ImpView tmpSend =
2433  subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
2434 
2435  if (sendType == Details::DISTRIBUTOR_SEND) {
2436  send<int> (tmpSend,
2437  as<int> (tmpSend.size ()),
2438  procsTo_[p], tag, *comm_);
2439  }
2440  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2441  exports_view_type tmpSendBuf =
2442  subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
2443  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2444  tag, *comm_));
2445  }
2446  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2447  readySend<int> (tmpSend,
2448  as<int> (tmpSend.size ()),
2449  procsTo_[p], tag, *comm_);
2450  }
2451  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2452  ssend<int> (tmpSend,
2453  as<int> (tmpSend.size ()),
2454  procsTo_[p], tag, *comm_);
2455  }
2456  else {
2457  TEUCHOS_TEST_FOR_EXCEPTION(
2458  true,
2459  std::logic_error,
2460  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2461  "Invalid send type. We should never get here. "
2462  "Please report this bug to the Tpetra developers.");
2463  }
2464  }
2465  else { // "Sending" the message to myself
2466  selfNum = p;
2467  selfIndex = startsTo_[p];
2468  }
2469  }
2470 
2471  if (selfMessage_) {
2472  if (verbose_) {
2473  std::ostringstream os;
2474  os << "Proc " << myRank
2475  << ": doPosts(3 args, Kokkos, slow): Self-send" << endl;
2476  *out_ << os.str ();
2477  }
2478  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2479  deep_copy_offset(imports, exports, selfReceiveOffset,
2480  indicesTo_[selfIndex]*numPackets, numPackets);
2481  ++selfIndex;
2482  selfReceiveOffset += numPackets;
2483  }
2484  }
2485  if (verbose_) {
2486  std::ostringstream os;
2487  os << "Proc " << myRank
2488  << ": doPosts(3 args, Kokkos, slow) done" << endl;
2489  *out_ << os.str ();
2490  }
2491  }
2492 
2493  if (verbose_) {
2494  std::ostringstream os;
2495  os << "Proc " << myRank << ": doPosts done" << endl;
2496  *out_ << os.str ();
2497  }
2498  }
2499 
2500  template <class ExpView, class ImpView>
2501  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2502  Distributor::
2503  doPosts (const ExpView &exports,
2504  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2505  const ImpView &imports,
2506  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2507  {
2508  using Teuchos::Array;
2509  using Teuchos::as;
2510  using Teuchos::ireceive;
2511  using Teuchos::isend;
2512  using Teuchos::readySend;
2513  using Teuchos::send;
2514  using Teuchos::ssend;
2515  using Teuchos::TypeNameTraits;
2516 #ifdef HAVE_TEUCHOS_DEBUG
2517  using Teuchos::OSTab;
2518 #endif // HAVE_TEUCHOS_DEBUG
2519  using std::endl;
2520  using Kokkos::Compat::create_const_view;
2521  using Kokkos::Compat::create_view;
2522  using Kokkos::Compat::subview_offset;
2523  using Kokkos::Compat::deep_copy_offset;
2524  typedef Array<size_t>::size_type size_type;
2525  typedef ExpView exports_view_type;
2526  typedef ImpView imports_view_type;
2527 
2528 #ifdef KOKKOS_ENABLE_CUDA
2529  static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2530  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2531  "Please do not use Tpetra::Distributor with UVM "
2532  "allocations. See GitHub issue #1088.");
2533 #endif // KOKKOS_ENABLE_CUDA
2534 
2535  Teuchos::OSTab tab (out_);
2536 
2537 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2538  Teuchos::TimeMonitor timeMon (*timer_doPosts4_);
2539 #endif // TPETRA_DISTRIBUTOR_TIMERS
2540 
2541  // Run-time configurable parameters that come from the input
2542  // ParameterList set by setParameterList().
2543  const Details::EDistributorSendType sendType = sendType_;
2544  const bool doBarrier = barrierBetween_;
2545 
2546 // #ifdef HAVE_TEUCHOS_DEBUG
2547 // // Prepare for verbose output, if applicable.
2548 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2549 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
2550 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2551 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2552 
2553 // if (doPrint) {
2554 // // Only need one process to print out parameters.
2555 // *out << "Distributor::doPosts (4 args)" << endl;
2556 // }
2557 // // Add one tab level. We declare this outside the doPrint scopes
2558 // // so that the tab persists until the end of this method.
2559 // Teuchos::OSTab tab = this->getOSTab ();
2560 // if (doPrint) {
2561 // *out << "Parameters:" << endl;
2562 // {
2563 // OSTab tab2 (out);
2564 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2565 // << endl << "barrierBetween: " << doBarrier << endl;
2566 // }
2567 // }
2568 // #endif // HAVE_TEUCHOS_DEBUG
2569 
2570  TEUCHOS_TEST_FOR_EXCEPTION(
2571  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2572  std::logic_error, "Tpetra::Distributor::doPosts(4 args, Kokkos): Ready-send "
2573  "version requires a barrier between posting receives and posting ready "
2574  "sends. This should have been checked before. "
2575  "Please report this bug to the Tpetra developers.");
2576 
2577  const int myProcID = comm_->getRank ();
2578  size_t selfReceiveOffset = 0;
2579 
2580 #ifdef HAVE_TEUCHOS_DEBUG
2581  // Different messages may have different numbers of packets.
2582  size_t totalNumImportPackets = 0;
2583  for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
2584  totalNumImportPackets += numImportPacketsPerLID[ii];
2585  }
2586  TEUCHOS_TEST_FOR_EXCEPTION(
2587  imports.extent (0) < totalNumImportPackets, std::runtime_error,
2588  "Tpetra::Distributor::doPosts(4 args, Kokkos): The 'imports' array must have "
2589  "enough entries to hold the expected number of import packets. "
2590  "imports.extent(0) = " << imports.extent (0) << " < "
2591  "totalNumImportPackets = " << totalNumImportPackets << ".");
2592 #endif // HAVE_TEUCHOS_DEBUG
2593 
2594  // MPI tag for nonblocking receives and blocking sends in this
2595  // method. Some processes might take the "fast" path
2596  // (indicesTo_.empty()) and others might take the "slow" path for
2597  // the same doPosts() call, so the path tag must be the same for
2598  // both.
2599  const int pathTag = 1;
2600  const int tag = this->getTag (pathTag);
2601 
2602 #ifdef HAVE_TEUCHOS_DEBUG
2603  TEUCHOS_TEST_FOR_EXCEPTION
2604  (requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2605  "doPosts(4 args, Kokkos): Process " << myProcID << ": requests_.size () = "
2606  << requests_.size () << " != 0.");
2607 #endif // HAVE_TEUCHOS_DEBUG
2608  if (verbose_) {
2609  std::ostringstream os;
2610  os << "Proc " << myProcID << ": doPosts(4 args, Kokkos, "
2611  << (indicesTo_.empty () ? "fast" : "slow") << ")" << endl;
2612  *out_ << os.str ();
2613  }
2614 
2615  // Distributor uses requests_.size() as the number of outstanding
2616  // nonblocking message requests, so we resize to zero to maintain
2617  // this invariant.
2618  //
2619  // numReceives_ does _not_ include the self message, if there is
2620  // one. Here, we do actually send a message to ourselves, so we
2621  // include any self message in the "actual" number of receives to
2622  // post.
2623  //
2624  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2625  // doesn't (re)allocate its array of requests. That happens in
2626  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2627  // demand), or Resize_().
2628  const size_type actualNumReceives = as<size_type> (numReceives_) +
2629  as<size_type> (selfMessage_ ? 1 : 0);
2630  requests_.resize (0);
2631 
2632  // Post the nonblocking receives. It's common MPI wisdom to post
2633  // receives before sends. In MPI terms, this means favoring
2634  // adding to the "posted queue" (of receive requests) over adding
2635  // to the "unexpected queue" (of arrived messages not yet matched
2636  // with a receive).
2637  {
2638 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2639  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4_recvs_);
2640 #endif // TPETRA_DISTRIBUTOR_TIMERS
2641 
2642  size_t curBufferOffset = 0;
2643  size_t curLIDoffset = 0;
2644  for (size_type i = 0; i < actualNumReceives; ++i) {
2645  size_t totalPacketsFrom_i = 0;
2646  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
2647  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
2648  }
2649  curLIDoffset += lengthsFrom_[i];
2650  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
2651  // If my process is receiving these packet(s) from another
2652  // process (not a self-receive), and if there is at least
2653  // one packet to receive:
2654  //
2655  // 1. Set up the persisting view (recvBuf) into the imports
2656  // array, given the offset and size (total number of
2657  // packets from process procsFrom_[i]).
2658  // 2. Start the Irecv and save the resulting request.
2659  imports_view_type recvBuf =
2660  subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
2661  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2662  tag, *comm_));
2663  }
2664  else { // Receiving these packet(s) from myself
2665  selfReceiveOffset = curBufferOffset; // Remember the offset
2666  }
2667  curBufferOffset += totalPacketsFrom_i;
2668  }
2669  }
2670 
2671  if (doBarrier) {
2672 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2673  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4_barrier_);
2674 #endif // TPETRA_DISTRIBUTOR_TIMERS
2675  // If we are using ready sends (MPI_Rsend) below, we need to do
2676  // a barrier before we post the ready sends. This is because a
2677  // ready send requires that its matching receive has already
2678  // been posted before the send has been posted. The only way to
2679  // guarantee that in this case is to use a barrier.
2680  comm_->barrier ();
2681  }
2682 
2683 #ifdef TPETRA_DISTRIBUTOR_TIMERS
2684  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4_sends_);
2685 #endif // TPETRA_DISTRIBUTOR_TIMERS
2686 
2687  // setup arrays containing starting-offsets into exports for each send,
2688  // and num-packets-to-send for each send.
2689  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
2690  size_t maxNumPackets = 0;
2691  size_t curPKToffset = 0;
2692  for (size_t pp=0; pp<numSends_; ++pp) {
2693  sendPacketOffsets[pp] = curPKToffset;
2694  size_t numPackets = 0;
2695  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
2696  numPackets += numExportPacketsPerLID[j];
2697  }
2698  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
2699  packetsPerSend[pp] = numPackets;
2700  curPKToffset += numPackets;
2701  }
2702 
2703  // setup scan through procsTo_ list starting with higher numbered procs
2704  // (should help balance message traffic)
2705  size_t numBlocks = numSends_+ selfMessage_;
2706  size_t procIndex = 0;
2707  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
2708  ++procIndex;
2709  }
2710  if (procIndex == numBlocks) {
2711  procIndex = 0;
2712  }
2713 
2714  size_t selfNum = 0;
2715  size_t selfIndex = 0;
2716  if (indicesTo_.empty()) {
2717  if (verbose_) {
2718  std::ostringstream os;
2719  os << "Proc " << myProcID
2720  << ": doPosts(4 args, Kokkos, fast): posting sends" << endl;
2721  *out_ << os.str ();
2722  }
2723 
2724  // Data are already blocked (laid out) by process, so we don't
2725  // need a separate send buffer (besides the exports array).
2726  for (size_t i = 0; i < numBlocks; ++i) {
2727  size_t p = i + procIndex;
2728  if (p > (numBlocks - 1)) {
2729  p -= numBlocks;
2730  }
2731 
2732  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
2733  exports_view_type tmpSend =
2734  subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
2735 
2736  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
2737  send<int> (tmpSend,
2738  as<int> (tmpSend.size ()),
2739  procsTo_[p], tag, *comm_);
2740  }
2741  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2742  readySend<int> (tmpSend,
2743  as<int> (tmpSend.size ()),
2744  procsTo_[p], tag, *comm_);
2745  }
2746  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2747  exports_view_type tmpSendBuf =
2748  subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
2749  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2750  tag, *comm_));
2751  }
2752  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2753  ssend<int> (tmpSend,
2754  as<int> (tmpSend.size ()),
2755  procsTo_[p], tag, *comm_);
2756  }
2757  else {
2758  TEUCHOS_TEST_FOR_EXCEPTION(
2759  true, std::logic_error,
2760  "Tpetra::Distributor::doPosts(4 args, Kokkos): "
2761  "Invalid send type. We should never get here. "
2762  "Please report this bug to the Tpetra developers.");
2763  }
2764  }
2765  else { // "Sending" the message to myself
2766  selfNum = p;
2767  }
2768  }
2769 
2770  if (selfMessage_) {
2771  deep_copy_offset(imports, exports, selfReceiveOffset,
2772  sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
2773  }
2774  if (verbose_) {
2775  std::ostringstream os;
2776  os << "Proc " << myProcID << ": doPosts(4 args, Kokkos, fast) done" << endl;
2777  *out_ << os.str ();
2778  }
2779  }
2780  else { // data are not blocked by proc, use send buffer
2781  if (verbose_) {
2782  std::ostringstream os;
2783  os << "Proc " << myProcID << ": doPosts(4 args, Kokkos, slow): posting sends" << endl;
2784  *out_ << os.str ();
2785  }
2786 
2787  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
2788  typedef typename ExpView::non_const_value_type Packet;
2789  typedef typename ExpView::array_layout Layout;
2790  typedef typename ExpView::device_type Device;
2791  typedef typename ExpView::memory_traits Mem;
2792  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxNumPackets); // send buffer
2793 
2794  TEUCHOS_TEST_FOR_EXCEPTION(
2795  sendType == Details::DISTRIBUTOR_ISEND,
2796  std::logic_error,
2797  "Tpetra::Distributor::doPosts(4 args, Kokkos): "
2798  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
2799 
2800  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
2801  size_t ioffset = 0;
2802  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
2803  indicesOffsets[j] = ioffset;
2804  ioffset += numExportPacketsPerLID[j];
2805  }
2806 
2807  for (size_t i = 0; i < numBlocks; ++i) {
2808  size_t p = i + procIndex;
2809  if (p > (numBlocks - 1)) {
2810  p -= numBlocks;
2811  }
2812 
2813  if (procsTo_[p] != myProcID) {
2814  size_t sendArrayOffset = 0;
2815  size_t j = startsTo_[p];
2816  size_t numPacketsTo_p = 0;
2817  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2818  numPacketsTo_p += numExportPacketsPerLID[j];
2819  deep_copy_offset(sendArray, exports, sendArrayOffset,
2820  indicesOffsets[j], numExportPacketsPerLID[j]);
2821  sendArrayOffset += numExportPacketsPerLID[j];
2822  }
2823  if (numPacketsTo_p > 0) {
2824  ImpView tmpSend =
2825  subview_offset(sendArray, size_t(0), numPacketsTo_p);
2826 
2827  if (sendType == Details::DISTRIBUTOR_RSEND) {
2828  readySend<int> (tmpSend,
2829  as<int> (tmpSend.size ()),
2830  procsTo_[p], tag, *comm_);
2831  }
2832  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2833  exports_view_type tmpSendBuf =
2834  subview_offset (sendArray, size_t(0), numPacketsTo_p);
2835  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2836  tag, *comm_));
2837  }
2838  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2839  ssend<int> (tmpSend,
2840  as<int> (tmpSend.size ()),
2841  procsTo_[p], tag, *comm_);
2842  }
2843  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
2844  send<int> (tmpSend,
2845  as<int> (tmpSend.size ()),
2846  procsTo_[p], tag, *comm_);
2847  }
2848  }
2849  }
2850  else { // "Sending" the message to myself
2851  selfNum = p;
2852  selfIndex = startsTo_[p];
2853  }
2854  }
2855 
2856  if (selfMessage_) {
2857  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2858  deep_copy_offset(imports, exports, selfReceiveOffset,
2859  indicesOffsets[selfIndex],
2860  numExportPacketsPerLID[selfIndex]);
2861  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
2862  ++selfIndex;
2863  }
2864  }
2865  if (verbose_) {
2866  std::ostringstream os;
2867  os << "Proc " << myProcID
2868  << ": doPosts(4 args, Kokkos, slow) done" << endl;
2869  *out_ << os.str ();
2870  }
2871  }
2872  }
2873 
2874  template <class ExpView, class ImpView>
2875  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2876  Distributor::
2877  doReversePostsAndWaits (const ExpView& exports,
2878  size_t numPackets,
2879  const ImpView& imports)
2880  {
2881  doReversePosts (exports, numPackets, imports);
2882  doReverseWaits ();
2883  }
2884 
2885  template <class ExpView, class ImpView>
2886  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2887  Distributor::
2888  doReversePostsAndWaits (const ExpView& exports,
2889  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2890  const ImpView& imports,
2891  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2892  {
2893  TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
2894  "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
2895  << requests_.size() << " outstanding nonblocking messages pending. It "
2896  "is incorrect to call this method with posts outstanding.");
2897 
2898  doReversePosts (exports, numExportPacketsPerLID, imports,
2899  numImportPacketsPerLID);
2900  doReverseWaits ();
2901  }
2902 
2903  template <class ExpView, class ImpView>
2904  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2905  Distributor::
2906  doReversePosts (const ExpView &exports,
2907  size_t numPackets,
2908  const ImpView &imports)
2909  {
2910  // FIXME (mfh 29 Mar 2012) WHY?
2911  TEUCHOS_TEST_FOR_EXCEPTION(
2912  ! indicesTo_.empty (), std::runtime_error,
2913  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2914  "reverse communication when original data are blocked by process.");
2915  if (reverseDistributor_.is_null ()) {
2916  createReverseDistributor ();
2917  }
2918  reverseDistributor_->doPosts (exports, numPackets, imports);
2919  }
2920 
2921  template <class ExpView, class ImpView>
2922  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2923  Distributor::
2924  doReversePosts (const ExpView &exports,
2925  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2926  const ImpView &imports,
2927  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2928  {
2929  // FIXME (mfh 29 Mar 2012) WHY?
2930  TEUCHOS_TEST_FOR_EXCEPTION(
2931  ! indicesTo_.empty (), std::runtime_error,
2932  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2933  "reverse communication when original data are blocked by process.");
2934  if (reverseDistributor_.is_null ()) {
2935  createReverseDistributor ();
2936  }
2937  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2938  imports, numImportPacketsPerLID);
2939  }
2940 
2941  template <class OrdinalType>
2942  void Distributor::
2943  computeSends (const Teuchos::ArrayView<const OrdinalType> & importGIDs,
2944  const Teuchos::ArrayView<const int> & importProcIDs,
2945  Teuchos::Array<OrdinalType> & exportGIDs,
2946  Teuchos::Array<int> & exportProcIDs)
2947  {
2948  // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
2949  // "assumes that size_t >= Ordinal". The code certainly does
2950  // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
2951  // sizeof(size_t) >= sizeof(int). This is because it casts the
2952  // OrdinalType elements of importGIDs (along with their
2953  // corresponding process IDs, as int) to size_t, and does a
2954  // doPostsAndWaits<size_t>() to send the packed data.
2955  using Teuchos::Array;
2956  using Teuchos::ArrayView;
2957  using std::endl;
2958  typedef typename ArrayView<const OrdinalType>::size_type size_type;
2959 
2960  Teuchos::OSTab tab (out_);
2961  const int myRank = comm_->getRank ();
2962  if (verbose_) {
2963  std::ostringstream os;
2964  os << "Proc " << myRank << ": computeSends" << endl;
2965  *out_ << os.str ();
2966  }
2967 
2968  TEUCHOS_TEST_FOR_EXCEPTION(
2969  importGIDs.size () != importProcIDs.size (), std::invalid_argument,
2970  "Tpetra::Distributor::computeSends: On Process " << myRank << ": "
2971  "importProcIDs.size() = " << importProcIDs.size ()
2972  << " != importGIDs.size() = " << importGIDs.size () << ".");
2973 
2974  const size_type numImports = importProcIDs.size ();
2975  Array<size_t> importObjs (2*numImports);
2976  // Pack pairs (importGIDs[i], my process ID) to send into importObjs.
2977  for (size_type i = 0; i < numImports; ++i) {
2978  importObjs[2*i] = static_cast<size_t> (importGIDs[i]);
2979  importObjs[2*i+1] = static_cast<size_t> (myRank);
2980  }
2981  //
2982  // Use a temporary Distributor to send the (importGIDs[i], myRank)
2983  // pairs to importProcIDs[i].
2984  //
2985  Distributor tempPlan (comm_, out_);
2986  if (verbose_) {
2987  std::ostringstream os;
2988  os << "Proc " << myRank << ": computeSends: tempPlan.createFromSends" << endl;
2989  *out_ << os.str ();
2990  }
2991 
2992  // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
2993  // signed, in order to forestall any possible causes for Bug 6069.
2994  const size_t numExportsAsSizeT = tempPlan.createFromSends (importProcIDs);
2995  const size_type numExports = static_cast<size_type> (numExportsAsSizeT);
2996  TEUCHOS_TEST_FOR_EXCEPTION(
2997  numExports < 0, std::logic_error, "Tpetra::Distributor::computeSends: "
2998  "tempPlan.createFromSends() returned numExports = " << numExportsAsSizeT
2999  << " as a size_t, which overflows to " << numExports << " when cast to "
3000  << Teuchos::TypeNameTraits<size_type>::name () << ". "
3001  "Please report this bug to the Tpetra developers.");
3002  TEUCHOS_TEST_FOR_EXCEPTION(
3003  static_cast<size_type> (tempPlan.getTotalReceiveLength ()) != numExports,
3004  std::logic_error, "Tpetra::Distributor::computeSends: tempPlan.getTotal"
3005  "ReceiveLength() = " << tempPlan.getTotalReceiveLength () << " != num"
3006  "Exports = " << numExports << ". Please report this bug to the "
3007  "Tpetra developers.");
3008 
3009  if (numExports > 0) {
3010  exportGIDs.resize (numExports);
3011  exportProcIDs.resize (numExports);
3012  }
3013 
3014  // exportObjs: Packed receive buffer. (exportObjs[2*i],
3015  // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
3016  // after tempPlan.doPostsAndWaits(...) finishes below.
3017  //
3018  // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
3019  // size_t. This issue might come up, for example, on a 32-bit
3020  // machine using 64-bit global indices. I will add a check here
3021  // for that case.
3022  TEUCHOS_TEST_FOR_EXCEPTION(
3023  sizeof (size_t) < sizeof (OrdinalType), std::logic_error,
3024  "Tpetra::Distributor::computeSends: sizeof(size_t) = " << sizeof(size_t)
3025  << " < sizeof(" << Teuchos::TypeNameTraits<OrdinalType>::name () << ") = "
3026  << sizeof (OrdinalType) << ". This violates an assumption of the "
3027  "method. It's not hard to work around (just use Array<OrdinalType> as "
3028  "the export buffer, not Array<size_t>), but we haven't done that yet. "
3029  "Please report this bug to the Tpetra developers.");
3030 
3031  TEUCHOS_TEST_FOR_EXCEPTION(
3032  tempPlan.getTotalReceiveLength () < static_cast<size_t> (numExports),
3033  std::logic_error,
3034  "Tpetra::Distributor::computeSends: tempPlan.getTotalReceiveLength() = "
3035  << tempPlan.getTotalReceiveLength() << " < numExports = " << numExports
3036  << ". Please report this bug to the Tpetra developers.");
3037 
3038  Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
3039  if (verbose_) {
3040  std::ostringstream os;
3041  os << "Proc " << myRank << ": computeSends: tempPlan.doPostsAndWaits" << endl;
3042  *out_ << os.str ();
3043  }
3044  tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
3045 
3046  // Unpack received (GID, PID) pairs into exportIDs resp. exportProcIDs.
3047  for (size_type i = 0; i < numExports; ++i) {
3048  exportGIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
3049  exportProcIDs[i] = static_cast<int> (exportObjs[2*i+1]);
3050  }
3051 
3052  if (verbose_) {
3053  std::ostringstream os;
3054  os << "Proc " << myRank << ": computeSends done" << endl;
3055  *out_ << os.str ();
3056  }
3057  }
3058 
3059  template <class OrdinalType>
3060  void Distributor::
3061  createFromRecvs (const Teuchos::ArrayView<const OrdinalType> &remoteGIDs,
3062  const Teuchos::ArrayView<const int> &remoteProcIDs,
3063  Teuchos::Array<OrdinalType> &exportGIDs,
3064  Teuchos::Array<int> &exportProcIDs)
3065  {
3066  using std::endl;
3067 
3068  Teuchos::OSTab tab (out_);
3069  const int myRank = comm_->getRank();
3070 
3071  if (verbose_) {
3072  *out_ << "Proc " << myRank << ": createFromRecvs" << endl;
3073  }
3074 
3075 #ifdef HAVE_TPETRA_DEBUG
3076  using Teuchos::outArg;
3077  using Teuchos::reduceAll;
3078 
3079  // In debug mode, first test locally, then do an all-reduce to
3080  // make sure that all processes passed.
3081  const int errProc =
3082  (remoteGIDs.size () != remoteProcIDs.size ()) ? myRank : -1;
3083  int maxErrProc = -1;
3084  reduceAll<int, int> (*comm_, Teuchos::REDUCE_MAX, errProc, outArg (maxErrProc));
3085  TEUCHOS_TEST_FOR_EXCEPTION(maxErrProc != -1, std::runtime_error,
3086  Teuchos::typeName (*this) << "::createFromRecvs(): lists of remote IDs "
3087  "and remote process IDs must have the same size on all participating "
3088  "processes. Maximum process ID with error: " << maxErrProc << ".");
3089 #else // NOT HAVE_TPETRA_DEBUG
3090 
3091  // In non-debug mode, just test locally.
3092  TEUCHOS_TEST_FOR_EXCEPTION(
3093  remoteGIDs.size () != remoteProcIDs.size (), std::invalid_argument,
3094  Teuchos::typeName (*this) << "::createFromRecvs<" <<
3095  Teuchos::TypeNameTraits<OrdinalType>::name () << ">(): On Process " <<
3096  myRank << ": remoteGIDs.size() = " << remoteGIDs.size () << " != "
3097  "remoteProcIDs.size() = " << remoteProcIDs.size () << ".");
3098 #endif // HAVE_TPETRA_DEBUG
3099 
3100  computeSends (remoteGIDs, remoteProcIDs, exportGIDs, exportProcIDs);
3101 
3102  const size_t numProcsSendingToMe = createFromSends (exportProcIDs ());
3103 
3104  if (verbose_) {
3105  // NOTE (mfh 20 Mar 2014) If remoteProcIDs could contain
3106  // duplicates, then its length might not be the right check here,
3107  // even if we account for selfMessage_. selfMessage_ is set in
3108  // createFromSends.
3109  std::ostringstream os;
3110  os << "Proc " << myRank << ": {numProcsSendingToMe: "
3111  << numProcsSendingToMe << ", remoteProcIDs.size(): "
3112  << remoteProcIDs.size () << ", selfMessage_: "
3113  << (selfMessage_ ? "true" : "false") << "}" << std::endl;
3114  *out_ << os.str ();
3115  }
3116 
3117  if (verbose_) {
3118  *out_ << "Proc " << myRank << ": createFromRecvs done" << endl;
3119  }
3120 
3121  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
3122  }
3123 
3124 
3125 } // namespace Tpetra
3126 
3127 #endif // TPETRA_DISTRIBUTOR_HPP
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
size_t getNumReceives() const
The number of processes from which we will receive data.
std::string description() const
Return a one-line description of this object.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
List of valid Distributor parameters.
virtual ~Distributor()=default
Destructor (virtual for memory safety).
void swap(Distributor &rhs)
Swap the contents of rhs with those of *this.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
Teuchos::ArrayView< const size_t > getLengthsFrom() const
Number of values this process will receive from each process.
Teuchos::ArrayView< const int > getProcsFrom() const
Ranks of the processes sending values to this process.
size_t createFromSends(const Teuchos::ArrayView< const int > &exportProcIDs)
Set up Distributor using list of process ranks to which this process will send.
Details::EDistributorHowInitialized howInitialized() const
Return an enum indicating whether and how a Distributor was initialized.
void doPosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a forward plan, but do not execute the waits yet.
Teuchos::ArrayView< const int > getProcsTo() const
Ranks of the processes to which this process will send values.
void createFromSendsAndRecvs(const Teuchos::ArrayView< const int > &exportProcIDs, const Teuchos::ArrayView< const int > &remoteProcIDs)
Set up Distributor using list of process ranks to which to send, and list of process ranks from which...
bool hasSelfMessage() const
Whether the calling process will send or receive messages to itself.
Sets up and executes a communication plan for a Tpetra DistObject.
size_t getTotalReceiveLength() const
Total number of values this process will receive from other processes.
void setParameterList(const Teuchos::RCP< Teuchos::ParameterList > &plist)
Set Distributor parameters.
void doReversePosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a reverse plan, but do not execute the waits yet.
Teuchos::ArrayView< const size_t > getLengthsTo() const
Number of values this process will send to each process.
Teuchos::Array< std::string > distributorSendTypes()
Valid values for Distributor&#39;s &quot;Send type&quot; parameter.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
size_t getNumSends() const
The number of processes to which we will send data.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Describe this object in a human-readable way to the given output stream.
size_t getMaxSendLength() const
Maximum number of values this process will send to another single process.
Teuchos::RCP< Distributor > getReverse() const
A reverse communication plan Distributor.
void createFromRecvs(const Teuchos::ArrayView< const Ordinal > &remoteIDs, const Teuchos::ArrayView< const int > &remoteProcIDs, Teuchos::Array< Ordinal > &exportIDs, Teuchos::Array< int > &exportProcIDs)
Set up Distributor using list of process ranks from which to receive.
EDistributorSendType
The type of MPI send that Distributor should use.
Distributor(const Teuchos::RCP< const Teuchos::Comm< int > > &comm)
Construct using the specified communicator and default parameters.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.