Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Distributor.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DISTRIBUTOR_HPP
41 #define TPETRA_DISTRIBUTOR_HPP
42 
43 #include "Tpetra_Util.hpp"
44 #include "Teuchos_as.hpp"
45 #include "Teuchos_Describable.hpp"
46 #include "Teuchos_ParameterListAcceptorDefaultBase.hpp"
47 #include "Teuchos_VerboseObject.hpp"
49 
50 #include "KokkosCompat_View.hpp"
51 #include "Kokkos_Core.hpp"
52 #include "Kokkos_TeuchosCommAdapters.hpp"
53 #include <memory>
54 #include <sstream>
55 #include <type_traits>
56 
57 namespace Tpetra {
58 
59  namespace Details {
65  DISTRIBUTOR_ISEND, // Use MPI_Isend (Teuchos::isend)
66  DISTRIBUTOR_RSEND, // Use MPI_Rsend (Teuchos::readySend)
67  DISTRIBUTOR_SEND, // Use MPI_Send (Teuchos::send)
68  DISTRIBUTOR_SSEND // Use MPI_Ssend (Teuchos::ssend)
69  };
70 
75  std::string
77 
83  DISTRIBUTOR_NOT_INITIALIZED, // Not initialized yet
84  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS, // By createFromSends
85  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS, // By createFromRecvs
86  DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_SENDS_N_RECVS, // By createFromSendsAndRecvs
87  DISTRIBUTOR_INITIALIZED_BY_REVERSE, // By createReverseDistributor
88  DISTRIBUTOR_INITIALIZED_BY_COPY, // By copy constructor
89  };
90 
95  std::string
97 
98  } // namespace Details
99 
106  Teuchos::Array<std::string> distributorSendTypes ();
107 
175  class Distributor :
176  public Teuchos::Describable,
177  public Teuchos::ParameterListAcceptorDefaultBase {
178  public:
180 
181 
190  explicit Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm);
191 
203  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
204  const Teuchos::RCP<Teuchos::FancyOStream>& out);
205 
219  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
220  const Teuchos::RCP<Teuchos::ParameterList>& plist);
221 
238  Distributor (const Teuchos::RCP<const Teuchos::Comm<int> >& comm,
239  const Teuchos::RCP<Teuchos::FancyOStream>& out,
240  const Teuchos::RCP<Teuchos::ParameterList>& plist);
241 
243  Distributor (const Distributor& distributor);
244 
249  virtual ~Distributor () = default;
250 
256  void swap (Distributor& rhs);
257 
259 
261 
266  void setParameterList (const Teuchos::RCP<Teuchos::ParameterList>& plist);
267 
272  Teuchos::RCP<const Teuchos::ParameterList> getValidParameters () const;
273 
275 
277 
297  size_t createFromSends (const Teuchos::ArrayView<const int>& exportProcIDs);
298 
332  template <class Ordinal>
333  void
334  createFromRecvs (const Teuchos::ArrayView<const Ordinal>& remoteIDs,
335  const Teuchos::ArrayView<const int>& remoteProcIDs,
336  Teuchos::Array<Ordinal>& exportIDs,
337  Teuchos::Array<int>& exportProcIDs);
338 
346  void
347  createFromSendsAndRecvs (const Teuchos::ArrayView<const int>& exportProcIDs,
348  const Teuchos::ArrayView<const int>& remoteProcIDs);
349 
351 
353 
357  size_t getNumReceives() const;
358 
362  size_t getNumSends() const;
363 
365  bool hasSelfMessage() const;
366 
368  size_t getMaxSendLength() const;
369 
371  size_t getTotalReceiveLength() const;
372 
377  Teuchos::ArrayView<const int> getProcsFrom() const;
378 
383  Teuchos::ArrayView<const int> getProcsTo() const;
384 
392  Teuchos::ArrayView<const size_t> getLengthsFrom() const;
393 
401  Teuchos::ArrayView<const size_t> getLengthsTo() const;
402 
408  return howInitialized_;
409  }
410 
412 
414 
425  Teuchos::RCP<Distributor> getReverse(bool create=true) const;
426 
428 
430 
451  template <class Packet>
452  void
453  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
454  size_t numPackets,
455  const Teuchos::ArrayView<Packet> &imports);
456 
478  template <class Packet>
479  void
480  doPostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
481  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
482  const Teuchos::ArrayView<Packet> &imports,
483  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
484 
509  template <class Packet>
510  void
511  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
512  size_t numPackets,
513  const Teuchos::ArrayRCP<Packet> &imports);
514 
533  template <class Packet>
534  void
535  doPosts (const Teuchos::ArrayRCP<const Packet> &exports,
536  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
537  const Teuchos::ArrayRCP<Packet> &imports,
538  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
539 
546  void doWaits ();
547 
552  template <class Packet>
553  void
554  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
555  size_t numPackets,
556  const Teuchos::ArrayView<Packet> &imports);
557 
562  template <class Packet>
563  void
564  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet> &exports,
565  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
566  const Teuchos::ArrayView<Packet> &imports,
567  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
568 
573  template <class Packet>
574  void
575  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
576  size_t numPackets,
577  const Teuchos::ArrayRCP<Packet> &imports);
578 
583  template <class Packet>
584  void
585  doReversePosts (const Teuchos::ArrayRCP<const Packet> &exports,
586  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
587  const Teuchos::ArrayRCP<Packet> &imports,
588  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
589 
596  void doReverseWaits ();
597 
618  template <class ExpView, class ImpView>
619  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
621  const ExpView &exports,
622  size_t numPackets,
623  const ImpView &imports);
624 
646  template <class ExpView, class ImpView>
647  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
648  doPostsAndWaits (const ExpView &exports,
649  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
650  const ImpView &imports,
651  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
652 
677  template <class ExpView, class ImpView>
678  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
679  doPosts (const ExpView &exports,
680  size_t numPackets,
681  const ImpView &imports);
682 
701  template <class ExpView, class ImpView>
702  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
703  doPosts (const ExpView &exports,
704  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
705  const ImpView &imports,
706  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
707 
712  template <class ExpView, class ImpView>
713  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
714  doReversePostsAndWaits (const ExpView &exports,
715  size_t numPackets,
716  const ImpView &imports);
717 
722  template <class ExpView, class ImpView>
723  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
724  doReversePostsAndWaits (const ExpView &exports,
725  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
726  const ImpView &imports,
727  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
728 
733  template <class ExpView, class ImpView>
734  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
735  doReversePosts (const ExpView &exports,
736  size_t numPackets,
737  const ImpView &imports);
738 
743  template <class ExpView, class ImpView>
744  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
745  doReversePosts (const ExpView &exports,
746  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
747  const ImpView &imports,
748  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
749 
753  void getLastDoStatistics(size_t & bytes_sent, size_t & bytes_recvd) const{
754  bytes_sent = lastRoundBytesSend_;
755  bytes_recvd = lastRoundBytesRecv_;
756  }
757 
759 
761 
763  std::string description() const;
764 
786  void
787  describe (Teuchos::FancyOStream& out,
788  const Teuchos::EVerbosityLevel verbLevel =
789  Teuchos::Describable::verbLevel_default) const;
791 
792  private:
794  Teuchos::RCP<const Teuchos::Comm<int> > comm_;
795 
797  Details::EDistributorHowInitialized howInitialized_;
798 
800 
801 
804 
806  bool barrierBetween_;
807 
809  static bool getVerbose();
810 
815  std::unique_ptr<std::string>
816  createPrefix(const char methodName[]) const;
817 
819  bool verbose_ = getVerbose();
821 
825  bool selfMessage_;
826 
836  size_t numSends_;
837 
842  Teuchos::Array<int> procsTo_;
843 
852  Teuchos::Array<size_t> startsTo_;
853 
859  Teuchos::Array<size_t> lengthsTo_;
860 
864  size_t maxSendLength_;
865 
881  Teuchos::Array<size_t> indicesTo_;
882 
892  size_t numReceives_;
893 
900  size_t totalReceiveLength_;
901 
907  Teuchos::Array<size_t> lengthsFrom_;
908 
914  Teuchos::Array<int> procsFrom_;
915 
921  Teuchos::Array<size_t> startsFrom_;
922 
928  Teuchos::Array<size_t> indicesFrom_;
929 
936  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int> > > requests_;
937 
942  mutable Teuchos::RCP<Distributor> reverseDistributor_;
943 
945  size_t lastRoundBytesSend_;
946 
948  size_t lastRoundBytesRecv_;
949 
950 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
951  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_;
952  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_;
953  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_;
954  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_;
955  Teuchos::RCP<Teuchos::Time> timer_doWaits_;
956  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_recvs_;
957  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_recvs_;
958  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_barrier_;
959  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_barrier_;
960  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_sends_;
961  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_sends_;
962  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_sends_slow_;
963  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_sends_slow_;
964  Teuchos::RCP<Teuchos::Time> timer_doPosts3TA_sends_fast_;
965  Teuchos::RCP<Teuchos::Time> timer_doPosts4TA_sends_fast_;
966  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_recvs_;
967  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_recvs_;
968  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_barrier_;
969  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_barrier_;
970  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_;
971  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_;
972  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_slow_;
973  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_slow_;
974  Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_fast_;
975  Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_fast_;
976 
978  void makeTimers ();
979 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
980 
992  bool useDistinctTags_;
993 
998  int getTag (const int pathTag) const;
999 
1010  void computeReceives ();
1011 
1024  template <class Ordinal>
1025  void computeSends (const Teuchos::ArrayView<const Ordinal> &remoteGIDs,
1026  const Teuchos::ArrayView<const int> &remoteProcIDs,
1027  Teuchos::Array<Ordinal> &exportGIDs,
1028  Teuchos::Array<int> &exportProcIDs);
1029 
1031  void createReverseDistributor() const;
1032 
1033 
1038  std::string
1039  localDescribeToString (const Teuchos::EVerbosityLevel vl) const;
1040  }; // class Distributor
1041 
1042 
1043  template <class Packet>
1044  void Distributor::
1045  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1046  size_t numPackets,
1047  const Teuchos::ArrayView<Packet>& imports)
1048  {
1049  using Teuchos::arcp;
1050  using Teuchos::ArrayRCP;
1051  typedef typename ArrayRCP<const Packet>::size_type size_type;
1052 
1053  TEUCHOS_TEST_FOR_EXCEPTION(
1054  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1055  "doPostsAndWaits(3 args): There are " << requests_.size () <<
1056  " outstanding nonblocking messages pending. It is incorrect to call "
1057  "this method with posts outstanding.");
1058 
1059  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1060  // requiring that the memory location is persisting (as is
1061  // necessary for nonblocking receives). However, it need only
1062  // persist until doWaits() completes, so it is safe for us to use
1063  // a nonpersisting reference in this case. The use of a
1064  // nonpersisting reference is purely a performance optimization.
1065 
1066  //const Packet* exportsPtr = exports.getRawPtr();
1067  //ArrayRCP<const Packet> exportsArcp (exportsPtr, static_cast<size_type> (0),
1068  // exports.size(), false);
1069  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1070  static_cast<size_type> (0),
1071  exports.size(), false);
1072 
1073  // For some reason, neither of the options below (that use arcp)
1074  // compile for Packet=std::complex<double> with GCC 4.5.1. The
1075  // issue only arises with the exports array. This is why we
1076  // construct a separate nonowning ArrayRCP.
1077 
1078  // doPosts (arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1079  // numPackets,
1080  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1081  // doPosts (arcp<const Packet> (exportsPtr, 0, exports.size(), false),
1082  // numPackets,
1083  // arcp<Packet> (imports.getRawPtr(), 0, imports.size(), false));
1084  doPosts (exportsArcp,
1085  numPackets,
1086  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1087  doWaits ();
1088 
1089  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1090  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1091  }
1092 
1093  template <class Packet>
1094  void Distributor::
1095  doPostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1096  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1097  const Teuchos::ArrayView<Packet> &imports,
1098  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1099  {
1100  using Teuchos::arcp;
1101  using Teuchos::ArrayRCP;
1102 
1103  TEUCHOS_TEST_FOR_EXCEPTION(
1104  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1105  "doPostsAndWaits: There are " << requests_.size () << " outstanding "
1106  "nonblocking messages pending. It is incorrect to call doPostsAndWaits "
1107  "with posts outstanding.");
1108 
1109  // doPosts() accepts the exports and imports arrays as ArrayRCPs,
1110  // requiring that the memory location is persisting (as is
1111  // necessary for nonblocking receives). However, it need only
1112  // persist until doWaits() completes, so it is safe for us to use
1113  // a nonpersisting reference in this case.
1114 
1115  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1116  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1117  // with some versions of GCC. The issue only arises with the
1118  // exports array. This is why we construct a separate nonowning
1119  // ArrayRCP.
1120  typedef typename ArrayRCP<const Packet>::size_type size_type;
1121  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (),
1122  static_cast<size_type> (0),
1123  exports.size (), false);
1124  // mfh 04 Apr 2012: This is the offending code. This statement
1125  // would normally be in place of "exportsArcp" in the
1126  // doPosts() call below.
1127  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false),
1128  doPosts (exportsArcp,
1129  numExportPacketsPerLID,
1130  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1131  numImportPacketsPerLID);
1132  doWaits ();
1133 
1134  lastRoundBytesSend_ = exports.size () * sizeof (Packet);
1135  lastRoundBytesRecv_ = imports.size () * sizeof (Packet);
1136  }
1137 
1138 
1139  template <class Packet>
1140  void Distributor::
1141  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1142  size_t numPackets,
1143  const Teuchos::ArrayRCP<Packet>& imports)
1144  {
1145  using Teuchos::Array;
1146  using Teuchos::ArrayRCP;
1147  using Teuchos::ArrayView;
1148  using Teuchos::as;
1149  using Teuchos::FancyOStream;
1150  using Teuchos::includesVerbLevel;
1151  using Teuchos::ireceive;
1152  using Teuchos::isend;
1153  using Teuchos::readySend;
1154  using Teuchos::send;
1155  using Teuchos::ssend;
1156  using Teuchos::TypeNameTraits;
1157  using Teuchos::typeName;
1158  using std::endl;
1159  using size_type = Array<size_t>::size_type;
1160 
1161 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1162  Teuchos::TimeMonitor timeMon (*timer_doPosts3TA_);
1163 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1164 
1165  const bool debug = Details::Behavior::debug("Distributor");
1166  const int myRank = comm_->getRank ();
1167  // Run-time configurable parameters that come from the input
1168  // ParameterList set by setParameterList().
1169  const Details::EDistributorSendType sendType = sendType_;
1170  const bool doBarrier = barrierBetween_;
1171 
1172  std::unique_ptr<std::string> prefix;
1173  if (verbose_) {
1174  prefix = createPrefix("doPosts(3-arg, ArrayRCP)");
1175  std::ostringstream os;
1176  os << *prefix << "Start" << endl;
1177  std::cerr << os.str();
1178  }
1179 
1180  TEUCHOS_TEST_FOR_EXCEPTION(
1181  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier, std::logic_error,
1182  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Ready-send "
1183  "version requires a barrier between posting receives and posting ready "
1184  "sends. This should have been checked before. "
1185  "Please report this bug to the Tpetra developers.");
1186 
1187  size_t selfReceiveOffset = 0;
1188 
1189  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
1190  // check whether we're doing reverse mode before checking the
1191  // length of the imports array.
1192  if (howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE) {
1193  // Each message has the same number of packets.
1194  //
1195  // FIXME (mfh 18 Jul 2014): Relaxing this test from strict
1196  // inequality to a less-than seems to have fixed Bug 6170. It's
1197  // OK for the 'imports' array to be longer than it needs to be;
1198  // I'm just curious why it would be.
1199  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
1200  TEUCHOS_TEST_FOR_EXCEPTION
1201  (static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1202  std::invalid_argument,
1203  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1204  "The 'imports' array must have enough entries to hold the expected number "
1205  "of import packets. imports.size() = " << imports.size () << " < "
1206  "totalNumImportPackets = " << totalNumImportPackets << ".");
1207  }
1208 
1209  // MPI tag for nonblocking receives and blocking sends in this
1210  // method. Some processes might take the "fast" path
1211  // (indicesTo_.empty()) and others might take the "slow" path for
1212  // the same doPosts() call, so the path tag must be the same for
1213  // both.
1214  const int pathTag = 0;
1215  const int tag = this->getTag (pathTag);
1216 
1217  if (debug) {
1218  TEUCHOS_TEST_FOR_EXCEPTION
1219  (requests_.size () != 0,
1220  std::logic_error,
1221  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): Process "
1222  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
1223  }
1224 
1225  // Distributor uses requests_.size() as the number of outstanding
1226  // nonblocking message requests, so we resize to zero to maintain
1227  // this invariant.
1228  //
1229  // numReceives_ does _not_ include the self message, if there is
1230  // one. Here, we do actually send a message to ourselves, so we
1231  // include any self message in the "actual" number of receives to
1232  // post.
1233  //
1234  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1235  // doesn't (re)allocate its array of requests. That happens in
1236  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1237  // demand), or Resize_().
1238  const size_type actualNumReceives = as<size_type> (numReceives_) +
1239  as<size_type> (selfMessage_ ? 1 : 0);
1240  requests_.resize (0);
1241 
1242  if (verbose_) {
1243  std::ostringstream os;
1244  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1245  << ": Post receives" << endl;
1246  std::cerr << os.str();
1247  }
1248 
1249  // Post the nonblocking receives. It's common MPI wisdom to post
1250  // receives before sends. In MPI terms, this means favoring
1251  // adding to the "posted queue" (of receive requests) over adding
1252  // to the "unexpected queue" (of arrived messages not yet matched
1253  // with a receive).
1254  {
1255 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1256  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3TA_recvs_);
1257 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1258 
1259  size_t curBufOffset = 0;
1260  for (size_type i = 0; i < actualNumReceives; ++i) {
1261  const size_t curBufLen = lengthsFrom_[i] * numPackets;
1262  if (procsFrom_[i] != myRank) {
1263  if (verbose_) {
1264  std::ostringstream os;
1265  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1266  << ": Post irecv: {source: " << procsFrom_[i]
1267  << ", tag: " << tag << "}" << endl;
1268  std::cerr << os.str();
1269  }
1270  // If my process is receiving these packet(s) from another
1271  // process (not a self-receive):
1272  //
1273  // 1. Set up the persisting view (recvBuf) of the imports
1274  // array, given the offset and size (total number of
1275  // packets from process procsFrom_[i]).
1276  // 2. Start the Irecv and save the resulting request.
1277  TEUCHOS_TEST_FOR_EXCEPTION(
1278  curBufOffset + curBufLen > static_cast<size_t> (imports.size ()),
1279  std::logic_error,
1280  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1281  "Exceeded size of 'imports' array in packing loop on Process " <<
1282  myRank << ". imports.size() = " << imports.size () << " < "
1283  "curBufOffset(" << curBufOffset << ") + curBufLen(" << curBufLen
1284  << ").");
1285  ArrayRCP<Packet> recvBuf =
1286  imports.persistingView (curBufOffset, curBufLen);
1287  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1288  tag, *comm_));
1289  }
1290  else { // Receiving from myself
1291  selfReceiveOffset = curBufOffset; // Remember the self-recv offset
1292  }
1293  curBufOffset += curBufLen;
1294  }
1295  }
1296 
1297  if (doBarrier) {
1298 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1299  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3TA_barrier_);
1300 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1301 
1302  if (verbose_) {
1303  std::ostringstream os;
1304  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1305  << ": Barrier" << endl;
1306  std::cerr << os.str();
1307  }
1308  // If we are using ready sends (MPI_Rsend) below, we need to do
1309  // a barrier before we post the ready sends. This is because a
1310  // ready send requires that its matching receive has already
1311  // been posted before the send has been posted. The only way to
1312  // guarantee that in this case is to use a barrier.
1313  comm_->barrier ();
1314  }
1315 
1316 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1317  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3TA_sends_);
1318 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1319 
1320  // setup scan through procsTo_ list starting with higher numbered procs
1321  // (should help balance message traffic)
1322  //
1323  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
1324  // It doesn't depend on the input at all.
1325  size_t numBlocks = numSends_ + selfMessage_;
1326  size_t procIndex = 0;
1327  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
1328  ++procIndex;
1329  }
1330  if (procIndex == numBlocks) {
1331  procIndex = 0;
1332  }
1333 
1334  size_t selfNum = 0;
1335  size_t selfIndex = 0;
1336 
1337  if (verbose_) {
1338  std::ostringstream os;
1339  os << *prefix << (indicesTo_.empty () ? "Fast" : "Slow")
1340  << ": Post sends" << endl;
1341  std::cerr << os.str();
1342  }
1343 
1344  if (indicesTo_.empty ()) {
1345 
1346 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1347  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3TA_sends_fast_);
1348 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1349 
1350  // Data are already blocked (laid out) by process, so we don't
1351  // need a separate send buffer (besides the exports array).
1352  for (size_t i = 0; i < numBlocks; ++i) {
1353  size_t p = i + procIndex;
1354  if (p > (numBlocks - 1)) {
1355  p -= numBlocks;
1356  }
1357 
1358  if (procsTo_[p] != myRank) {
1359  if (verbose_) {
1360  std::ostringstream os;
1361  os << *prefix << ": Post send: {target: "
1362  << procsTo_[p] << ", tag: " << tag << "}" << endl;
1363  std::cerr << os.str();
1364  }
1365 
1366  ArrayView<const Packet> tmpSend =
1367  exports.view (startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
1368 
1369  if (sendType == Details::DISTRIBUTOR_SEND) {
1370  send<int, Packet> (tmpSend.getRawPtr (),
1371  as<int> (tmpSend.size ()),
1372  procsTo_[p], tag, *comm_);
1373  }
1374  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1375  ArrayRCP<const Packet> tmpSendBuf =
1376  exports.persistingView (startsTo_[p] * numPackets,
1377  lengthsTo_[p] * numPackets);
1378  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1379  tag, *comm_));
1380  }
1381  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1382  readySend<int, Packet> (tmpSend.getRawPtr (),
1383  as<int> (tmpSend.size ()),
1384  procsTo_[p], tag, *comm_);
1385  }
1386  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1387  ssend<int, Packet> (tmpSend.getRawPtr (),
1388  as<int> (tmpSend.size ()),
1389  procsTo_[p], tag, *comm_);
1390  } else {
1391  TEUCHOS_TEST_FOR_EXCEPTION(
1392  true, std::logic_error,
1393  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1394  "Invalid send type. We should never get here. "
1395  "Please report this bug to the Tpetra developers.");
1396  }
1397  }
1398  else { // "Sending" the message to myself
1399  selfNum = p;
1400  }
1401  }
1402 
1403  if (selfMessage_) {
1404  if (verbose_) {
1405  std::ostringstream os;
1406  os << *prefix << "Fast: Self-send" << endl;
1407  std::cerr << os.str();
1408  }
1409  // This is how we "send a message to ourself": we copy from
1410  // the export buffer to the import buffer. That saves
1411  // Teuchos::Comm implementations other than MpiComm (in
1412  // particular, SerialComm) the trouble of implementing self
1413  // messages correctly. (To do this right, SerialComm would
1414  // need internal buffer space for messages, keyed on the
1415  // message's tag.)
1416  std::copy (exports.begin()+startsTo_[selfNum]*numPackets,
1417  exports.begin()+startsTo_[selfNum]*numPackets+lengthsTo_[selfNum]*numPackets,
1418  imports.begin()+selfReceiveOffset);
1419  }
1420  }
1421  else { // data are not blocked by proc, use send buffer
1422 
1423 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1424  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3TA_sends_slow_);
1425 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1426 
1427  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
1428  // sends), because the buffer is only long enough for one send.
1429  ArrayRCP<Packet> sendArray (maxSendLength_ * numPackets); // send buffer
1430 
1431  TEUCHOS_TEST_FOR_EXCEPTION(
1432  sendType == Details::DISTRIBUTOR_ISEND, std::logic_error,
1433  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1434  "The \"send buffer\" code path doesn't currently work with "
1435  "nonblocking sends.");
1436 
1437  for (size_t i = 0; i < numBlocks; ++i) {
1438  size_t p = i + procIndex;
1439  if (p > (numBlocks - 1)) {
1440  p -= numBlocks;
1441  }
1442 
1443  if (procsTo_[p] != myRank) {
1444  if (verbose_) {
1445  std::ostringstream os;
1446  os << *prefix << "Slow: Post send: "
1447  "{target: " << procsTo_[p] << ", tag: " << tag << "}" << endl;
1448  std::cerr << os.str();
1449  }
1450 
1451  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1452  size_t sendArrayOffset = 0;
1453  size_t j = startsTo_[p];
1454  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1455  srcBegin = exports.begin() + indicesTo_[j]*numPackets;
1456  srcEnd = srcBegin + numPackets;
1457  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1458  sendArrayOffset += numPackets;
1459  }
1460  ArrayView<const Packet> tmpSend =
1461  sendArray.view (0, lengthsTo_[p]*numPackets);
1462 
1463  if (sendType == Details::DISTRIBUTOR_SEND) {
1464  send<int, Packet> (tmpSend.getRawPtr (),
1465  as<int> (tmpSend.size ()),
1466  procsTo_[p], tag, *comm_);
1467  }
1468  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1469  ArrayRCP<const Packet> tmpSendBuf =
1470  sendArray.persistingView (0, lengthsTo_[p] * numPackets);
1471  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1472  tag, *comm_));
1473  }
1474  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1475  readySend<int, Packet> (tmpSend.getRawPtr (),
1476  as<int> (tmpSend.size ()),
1477  procsTo_[p], tag, *comm_);
1478  }
1479  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1480  ssend<int, Packet> (tmpSend.getRawPtr (),
1481  as<int> (tmpSend.size ()),
1482  procsTo_[p], tag, *comm_);
1483  }
1484  else {
1485  TEUCHOS_TEST_FOR_EXCEPTION(
1486  true, std::logic_error,
1487  "Tpetra::Distributor::doPosts(3 args, Teuchos::ArrayRCP): "
1488  "Invalid send type. We should never get here. "
1489  "Please report this bug to the Tpetra developers.");
1490  }
1491  }
1492  else { // "Sending" the message to myself
1493  selfNum = p;
1494  selfIndex = startsTo_[p];
1495  }
1496  }
1497 
1498  if (selfMessage_) {
1499  if (verbose_) {
1500  std::ostringstream os;
1501  os << *prefix << "Slow: Self-send" << endl;
1502  std::cerr << os.str();
1503  }
1504  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1505  std::copy (exports.begin()+indicesTo_[selfIndex]*numPackets,
1506  exports.begin()+indicesTo_[selfIndex]*numPackets + numPackets,
1507  imports.begin() + selfReceiveOffset);
1508  ++selfIndex;
1509  selfReceiveOffset += numPackets;
1510  }
1511  }
1512  }
1513 
1514  if (verbose_) {
1515  std::ostringstream os;
1516  os << *prefix << "Done!" << endl;
1517  std::cerr << os.str();
1518  }
1519  }
1520 
1521  template <class Packet>
1522  void Distributor::
1523  doPosts (const Teuchos::ArrayRCP<const Packet>& exports,
1524  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1525  const Teuchos::ArrayRCP<Packet>& imports,
1526  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1527  {
1528  using Teuchos::Array;
1529  using Teuchos::ArrayRCP;
1530  using Teuchos::ArrayView;
1531  using Teuchos::as;
1532  using Teuchos::ireceive;
1533  using Teuchos::isend;
1534  using Teuchos::readySend;
1535  using Teuchos::send;
1536  using Teuchos::ssend;
1537  using Teuchos::TypeNameTraits;
1538  using std::endl;
1539  typedef Array<size_t>::size_type size_type;
1540 
1541  std::unique_ptr<std::string> prefix;
1542  if (verbose_) {
1543  prefix = createPrefix("doPosts(4-arg, Teuchos)");
1544  std::ostringstream os;
1545  os << *prefix << "Start" << endl;
1546  std::cerr << os.str();
1547  }
1548 
1549 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1550  Teuchos::TimeMonitor timeMon (*timer_doPosts4TA_);
1551 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1552 
1553  // Run-time configurable parameters that come from the input
1554  // ParameterList set by setParameterList().
1555  const Details::EDistributorSendType sendType = sendType_;
1556  const bool doBarrier = barrierBetween_;
1557 
1558 // #ifdef HAVE_TEUCHOS_DEBUG
1559 // // Prepare for verbose output, if applicable.
1560 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
1561 // Teuchos::RCP<Teuchos::FancyOStream> out = this->getOStream ();
1562 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
1563 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
1564 
1565 // if (doPrint) {
1566 // // Only need one process to print out parameters.
1567 // *out << "Distributor::doPosts (4 args)" << endl;
1568 // }
1569 // // Add one tab level. We declare this outside the doPrint scopes
1570 // // so that the tab persists until the end of this method.
1571 // if (doPrint) {
1572 // *out << "Parameters:" << endl;
1573 // {
1574 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
1575 // << endl << "barrierBetween: " << doBarrier << endl;
1576 // }
1577 // }
1578 // #endif // HAVE_TEUCHOS_DEBUG
1579 
1580  TEUCHOS_TEST_FOR_EXCEPTION(
1581  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
1582  std::logic_error,
1583  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Ready-send "
1584  "version requires a barrier between posting receives and posting ready "
1585  "ends. This should have been checked before. "
1586  "Please report this bug to the Tpetra developers.");
1587 
1588  const int myProcID = comm_->getRank ();
1589  size_t selfReceiveOffset = 0;
1590 
1591 #ifdef HAVE_TEUCHOS_DEBUG
1592  // Different messages may have different numbers of packets.
1593  size_t totalNumImportPackets = 0;
1594  for (size_t ii = 0; ii < static_cast<size_t> (numImportPacketsPerLID.size ()); ++ii) {
1595  totalNumImportPackets += numImportPacketsPerLID[ii];
1596  }
1597  TEUCHOS_TEST_FOR_EXCEPTION(
1598  static_cast<size_t> (imports.size ()) < totalNumImportPackets,
1599  std::runtime_error,
1600  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): The 'imports' "
1601  "array must have enough entries to hold the expected number of import "
1602  "packets. imports.size() = " << imports.size() << " < "
1603  "totalNumImportPackets = " << totalNumImportPackets << ".");
1604 #endif // HAVE_TEUCHOS_DEBUG
1605 
1606  // MPI tag for nonblocking receives and blocking sends in this
1607  // method. Some processes might take the "fast" path
1608  // (indicesTo_.empty()) and others might take the "slow" path for
1609  // the same doPosts() call, so the path tag must be the same for
1610  // both.
1611  const int pathTag = 1;
1612  const int tag = this->getTag (pathTag);
1613 
1614 #ifdef HAVE_TEUCHOS_DEBUG
1615  TEUCHOS_TEST_FOR_EXCEPTION
1616  (requests_.size () != 0,
1617  std::logic_error,
1618  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): Process "
1619  << myProcID << ": requests_.size() = " << requests_.size ()
1620  << " != 0.");
1621 #endif // HAVE_TEUCHOS_DEBUG
1622  if (verbose_) {
1623  std::ostringstream os;
1624  os << *prefix << (indicesTo_.empty () ? "fast" : "slow")
1625  << " path" << endl;
1626  std::cerr << os.str();
1627  }
1628 
1629  // Distributor uses requests_.size() as the number of outstanding
1630  // nonblocking message requests, so we resize to zero to maintain
1631  // this invariant.
1632  //
1633  // numReceives_ does _not_ include the self message, if there is
1634  // one. Here, we do actually send a message to ourselves, so we
1635  // include any self message in the "actual" number of receives to
1636  // post.
1637  //
1638  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
1639  // doesn't (re)allocate its array of requests. That happens in
1640  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
1641  // demand), or Resize_().
1642  const size_type actualNumReceives = as<size_type> (numReceives_) +
1643  as<size_type> (selfMessage_ ? 1 : 0);
1644  requests_.resize (0);
1645 
1646  // Post the nonblocking receives. It's common MPI wisdom to post
1647  // receives before sends. In MPI terms, this means favoring
1648  // adding to the "posted queue" (of receive requests) over adding
1649  // to the "unexpected queue" (of arrived messages not yet matched
1650  // with a receive).
1651  {
1652 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1653  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4TA_recvs_);
1654 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1655 
1656  size_t curBufferOffset = 0;
1657  size_t curLIDoffset = 0;
1658  for (size_type i = 0; i < actualNumReceives; ++i) {
1659  size_t totalPacketsFrom_i = 0;
1660  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
1661  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1662  }
1663  curLIDoffset += lengthsFrom_[i];
1664  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
1665  // If my process is receiving these packet(s) from another
1666  // process (not a self-receive), and if there is at least
1667  // one packet to receive:
1668  //
1669  // 1. Set up the persisting view (recvBuf) into the imports
1670  // array, given the offset and size (total number of
1671  // packets from process procsFrom_[i]).
1672  // 2. Start the Irecv and save the resulting request.
1673  ArrayRCP<Packet> recvBuf =
1674  imports.persistingView (curBufferOffset, totalPacketsFrom_i);
1675  requests_.push_back (ireceive<int, Packet> (recvBuf, procsFrom_[i],
1676  tag, *comm_));
1677  }
1678  else { // Receiving these packet(s) from myself
1679  selfReceiveOffset = curBufferOffset; // Remember the offset
1680  }
1681  curBufferOffset += totalPacketsFrom_i;
1682  }
1683  }
1684 
1685  if (doBarrier) {
1686 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1687  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4TA_barrier_);
1688 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1689  // If we are using ready sends (MPI_Rsend) below, we need to do
1690  // a barrier before we post the ready sends. This is because a
1691  // ready send requires that its matching receive has already
1692  // been posted before the send has been posted. The only way to
1693  // guarantee that in this case is to use a barrier.
1694  comm_->barrier ();
1695  }
1696 
1697 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1698  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4TA_sends_);
1699 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1700 
1701  // setup arrays containing starting-offsets into exports for each send,
1702  // and num-packets-to-send for each send.
1703  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
1704  size_t maxNumPackets = 0;
1705  size_t curPKToffset = 0;
1706  for (size_t pp=0; pp<numSends_; ++pp) {
1707  sendPacketOffsets[pp] = curPKToffset;
1708  size_t numPackets = 0;
1709  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
1710  numPackets += numExportPacketsPerLID[j];
1711  }
1712  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1713  packetsPerSend[pp] = numPackets;
1714  curPKToffset += numPackets;
1715  }
1716 
1717  // setup scan through procsTo_ list starting with higher numbered procs
1718  // (should help balance message traffic)
1719  size_t numBlocks = numSends_+ selfMessage_;
1720  size_t procIndex = 0;
1721  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
1722  ++procIndex;
1723  }
1724  if (procIndex == numBlocks) {
1725  procIndex = 0;
1726  }
1727 
1728  size_t selfNum = 0;
1729  size_t selfIndex = 0;
1730 
1731  if (indicesTo_.empty()) {
1732 
1733 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1734  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4TA_sends_fast_);
1735 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1736 
1737  if (verbose_) {
1738  std::ostringstream os;
1739  os << *prefix << "fast path: posting sends" << endl;
1740  std::cerr << os.str();
1741  }
1742 
1743  // Data are already blocked (laid out) by process, so we don't
1744  // need a separate send buffer (besides the exports array).
1745  for (size_t i = 0; i < numBlocks; ++i) {
1746  size_t p = i + procIndex;
1747  if (p > (numBlocks - 1)) {
1748  p -= numBlocks;
1749  }
1750 
1751  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
1752  ArrayView<const Packet> tmpSend =
1753  exports.view (sendPacketOffsets[p], packetsPerSend[p]);
1754 
1755  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
1756  send<int, Packet> (tmpSend.getRawPtr (),
1757  as<int> (tmpSend.size ()),
1758  procsTo_[p], tag, *comm_);
1759  }
1760  else if (sendType == Details::DISTRIBUTOR_RSEND) {
1761  readySend<int, Packet> (tmpSend.getRawPtr (),
1762  as<int> (tmpSend.size ()),
1763  procsTo_[p], tag, *comm_);
1764  }
1765  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1766  ArrayRCP<const Packet> tmpSendBuf =
1767  exports.persistingView (sendPacketOffsets[p], packetsPerSend[p]);
1768  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1769  tag, *comm_));
1770  }
1771  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1772  ssend<int, Packet> (tmpSend.getRawPtr (),
1773  as<int> (tmpSend.size ()),
1774  procsTo_[p], tag, *comm_);
1775  }
1776  else {
1777  TEUCHOS_TEST_FOR_EXCEPTION(
1778  true, std::logic_error,
1779  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1780  "Invalid send type. We should never get here. Please report "
1781  "this bug to the Tpetra developers.");
1782  }
1783  }
1784  else { // "Sending" the message to myself
1785  selfNum = p;
1786  }
1787  }
1788 
1789  if (selfMessage_) {
1790  std::copy (exports.begin()+sendPacketOffsets[selfNum],
1791  exports.begin()+sendPacketOffsets[selfNum]+packetsPerSend[selfNum],
1792  imports.begin()+selfReceiveOffset);
1793  }
1794  if (verbose_) {
1795  std::ostringstream os;
1796  os << *prefix << "fast path: done" << endl;
1797  std::cerr << os.str();
1798  }
1799  }
1800  else { // data are not blocked by proc, use send buffer
1801 
1802 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1803  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4TA_sends_slow_);
1804 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1805 
1806  if (verbose_) {
1807  std::ostringstream os;
1808  os << *prefix << "slow path: posting sends" << endl;
1809  std::cerr << os.str();
1810  }
1811 
1812  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
1813  ArrayRCP<Packet> sendArray (maxNumPackets); // send buffer
1814 
1815  TEUCHOS_TEST_FOR_EXCEPTION(
1816  sendType == Details::DISTRIBUTOR_ISEND,
1817  std::logic_error,
1818  "Tpetra::Distributor::doPosts(4 args, Teuchos::ArrayRCP): "
1819  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
1820 
1821  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1822  size_t ioffset = 0;
1823  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
1824  indicesOffsets[j] = ioffset;
1825  ioffset += numExportPacketsPerLID[j];
1826  }
1827 
1828  for (size_t i = 0; i < numBlocks; ++i) {
1829  size_t p = i + procIndex;
1830  if (p > (numBlocks - 1)) {
1831  p -= numBlocks;
1832  }
1833 
1834  if (procsTo_[p] != myProcID) {
1835  typename ArrayView<const Packet>::iterator srcBegin, srcEnd;
1836  size_t sendArrayOffset = 0;
1837  size_t j = startsTo_[p];
1838  size_t numPacketsTo_p = 0;
1839  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
1840  srcBegin = exports.begin() + indicesOffsets[j];
1841  srcEnd = srcBegin + numExportPacketsPerLID[j];
1842  numPacketsTo_p += numExportPacketsPerLID[j];
1843  std::copy (srcBegin, srcEnd, sendArray.begin()+sendArrayOffset);
1844  sendArrayOffset += numExportPacketsPerLID[j];
1845  }
1846  if (numPacketsTo_p > 0) {
1847  ArrayView<const Packet> tmpSend =
1848  sendArray.view (0, numPacketsTo_p);
1849 
1850  if (sendType == Details::DISTRIBUTOR_RSEND) {
1851  readySend<int, Packet> (tmpSend.getRawPtr (),
1852  as<int> (tmpSend.size ()),
1853  procsTo_[p], tag, *comm_);
1854  }
1855  else if (sendType == Details::DISTRIBUTOR_ISEND) {
1856  ArrayRCP<const Packet> tmpSendBuf =
1857  sendArray.persistingView (0, numPacketsTo_p);
1858  requests_.push_back (isend<int, Packet> (tmpSendBuf, procsTo_[p],
1859  tag, *comm_));
1860  }
1861  else if (sendType == Details::DISTRIBUTOR_SSEND) {
1862  ssend<int, Packet> (tmpSend.getRawPtr (),
1863  as<int> (tmpSend.size ()),
1864  procsTo_[p], tag, *comm_);
1865  }
1866  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
1867  send<int, Packet> (tmpSend.getRawPtr (),
1868  as<int> (tmpSend.size ()),
1869  procsTo_[p], tag, *comm_);
1870  }
1871  }
1872  }
1873  else { // "Sending" the message to myself
1874  selfNum = p;
1875  selfIndex = startsTo_[p];
1876  }
1877  }
1878 
1879  if (selfMessage_) {
1880  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
1881  std::copy (exports.begin()+indicesOffsets[selfIndex],
1882  exports.begin()+indicesOffsets[selfIndex]+numExportPacketsPerLID[selfIndex],
1883  imports.begin() + selfReceiveOffset);
1884  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1885  ++selfIndex;
1886  }
1887  }
1888  if (verbose_) {
1889  std::ostringstream os;
1890  os << *prefix << "slow path: done" << endl;
1891  std::cerr << os.str();
1892  }
1893  }
1894  }
1895 
1896  template <class Packet>
1897  void Distributor::
1898  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1899  size_t numPackets,
1900  const Teuchos::ArrayView<Packet>& imports)
1901  {
1902  using Teuchos::arcp;
1903  using Teuchos::ArrayRCP;
1904  using Teuchos::as;
1905 
1906  // doReversePosts() takes exports and imports as ArrayRCPs,
1907  // requiring that the memory locations are persisting. However,
1908  // they need only persist within the scope of that routine, so it
1909  // is safe for us to use nonpersisting references in this case.
1910 
1911  // mfh 04 Apr 2012: For some reason, calling arcp<const Packet>
1912  // for Packet=std::complex<T> (e.g., T=float) fails to compile
1913  // with some versions of GCC. The issue only arises with the
1914  // exports array. This is why we construct a separate nonowning
1915  // ArrayRCP.
1916  typedef typename ArrayRCP<const Packet>::size_type size_type;
1917  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr(), as<size_type> (0),
1918  exports.size(), false);
1919  // mfh 04 Apr 2012: This is the offending code. This statement
1920  // would normally be in place of "exportsArcp" in the
1921  // doReversePosts() call below.
1922  //arcp<const Packet> (exports.getRawPtr(), 0, exports.size(), false)
1923  doReversePosts (exportsArcp,
1924  numPackets,
1925  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false));
1926  doReverseWaits ();
1927 
1928  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1929  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1930  }
1931 
1932  template <class Packet>
1933  void Distributor::
1934  doReversePostsAndWaits (const Teuchos::ArrayView<const Packet>& exports,
1935  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1936  const Teuchos::ArrayView<Packet> &imports,
1937  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1938  {
1939  using Teuchos::as;
1940  using Teuchos::arcp;
1941  using Teuchos::ArrayRCP;
1942 
1943  TEUCHOS_TEST_FOR_EXCEPTION(
1944  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
1945  "doReversePostsAndWaits(4 args): There are " << requests_.size ()
1946  << " outstanding nonblocking messages pending. It is incorrect to call "
1947  "this method with posts outstanding.");
1948 
1949  // doReversePosts() accepts the exports and imports arrays as
1950  // ArrayRCPs, requiring that the memory location is persisting (as
1951  // is necessary for nonblocking receives). However, it need only
1952  // persist until doReverseWaits() completes, so it is safe for us
1953  // to use a nonpersisting reference in this case. The use of a
1954  // nonpersisting reference is purely a performance optimization.
1955 
1956  // mfh 02 Apr 2012: For some reason, calling arcp<const Packet>
1957  // for Packet=std::complex<double> fails to compile with some
1958  // versions of GCC. The issue only arises with the exports array.
1959  // This is why we construct a separate nonowning ArrayRCP.
1960  typedef typename ArrayRCP<const Packet>::size_type size_type;
1961  ArrayRCP<const Packet> exportsArcp (exports.getRawPtr (), as<size_type> (0),
1962  exports.size (), false);
1963  doReversePosts (exportsArcp,
1964  numExportPacketsPerLID,
1965  arcp<Packet> (imports.getRawPtr (), 0, imports.size (), false),
1966  numImportPacketsPerLID);
1967  doReverseWaits ();
1968 
1969  lastRoundBytesSend_ = exports.size() * sizeof(Packet);
1970  lastRoundBytesRecv_ = imports.size() * sizeof(Packet);
1971  }
1972 
1973  template <class Packet>
1974  void Distributor::
1975  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1976  size_t numPackets,
1977  const Teuchos::ArrayRCP<Packet>& imports)
1978  {
1979  // FIXME (mfh 29 Mar 2012) WHY?
1980  TEUCHOS_TEST_FOR_EXCEPTION(
1981  ! indicesTo_.empty (), std::runtime_error,
1982  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
1983  "communication when original data are blocked by process.");
1984  if (reverseDistributor_.is_null ()) {
1985  createReverseDistributor ();
1986  }
1987  reverseDistributor_->doPosts (exports, numPackets, imports);
1988  }
1989 
1990  template <class Packet>
1991  void Distributor::
1992  doReversePosts (const Teuchos::ArrayRCP<const Packet>& exports,
1993  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1994  const Teuchos::ArrayRCP<Packet>& imports,
1995  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1996  {
1997  // FIXME (mfh 29 Mar 2012) WHY?
1998  TEUCHOS_TEST_FOR_EXCEPTION(
1999  ! indicesTo_.empty (), std::runtime_error,
2000  "Tpetra::Distributor::doReversePosts(3 args): Can only do reverse "
2001  "communication when original data are blocked by process.");
2002  if (reverseDistributor_.is_null ()) {
2003  createReverseDistributor ();
2004  }
2005  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2006  imports, numImportPacketsPerLID);
2007  }
2008 
2009  template <class ExpView, class ImpView>
2010  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2011  Distributor::
2012  doPostsAndWaits (const ExpView& exports,
2013  size_t numPackets,
2014  const ImpView& imports)
2015  {
2016  using Teuchos::RCP;
2017  using Teuchos::rcp;
2018  using std::endl;
2019 
2020  std::unique_ptr<std::string> prefix;
2021  if (verbose_) {
2022  prefix = createPrefix("doPostsAndWaits(3-arg, Kokkos)");
2023  std::ostringstream os;
2024  os << *prefix << "sendType: "
2025  << DistributorSendTypeEnumToString(sendType_)
2026  << ", barrierBetween: "
2027  << (barrierBetween_ ? "true" : "false") << endl;
2028  std::cerr << os.str();
2029  }
2030 
2031  TEUCHOS_TEST_FOR_EXCEPTION(
2032  requests_.size () != 0, std::runtime_error, "Tpetra::Distributor::"
2033  "doPostsAndWaits(3 args): There are " << requests_.size () <<
2034  " outstanding nonblocking messages pending. It is incorrect to call "
2035  "this method with posts outstanding.");
2036 
2037  if (verbose_) {
2038  std::ostringstream os;
2039  os << *prefix << "Call doPosts" << endl;
2040  std::cerr << os.str();
2041  }
2042  doPosts (exports, numPackets, imports);
2043  if (verbose_) {
2044  std::ostringstream os;
2045  os << *prefix << "Call doWaits" << endl;
2046  std::cerr << os.str();
2047  }
2048  doWaits ();
2049  if (verbose_) {
2050  std::ostringstream os;
2051  os << *prefix << "Done" << endl;
2052  std::cerr << os.str();
2053  }
2054  }
2055 
2056  template <class ExpView, class ImpView>
2057  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2058  Distributor::
2059  doPostsAndWaits(const ExpView& exports,
2060  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2061  const ImpView& imports,
2062  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2063  {
2064  using std::endl;
2065  const char rawPrefix[] = "doPostsAndWaits(4-arg, Kokkos)";
2066 
2067  std::unique_ptr<std::string> prefix;
2068  if (verbose_) {
2069  prefix = createPrefix(rawPrefix);
2070  std::ostringstream os;
2071  os << *prefix << "Start" << endl;
2072  std::cerr << os.str();
2073  }
2074  TEUCHOS_TEST_FOR_EXCEPTION
2075  (requests_.size() != 0, std::runtime_error,
2076  "Tpetra::Distributor::" << rawPrefix << ": There is/are "
2077  << requests_.size() << " outstanding nonblocking message(s) "
2078  "pending. It is incorrect to call this method with posts "
2079  "outstanding.");
2080  doPosts(exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
2081  doWaits();
2082  if (verbose_) {
2083  std::ostringstream os;
2084  os << *prefix << "Done" << endl;
2085  std::cerr << os.str();
2086  }
2087  }
2088 
2089 
2090  template <class ExpView, class ImpView>
2091  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2092  Distributor::
2093  doPosts (const ExpView &exports,
2094  size_t numPackets,
2095  const ImpView &imports)
2096  {
2097  using Teuchos::Array;
2098  using Teuchos::as;
2099  using Teuchos::FancyOStream;
2100  using Teuchos::includesVerbLevel;
2101  using Teuchos::ireceive;
2102  using Teuchos::isend;
2103  using Teuchos::readySend;
2104  using Teuchos::send;
2105  using Teuchos::ssend;
2106  using Teuchos::TypeNameTraits;
2107  using Teuchos::typeName;
2108  using std::endl;
2109  using Kokkos::Compat::create_const_view;
2110  using Kokkos::Compat::create_view;
2111  using Kokkos::Compat::subview_offset;
2112  using Kokkos::Compat::deep_copy_offset;
2113  typedef Array<size_t>::size_type size_type;
2114  typedef ExpView exports_view_type;
2115  typedef ImpView imports_view_type;
2116 
2117 #ifdef KOKKOS_ENABLE_CUDA
2118  static_assert
2119  (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2120  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2121  "Please do not use Tpetra::Distributor with UVM allocations. "
2122  "See Trilinos GitHub issue #1088.");
2123 #endif // KOKKOS_ENABLE_CUDA
2124 
2125 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2126  Teuchos::TimeMonitor timeMon (*timer_doPosts3KV_);
2127 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2128 
2129  const int myRank = comm_->getRank ();
2130  // Run-time configurable parameters that come from the input
2131  // ParameterList set by setParameterList().
2132  const Details::EDistributorSendType sendType = sendType_;
2133  const bool doBarrier = barrierBetween_;
2134 
2135  std::unique_ptr<std::string> prefix;
2136  if (verbose_) {
2137  prefix = createPrefix("doPosts(3-arg, Kokkos)");
2138  std::ostringstream os;
2139  os << *prefix << "Start" << endl;
2140  std::cerr << os.str();
2141  }
2142 
2143  TEUCHOS_TEST_FOR_EXCEPTION(
2144  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2145  std::logic_error,
2146  "Tpetra::Distributor::doPosts(3 args, Kokkos): Ready-send version "
2147  "requires a barrier between posting receives and posting ready sends. "
2148  "This should have been checked before. "
2149  "Please report this bug to the Tpetra developers.");
2150 
2151  size_t selfReceiveOffset = 0;
2152 
2153  // mfh 30 Mar 2016: See Github Issue #227 to see why we need to
2154  // check whether we're doing reverse mode before checking the
2155  // length of the imports array.
2156  if (false /* howInitialized_ != Details::DISTRIBUTOR_INITIALIZED_BY_REVERSE */) {
2157  // Each message has the same number of packets.
2158  const size_t totalNumImportPackets = totalReceiveLength_ * numPackets;
2159 
2160  if (verbose_) {
2161  std::ostringstream os;
2162  os << *prefix << "totalNumImportPackets = " <<
2163  totalNumImportPackets << " = " << totalReceiveLength_ << " * " <<
2164  numPackets << "; imports.extent(0) = " << imports.extent (0)
2165  << endl;
2166  std::cerr << os.str();
2167  }
2168 
2169 #ifdef HAVE_TPETRA_DEBUG
2170  // mfh 31 Mar 2016: Extra special all-reduce check to help diagnose #227.
2171  {
2172  const size_t importBufSize = static_cast<size_t> (imports.extent (0));
2173  const int lclBad = (importBufSize < totalNumImportPackets) ? 1 : 0;
2174  int gblBad = 0;
2175  using Teuchos::reduceAll;
2176  using Teuchos::REDUCE_MAX;
2177  using Teuchos::outArg;
2178  reduceAll (*comm_, REDUCE_MAX, lclBad, outArg (gblBad));
2179  TEUCHOS_TEST_FOR_EXCEPTION
2180  (gblBad != 0,
2181  std::runtime_error,
2182  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2183  "On one or more MPI processes, the 'imports' array "
2184  "does not have enough entries to hold the expected number of "
2185  "import packets. ");
2186  }
2187 #else
2188  TEUCHOS_TEST_FOR_EXCEPTION
2189  (static_cast<size_t> (imports.extent (0)) < totalNumImportPackets,
2190  std::runtime_error,
2191  "Tpetra::Distributor::doPosts(3 args, Kokkos): The 'imports' "
2192  "array must have enough entries to hold the expected number of import "
2193  "packets. imports.extent(0) = " << imports.extent (0) << " < "
2194  "totalNumImportPackets = " << totalNumImportPackets << " = "
2195  "totalReceiveLength_ (" << totalReceiveLength_ << ") * numPackets ("
2196  << numPackets << ").");
2197 #endif // HAVE_TPETRA_DEBUG
2198  }
2199 
2200  // MPI tag for nonblocking receives and blocking sends in this
2201  // method. Some processes might take the "fast" path
2202  // (indicesTo_.empty()) and others might take the "slow" path for
2203  // the same doPosts() call, so the path tag must be the same for
2204  // both.
2205  const int pathTag = 0;
2206  const int tag = this->getTag (pathTag);
2207 
2208 #ifdef HAVE_TPETRA_DEBUG
2209  TEUCHOS_TEST_FOR_EXCEPTION
2210  (requests_.size () != 0,
2211  std::logic_error,
2212  "Tpetra::Distributor::doPosts(3 args, Kokkos): Process "
2213  << myRank << ": requests_.size() = " << requests_.size () << " != 0.");
2214 #endif // HAVE_TPETRA_DEBUG
2215 
2216  // Distributor uses requests_.size() as the number of outstanding
2217  // nonblocking message requests, so we resize to zero to maintain
2218  // this invariant.
2219  //
2220  // numReceives_ does _not_ include the self message, if there is
2221  // one. Here, we do actually send a message to ourselves, so we
2222  // include any self message in the "actual" number of receives to
2223  // post.
2224  //
2225  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2226  // doesn't (re)allocate its array of requests. That happens in
2227  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2228  // demand), or Resize_().
2229  const size_type actualNumReceives = as<size_type> (numReceives_) +
2230  as<size_type> (selfMessage_ ? 1 : 0);
2231  requests_.resize (0);
2232 
2233  if (verbose_) {
2234  std::ostringstream os;
2235  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2236  << " path: post receives" << endl;
2237  std::cerr << os.str();
2238  }
2239 
2240  // Post the nonblocking receives. It's common MPI wisdom to post
2241  // receives before sends. In MPI terms, this means favoring
2242  // adding to the "posted queue" (of receive requests) over adding
2243  // to the "unexpected queue" (of arrived messages not yet matched
2244  // with a receive).
2245  {
2246 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2247  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3KV_recvs_);
2248 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2249 
2250  size_t curBufferOffset = 0;
2251  for (size_type i = 0; i < actualNumReceives; ++i) {
2252  const size_t curBufLen = lengthsFrom_[i] * numPackets;
2253  if (procsFrom_[i] != myRank) {
2254  if (verbose_) {
2255  std::ostringstream os;
2256  os << *prefix
2257  << (indicesTo_.empty() ? "fast" : "slow") << " path: "
2258  << "post irecv: {source: " << procsFrom_[i]
2259  << ", tag: " << tag << "}" << endl;
2260  std::cerr << os.str();
2261  }
2262  // If my process is receiving these packet(s) from another
2263  // process (not a self-receive):
2264  //
2265  // 1. Set up the persisting view (recvBuf) of the imports
2266  // array, given the offset and size (total number of
2267  // packets from process procsFrom_[i]).
2268  // 2. Start the Irecv and save the resulting request.
2269  TEUCHOS_TEST_FOR_EXCEPTION(
2270  curBufferOffset + curBufLen > static_cast<size_t> (imports.size ()),
2271  std::logic_error, "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2272  "Exceeded size of 'imports' array in packing loop on Process " <<
2273  myRank << ". imports.size() = " << imports.size () << " < "
2274  "curBufferOffset(" << curBufferOffset << ") + curBufLen(" <<
2275  curBufLen << ").");
2276  imports_view_type recvBuf =
2277  subview_offset (imports, curBufferOffset, curBufLen);
2278  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2279  tag, *comm_));
2280  }
2281  else { // Receiving from myself
2282  selfReceiveOffset = curBufferOffset; // Remember the self-recv offset
2283  }
2284  curBufferOffset += curBufLen;
2285  }
2286  }
2287 
2288  if (doBarrier) {
2289 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2290  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts3KV_barrier_);
2291 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2292 
2293  if (verbose_) {
2294  std::ostringstream os;
2295  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2296  << " path: barrier" << endl;
2297  std::cerr << os.str();
2298  }
2299  // If we are using ready sends (MPI_Rsend) below, we need to do
2300  // a barrier before we post the ready sends. This is because a
2301  // ready send requires that its matching receive has already
2302  // been posted before the send has been posted. The only way to
2303  // guarantee that in this case is to use a barrier.
2304  comm_->barrier ();
2305  }
2306 
2307 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2308  Teuchos::TimeMonitor timeMonSends (*timer_doPosts3KV_sends_);
2309 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2310 
2311  // setup scan through procsTo_ list starting with higher numbered procs
2312  // (should help balance message traffic)
2313  //
2314  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
2315  // It doesn't depend on the input at all.
2316  size_t numBlocks = numSends_ + selfMessage_;
2317  size_t procIndex = 0;
2318  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myRank)) {
2319  ++procIndex;
2320  }
2321  if (procIndex == numBlocks) {
2322  procIndex = 0;
2323  }
2324 
2325  size_t selfNum = 0;
2326  size_t selfIndex = 0;
2327 
2328  if (verbose_) {
2329  std::ostringstream os;
2330  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2331  << " path: post sends" << endl;
2332  std::cerr << os.str();
2333  }
2334 
2335  if (indicesTo_.empty()) {
2336 
2337 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2338  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3KV_sends_fast_);
2339 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2340 
2341  if (verbose_) {
2342  std::ostringstream os;
2343  os << *prefix << "fast path: posting sends" << endl;
2344  std::cerr << os.str();
2345  }
2346 
2347  // Data are already blocked (laid out) by process, so we don't
2348  // need a separate send buffer (besides the exports array).
2349  for (size_t i = 0; i < numBlocks; ++i) {
2350  size_t p = i + procIndex;
2351  if (p > (numBlocks - 1)) {
2352  p -= numBlocks;
2353  }
2354 
2355  if (procsTo_[p] != myRank) {
2356  if (verbose_) {
2357  std::ostringstream os;
2358  os << *prefix << "fast path: post send: {target: "
2359  << procsTo_[p] << ", tag: " << tag << "}" << endl;
2360  std::cerr << os.str();
2361  }
2362  exports_view_type tmpSend = subview_offset(
2363  exports, startsTo_[p]*numPackets, lengthsTo_[p]*numPackets);
2364 
2365  if (sendType == Details::DISTRIBUTOR_SEND) {
2366  send<int> (tmpSend,
2367  as<int> (tmpSend.size ()),
2368  procsTo_[p], tag, *comm_);
2369  }
2370  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2371  exports_view_type tmpSendBuf =
2372  subview_offset (exports, startsTo_[p] * numPackets,
2373  lengthsTo_[p] * numPackets);
2374  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2375  tag, *comm_));
2376  }
2377  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2378  readySend<int> (tmpSend,
2379  as<int> (tmpSend.size ()),
2380  procsTo_[p], tag, *comm_);
2381  }
2382  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2383  ssend<int> (tmpSend,
2384  as<int> (tmpSend.size ()),
2385  procsTo_[p], tag, *comm_);
2386  } else {
2387  TEUCHOS_TEST_FOR_EXCEPTION(
2388  true,
2389  std::logic_error,
2390  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2391  "Invalid send type. We should never get here. "
2392  "Please report this bug to the Tpetra developers.");
2393  }
2394  }
2395  else { // "Sending" the message to myself
2396  selfNum = p;
2397  }
2398  }
2399 
2400  if (selfMessage_) {
2401  if (verbose_) {
2402  std::ostringstream os;
2403  os << *prefix << "fast path: self-send" << endl;
2404  std::cerr << os.str();
2405  }
2406  // This is how we "send a message to ourself": we copy from
2407  // the export buffer to the import buffer. That saves
2408  // Teuchos::Comm implementations other than MpiComm (in
2409  // particular, SerialComm) the trouble of implementing self
2410  // messages correctly. (To do this right, SerialComm would
2411  // need internal buffer space for messages, keyed on the
2412  // message's tag.)
2413  deep_copy_offset(imports, exports, selfReceiveOffset,
2414  startsTo_[selfNum]*numPackets,
2415  lengthsTo_[selfNum]*numPackets);
2416  }
2417  if (verbose_) {
2418  std::ostringstream os;
2419  os << *prefix << "fast path: done" << endl;
2420  std::cerr << os.str();
2421  }
2422  }
2423  else { // data are not blocked by proc, use send buffer
2424 
2425 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2426  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3KV_sends_slow_);
2427 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2428 
2429  if (verbose_) {
2430  std::ostringstream os;
2431  os << *prefix << "slow path: posting sends" << endl;
2432  std::cerr << os.str();
2433  }
2434  typedef typename ExpView::non_const_value_type Packet;
2435  typedef typename ExpView::array_layout Layout;
2436  typedef typename ExpView::device_type Device;
2437  typedef typename ExpView::memory_traits Mem;
2438  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray",
2439  maxSendLength_ * numPackets);
2440 
2441  // FIXME (mfh 05 Mar 2013) This is broken for Isend (nonblocking
2442  // sends), because the buffer is only long enough for one send.
2443  TEUCHOS_TEST_FOR_EXCEPTION(
2444  sendType == Details::DISTRIBUTOR_ISEND,
2445  std::logic_error,
2446  "Tpetra::Distributor::doPosts(3 args, Kokkos): The \"send buffer\" code path "
2447  "doesn't currently work with nonblocking sends.");
2448 
2449  for (size_t i = 0; i < numBlocks; ++i) {
2450  size_t p = i + procIndex;
2451  if (p > (numBlocks - 1)) {
2452  p -= numBlocks;
2453  }
2454 
2455  if (procsTo_[p] != myRank) {
2456  if (verbose_) {
2457  std::ostringstream os;
2458  os << *prefix << "slow path: post send: {target: "
2459  << procsTo_[p] << ", tag: " << tag << "}" << endl;
2460  std::cerr << os.str();
2461  }
2462 
2463  size_t sendArrayOffset = 0;
2464  size_t j = startsTo_[p];
2465  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2466  deep_copy_offset(sendArray, exports, sendArrayOffset,
2467  indicesTo_[j]*numPackets, numPackets);
2468  sendArrayOffset += numPackets;
2469  }
2470  ImpView tmpSend =
2471  subview_offset(sendArray, size_t(0), lengthsTo_[p]*numPackets);
2472 
2473  if (sendType == Details::DISTRIBUTOR_SEND) {
2474  send<int> (tmpSend,
2475  as<int> (tmpSend.size ()),
2476  procsTo_[p], tag, *comm_);
2477  }
2478  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2479  exports_view_type tmpSendBuf =
2480  subview_offset (sendArray, size_t(0), lengthsTo_[p] * numPackets);
2481  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2482  tag, *comm_));
2483  }
2484  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2485  readySend<int> (tmpSend,
2486  as<int> (tmpSend.size ()),
2487  procsTo_[p], tag, *comm_);
2488  }
2489  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2490  ssend<int> (tmpSend,
2491  as<int> (tmpSend.size ()),
2492  procsTo_[p], tag, *comm_);
2493  }
2494  else {
2495  TEUCHOS_TEST_FOR_EXCEPTION(
2496  true,
2497  std::logic_error,
2498  "Tpetra::Distributor::doPosts(3 args, Kokkos): "
2499  "Invalid send type. We should never get here. "
2500  "Please report this bug to the Tpetra developers.");
2501  }
2502  }
2503  else { // "Sending" the message to myself
2504  selfNum = p;
2505  selfIndex = startsTo_[p];
2506  }
2507  }
2508 
2509  if (selfMessage_) {
2510  if (verbose_) {
2511  std::ostringstream os;
2512  os << *prefix << "slow path: self-send" << endl;
2513  std::cerr << os.str();
2514  }
2515  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2516  deep_copy_offset(imports, exports, selfReceiveOffset,
2517  indicesTo_[selfIndex]*numPackets, numPackets);
2518  ++selfIndex;
2519  selfReceiveOffset += numPackets;
2520  }
2521  }
2522  if (verbose_) {
2523  std::ostringstream os;
2524  os << *prefix << "slow path: done" << endl;
2525  std::cerr << os.str();
2526  }
2527  }
2528 
2529  if (verbose_) {
2530  std::ostringstream os;
2531  os << *prefix << "Done" << endl;
2532  std::cerr << os.str();
2533  }
2534  }
2535 
2536  template <class ExpView, class ImpView>
2537  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2538  Distributor::
2539  doPosts (const ExpView &exports,
2540  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2541  const ImpView &imports,
2542  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2543  {
2544  using Teuchos::Array;
2545  using Teuchos::as;
2546  using Teuchos::ireceive;
2547  using Teuchos::isend;
2548  using Teuchos::readySend;
2549  using Teuchos::send;
2550  using Teuchos::ssend;
2551  using Teuchos::TypeNameTraits;
2552  using std::endl;
2553  using Kokkos::Compat::create_const_view;
2554  using Kokkos::Compat::create_view;
2555  using Kokkos::Compat::subview_offset;
2556  using Kokkos::Compat::deep_copy_offset;
2557  typedef Array<size_t>::size_type size_type;
2558  typedef ExpView exports_view_type;
2559  typedef ImpView imports_view_type;
2560 
2561 #ifdef KOKKOS_ENABLE_CUDA
2562  static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
2563  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
2564  "Please do not use Tpetra::Distributor with UVM "
2565  "allocations. See GitHub issue #1088.");
2566 #endif // KOKKOS_ENABLE_CUDA
2567 
2568  std::unique_ptr<std::string> prefix;
2569  if (verbose_) {
2570  prefix = createPrefix("doPosts(4-arg, Kokkos)");
2571  std::ostringstream os;
2572  os << *prefix << "Start" << endl;
2573  std::cerr << os.str();
2574  }
2575 
2576 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2577  Teuchos::TimeMonitor timeMon (*timer_doPosts4KV_);
2578 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2579 
2580  // Run-time configurable parameters that come from the input
2581  // ParameterList set by setParameterList().
2582  const Details::EDistributorSendType sendType = sendType_;
2583  const bool doBarrier = barrierBetween_;
2584 
2585 // #ifdef HAVE_TEUCHOS_DEBUG
2586 // // Prepare for verbose output, if applicable.
2587 // Teuchos::EVerbosityLevel verbLevel = this->getVerbLevel ();
2588 // RCP<Teuchos::FancyOStream> out = this->getOStream ();
2589 // const bool doPrint = out.get () && (comm_->getRank () == 0) &&
2590 // includesVerbLevel (verbLevel, Teuchos::VERB_EXTREME, true);
2591 
2592 // if (doPrint) {
2593 // // Only need one process to print out parameters.
2594 // *out << "Distributor::doPosts (4 args)" << endl;
2595 // }
2596 // // Add one tab level. We declare this outside the doPrint scopes
2597 // // so that the tab persists until the end of this method.
2598 // if (doPrint) {
2599 // *out << "Parameters:" << endl;
2600 // {
2601 // *out << "sendType: " << DistributorSendTypeEnumToString (sendType)
2602 // << endl << "barrierBetween: " << doBarrier << endl;
2603 // }
2604 // }
2605 // #endif // HAVE_TEUCHOS_DEBUG
2606 
2607  TEUCHOS_TEST_FOR_EXCEPTION(
2608  sendType == Details::DISTRIBUTOR_RSEND && ! doBarrier,
2609  std::logic_error, "Tpetra::Distributor::doPosts(4 args, Kokkos): Ready-send "
2610  "version requires a barrier between posting receives and posting ready "
2611  "sends. This should have been checked before. "
2612  "Please report this bug to the Tpetra developers.");
2613 
2614  const int myProcID = comm_->getRank ();
2615  size_t selfReceiveOffset = 0;
2616 
2617 #ifdef HAVE_TEUCHOS_DEBUG
2618  // Different messages may have different numbers of packets.
2619  size_t totalNumImportPackets = 0;
2620  for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
2621  totalNumImportPackets += numImportPacketsPerLID[ii];
2622  }
2623  TEUCHOS_TEST_FOR_EXCEPTION(
2624  imports.extent (0) < totalNumImportPackets, std::runtime_error,
2625  "Tpetra::Distributor::doPosts(4 args, Kokkos): The 'imports' array must have "
2626  "enough entries to hold the expected number of import packets. "
2627  "imports.extent(0) = " << imports.extent (0) << " < "
2628  "totalNumImportPackets = " << totalNumImportPackets << ".");
2629 #endif // HAVE_TEUCHOS_DEBUG
2630 
2631  // MPI tag for nonblocking receives and blocking sends in this
2632  // method. Some processes might take the "fast" path
2633  // (indicesTo_.empty()) and others might take the "slow" path for
2634  // the same doPosts() call, so the path tag must be the same for
2635  // both.
2636  const int pathTag = 1;
2637  const int tag = this->getTag (pathTag);
2638 
2639 #ifdef HAVE_TEUCHOS_DEBUG
2640  TEUCHOS_TEST_FOR_EXCEPTION
2641  (requests_.size () != 0, std::logic_error, "Tpetra::Distributor::"
2642  "doPosts(4 args, Kokkos): Process " << myProcID << ": requests_.size () = "
2643  << requests_.size () << " != 0.");
2644 #endif // HAVE_TEUCHOS_DEBUG
2645  if (verbose_) {
2646  std::ostringstream os;
2647  os << *prefix << (indicesTo_.empty() ? "fast" : "slow")
2648  << " path, tag=" << tag << endl;
2649  std::cerr << os.str();
2650  }
2651  // Distributor uses requests_.size() as the number of outstanding
2652  // nonblocking message requests, so we resize to zero to maintain
2653  // this invariant.
2654  //
2655  // numReceives_ does _not_ include the self message, if there is
2656  // one. Here, we do actually send a message to ourselves, so we
2657  // include any self message in the "actual" number of receives to
2658  // post.
2659  //
2660  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
2661  // doesn't (re)allocate its array of requests. That happens in
2662  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
2663  // demand), or Resize_().
2664  const size_type actualNumReceives = as<size_type> (numReceives_) +
2665  as<size_type> (selfMessage_ ? 1 : 0);
2666  requests_.resize (0);
2667 
2668  // Post the nonblocking receives. It's common MPI wisdom to post
2669  // receives before sends. In MPI terms, this means favoring
2670  // adding to the "posted queue" (of receive requests) over adding
2671  // to the "unexpected queue" (of arrived messages not yet matched
2672  // with a receive).
2673  {
2674 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2675  Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4KV_recvs_);
2676 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2677 
2678  size_t curBufferOffset = 0;
2679  size_t curLIDoffset = 0;
2680  for (size_type i = 0; i < actualNumReceives; ++i) {
2681  size_t totalPacketsFrom_i = 0;
2682  for (size_t j = 0; j < lengthsFrom_[i]; ++j) {
2683  totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
2684  }
2685  curLIDoffset += lengthsFrom_[i];
2686  if (procsFrom_[i] != myProcID && totalPacketsFrom_i) {
2687  // If my process is receiving these packet(s) from another
2688  // process (not a self-receive), and if there is at least
2689  // one packet to receive:
2690  //
2691  // 1. Set up the persisting view (recvBuf) into the imports
2692  // array, given the offset and size (total number of
2693  // packets from process procsFrom_[i]).
2694  // 2. Start the Irecv and save the resulting request.
2695  imports_view_type recvBuf =
2696  subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
2697  requests_.push_back (ireceive<int> (recvBuf, procsFrom_[i],
2698  tag, *comm_));
2699  }
2700  else { // Receiving these packet(s) from myself
2701  selfReceiveOffset = curBufferOffset; // Remember the offset
2702  }
2703  curBufferOffset += totalPacketsFrom_i;
2704  }
2705  }
2706 
2707  if (doBarrier) {
2708 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2709  Teuchos::TimeMonitor timeMonBarrier (*timer_doPosts4KV_barrier_);
2710 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2711  // If we are using ready sends (MPI_Rsend) below, we need to do
2712  // a barrier before we post the ready sends. This is because a
2713  // ready send requires that its matching receive has already
2714  // been posted before the send has been posted. The only way to
2715  // guarantee that in this case is to use a barrier.
2716  comm_->barrier ();
2717  }
2718 
2719 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2720  Teuchos::TimeMonitor timeMonSends (*timer_doPosts4KV_sends_);
2721 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2722 
2723  // setup arrays containing starting-offsets into exports for each send,
2724  // and num-packets-to-send for each send.
2725  Array<size_t> sendPacketOffsets(numSends_,0), packetsPerSend(numSends_,0);
2726  size_t maxNumPackets = 0;
2727  size_t curPKToffset = 0;
2728  for (size_t pp=0; pp<numSends_; ++pp) {
2729  sendPacketOffsets[pp] = curPKToffset;
2730  size_t numPackets = 0;
2731  for (size_t j=startsTo_[pp]; j<startsTo_[pp]+lengthsTo_[pp]; ++j) {
2732  numPackets += numExportPacketsPerLID[j];
2733  }
2734  if (numPackets > maxNumPackets) maxNumPackets = numPackets;
2735  packetsPerSend[pp] = numPackets;
2736  curPKToffset += numPackets;
2737  }
2738 
2739  // setup scan through procsTo_ list starting with higher numbered procs
2740  // (should help balance message traffic)
2741  size_t numBlocks = numSends_+ selfMessage_;
2742  size_t procIndex = 0;
2743  while ((procIndex < numBlocks) && (procsTo_[procIndex] < myProcID)) {
2744  ++procIndex;
2745  }
2746  if (procIndex == numBlocks) {
2747  procIndex = 0;
2748  }
2749 
2750  size_t selfNum = 0;
2751  size_t selfIndex = 0;
2752  if (indicesTo_.empty()) {
2753 
2754 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2755  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_fast_);
2756 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2757 
2758  if (verbose_) {
2759  std::ostringstream os;
2760  os << *prefix << "fast path: posting sends" << endl;
2761  std::cerr << os.str();
2762  }
2763 
2764  // Data are already blocked (laid out) by process, so we don't
2765  // need a separate send buffer (besides the exports array).
2766  for (size_t i = 0; i < numBlocks; ++i) {
2767  size_t p = i + procIndex;
2768  if (p > (numBlocks - 1)) {
2769  p -= numBlocks;
2770  }
2771 
2772  if (procsTo_[p] != myProcID && packetsPerSend[p] > 0) {
2773  exports_view_type tmpSend =
2774  subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
2775 
2776  if (sendType == Details::DISTRIBUTOR_SEND) { // the default, so put it first
2777  send<int> (tmpSend,
2778  as<int> (tmpSend.size ()),
2779  procsTo_[p], tag, *comm_);
2780  }
2781  else if (sendType == Details::DISTRIBUTOR_RSEND) {
2782  readySend<int> (tmpSend,
2783  as<int> (tmpSend.size ()),
2784  procsTo_[p], tag, *comm_);
2785  }
2786  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2787  exports_view_type tmpSendBuf =
2788  subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
2789  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2790  tag, *comm_));
2791  }
2792  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2793  ssend<int> (tmpSend,
2794  as<int> (tmpSend.size ()),
2795  procsTo_[p], tag, *comm_);
2796  }
2797  else {
2798  TEUCHOS_TEST_FOR_EXCEPTION(
2799  true, std::logic_error,
2800  "Tpetra::Distributor::doPosts(4 args, Kokkos): "
2801  "Invalid send type. We should never get here. "
2802  "Please report this bug to the Tpetra developers.");
2803  }
2804  }
2805  else { // "Sending" the message to myself
2806  selfNum = p;
2807  }
2808  }
2809 
2810  if (selfMessage_) {
2811  deep_copy_offset(imports, exports, selfReceiveOffset,
2812  sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
2813  }
2814  if (verbose_) {
2815  std::ostringstream os;
2816  os << *prefix << "fast path: done" << endl;
2817  std::cerr << os.str();
2818  }
2819  }
2820  else { // data are not blocked by proc, use send buffer
2821 
2822 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2823  Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_slow_);
2824 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
2825 
2826  if (verbose_) {
2827  std::ostringstream os;
2828  os << *prefix << "slow path: posting sends" << endl;
2829  std::cerr << os.str();
2830  }
2831  // FIXME (mfh 05 Mar 2013) This may be broken for Isend.
2832  typedef typename ExpView::non_const_value_type Packet;
2833  typedef typename ExpView::array_layout Layout;
2834  typedef typename ExpView::device_type Device;
2835  typedef typename ExpView::memory_traits Mem;
2836  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxNumPackets); // send buffer
2837 
2838  TEUCHOS_TEST_FOR_EXCEPTION(
2839  sendType == Details::DISTRIBUTOR_ISEND,
2840  std::logic_error,
2841  "Tpetra::Distributor::doPosts(4-arg, Kokkos): "
2842  "The \"send buffer\" code path may not necessarily work with nonblocking sends.");
2843 
2844  Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
2845  size_t ioffset = 0;
2846  for (int j=0; j<numExportPacketsPerLID.size(); ++j) {
2847  indicesOffsets[j] = ioffset;
2848  ioffset += numExportPacketsPerLID[j];
2849  }
2850 
2851  for (size_t i = 0; i < numBlocks; ++i) {
2852  size_t p = i + procIndex;
2853  if (p > (numBlocks - 1)) {
2854  p -= numBlocks;
2855  }
2856 
2857  if (procsTo_[p] != myProcID) {
2858  size_t sendArrayOffset = 0;
2859  size_t j = startsTo_[p];
2860  size_t numPacketsTo_p = 0;
2861  for (size_t k = 0; k < lengthsTo_[p]; ++k, ++j) {
2862  numPacketsTo_p += numExportPacketsPerLID[j];
2863  deep_copy_offset(sendArray, exports, sendArrayOffset,
2864  indicesOffsets[j], numExportPacketsPerLID[j]);
2865  sendArrayOffset += numExportPacketsPerLID[j];
2866  }
2867  if (numPacketsTo_p > 0) {
2868  ImpView tmpSend =
2869  subview_offset(sendArray, size_t(0), numPacketsTo_p);
2870 
2871  if (sendType == Details::DISTRIBUTOR_RSEND) {
2872  readySend<int> (tmpSend,
2873  as<int> (tmpSend.size ()),
2874  procsTo_[p], tag, *comm_);
2875  }
2876  else if (sendType == Details::DISTRIBUTOR_ISEND) {
2877  exports_view_type tmpSendBuf =
2878  subview_offset (sendArray, size_t(0), numPacketsTo_p);
2879  requests_.push_back (isend<int> (tmpSendBuf, procsTo_[p],
2880  tag, *comm_));
2881  }
2882  else if (sendType == Details::DISTRIBUTOR_SSEND) {
2883  ssend<int> (tmpSend,
2884  as<int> (tmpSend.size ()),
2885  procsTo_[p], tag, *comm_);
2886  }
2887  else { // if (sendType == Details::DISTRIBUTOR_SSEND)
2888  send<int> (tmpSend,
2889  as<int> (tmpSend.size ()),
2890  procsTo_[p], tag, *comm_);
2891  }
2892  }
2893  }
2894  else { // "Sending" the message to myself
2895  selfNum = p;
2896  selfIndex = startsTo_[p];
2897  }
2898  }
2899 
2900  if (selfMessage_) {
2901  for (size_t k = 0; k < lengthsTo_[selfNum]; ++k) {
2902  deep_copy_offset(imports, exports, selfReceiveOffset,
2903  indicesOffsets[selfIndex],
2904  numExportPacketsPerLID[selfIndex]);
2905  selfReceiveOffset += numExportPacketsPerLID[selfIndex];
2906  ++selfIndex;
2907  }
2908  }
2909  if (verbose_) {
2910  std::ostringstream os;
2911  os << *prefix << "slow path: done" << endl;
2912  std::cerr << os.str();
2913  }
2914  }
2915  }
2916 
2917  template <class ExpView, class ImpView>
2918  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2919  Distributor::
2920  doReversePostsAndWaits (const ExpView& exports,
2921  size_t numPackets,
2922  const ImpView& imports)
2923  {
2924  doReversePosts (exports, numPackets, imports);
2925  doReverseWaits ();
2926  }
2927 
2928  template <class ExpView, class ImpView>
2929  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2930  Distributor::
2931  doReversePostsAndWaits (const ExpView& exports,
2932  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2933  const ImpView& imports,
2934  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2935  {
2936  TEUCHOS_TEST_FOR_EXCEPTION(requests_.size() != 0, std::runtime_error,
2937  "Tpetra::Distributor::doReversePostsAndWaits(4 args): There are "
2938  << requests_.size() << " outstanding nonblocking messages pending. It "
2939  "is incorrect to call this method with posts outstanding.");
2940 
2941  doReversePosts (exports, numExportPacketsPerLID, imports,
2942  numImportPacketsPerLID);
2943  doReverseWaits ();
2944  }
2945 
2946  template <class ExpView, class ImpView>
2947  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2948  Distributor::
2949  doReversePosts (const ExpView &exports,
2950  size_t numPackets,
2951  const ImpView &imports)
2952  {
2953  // FIXME (mfh 29 Mar 2012) WHY?
2954  TEUCHOS_TEST_FOR_EXCEPTION(
2955  ! indicesTo_.empty (), std::runtime_error,
2956  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2957  "reverse communication when original data are blocked by process.");
2958  if (reverseDistributor_.is_null ()) {
2959  createReverseDistributor ();
2960  }
2961  reverseDistributor_->doPosts (exports, numPackets, imports);
2962  }
2963 
2964  template <class ExpView, class ImpView>
2965  typename std::enable_if<(Kokkos::Impl::is_view<ExpView>::value && Kokkos::Impl::is_view<ImpView>::value)>::type
2966  Distributor::
2967  doReversePosts (const ExpView &exports,
2968  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
2969  const ImpView &imports,
2970  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
2971  {
2972  // FIXME (mfh 29 Mar 2012) WHY?
2973  TEUCHOS_TEST_FOR_EXCEPTION(
2974  ! indicesTo_.empty (), std::runtime_error,
2975  "Tpetra::Distributor::doReversePosts(3 args): Can only do "
2976  "reverse communication when original data are blocked by process.");
2977  if (reverseDistributor_.is_null ()) {
2978  createReverseDistributor ();
2979  }
2980  reverseDistributor_->doPosts (exports, numExportPacketsPerLID,
2981  imports, numImportPacketsPerLID);
2982  }
2983 
2984  template <class OrdinalType>
2985  void Distributor::
2986  computeSends(const Teuchos::ArrayView<const OrdinalType>& importGIDs,
2987  const Teuchos::ArrayView<const int>& importProcIDs,
2988  Teuchos::Array<OrdinalType>& exportGIDs,
2989  Teuchos::Array<int>& exportProcIDs)
2990  {
2991  // NOTE (mfh 19 Apr 2012): There was a note on this code saying:
2992  // "assumes that size_t >= Ordinal". The code certainly does
2993  // assume that sizeof(size_t) >= sizeof(OrdinalType) as well as
2994  // sizeof(size_t) >= sizeof(int). This is because it casts the
2995  // OrdinalType elements of importGIDs (along with their
2996  // corresponding process IDs, as int) to size_t, and does a
2997  // doPostsAndWaits<size_t>() to send the packed data.
2998  using Teuchos::Array;
2999  using Teuchos::ArrayView;
3000  using std::endl;
3001  using size_type = typename ArrayView<const OrdinalType>::size_type;
3002  const char errPrefix[] = "Tpetra::Distributor::computeSends: ";
3003  const char suffix[] =
3004  " Please report this bug to the Tpetra developers.";
3005 
3006  const int myRank = comm_->getRank ();
3007  std::unique_ptr<std::string> prefix;
3008  if (verbose_) {
3009  prefix = createPrefix("computeSends");
3010  std::ostringstream os;
3011  os << *prefix << "Start" << endl;
3012  std::cerr << os.str();
3013  }
3014 
3015  TEUCHOS_TEST_FOR_EXCEPTION
3016  (importGIDs.size () != importProcIDs.size (),
3017  std::invalid_argument, errPrefix << "On Process " << myRank
3018  << ": importProcIDs.size()=" << importProcIDs.size()
3019  << " != importGIDs.size()=" << importGIDs.size() << ".");
3020 
3021  const size_type numImports = importProcIDs.size();
3022  Array<size_t> importObjs(2*numImports);
3023  // Pack pairs (importGIDs[i], my process ID) to send into importObjs.
3024  for (size_type i = 0; i < numImports; ++i) {
3025  importObjs[2*i] = static_cast<size_t>(importGIDs[i]);
3026  importObjs[2*i+1] = static_cast<size_t>(myRank);
3027  }
3028  //
3029  // Use a temporary Distributor to send the (importGIDs[i], myRank)
3030  // pairs to importProcIDs[i].
3031  //
3032  Distributor tempPlan(comm_);
3033  if (verbose_) {
3034  std::ostringstream os;
3035  os << *prefix << "Call tempPlan.createFromSends" << endl;
3036  std::cerr << os.str();
3037  }
3038  // mfh 20 Mar 2014: An extra-cautious cast from unsigned to
3039  // signed, in order to forestall any possible causes for Bug 6069.
3040  const size_t numExportsAsSizeT =
3041  tempPlan.createFromSends(importProcIDs);
3042  const size_type numExports =
3043  static_cast<size_type>(numExportsAsSizeT);
3044  TEUCHOS_TEST_FOR_EXCEPTION
3045  (numExports < 0, std::logic_error, errPrefix <<
3046  "tempPlan.createFromSends() returned numExports="
3047  << numExportsAsSizeT << " as a size_t, which overflows to "
3048  << numExports << " when cast to " <<
3049  Teuchos::TypeNameTraits<size_type>::name () << "." << suffix);
3050  TEUCHOS_TEST_FOR_EXCEPTION
3051  (size_type(tempPlan.getTotalReceiveLength()) != numExports,
3052  std::logic_error, errPrefix << "tempPlan.getTotalReceiveLength()="
3053  << tempPlan.getTotalReceiveLength () << " != numExports="
3054  << numExports << "." << suffix);
3055 
3056  if (numExports > 0) {
3057  exportGIDs.resize(numExports);
3058  exportProcIDs.resize(numExports);
3059  }
3060 
3061  // exportObjs: Packed receive buffer. (exportObjs[2*i],
3062  // exportObjs[2*i+1]) will give the (GID, PID) pair for export i,
3063  // after tempPlan.doPostsAndWaits(...) finishes below.
3064  //
3065  // FIXME (mfh 19 Mar 2014) This only works if OrdinalType fits in
3066  // size_t. This issue might come up, for example, on a 32-bit
3067  // machine using 64-bit global indices. I will add a check here
3068  // for that case.
3069  static_assert(sizeof(size_t) >= sizeof(OrdinalType),
3070  "Tpetra::Distributor::computeSends: "
3071  "sizeof(size_t) < sizeof(OrdinalType).");
3072 
3073  TEUCHOS_TEST_FOR_EXCEPTION
3074  (tempPlan.getTotalReceiveLength () < size_t(numExports),
3075  std::logic_error,
3076  errPrefix << "tempPlan.getTotalReceiveLength()="
3077  << tempPlan.getTotalReceiveLength() << " < numExports="
3078  << numExports << "." << suffix);
3079 
3080  Array<size_t> exportObjs (tempPlan.getTotalReceiveLength () * 2);
3081  if (verbose_) {
3082  std::ostringstream os;
3083  os << *prefix << "Call tempPlan.doPostsAndWaits" << endl;
3084  std::cerr << os.str();
3085  }
3086  tempPlan.doPostsAndWaits<size_t> (importObjs (), 2, exportObjs ());
3087 
3088  // Unpack received (GID, PID) pairs into exportIDs resp. exportProcIDs.
3089  for (size_type i = 0; i < numExports; ++i) {
3090  exportGIDs[i] = static_cast<OrdinalType> (exportObjs[2*i]);
3091  exportProcIDs[i] = static_cast<int> (exportObjs[2*i+1]);
3092  }
3093 
3094  if (verbose_) {
3095  std::ostringstream os;
3096  os << *prefix << "Done" << endl;
3097  std::cerr << os.str();
3098  }
3099  }
3100 
3101  template <class OrdinalType>
3102  void Distributor::
3103  createFromRecvs (const Teuchos::ArrayView<const OrdinalType> &remoteGIDs,
3104  const Teuchos::ArrayView<const int> &remoteProcIDs,
3105  Teuchos::Array<OrdinalType> &exportGIDs,
3106  Teuchos::Array<int> &exportProcIDs)
3107  {
3108  using std::endl;
3109  const char errPrefix[] = "Tpetra::Distributor::createFromRecvs: ";
3110  const int myRank = comm_->getRank();
3111 
3112  std::unique_ptr<std::string> prefix;
3113  if (verbose_) {
3114  prefix = createPrefix("createFromRecvs");
3115  std::ostringstream os;
3116  os << *prefix << "Start" << endl;
3117  std::cerr << os.str();
3118  }
3119 
3120  const bool debug = Details::Behavior::debug("Distributor");
3121  if (debug) {
3122  using Teuchos::outArg;
3123  using Teuchos::REDUCE_MAX;
3124  using Teuchos::reduceAll;
3125  // In debug mode, first test locally, then do an all-reduce to
3126  // make sure that all processes passed.
3127  const int errProc =
3128  (remoteGIDs.size () != remoteProcIDs.size ()) ? myRank : -1;
3129  int maxErrProc = -1;
3130  reduceAll(*comm_, REDUCE_MAX, errProc, outArg(maxErrProc));
3131  TEUCHOS_TEST_FOR_EXCEPTION
3132  (maxErrProc != -1, std::runtime_error, errPrefix << "Lists "
3133  "of remote IDs and remote process IDs must have the same "
3134  "size on all participating processes. Maximum process ID "
3135  "with error: " << maxErrProc << ".");
3136  }
3137  else { // in non-debug mode, just test locally
3138  // NOTE (mfh 13 Feb 2020) This needs to throw std::runtime_error
3139  // in order to make an existing Distributor unit test pass.
3140  TEUCHOS_TEST_FOR_EXCEPTION
3141  (remoteGIDs.size() != remoteProcIDs.size(), std::runtime_error,
3142  errPrefix << "On Process " << myRank << ": "
3143  "remoteGIDs.size()=" << remoteGIDs.size() <<
3144  " != remoteProcIDs.size()=" << remoteProcIDs.size() << ".");
3145  }
3146 
3147  computeSends(remoteGIDs, remoteProcIDs, exportGIDs, exportProcIDs);
3148 
3149  const size_t numProcsSendingToMe = createFromSends(exportProcIDs ());
3150 
3151  if (verbose_) {
3152  // NOTE (mfh 20 Mar 2014) If remoteProcIDs could contain
3153  // duplicates, then its length might not be the right check here,
3154  // even if we account for selfMessage_. selfMessage_ is set in
3155  // createFromSends.
3156  std::ostringstream os;
3157  os << *prefix << "numProcsSendingToMe: "
3158  << numProcsSendingToMe << ", remoteProcIDs.size(): "
3159  << remoteProcIDs.size () << ", selfMessage_: "
3160  << (selfMessage_ ? "true" : "false") << "" << endl;
3161  std::cerr << os.str();
3162  }
3163 
3164  howInitialized_ = Details::DISTRIBUTOR_INITIALIZED_BY_CREATE_FROM_RECVS;
3165 
3166  if (verbose_) {
3167  std::ostringstream os;
3168  os << *prefix << "Done" << endl;
3169  std::cerr << os.str();
3170  }
3171  }
3172 
3173 } // namespace Tpetra
3174 
3175 #endif // TPETRA_DISTRIBUTOR_HPP
void doPostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the (forward) communication plan.
size_t getNumReceives() const
The number of processes from which we will receive data.
std::string description() const
Return a one-line description of this object.
Teuchos::RCP< Distributor > getReverse(bool create=true) const
A reverse communication plan Distributor.
EDistributorHowInitialized
Enum indicating how and whether a Distributor was initialized.
Teuchos::RCP< const Teuchos::ParameterList > getValidParameters() const
List of valid Distributor parameters.
virtual ~Distributor()=default
Destructor (virtual for memory safety).
static bool debug()
Whether Tpetra is in debug mode.
void swap(Distributor &rhs)
Swap the contents of rhs with those of *this.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
void doReversePostsAndWaits(const Teuchos::ArrayView< const Packet > &exports, size_t numPackets, const Teuchos::ArrayView< Packet > &imports)
Execute the reverse communication plan.
Teuchos::ArrayView< const size_t > getLengthsFrom() const
Number of values this process will receive from each process.
Teuchos::ArrayView< const int > getProcsFrom() const
Ranks of the processes sending values to this process.
size_t createFromSends(const Teuchos::ArrayView< const int > &exportProcIDs)
Set up Distributor using list of process ranks to which this process will send.
Details::EDistributorHowInitialized howInitialized() const
Return an enum indicating whether and how a Distributor was initialized.
void doPosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a forward plan, but do not execute the waits yet.
Teuchos::ArrayView< const int > getProcsTo() const
Ranks of the processes to which this process will send values.
void createFromSendsAndRecvs(const Teuchos::ArrayView< const int > &exportProcIDs, const Teuchos::ArrayView< const int > &remoteProcIDs)
Set up Distributor using list of process ranks to which to send, and list of process ranks from which...
bool hasSelfMessage() const
Whether the calling process will send or receive messages to itself.
Sets up and executes a communication plan for a Tpetra DistObject.
size_t getTotalReceiveLength() const
Total number of values this process will receive from other processes.
void setParameterList(const Teuchos::RCP< Teuchos::ParameterList > &plist)
Set Distributor parameters.
void doReversePosts(const Teuchos::ArrayRCP< const Packet > &exports, size_t numPackets, const Teuchos::ArrayRCP< Packet > &imports)
Post the data for a reverse plan, but do not execute the waits yet.
Teuchos::ArrayView< const size_t > getLengthsTo() const
Number of values this process will send to each process.
Teuchos::Array< std::string > distributorSendTypes()
Valid values for Distributor&#39;s &quot;Send type&quot; parameter.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
void getLastDoStatistics(size_t &bytes_sent, size_t &bytes_recvd) const
Information on the last call to do/doReverse.
size_t getNumSends() const
The number of processes to which we will send data.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const
Describe this object in a human-readable way to the given output stream.
size_t getMaxSendLength() const
Maximum number of values this process will send to another single process.
void createFromRecvs(const Teuchos::ArrayView< const Ordinal > &remoteIDs, const Teuchos::ArrayView< const int > &remoteProcIDs, Teuchos::Array< Ordinal > &exportIDs, Teuchos::Array< int > &exportProcIDs)
Set up Distributor using list of process ranks from which to receive.
EDistributorSendType
The type of MPI send that Distributor should use.
Distributor(const Teuchos::RCP< const Teuchos::Comm< int > > &comm)
Construct using the specified communicator and default parameters.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.