Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_DistributorActor.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_DISTRIBUTOR_ACTOR_HPP
11 #define TPETRA_DETAILS_DISTRIBUTOR_ACTOR_HPP
12 
13 #include "Teuchos_Assert.hpp"
15 #include "Tpetra_Util.hpp"
16 
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_Comm.hpp"
21 #include "Teuchos_RCP.hpp"
22 
23 #include "Kokkos_TeuchosCommAdapters.hpp"
24 
25 #ifdef HAVE_TPETRA_MPI
26 #include "mpi.h"
27 #endif
28 
29 namespace Tpetra::Details {
30 
31 template <class View>
32 constexpr bool isKokkosView = Kokkos::is_view<View>::value;
33 
34 template <class View1, class View2>
35 constexpr bool areKokkosViews = Kokkos::is_view<View1>::value && Kokkos::is_view<View2>::value;
36 
37 class DistributorActor {
38  static constexpr int DEFAULT_MPI_TAG = 1;
39 
40  using IndexView = DistributorPlan::IndexView;
41  using SubViewLimits = DistributorPlan::SubViewLimits;
42 
43 public:
44  DistributorActor();
45  DistributorActor(const DistributorActor& otherActor);
46 
47  template <class ExpView, class ImpView>
48  void doPostsAndWaits(const DistributorPlan& plan,
49  const ExpView &exports,
50  size_t numPackets,
51  const ImpView &imports);
52 
53  template <class ExpView, class ImpView>
54  void doPostsAndWaits(const DistributorPlan& plan,
55  const ExpView &exports,
56  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
57  const ImpView &imports,
58  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
59 
60  template <class ImpView>
61  void doPostRecvs(const DistributorPlan& plan,
62  size_t numPackets,
63  const ImpView& imports);
64 
65  template <class ImpView>
66  void doPostRecvs(const DistributorPlan& plan,
67  const ImpView &imports,
68  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
69 
70  template <class ExpView, class ImpView>
71  void doPostSends(const DistributorPlan& plan,
72  const ExpView& exports,
73  size_t numPackets,
74  const ImpView& imports);
75 
76  template <class ExpView, class ImpView>
77  void doPostSends(const DistributorPlan& plan,
78  const ExpView &exports,
79  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
80  const ImpView &imports,
81  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
82 
83  template <class ExpView, class ImpView>
84  void doPosts(const DistributorPlan& plan,
85  const ExpView& exports,
86  size_t numPackets,
87  const ImpView& imports);
88 
89  template <class ExpView, class ImpView>
90  void doPosts(const DistributorPlan& plan,
91  const ExpView &exports,
92  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
93  const ImpView &imports,
94  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
95 
96  void doWaits(const DistributorPlan& plan);
97 
98  void doWaitsRecv(const DistributorPlan& plan);
99 
100  void doWaitsSend(const DistributorPlan& plan);
101 
102  bool isReady() const;
103 
104 private:
105 
106  template <class ImpView>
107  void doPostRecvsImpl(const DistributorPlan& plan,
108  const ImpView& imports,
109  const SubViewLimits& totalPacketsFrom);
110 
111  template <class ExpView, class ImpView>
112  void doPostSendsImpl(const DistributorPlan& plan,
113  const ExpView& exports,
114  const SubViewLimits& exportSubViewLimits,
115  const ImpView& imports,
116  const SubViewLimits& importSubViewLimits);
117 
118 #ifdef HAVE_TPETRA_MPI
119  template <class ExpView, class ImpView>
120  void doPostsAllToAllImpl(const DistributorPlan &plan,
121  const ExpView &exports,
122  const SubViewLimits& exportSubViewLimits,
123  const ImpView &imports,
124  const SubViewLimits& importSubViewLimits);
125 
126 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
127  template <class ExpView, class ImpView>
128  void doPostsNbrAllToAllVImpl(const DistributorPlan &plan,
129  const ExpView &exports,
130  const SubViewLimits& exportSubViewLimits,
131  const ImpView &imports,
132  const SubViewLimits& importSubViewLimits);
133 #endif // HAVE_TPETRACORE_MPI_ADVANCE
134 #endif // HAVE_TPETRA_CORE
135 
136  int mpiTag_;
137 
138  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int>>> requestsRecv_;
139  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int>>> requestsSend_;
140 };
141 
142 template <class ExpView, class ImpView>
143 void DistributorActor::doPosts(const DistributorPlan& plan,
144  const ExpView& exports,
145  size_t numPackets,
146  const ImpView& imports)
147 {
148  doPostRecvs(plan, numPackets, imports);
149  doPostSends(plan, exports, numPackets, imports);
150 }
151 
152 template <class ExpView, class ImpView>
153 void DistributorActor::doPostsAndWaits(const DistributorPlan& plan,
154  const ExpView& exports,
155  size_t numPackets,
156  const ImpView& imports)
157 {
158  static_assert(areKokkosViews<ExpView, ImpView>,
159  "Data arrays for DistributorActor::doPostsAndWaits must be Kokkos::Views");
160  doPosts(plan, exports, numPackets, imports);
161  doWaits(plan);
162 }
163 
164 template <class ExpView, class ImpView>
165 void DistributorActor::doPosts(const DistributorPlan& plan,
166  const ExpView &exports,
167  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
168  const ImpView &imports,
169  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
170 {
171  doPostRecvs(plan, imports, numImportPacketsPerLID);
172  doPostSends(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
173 }
174 
175 template <class ExpView, class ImpView>
176 void DistributorActor::doPostsAndWaits(const DistributorPlan& plan,
177  const ExpView& exports,
178  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
179  const ImpView& imports,
180  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
181 {
182  static_assert(areKokkosViews<ExpView, ImpView>,
183  "Data arrays for DistributorActor::doPostsAndWaits must be Kokkos::Views");
184  doPosts(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
185  doWaits(plan);
186 }
187 
188 template <typename ViewType>
189 using HostAccessibility = Kokkos::SpaceAccessibility<Kokkos::DefaultHostExecutionSpace, typename ViewType::memory_space>;
190 
191 template <typename DstViewType, typename SrcViewType>
192 using enableIfHostAccessible = std::enable_if_t<HostAccessibility<DstViewType>::accessible &&
193  HostAccessibility<SrcViewType>::accessible>;
194 
195 template <typename DstViewType, typename SrcViewType>
196 using enableIfNotHostAccessible = std::enable_if_t<!HostAccessibility<DstViewType>::accessible ||
197  !HostAccessibility<SrcViewType>::accessible>;
198 
199 template <typename DstViewType, typename SrcViewType>
200 enableIfHostAccessible<DstViewType, SrcViewType>
201 packOffset(const DstViewType& dst,
202  const SrcViewType& src,
203  const size_t dst_offset,
204  const size_t src_offset,
205  const size_t size)
206 {
207  memcpy((void*) (dst.data()+dst_offset), src.data()+src_offset, size*sizeof(typename DstViewType::value_type));
208 }
209 
210 template <typename DstViewType, typename SrcViewType>
211 enableIfNotHostAccessible<DstViewType, SrcViewType>
212 packOffset(const DstViewType& dst,
213  const SrcViewType& src,
214  const size_t dst_offset,
215  const size_t src_offset,
216  const size_t size)
217 {
218  Kokkos::Compat::deep_copy_offset(dst, src, dst_offset, src_offset, size);
219 }
220 
221 #ifdef HAVE_TPETRA_MPI
222 template <class ExpView, class ImpView>
223 void DistributorActor::doPostsAllToAllImpl(const DistributorPlan &plan,
224  const ExpView &exports,
225  const SubViewLimits& exportSubViewLimits,
226  const ImpView &imports,
227  const SubViewLimits& importSubViewLimits) {
228  TEUCHOS_TEST_FOR_EXCEPTION(
229  !plan.getIndicesTo().is_null(), std::runtime_error,
230  "Send Type=\"Alltoall\" only works for fast-path communication.");
231 
232  using size_type = Teuchos::Array<size_t>::size_type;
233 
234  auto comm = plan.getComm();
235  std::vector<int> sendcounts(comm->getSize(), 0);
236  std::vector<int> sdispls(comm->getSize(), 0);
237  std::vector<int> recvcounts(comm->getSize(), 0);
238  std::vector<int> rdispls(comm->getSize(), 0);
239 
240  auto& [importStarts, importLengths] = importSubViewLimits;
241  auto& [exportStarts, exportLengths] = exportSubViewLimits;
242 
243  for (size_t pp = 0; pp < plan.getNumSends(); ++pp) {
244  sdispls[plan.getProcsTo()[pp]] = exportStarts[pp];
245  size_t numPackets = exportLengths[pp];
246  // numPackets is converted down to int, so make sure it can be represented
247  TEUCHOS_TEST_FOR_EXCEPTION(numPackets > size_t(INT_MAX), std::logic_error,
248  "Tpetra::Distributor::doPostsAllToAll: "
249  "Send count for send "
250  << pp << " (" << numPackets
251  << ") is too large "
252  "to be represented as int.");
253  sendcounts[plan.getProcsTo()[pp]] = static_cast<int>(numPackets);
254  }
255 
256  const size_type actualNumReceives =
257  Teuchos::as<size_type>(plan.getNumReceives()) +
258  Teuchos::as<size_type>(plan.hasSelfMessage() ? 1 : 0);
259 
260  for (size_type i = 0; i < actualNumReceives; ++i) {
261  rdispls[plan.getProcsFrom()[i]] = importStarts[i];
262  size_t totalPacketsFrom_i = importLengths[i];
263  // totalPacketsFrom_i is converted down to int, so make sure it can be
264  // represented
265  TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i > size_t(INT_MAX),
266  std::logic_error,
267  "Tpetra::Distributor::doPostsAllToAll: "
268  "Recv count for receive "
269  << i << " (" << totalPacketsFrom_i
270  << ") is too large "
271  "to be represented as int.");
272  recvcounts[plan.getProcsFrom()[i]] = static_cast<int>(totalPacketsFrom_i);
273  }
274 
275  Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm =
276  Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm);
277  Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm =
278  mpiComm->getRawMpiComm();
279  using T = typename ExpView::non_const_value_type;
280  MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
281 
282 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
283  if (Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL == plan.getSendType()) {
284  MPIX_Comm *mpixComm = *plan.getMPIXComm();
285  TEUCHOS_TEST_FOR_EXCEPTION(!mpixComm, std::runtime_error,
286  "MPIX_Comm is null in doPostsAllToAll \""
287  << __FILE__ << ":" << __LINE__);
288 
289  const int err = MPIX_Alltoallv(
290  exports.data(), sendcounts.data(), sdispls.data(), rawType,
291  imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
292 
293  TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
294  "MPIX_Alltoallv failed with error \""
295  << Teuchos::mpiErrorCodeToString(err)
296  << "\".");
297 
298  return;
299  }
300 #endif // HAVE_TPETRACORE_MPI_ADVANCE
301 
302  const int err = MPI_Alltoallv(
303  exports.data(), sendcounts.data(), sdispls.data(), rawType,
304  imports.data(), recvcounts.data(), rdispls.data(), rawType, (*rawComm)());
305 
306  TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
307  "MPI_Alltoallv failed with error \""
308  << Teuchos::mpiErrorCodeToString(err)
309  << "\".");
310 }
311 
312 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
313 template <class ExpView, class ImpView>
314 void DistributorActor::doPostsNbrAllToAllVImpl(const DistributorPlan &plan,
315  const ExpView &exports,
316  const SubViewLimits& exportSubViewLimits,
317  const ImpView &imports,
318  const SubViewLimits& importSubViewLimits) {
319  TEUCHOS_TEST_FOR_EXCEPTION(
320  !plan.getIndicesTo().is_null(), std::runtime_error,
321  "Send Type=\"Alltoall\" only works for fast-path communication.");
322 
323  const int myRank = plan.getComm()->getRank();
324  MPIX_Comm *mpixComm = *plan.getMPIXComm();
325 
326  const size_t numSends = plan.getNumSends() + plan.hasSelfMessage();
327  const size_t numRecvs = plan.getNumReceives() + plan.hasSelfMessage();
328  std::vector<int> sendcounts(numSends, 0);
329  std::vector<int> sdispls(numSends, 0);
330  std::vector<int> recvcounts(numRecvs, 0);
331  std::vector<int> rdispls(numRecvs, 0);
332 
333  auto& [importStarts, importLengths] = importSubViewLimits;
334  auto& [exportStarts, exportLengths] = exportSubViewLimits;
335 
336  for (size_t pp = 0; pp < plan.getNumSends(); ++pp) {
337  sdispls[plan.getProcsTo()[pp]] = exportStarts[pp];
338  size_t numPackets = exportLengths[pp];
339  // numPackets is converted down to int, so make sure it can be represented
340  TEUCHOS_TEST_FOR_EXCEPTION(numPackets > size_t(INT_MAX), std::logic_error,
341  "Tpetra::Distributor::doPostsNbrAllToAllV: "
342  "Send count for send "
343  << pp << " (" << numPackets
344  << ") is too large "
345  "to be represented as int.");
346  sendcounts[plan.getProcsTo()[pp]] = static_cast<int>(numPackets);
347  }
348 
349  const size_type actualNumReceives =
350  Teuchos::as<size_type>(plan.getNumReceives()) +
351  Teuchos::as<size_type>(plan.hasSelfMessage() ? 1 : 0);
352 
353  for (size_type i = 0; i < actualNumReceives; ++i) {
354  rdispls[plan.getProcsFrom()[i]] = importStarts(i);
355  size_t totalPacketsFrom_i = importLengths(i);
356  // totalPacketsFrom_i is converted down to int, so make sure it can be
357  // represented
358  TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i > size_t(INT_MAX),
359  std::logic_error,
360  "Tpetra::Distributor::doPostsNbrAllToAllV: "
361  "Recv count for receive "
362  << i << " (" << totalPacketsFrom_i
363  << ") is too large "
364  "to be represented as int.");
365  recvcounts[plan.getProcsFrom()[i]] = static_cast<int>(totalPacketsFrom_i);
366  }
367 
368  using T = typename ExpView::non_const_value_type;
369  MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
370 
371  const int err = MPIX_Neighbor_alltoallv(
372  exports.data(), sendcounts.data(), sdispls.data(), rawType,
373  imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
374 
375  TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
376  "MPIX_Neighbor_alltoallv failed with error \""
377  << Teuchos::mpiErrorCodeToString(err)
378  << "\".");
379 }
380 #endif // HAVE_TPETRACORE_MPI_ADVANCE
381 #endif // HAVE_TPETRA_MPI
382 
383 template <class ImpView>
384 void DistributorActor::doPostRecvs(const DistributorPlan& plan,
385  size_t numPackets,
386  const ImpView& imports)
387 {
388  auto importSubViewLimits = plan.getImportViewLimits(numPackets);
389  doPostRecvsImpl(plan, imports, importSubViewLimits);
390 }
391 
392 template <class ImpView>
393 void DistributorActor::doPostRecvs(const DistributorPlan& plan,
394  const ImpView &imports,
395  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
396 {
397  auto importSubViewLimits = plan.getImportViewLimits(numImportPacketsPerLID);
398  doPostRecvsImpl(plan, imports, importSubViewLimits);
399 }
400 
401 template <class ImpView>
402 void DistributorActor::doPostRecvsImpl(const DistributorPlan& plan,
403  const ImpView &imports,
404  const SubViewLimits& importSubViewLimits)
405 {
406  static_assert(isKokkosView<ImpView>,
407  "Data arrays for DistributorActor::doPostRecvs must be Kokkos::Views");
408  using Teuchos::Array;
409  using Teuchos::as;
410  using Teuchos::ireceive;
411  using Kokkos::Compat::subview_offset;
412  using size_type = Array<size_t>::size_type;
413  using imports_view_type = ImpView;
414 
415 #ifdef KOKKOS_ENABLE_CUDA
416  static_assert (! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
417  "Please do not use Tpetra::Distributor with UVM "
418  "allocations. See GitHub issue #1088.");
419 #endif // KOKKOS_ENABLE_CUDA
420 
421 #ifdef KOKKOS_ENABLE_SYCL
422  static_assert (! std::is_same<typename ImpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value,
423  "Please do not use Tpetra::Distributor with SharedUSM "
424  "allocations. See GitHub issue #1088 (corresponding to CUDA).");
425 #endif // KOKKOS_ENABLE_SYCL
426 
427 
428 #if defined(HAVE_TPETRA_MPI)
429  // All-to-all communication layout is quite different from
430  // point-to-point, so we handle it separately.
431 
432  // These send options require no matching receives, so we just return.
433  const Details::EDistributorSendType sendType = plan.getSendType();
434  if ((sendType == Details::DISTRIBUTOR_ALLTOALL)
435 #ifdef HAVE_TPETRACORE_MPI_ADVANCE
436  || (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL)
437  || (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV)
438 #endif
439  ) {
440  return;
441  }
442 #endif // HAVE_TPETRA_MPI
443 
444  ProfilingRegion pr("Tpetra::Distributor::doPostRecvs");
445 
446  const int myProcID = plan.getComm()->getRank ();
447 
448  auto& [importStarts, importLengths] = importSubViewLimits;
449 
450  // Distributor uses requestsRecv_.size() and requestsSend_.size()
451  // as the number of outstanding nonblocking message requests, so
452  // we resize to zero to maintain this invariant.
453  //
454  // getNumReceives() does _not_ include the self message, if there is
455  // one. Here, we do actually send a message to ourselves, so we
456  // include any self message in the "actual" number of receives to
457  // post.
458  //
459  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
460  // doesn't (re)allocate its array of requests. That happens in
461  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
462  // demand), or Resize_().
463  const size_type actualNumReceives = as<size_type> (plan.getNumReceives()) +
464  as<size_type> (plan.hasSelfMessage() ? 1 : 0);
465 
466 #ifdef HAVE_TPETRA_DEBUG
467  size_t totalNumImportPackets = 0;
468  for (size_t i = 0; i < Teuchos::as<size_t>(actualNumReceives); ++i) {
469  totalNumImportPackets += importLengths[i];
470  }
471  TEUCHOS_TEST_FOR_EXCEPTION(
472  imports.extent (0) < totalNumImportPackets, std::runtime_error,
473  "Tpetra::Distributor::doPostRecvs: The 'imports' array must have "
474  "enough entries to hold the expected number of import packets. "
475  "imports.extent(0) = " << imports.extent (0) << " < "
476  "totalNumImportPackets = " << totalNumImportPackets << ".");
477  TEUCHOS_TEST_FOR_EXCEPTION
478  (!requestsRecv_.empty(), std::logic_error, "Tpetra::Distributor::"
479  "doPostRecvs: Process " << myProcID << ": requestsRecv_.size () = "
480  << requestsRecv_.size () << " != 0.");
481 #endif // HAVE_TPETRA_DEBUG
482 
483  requestsRecv_.resize (0);
484 
485  // Post the nonblocking receives. It's common MPI wisdom to post
486  // receives before sends. In MPI terms, this means favoring
487  // adding to the "posted queue" (of receive requests) over adding
488  // to the "unexpected queue" (of arrived messages not yet matched
489  // with a receive).
490  {
491  ProfilingRegion prr("Tpetra::Distributor::doPostRecvs MPI_Irecv");
492 
493  for (size_type i = 0; i < actualNumReceives; ++i) {
494  size_t totalPacketsFrom_i = importLengths[Teuchos::as<size_t>(i)];
495  TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i > size_t(INT_MAX),
496  std::logic_error, "Tpetra::Distributor::doPostRecvs: "
497  "Recv count for receive " << i << " (" << totalPacketsFrom_i << ") is too large "
498  "to be represented as int.");
499  if (plan.getProcsFrom()[i] != myProcID && totalPacketsFrom_i) {
500  // If my process is receiving these packet(s) from another
501  // process (not a self-receive), and if there is at least
502  // one packet to receive:
503  //
504  // 1. Set up the persisting view (recvBuf) into the imports
505  // array, given the offset and size (total number of
506  // packets from process getProcsFrom()[i]).
507  // 2. Start the Irecv and save the resulting request.
508  imports_view_type recvBuf =
509  subview_offset (imports, importStarts[i], totalPacketsFrom_i);
510  requestsRecv_.push_back (ireceive<int> (recvBuf, plan.getProcsFrom()[i],
511  mpiTag_, *plan.getComm()));
512  }
513  }
514  }
515 }
516 
517 template <class ExpView, class ImpView>
518 void DistributorActor::doPostSends(const DistributorPlan& plan,
519  const ExpView& exports,
520  size_t numPackets,
521  const ImpView& imports)
522 {
523  auto exportSubViewLimits = plan.getExportViewLimits(numPackets);
524  auto importSubViewLimits = plan.getImportViewLimits(numPackets);
525  doPostSendsImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
526 }
527 
528 template <class ExpView, class ImpView>
529 void DistributorActor::doPostSends(const DistributorPlan& plan,
530  const ExpView &exports,
531  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
532  const ImpView &imports,
533  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
534 {
535  auto exportSubViewLimits = plan.getExportViewLimits(numExportPacketsPerLID);
536  auto importSubViewLimits = plan.getImportViewLimits(numImportPacketsPerLID);
537  doPostSendsImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
538 }
539 
540 template <class ExpView, class ImpView>
541 void DistributorActor::doPostSendsImpl(const DistributorPlan& plan,
542  const ExpView& exports,
543  const SubViewLimits& exportSubViewLimits,
544  const ImpView& imports,
545  const SubViewLimits& importSubViewLimits)
546 {
547  static_assert(areKokkosViews<ExpView, ImpView>,
548  "Data arrays for DistributorActor::doPostSends must be Kokkos::Views");
549  using Teuchos::Array;
550  using Teuchos::as;
551  using Teuchos::isend;
552  using Teuchos::send;
553  using Kokkos::Compat::subview_offset;
554  using Kokkos::Compat::deep_copy_offset;
555  using size_type = Array<size_t>::size_type;
556  using exports_view_type = ExpView;
557 
558 #ifdef KOKKOS_ENABLE_CUDA
559  static_assert
560  (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
561  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
562  "Please do not use Tpetra::Distributor with UVM allocations. "
563  "See Trilinos GitHub issue #1088.");
564 #endif // KOKKOS_ENABLE_CUDA
565 
566 #ifdef KOKKOS_ENABLE_SYCL
567  static_assert
568  (! std::is_same<typename ExpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value &&
569  ! std::is_same<typename ImpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value,
570  "Please do not use Tpetra::Distributor with SharedUSM allocations. "
571  "See Trilinos GitHub issue #1088 (corresponding to CUDA).");
572 #endif // KOKKOS_ENABLE_SYCL
573 
574  ProfilingRegion ps("Tpetra::Distributor::doPostSends");
575 
576  const int myRank = plan.getComm()->getRank ();
577  // Run-time configurable parameters that come from the input
578  // ParameterList set by setParameterList().
579  const Details::EDistributorSendType sendType = plan.getSendType();
580 
581  auto& [exportStarts, exportLengths] = exportSubViewLimits;
582  auto& [importStarts, importLengths] = importSubViewLimits;
583 
584 #if defined(HAVE_TPETRA_MPI)
585  // All-to-all communication layout is quite different from
586  // point-to-point, so we handle it separately.
587 
588  if (sendType == Details::DISTRIBUTOR_ALLTOALL) {
589  doPostsAllToAllImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
590  return;
591  }
592 #ifdef HAVE_TPETRACORE_MPI_ADVANCE
593  else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL) {
594  doPostsAllToAllImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
595  return;
596  } else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
597  doPostsNbrAllToAllVImpl(plan, exports,numPackets, imports);
598  return;
599  }
600 #endif // defined(HAVE_TPETRACORE_MPI_ADVANCE)
601 
602 
603 #else // HAVE_TPETRA_MPI
604  if (plan.hasSelfMessage()) {
605  // This is how we "send a message to ourself": we copy from
606  // the export buffer to the import buffer. That saves
607  // Teuchos::Comm implementations other than MpiComm (in
608  // particular, SerialComm) the trouble of implementing self
609  // messages correctly. (To do this right, SerialComm would
610  // need internal buffer space for messages, keyed on the
611  // message's tag.)
612  size_t selfReceiveOffset = 0;
613  deep_copy_offset(imports, exports, selfReceiveOffset,
614  exportStarts[0],
615  exportLengths[0]);
616  }
617  // should we just return here?
618  // likely not as comm could be a serial comm
619 #endif // HAVE_TPETRA_MPI
620 
621  size_t selfReceiveOffset = 0;
622 
623 #ifdef HAVE_TPETRA_DEBUG
624  TEUCHOS_TEST_FOR_EXCEPTION
625  (requestsSend_.size () != 0,
626  std::logic_error,
627  "Tpetra::Distributor::doPostSends: Process "
628  << myRank << ": requestsSend_.size() = " << requestsSend_.size () << " != 0.");
629 #endif // HAVE_TPETRA_DEBUG
630 
631  // Distributor uses requestsRecv_.size() and requestsSend_.size()
632  // as the number of outstanding nonblocking message requests, so
633  // we resize to zero to maintain this invariant.
634  //
635  // getNumReceives() does _not_ include the self message, if there is
636  // one. Here, we do actually send a message to ourselves, so we
637  // include any self message in the "actual" number of receives to
638  // post.
639  //
640  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
641  // doesn't (re)allocate its array of requests. That happens in
642  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
643  // demand), or Resize_().
644  const size_type actualNumReceives = as<size_type> (plan.getNumReceives()) +
645  as<size_type> (plan.hasSelfMessage() ? 1 : 0);
646  requestsSend_.resize (0);
647 
648  {
649  for (size_type i = 0; i < actualNumReceives; ++i) {
650  if (plan.getProcsFrom()[i] == myRank) { // Receiving from myself
651  selfReceiveOffset = importStarts[i]; // Remember the self-recv offset
652  }
653  }
654  }
655 
656  ProfilingRegion pss("Tpetra::Distributor::doPostSends sends");
657 
658  // setup scan through getProcsTo() list starting with higher numbered procs
659  // (should help balance message traffic)
660  //
661  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
662  // It doesn't depend on the input at all.
663  size_t numBlocks = plan.getNumSends() + plan.hasSelfMessage();
664  size_t procIndex = 0;
665  while ((procIndex < numBlocks) && (plan.getProcsTo()[procIndex] < myRank)) {
666  ++procIndex;
667  }
668  if (procIndex == numBlocks) {
669  procIndex = 0;
670  }
671 
672  size_t selfNum = 0;
673  size_t selfIndex = 0;
674 
675  if (plan.getIndicesTo().is_null()) {
676  const char isend_region[] = "Tpetra::Distributor::doPostSends MPI_Isend FAST";
677  const char send_region[] = "Tpetra::Distributor::doPostSends MPI_Send FAST";
678  ProfilingRegion pssf((sendType == Details::DISTRIBUTOR_ISEND) ? isend_region : send_region);
679 
680  // Data are already blocked (laid out) by process, so we don't
681  // need a separate send buffer (besides the exports array).
682  for (size_t i = 0; i < numBlocks; ++i) {
683  size_t p = i + procIndex;
684  if (p > (numBlocks - 1)) {
685  p -= numBlocks;
686  }
687 
688  if (plan.getProcsTo()[p] != myRank) {
689  if (exportLengths[p] == 0) {
690  // Do not attempt to send messages of length 0.
691  continue;
692  }
693 
694  exports_view_type tmpSend = subview_offset(exports, exportStarts[p], exportLengths[p]);
695 
696  if (sendType == Details::DISTRIBUTOR_ISEND) {
697  // NOTE: This looks very similar to the tmpSend above, but removing
698  // tmpSendBuf and uses tmpSend leads to a performance hit on Arm
699  // SerialNode builds
700  exports_view_type tmpSendBuf =
701  subview_offset (exports, exportStarts[p], exportLengths[p]);
702  requestsSend_.push_back (isend<int> (tmpSendBuf, plan.getProcsTo()[p],
703  mpiTag_, *plan.getComm()));
704  }
705  else { // DISTRIBUTOR_SEND
706  send<int> (tmpSend,
707  as<int> (tmpSend.size ()),
708  plan.getProcsTo()[p], mpiTag_, *plan.getComm());
709  }
710  }
711  else { // "Sending" the message to myself
712  selfNum = p;
713  }
714  }
715 
716  if (plan.hasSelfMessage()) {
717  // This is how we "send a message to ourself": we copy from
718  // the export buffer to the import buffer. That saves
719  // Teuchos::Comm implementations other than MpiComm (in
720  // particular, SerialComm) the trouble of implementing self
721  // messages correctly. (To do this right, SerialComm would
722  // need internal buffer space for messages, keyed on the
723  // message's tag.)
724  deep_copy_offset(imports, exports, selfReceiveOffset,
725  exportStarts[selfNum], exportLengths[selfNum]);
726  }
727 
728  }
729  else { // data are not blocked by proc, use send buffer
730  ProfilingRegion psss("Tpetra::Distributor::doPostSends: MPI_Send SLOW");
731 
732  using Packet = typename ExpView::non_const_value_type;
733  using Layout = typename ExpView::array_layout;
734  using Device = typename ExpView::device_type;
735  using Mem = typename ExpView::memory_traits;
736 
737  // This buffer is long enough for only one message at a time.
738  // Thus, we use DISTRIBUTOR_SEND always in this case, regardless
739  // of sendType requested by user.
740  // This code path formerly errored out with message:
741  // Tpetra::Distributor::doPosts(3 args):
742  // The "send buffer" code path
743  // doesn't currently work with nonblocking sends.
744  // Now, we opt to just do the communication in a way that works.
745 #ifdef HAVE_TPETRA_DEBUG
746  if (sendType != Details::DISTRIBUTOR_SEND) {
747  if (plan.getComm()->getRank() == 0)
748  std::cout << "The requested Tpetra send type "
750  << " requires Distributor data to be ordered by"
751  << " the receiving processor rank. Since these"
752  << " data are not ordered, Tpetra will use Send"
753  << " instead." << std::endl;
754  }
755 #endif
756 
757  size_t maxSendLength = 0;
758  for (size_t i = 0; i < numBlocks; ++i) {
759  size_t p = i + procIndex;
760  if (p > (numBlocks - 1)) {
761  p -= numBlocks;
762  }
763 
764  size_t sendArrayOffset = 0;
765  size_t j = plan.getStartsTo()[p];
766  for (size_t k = 0; k < plan.getLengthsTo()[p]; ++k, ++j) {
767  sendArrayOffset += exportLengths[j];
768  }
769  maxSendLength = std::max(maxSendLength, sendArrayOffset);
770  }
771  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxSendLength);
772 
773  for (size_t i = 0; i < numBlocks; ++i) {
774  size_t p = i + procIndex;
775  if (p > (numBlocks - 1)) {
776  p -= numBlocks;
777  }
778 
779  if (plan.getProcsTo()[p] != myRank) {
780  size_t sendArrayOffset = 0;
781  size_t j = plan.getStartsTo()[p];
782  for (size_t k = 0; k < plan.getLengthsTo()[p]; ++k, ++j) {
783  packOffset(sendArray, exports, sendArrayOffset, exportStarts[j], exportLengths[j]);
784  sendArrayOffset += exportLengths[j];
785  }
786  typename ExpView::execution_space().fence();
787 
788  ImpView tmpSend =
789  subview_offset(sendArray, size_t(0), sendArrayOffset);
790 
791  send<int> (tmpSend,
792  as<int> (tmpSend.size ()),
793  plan.getProcsTo()[p], mpiTag_, *plan.getComm());
794  }
795  else { // "Sending" the message to myself
796  selfNum = p;
797  selfIndex = plan.getStartsTo()[p];
798  }
799  }
800 
801  if (plan.hasSelfMessage()) {
802  for (size_t k = 0; k < plan.getLengthsTo()[selfNum]; ++k) {
803  packOffset(imports, exports, selfReceiveOffset, exportStarts[selfIndex], exportLengths[selfIndex]);
804  selfReceiveOffset += exportLengths[selfIndex];
805  ++selfIndex;
806  }
807  }
808  }
809 }
810 
811 }
812 
813 #endif
Add specializations of Teuchos::Details::MpiTypeTraits for Kokkos::complex&lt;float&gt; and Kokkos::complex...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
Stand-alone utility functions and macros.
EDistributorSendType
The type of MPI send that Distributor should use.