Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_DistributorActor.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_DISTRIBUTOR_ACTOR_HPP
11 #define TPETRA_DETAILS_DISTRIBUTOR_ACTOR_HPP
12 
13 #include "Teuchos_Assert.hpp"
15 #include "Tpetra_Util.hpp"
16 
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_Comm.hpp"
21 #include "Teuchos_RCP.hpp"
22 
23 #include "Kokkos_TeuchosCommAdapters.hpp"
24 
25 #ifdef HAVE_TPETRA_MPI
26 #include "mpi.h"
27 #endif
28 
29 namespace Tpetra::Details {
30 
31 template <class View>
32 constexpr bool isKokkosView = Kokkos::is_view<View>::value;
33 
34 template <class View1, class View2>
35 constexpr bool areKokkosViews = Kokkos::is_view<View1>::value && Kokkos::is_view<View2>::value;
36 
37 class DistributorActor {
38  static constexpr int DEFAULT_MPI_TAG = 1;
39 
40  using IndexView = DistributorPlan::IndexView;
41  using SubViewLimits = DistributorPlan::SubViewLimits;
42 
43 public:
44  DistributorActor();
45  DistributorActor(const DistributorActor& otherActor);
46 
47  template <class ExpView, class ImpView>
48  void doPostsAndWaits(const DistributorPlan& plan,
49  const ExpView &exports,
50  size_t numPackets,
51  const ImpView &imports);
52 
53  template <class ExpView, class ImpView>
54  void doPostsAndWaits(const DistributorPlan& plan,
55  const ExpView &exports,
56  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
57  const ImpView &imports,
58  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
59 
60  template <class ImpView>
61  void doPostRecvs(const DistributorPlan& plan,
62  size_t numPackets,
63  const ImpView& imports);
64 
65  template <class ImpView>
66  void doPostRecvs(const DistributorPlan& plan,
67  const ImpView &imports,
68  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
69 
70  template <class ExpView, class ImpView>
71  void doPostSends(const DistributorPlan& plan,
72  const ExpView& exports,
73  size_t numPackets,
74  const ImpView& imports);
75 
76  template <class ExpView, class ImpView>
77  void doPostSends(const DistributorPlan& plan,
78  const ExpView &exports,
79  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
80  const ImpView &imports,
81  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
82 
83  template <class ExpView, class ImpView>
84  void doPosts(const DistributorPlan& plan,
85  const ExpView& exports,
86  size_t numPackets,
87  const ImpView& imports);
88 
89  template <class ExpView, class ImpView>
90  void doPosts(const DistributorPlan& plan,
91  const ExpView &exports,
92  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
93  const ImpView &imports,
94  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
95 
96  void doWaits(const DistributorPlan& plan);
97 
98  void doWaitsRecv(const DistributorPlan& plan);
99 
100  void doWaitsSend(const DistributorPlan& plan);
101 
102  bool isReady() const;
103 
104 private:
105 
106  template <class ImpView>
107  void doPostRecvsImpl(const DistributorPlan& plan,
108  const ImpView& imports,
109  const SubViewLimits& totalPacketsFrom);
110 
111  template <class ExpView, class ImpView>
112  void doPostSendsImpl(const DistributorPlan& plan,
113  const ExpView& exports,
114  const SubViewLimits& exportSubViewLimits,
115  const ImpView& imports,
116  const SubViewLimits& importSubViewLimits);
117 
118 #ifdef HAVE_TPETRA_MPI
119  template <class ExpView, class ImpView>
120  void doPostsAllToAllImpl(const DistributorPlan &plan,
121  const ExpView &exports,
122  const SubViewLimits& exportSubViewLimits,
123  const ImpView &imports,
124  const SubViewLimits& importSubViewLimits);
125 
126 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
127  template <class ExpView, class ImpView>
128  void doPostsNbrAllToAllVImpl(const DistributorPlan &plan,
129  const ExpView &exports,
130  const SubViewLimits& exportSubViewLimits,
131  const ImpView &imports,
132  const SubViewLimits& importSubViewLimits);
133 #endif // HAVE_TPETRACORE_MPI_ADVANCE
134 #endif // HAVE_TPETRA_CORE
135 
136  int mpiTag_;
137 
138  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int>>> requestsRecv_;
139  Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int>>> requestsSend_;
140 };
141 
142 template <class ExpView, class ImpView>
143 void DistributorActor::doPosts(const DistributorPlan& plan,
144  const ExpView& exports,
145  size_t numPackets,
146  const ImpView& imports)
147 {
148  doPostRecvs(plan, numPackets, imports);
149  doPostSends(plan, exports, numPackets, imports);
150 }
151 
152 template <class ExpView, class ImpView>
153 void DistributorActor::doPostsAndWaits(const DistributorPlan& plan,
154  const ExpView& exports,
155  size_t numPackets,
156  const ImpView& imports)
157 {
158  static_assert(areKokkosViews<ExpView, ImpView>,
159  "Data arrays for DistributorActor::doPostsAndWaits must be Kokkos::Views");
160  doPosts(plan, exports, numPackets, imports);
161  doWaits(plan);
162 }
163 
164 template <class ExpView, class ImpView>
165 void DistributorActor::doPosts(const DistributorPlan& plan,
166  const ExpView &exports,
167  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
168  const ImpView &imports,
169  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
170 {
171  doPostRecvs(plan, imports, numImportPacketsPerLID);
172  doPostSends(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
173 }
174 
175 template <class ExpView, class ImpView>
176 void DistributorActor::doPostsAndWaits(const DistributorPlan& plan,
177  const ExpView& exports,
178  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
179  const ImpView& imports,
180  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
181 {
182  static_assert(areKokkosViews<ExpView, ImpView>,
183  "Data arrays for DistributorActor::doPostsAndWaits must be Kokkos::Views");
184  doPosts(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
185  doWaits(plan);
186 }
187 
188 template <typename ViewType>
189 using HostAccessibility = Kokkos::SpaceAccessibility<Kokkos::DefaultHostExecutionSpace, typename ViewType::memory_space>;
190 
191 template <typename DstViewType, typename SrcViewType>
192 using enableIfHostAccessible = std::enable_if_t<HostAccessibility<DstViewType>::accessible &&
193  HostAccessibility<SrcViewType>::accessible>;
194 
195 template <typename DstViewType, typename SrcViewType>
196 using enableIfNotHostAccessible = std::enable_if_t<!HostAccessibility<DstViewType>::accessible ||
197  !HostAccessibility<SrcViewType>::accessible>;
198 
199 template <typename DstViewType, typename SrcViewType>
200 enableIfHostAccessible<DstViewType, SrcViewType>
201 packOffset(const DstViewType& dst,
202  const SrcViewType& src,
203  const size_t dst_offset,
204  const size_t src_offset,
205  const size_t size)
206 {
207  memcpy((void*) (dst.data()+dst_offset), src.data()+src_offset, size*sizeof(typename DstViewType::value_type));
208 }
209 
210 template <typename DstViewType, typename SrcViewType>
211 enableIfNotHostAccessible<DstViewType, SrcViewType>
212 packOffset(const DstViewType& dst,
213  const SrcViewType& src,
214  const size_t dst_offset,
215  const size_t src_offset,
216  const size_t size)
217 {
218  Kokkos::Compat::deep_copy_offset(dst, src, dst_offset, src_offset, size);
219 }
220 
221 #ifdef HAVE_TPETRA_MPI
222 template <class ExpView, class ImpView>
223 void DistributorActor::doPostsAllToAllImpl(const DistributorPlan &plan,
224  const ExpView &exports,
225  const SubViewLimits& exportSubViewLimits,
226  const ImpView &imports,
227  const SubViewLimits& importSubViewLimits) {
228  TEUCHOS_TEST_FOR_EXCEPTION(
229  !plan.getIndicesTo().is_null(), std::runtime_error,
230  "Send Type=\"Alltoall\" only works for fast-path communication.");
231 
232  using size_type = Teuchos::Array<size_t>::size_type;
233 
234  auto comm = plan.getComm();
235  std::vector<int> sendcounts(comm->getSize(), 0);
236  std::vector<int> sdispls(comm->getSize(), 0);
237  std::vector<int> recvcounts(comm->getSize(), 0);
238  std::vector<int> rdispls(comm->getSize(), 0);
239 
240  auto& [importStarts, importLengths] = importSubViewLimits;
241  auto& [exportStarts, exportLengths] = exportSubViewLimits;
242 
243  for (size_t pp = 0; pp < plan.getNumSends(); ++pp) {
244  sdispls[plan.getProcsTo()[pp]] = exportStarts[pp];
245  size_t numPackets = exportLengths[pp];
246  // numPackets is converted down to int, so make sure it can be represented
247  TEUCHOS_TEST_FOR_EXCEPTION(numPackets > size_t(INT_MAX), std::logic_error,
248  "Tpetra::Distributor::doPostsAllToAll: "
249  "Send count for send "
250  << pp << " (" << numPackets
251  << ") is too large "
252  "to be represented as int.");
253  sendcounts[plan.getProcsTo()[pp]] = static_cast<int>(numPackets);
254  }
255 
256  const size_type actualNumReceives =
257  Teuchos::as<size_type>(plan.getNumReceives()) +
258  Teuchos::as<size_type>(plan.hasSelfMessage() ? 1 : 0);
259 
260  for (size_type i = 0; i < actualNumReceives; ++i) {
261  rdispls[plan.getProcsFrom()[i]] = importStarts[i];
262  size_t totalPacketsFrom_i = importLengths[i];
263  // totalPacketsFrom_i is converted down to int, so make sure it can be
264  // represented
265  TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i > size_t(INT_MAX),
266  std::logic_error,
267  "Tpetra::Distributor::doPostsAllToAll: "
268  "Recv count for receive "
269  << i << " (" << totalPacketsFrom_i
270  << ") is too large "
271  "to be represented as int.");
272  recvcounts[plan.getProcsFrom()[i]] = static_cast<int>(totalPacketsFrom_i);
273  }
274 
275  Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm =
276  Teuchos::rcp_dynamic_cast<const Teuchos::MpiComm<int>>(comm);
277  Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm =
278  mpiComm->getRawMpiComm();
279  using T = typename ExpView::non_const_value_type;
280  MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
281 
282 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
283  if (Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL == plan.getSendType()) {
284  MPIX_Comm *mpixComm = *plan.getMPIXComm();
285  TEUCHOS_TEST_FOR_EXCEPTION(!mpixComm, std::runtime_error,
286  "MPIX_Comm is null in doPostsAllToAll \""
287  << __FILE__ << ":" << __LINE__);
288 
289  const int err = MPIX_Alltoallv(
290  exports.data(), sendcounts.data(), sdispls.data(), rawType,
291  imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
292 
293  TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
294  "MPIX_Alltoallv failed with error \""
295  << Teuchos::mpiErrorCodeToString(err)
296  << "\".");
297 
298  return;
299  }
300 #endif // HAVE_TPETRACORE_MPI_ADVANCE
301 
302  const int err = MPI_Alltoallv(
303  exports.data(), sendcounts.data(), sdispls.data(), rawType,
304  imports.data(), recvcounts.data(), rdispls.data(), rawType, (*rawComm)());
305 
306  TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
307  "MPI_Alltoallv failed with error \""
308  << Teuchos::mpiErrorCodeToString(err)
309  << "\".");
310 }
311 
312 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
313 template <class ExpView, class ImpView>
314 void DistributorActor::doPostsNbrAllToAllVImpl(const DistributorPlan &plan,
315  const ExpView &exports,
316  const SubViewLimits& exportSubViewLimits,
317  const ImpView &imports,
318  const SubViewLimits& importSubViewLimits) {
319  TEUCHOS_TEST_FOR_EXCEPTION(
320  !plan.getIndicesTo().is_null(), std::runtime_error,
321  "Send Type=\"Alltoall\" only works for fast-path communication.");
322 
323  const int myRank = plan.getComm()->getRank();
324  MPIX_Comm *mpixComm = *plan.getMPIXComm();
325 
326  const size_t numSends = plan.getNumSends() + plan.hasSelfMessage();
327  const size_t numRecvs = plan.getNumReceives() + plan.hasSelfMessage();
328  std::vector<int> sendcounts(numSends, 0);
329  std::vector<int> sdispls(numSends, 0);
330  std::vector<int> recvcounts(numRecvs, 0);
331  std::vector<int> rdispls(numRecvs, 0);
332 
333  auto& [importStarts, importLengths] = importSubViewLimits;
334  auto& [exportStarts, exportLengths] = exportSubViewLimits;
335 
336  for (size_t pp = 0; pp < plan.getNumSends(); ++pp) {
337  sdispls[plan.getProcsTo()[pp]] = exportStarts[pp];
338  size_t numPackets = exportLengths[pp];
339  // numPackets is converted down to int, so make sure it can be represented
340  TEUCHOS_TEST_FOR_EXCEPTION(numPackets > size_t(INT_MAX), std::logic_error,
341  "Tpetra::Distributor::doPostsNbrAllToAllV: "
342  "Send count for send "
343  << pp << " (" << numPackets
344  << ") is too large "
345  "to be represented as int.");
346  sendcounts[plan.getProcsTo()[pp]] = static_cast<int>(numPackets);
347  }
348 
349  const size_type actualNumReceives =
350  Teuchos::as<size_type>(plan.getNumReceives()) +
351  Teuchos::as<size_type>(plan.hasSelfMessage() ? 1 : 0);
352 
353  for (size_type i = 0; i < actualNumReceives; ++i) {
354  rdispls[plan.getProcsFrom()[i]] = importStarts(i);
355  size_t totalPacketsFrom_i = importLengths(i);
356  // totalPacketsFrom_i is converted down to int, so make sure it can be
357  // represented
358  TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i > size_t(INT_MAX),
359  std::logic_error,
360  "Tpetra::Distributor::doPostsNbrAllToAllV: "
361  "Recv count for receive "
362  << i << " (" << totalPacketsFrom_i
363  << ") is too large "
364  "to be represented as int.");
365  recvcounts[plan.getProcsFrom()[i]] = static_cast<int>(totalPacketsFrom_i);
366  }
367 
368  using T = typename ExpView::non_const_value_type;
369  MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
370 
371  const int err = MPIX_Neighbor_alltoallv(
372  exports.data(), sendcounts.data(), sdispls.data(), rawType,
373  imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
374 
375  TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
376  "MPIX_Neighbor_alltoallv failed with error \""
377  << Teuchos::mpiErrorCodeToString(err)
378  << "\".");
379 }
380 #endif // HAVE_TPETRACORE_MPI_ADVANCE
381 #endif // HAVE_TPETRA_MPI
382 
383 template <class ImpView>
384 void DistributorActor::doPostRecvs(const DistributorPlan& plan,
385  size_t numPackets,
386  const ImpView& imports)
387 {
388  auto importSubViewLimits = plan.getImportViewLimits(numPackets);
389  doPostRecvsImpl(plan, imports, importSubViewLimits);
390 }
391 
392 template <class ImpView>
393 void DistributorActor::doPostRecvs(const DistributorPlan& plan,
394  const ImpView &imports,
395  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
396 {
397  auto importSubViewLimits = plan.getImportViewLimits(numImportPacketsPerLID);
398  doPostRecvsImpl(plan, imports, importSubViewLimits);
399 }
400 
401 template <class ImpView>
402 void DistributorActor::doPostRecvsImpl(const DistributorPlan& plan,
403  const ImpView &imports,
404  const SubViewLimits& importSubViewLimits)
405 {
406  static_assert(isKokkosView<ImpView>,
407  "Data arrays for DistributorActor::doPostRecvs must be Kokkos::Views");
408  using Teuchos::Array;
409  using Teuchos::as;
410  using Teuchos::ireceive;
411  using Kokkos::Compat::subview_offset;
412  using size_type = Array<size_t>::size_type;
413  using imports_view_type = ImpView;
414 
415 #ifdef KOKKOS_ENABLE_CUDA
416  static_assert (! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
417  "Please do not use Tpetra::Distributor with UVM "
418  "allocations. See GitHub issue #1088.");
419 #endif // KOKKOS_ENABLE_CUDA
420 
421 #ifdef KOKKOS_ENABLE_SYCL
422  static_assert (! std::is_same<typename ImpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value,
423  "Please do not use Tpetra::Distributor with SharedUSM "
424  "allocations. See GitHub issue #1088 (corresponding to CUDA).");
425 #endif // KOKKOS_ENABLE_SYCL
426 
427 
428 #if defined(HAVE_TPETRA_MPI)
429  // All-to-all communication layout is quite different from
430  // point-to-point, so we handle it separately.
431 
432  // These send options require no matching receives, so we just return.
433  const Details::EDistributorSendType sendType = plan.getSendType();
434  if ((sendType == Details::DISTRIBUTOR_ALLTOALL)
435 #ifdef HAVE_TPETRACORE_MPI_ADVANCE
436  || (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL)
437  || (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV)
438 #endif
439  ) {
440  return;
441  }
442 #endif // HAVE_TPETRA_MPI
443 
444  ProfilingRegion pr("Tpetra::Distributor: doPostRecvs");
445 
446  const int myProcID = plan.getComm()->getRank ();
447 
448  auto& [importStarts, importLengths] = importSubViewLimits;
449 
450  // Distributor uses requestsRecv_.size() and requestsSend_.size()
451  // as the number of outstanding nonblocking message requests, so
452  // we resize to zero to maintain this invariant.
453  //
454  // getNumReceives() does _not_ include the self message, if there is
455  // one. Here, we do actually send a message to ourselves, so we
456  // include any self message in the "actual" number of receives to
457  // post.
458  //
459  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
460  // doesn't (re)allocate its array of requests. That happens in
461  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
462  // demand), or Resize_().
463  const size_type actualNumReceives = as<size_type> (plan.getNumReceives()) +
464  as<size_type> (plan.hasSelfMessage() ? 1 : 0);
465 
466 #ifdef HAVE_TPETRA_DEBUG
467  size_t totalNumImportPackets = 0;
468  for (size_t i = 0; i < Teuchos::as<size_t>(actualNumReceives); ++i) {
469  totalNumImportPackets += importLengths[i];
470  }
471  TEUCHOS_TEST_FOR_EXCEPTION(
472  imports.extent (0) < totalNumImportPackets, std::runtime_error,
473  "Tpetra::Distributor::doPostRecvs: The 'imports' array must have "
474  "enough entries to hold the expected number of import packets. "
475  "imports.extent(0) = " << imports.extent (0) << " < "
476  "totalNumImportPackets = " << totalNumImportPackets << ".");
477  TEUCHOS_TEST_FOR_EXCEPTION
478  (!requestsRecv_.empty(), std::logic_error, "Tpetra::Distributor::"
479  "doPostRecvs: Process " << myProcID << ": requestsRecv_.size () = "
480  << requestsRecv_.size () << " != 0.");
481 #endif // HAVE_TPETRA_DEBUG
482 
483  requestsRecv_.resize (0);
484 
485  // Post the nonblocking receives. It's common MPI wisdom to post
486  // receives before sends. In MPI terms, this means favoring
487  // adding to the "posted queue" (of receive requests) over adding
488  // to the "unexpected queue" (of arrived messages not yet matched
489  // with a receive).
490  {
491  ProfilingRegion prr("Tpetra::Distributor: doPostRecvs recvs");
492 
493  for (size_type i = 0; i < actualNumReceives; ++i) {
494  size_t totalPacketsFrom_i = importLengths[Teuchos::as<size_t>(i)];
495  TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i > size_t(INT_MAX),
496  std::logic_error, "Tpetra::Distributor::doPostRecvs: "
497  "Recv count for receive " << i << " (" << totalPacketsFrom_i << ") is too large "
498  "to be represented as int.");
499  if (plan.getProcsFrom()[i] != myProcID && totalPacketsFrom_i) {
500  // If my process is receiving these packet(s) from another
501  // process (not a self-receive), and if there is at least
502  // one packet to receive:
503  //
504  // 1. Set up the persisting view (recvBuf) into the imports
505  // array, given the offset and size (total number of
506  // packets from process getProcsFrom()[i]).
507  // 2. Start the Irecv and save the resulting request.
508  imports_view_type recvBuf =
509  subview_offset (imports, importStarts[i], totalPacketsFrom_i);
510  requestsRecv_.push_back (ireceive<int> (recvBuf, plan.getProcsFrom()[i],
511  mpiTag_, *plan.getComm()));
512  }
513  }
514  }
515 }
516 
517 template <class ExpView, class ImpView>
518 void DistributorActor::doPostSends(const DistributorPlan& plan,
519  const ExpView& exports,
520  size_t numPackets,
521  const ImpView& imports)
522 {
523  auto exportSubViewLimits = plan.getExportViewLimits(numPackets);
524  auto importSubViewLimits = plan.getImportViewLimits(numPackets);
525  doPostSendsImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
526 }
527 
528 template <class ExpView, class ImpView>
529 void DistributorActor::doPostSends(const DistributorPlan& plan,
530  const ExpView &exports,
531  const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
532  const ImpView &imports,
533  const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
534 {
535  auto exportSubViewLimits = plan.getExportViewLimits(numExportPacketsPerLID);
536  auto importSubViewLimits = plan.getImportViewLimits(numImportPacketsPerLID);
537  doPostSendsImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
538 }
539 
540 template <class ExpView, class ImpView>
541 void DistributorActor::doPostSendsImpl(const DistributorPlan& plan,
542  const ExpView& exports,
543  const SubViewLimits& exportSubViewLimits,
544  const ImpView& imports,
545  const SubViewLimits& importSubViewLimits)
546 {
547  static_assert(areKokkosViews<ExpView, ImpView>,
548  "Data arrays for DistributorActor::doPostSends must be Kokkos::Views");
549  using Teuchos::Array;
550  using Teuchos::as;
551  using Teuchos::isend;
552  using Teuchos::send;
553  using Kokkos::Compat::subview_offset;
554  using Kokkos::Compat::deep_copy_offset;
555  using size_type = Array<size_t>::size_type;
556  using exports_view_type = ExpView;
557 
558 #ifdef KOKKOS_ENABLE_CUDA
559  static_assert
560  (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
561  ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
562  "Please do not use Tpetra::Distributor with UVM allocations. "
563  "See Trilinos GitHub issue #1088.");
564 #endif // KOKKOS_ENABLE_CUDA
565 
566 #ifdef KOKKOS_ENABLE_SYCL
567  static_assert
568  (! std::is_same<typename ExpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value &&
569  ! std::is_same<typename ImpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value,
570  "Please do not use Tpetra::Distributor with SharedUSM allocations. "
571  "See Trilinos GitHub issue #1088 (corresponding to CUDA).");
572 #endif // KOKKOS_ENABLE_SYCL
573 
574  ProfilingRegion ps("Tpetra::Distributor: doPostSends");
575 
576  const int myRank = plan.getComm()->getRank ();
577  // Run-time configurable parameters that come from the input
578  // ParameterList set by setParameterList().
579  const Details::EDistributorSendType sendType = plan.getSendType();
580 
581  auto& [exportStarts, exportLengths] = exportSubViewLimits;
582  auto& [importStarts, importLengths] = importSubViewLimits;
583 
584 #if defined(HAVE_TPETRA_MPI)
585  // All-to-all communication layout is quite different from
586  // point-to-point, so we handle it separately.
587 
588  if (sendType == Details::DISTRIBUTOR_ALLTOALL) {
589  doPostsAllToAllImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
590  return;
591  }
592 #ifdef HAVE_TPETRACORE_MPI_ADVANCE
593  else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL) {
594  doPostsAllToAllImpl(plan, exports, exportSubViewLimits, imports, importSubViewLimits);
595  return;
596  } else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
597  doPostsNbrAllToAllVImpl(plan, exports,numPackets, imports);
598  return;
599  }
600 #endif // defined(HAVE_TPETRACORE_MPI_ADVANCE)
601 
602 
603 #else // HAVE_TPETRA_MPI
604  if (plan.hasSelfMessage()) {
605  // This is how we "send a message to ourself": we copy from
606  // the export buffer to the import buffer. That saves
607  // Teuchos::Comm implementations other than MpiComm (in
608  // particular, SerialComm) the trouble of implementing self
609  // messages correctly. (To do this right, SerialComm would
610  // need internal buffer space for messages, keyed on the
611  // message's tag.)
612  size_t selfReceiveOffset = 0;
613  deep_copy_offset(imports, exports, selfReceiveOffset,
614  exportStarts[0],
615  exportLengths[0]);
616  }
617  // should we just return here?
618  // likely not as comm could be a serial comm
619 #endif // HAVE_TPETRA_MPI
620 
621  size_t selfReceiveOffset = 0;
622 
623 #ifdef HAVE_TPETRA_DEBUG
624  TEUCHOS_TEST_FOR_EXCEPTION
625  (requestsSend_.size () != 0,
626  std::logic_error,
627  "Tpetra::Distributor::doPostSends: Process "
628  << myRank << ": requestsSend_.size() = " << requestsSend_.size () << " != 0.");
629 #endif // HAVE_TPETRA_DEBUG
630 
631  // Distributor uses requestsRecv_.size() and requestsSend_.size()
632  // as the number of outstanding nonblocking message requests, so
633  // we resize to zero to maintain this invariant.
634  //
635  // getNumReceives() does _not_ include the self message, if there is
636  // one. Here, we do actually send a message to ourselves, so we
637  // include any self message in the "actual" number of receives to
638  // post.
639  //
640  // NOTE (mfh 19 Mar 2012): Epetra_MpiDistributor::DoPosts()
641  // doesn't (re)allocate its array of requests. That happens in
642  // CreateFromSends(), ComputeRecvs_(), DoReversePosts() (on
643  // demand), or Resize_().
644  const size_type actualNumReceives = as<size_type> (plan.getNumReceives()) +
645  as<size_type> (plan.hasSelfMessage() ? 1 : 0);
646  requestsSend_.resize (0);
647 
648  {
649  for (size_type i = 0; i < actualNumReceives; ++i) {
650  if (plan.getProcsFrom()[i] == myRank) { // Receiving from myself
651  selfReceiveOffset = importStarts[i]; // Remember the self-recv offset
652  }
653  }
654  }
655 
656  ProfilingRegion pss("Tpetra::Distributor: doPostSends sends");
657 
658  // setup scan through getProcsTo() list starting with higher numbered procs
659  // (should help balance message traffic)
660  //
661  // FIXME (mfh 20 Feb 2013) Why haven't we precomputed this?
662  // It doesn't depend on the input at all.
663  size_t numBlocks = plan.getNumSends() + plan.hasSelfMessage();
664  size_t procIndex = 0;
665  while ((procIndex < numBlocks) && (plan.getProcsTo()[procIndex] < myRank)) {
666  ++procIndex;
667  }
668  if (procIndex == numBlocks) {
669  procIndex = 0;
670  }
671 
672  size_t selfNum = 0;
673  size_t selfIndex = 0;
674 
675  if (plan.getIndicesTo().is_null()) {
676  ProfilingRegion pssf("Tpetra::Distributor: doPostSends sends FAST");
677 
678  // Data are already blocked (laid out) by process, so we don't
679  // need a separate send buffer (besides the exports array).
680  for (size_t i = 0; i < numBlocks; ++i) {
681  size_t p = i + procIndex;
682  if (p > (numBlocks - 1)) {
683  p -= numBlocks;
684  }
685 
686  if (plan.getProcsTo()[p] != myRank) {
687  if (exportLengths[p] == 0) {
688  // Do not attempt to send messages of length 0.
689  continue;
690  }
691 
692  exports_view_type tmpSend = subview_offset(exports, exportStarts[p], exportLengths[p]);
693 
694  if (sendType == Details::DISTRIBUTOR_ISEND) {
695  // NOTE: This looks very similar to the tmpSend above, but removing
696  // tmpSendBuf and uses tmpSend leads to a performance hit on Arm
697  // SerialNode builds
698  exports_view_type tmpSendBuf =
699  subview_offset (exports, exportStarts[p], exportLengths[p]);
700  requestsSend_.push_back (isend<int> (tmpSendBuf, plan.getProcsTo()[p],
701  mpiTag_, *plan.getComm()));
702  }
703  else { // DISTRIBUTOR_SEND
704  send<int> (tmpSend,
705  as<int> (tmpSend.size ()),
706  plan.getProcsTo()[p], mpiTag_, *plan.getComm());
707  }
708  }
709  else { // "Sending" the message to myself
710  selfNum = p;
711  }
712  }
713 
714  if (plan.hasSelfMessage()) {
715  // This is how we "send a message to ourself": we copy from
716  // the export buffer to the import buffer. That saves
717  // Teuchos::Comm implementations other than MpiComm (in
718  // particular, SerialComm) the trouble of implementing self
719  // messages correctly. (To do this right, SerialComm would
720  // need internal buffer space for messages, keyed on the
721  // message's tag.)
722  deep_copy_offset(imports, exports, selfReceiveOffset,
723  exportStarts[selfNum], exportLengths[selfNum]);
724  }
725 
726  }
727  else { // data are not blocked by proc, use send buffer
728  ProfilingRegion psss("Tpetra::Distributor: doPostSends: sends SLOW");
729 
730  using Packet = typename ExpView::non_const_value_type;
731  using Layout = typename ExpView::array_layout;
732  using Device = typename ExpView::device_type;
733  using Mem = typename ExpView::memory_traits;
734 
735  // This buffer is long enough for only one message at a time.
736  // Thus, we use DISTRIBUTOR_SEND always in this case, regardless
737  // of sendType requested by user.
738  // This code path formerly errored out with message:
739  // Tpetra::Distributor::doPosts(3 args):
740  // The "send buffer" code path
741  // doesn't currently work with nonblocking sends.
742  // Now, we opt to just do the communication in a way that works.
743 #ifdef HAVE_TPETRA_DEBUG
744  if (sendType != Details::DISTRIBUTOR_SEND) {
745  if (plan.getComm()->getRank() == 0)
746  std::cout << "The requested Tpetra send type "
748  << " requires Distributor data to be ordered by"
749  << " the receiving processor rank. Since these"
750  << " data are not ordered, Tpetra will use Send"
751  << " instead." << std::endl;
752  }
753 #endif
754 
755  size_t maxSendLength = 0;
756  for (size_t i = 0; i < numBlocks; ++i) {
757  size_t p = i + procIndex;
758  if (p > (numBlocks - 1)) {
759  p -= numBlocks;
760  }
761 
762  size_t sendArrayOffset = 0;
763  size_t j = plan.getStartsTo()[p];
764  for (size_t k = 0; k < plan.getLengthsTo()[p]; ++k, ++j) {
765  sendArrayOffset += exportLengths[j];
766  }
767  maxSendLength = std::max(maxSendLength, sendArrayOffset);
768  }
769  Kokkos::View<Packet*,Layout,Device,Mem> sendArray ("sendArray", maxSendLength);
770 
771  for (size_t i = 0; i < numBlocks; ++i) {
772  size_t p = i + procIndex;
773  if (p > (numBlocks - 1)) {
774  p -= numBlocks;
775  }
776 
777  if (plan.getProcsTo()[p] != myRank) {
778  size_t sendArrayOffset = 0;
779  size_t j = plan.getStartsTo()[p];
780  for (size_t k = 0; k < plan.getLengthsTo()[p]; ++k, ++j) {
781  packOffset(sendArray, exports, sendArrayOffset, exportStarts[j], exportLengths[j]);
782  sendArrayOffset += exportLengths[j];
783  }
784  typename ExpView::execution_space().fence();
785 
786  ImpView tmpSend =
787  subview_offset(sendArray, size_t(0), sendArrayOffset);
788 
789  send<int> (tmpSend,
790  as<int> (tmpSend.size ()),
791  plan.getProcsTo()[p], mpiTag_, *plan.getComm());
792  }
793  else { // "Sending" the message to myself
794  selfNum = p;
795  selfIndex = plan.getStartsTo()[p];
796  }
797  }
798 
799  if (plan.hasSelfMessage()) {
800  for (size_t k = 0; k < plan.getLengthsTo()[selfNum]; ++k) {
801  packOffset(imports, exports, selfReceiveOffset, exportStarts[selfIndex], exportLengths[selfIndex]);
802  selfReceiveOffset += exportLengths[selfIndex];
803  ++selfIndex;
804  }
805  }
806  }
807 }
808 
809 }
810 
811 #endif
Add specializations of Teuchos::Details::MpiTypeTraits for Kokkos::complex&lt;float&gt; and Kokkos::complex...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
Stand-alone utility functions and macros.
EDistributorSendType
The type of MPI send that Distributor should use.