12 #ifndef TPETRA_DETAILS_DISTRIBUTOR_ACTOR_HPP
13 #define TPETRA_DETAILS_DISTRIBUTOR_ACTOR_HPP
18 #include "Teuchos_Array.hpp"
19 #include "Teuchos_Comm.hpp"
21 #include "Teuchos_RCP.hpp"
22 #include "Teuchos_Time.hpp"
24 #include "Kokkos_TeuchosCommAdapters.hpp"
25 #include "Kokkos_StdAlgorithms.hpp"
27 #ifdef HAVE_TPETRA_MPI
34 template <
class View1,
class View2>
35 constexpr
bool areKokkosViews = Kokkos::is_view<View1>::value && Kokkos::is_view<View2>::value;
37 class DistributorActor {
38 static constexpr
int DEFAULT_MPI_TAG = 1;
42 DistributorActor(
const DistributorActor& otherActor);
44 template <
class ExpView,
class ImpView>
45 void doPostsAndWaits(
const DistributorPlan& plan,
46 const ExpView &exports,
48 const ImpView &imports);
50 template <
class ExpView,
class ImpView>
51 void doPostsAndWaits(
const DistributorPlan& plan,
52 const ExpView &exports,
53 const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
54 const ImpView &imports,
55 const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
57 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
58 void doPostsAndWaitsKokkos(
const DistributorPlan& plan,
59 const ExpView &exports,
60 const ExpPacketsView &numExportPacketsPerLID,
61 const ImpView &imports,
62 const ImpPacketsView &numImportPacketsPerLID);
64 template <
class ExpView,
class ImpView>
65 void doPosts(
const DistributorPlan& plan,
66 const ExpView& exports,
68 const ImpView& imports);
70 template <
class ExpView,
class ImpView>
71 void doPosts(
const DistributorPlan& plan,
72 const ExpView &exports,
73 const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
74 const ImpView &imports,
75 const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID);
77 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
78 void doPostsKokkos(
const DistributorPlan& plan,
79 const ExpView &exports,
80 const ExpPacketsView &numExportPacketsPerLID,
81 const ImpView &imports,
82 const ImpPacketsView &numImportPacketsPerLID);
84 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
85 void doPostsAllToAllKokkos(
86 const DistributorPlan &plan,
const ExpView &exports,
87 const ExpPacketsView &numExportPacketsPerLID,
88 const ImpView &imports,
89 const ImpPacketsView &numImportPacketsPerLID);
91 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
92 void doPostsNbrAllToAllVKokkos(
93 const DistributorPlan &plan,
const ExpView &exports,
94 const ExpPacketsView &numExportPacketsPerLID,
95 const ImpView &imports,
96 const ImpPacketsView &numImportPacketsPerLID);
98 void doWaits(
const DistributorPlan& plan);
100 bool isReady()
const;
104 #ifdef HAVE_TPETRA_MPI
105 template <
class ExpView,
class ImpView>
106 void doPostsAllToAll(
const DistributorPlan &plan,
const ExpView &exports,
107 size_t numPackets,
const ImpView &imports);
109 template <
class ExpView,
class ImpView>
110 void doPostsAllToAll(
111 const DistributorPlan &plan,
const ExpView &exports,
112 const Teuchos::ArrayView<const size_t> &numExportPacketsPerLID,
113 const ImpView &imports,
114 const Teuchos::ArrayView<const size_t> &numImportPacketsPerLID);
116 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
117 template <
class ExpView,
class ImpView>
118 void doPostsNbrAllToAllV(
const DistributorPlan &plan,
const ExpView &exports,
119 size_t numPackets,
const ImpView &imports);
121 template <
class ExpView,
class ImpView>
122 void doPostsNbrAllToAllV(
123 const DistributorPlan &plan,
const ExpView &exports,
124 const Teuchos::ArrayView<const size_t> &numExportPacketsPerLID,
125 const ImpView &imports,
126 const Teuchos::ArrayView<const size_t> &numImportPacketsPerLID);
127 #endif // HAVE_TPETRACORE_MPI_ADVANCE
128 #endif // HAVE_TPETRA_CORE
132 Teuchos::Array<Teuchos::RCP<Teuchos::CommRequest<int>>> requests_;
134 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
135 Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_;
136 Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_;
137 Teuchos::RCP<Teuchos::Time> timer_doWaits_;
138 Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_recvs_;
139 Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_recvs_;
140 Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_barrier_;
141 Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_barrier_;
142 Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_;
143 Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_;
144 Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_slow_;
145 Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_slow_;
146 Teuchos::RCP<Teuchos::Time> timer_doPosts3KV_sends_fast_;
147 Teuchos::RCP<Teuchos::Time> timer_doPosts4KV_sends_fast_;
151 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
154 template <
class ExpView,
class ImpView>
155 void DistributorActor::doPostsAndWaits(
const DistributorPlan& plan,
156 const ExpView& exports,
158 const ImpView& imports)
160 static_assert(areKokkosViews<ExpView, ImpView>,
161 "Data arrays for DistributorActor::doPostsAndWaits must be Kokkos::Views");
162 doPosts(plan, exports, numPackets, imports);
166 template <
class ExpView,
class ImpView>
167 void DistributorActor::doPostsAndWaits(
const DistributorPlan& plan,
168 const ExpView& exports,
169 const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
170 const ImpView& imports,
171 const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
173 static_assert(areKokkosViews<ExpView, ImpView>,
174 "Data arrays for DistributorActor::doPostsAndWaits must be Kokkos::Views");
175 doPosts(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
180 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
181 void DistributorActor::doPostsAndWaitsKokkos(
const DistributorPlan& plan,
182 const ExpView &exports,
183 const ExpPacketsView &numExportPacketsPerLID,
184 const ImpView &imports,
185 const ImpPacketsView &numImportPacketsPerLID)
187 static_assert(areKokkosViews<ExpView, ImpView>,
188 "Data arrays for DistributorActor::doPostsAndWaitsKokkos must be Kokkos::Views");
189 static_assert(areKokkosViews<ExpPacketsView, ImpPacketsView>,
190 "Num packets arrays for DistributorActor::doPostsAndWaitsKokkos must be Kokkos::Views");
191 doPostsKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
195 template <
typename ViewType>
196 using HostAccessibility = Kokkos::SpaceAccessibility<Kokkos::DefaultHostExecutionSpace, typename ViewType::memory_space>;
198 template <
typename DstViewType,
typename SrcViewType>
199 using enableIfHostAccessible = std::enable_if_t<HostAccessibility<DstViewType>::accessible &&
200 HostAccessibility<SrcViewType>::accessible>;
202 template <
typename DstViewType,
typename SrcViewType>
203 using enableIfNotHostAccessible = std::enable_if_t<!HostAccessibility<DstViewType>::accessible ||
204 !HostAccessibility<SrcViewType>::accessible>;
206 template <
typename DstViewType,
typename SrcViewType>
207 enableIfHostAccessible<DstViewType, SrcViewType>
208 packOffset(
const DstViewType& dst,
209 const SrcViewType& src,
210 const size_t dst_offset,
211 const size_t src_offset,
214 memcpy((
void*) (dst.data()+dst_offset), src.data()+src_offset, size*
sizeof(
typename DstViewType::value_type));
217 template <
typename DstViewType,
typename SrcViewType>
218 enableIfNotHostAccessible<DstViewType, SrcViewType>
219 packOffset(
const DstViewType& dst,
220 const SrcViewType& src,
221 const size_t dst_offset,
222 const size_t src_offset,
225 Kokkos::Compat::deep_copy_offset(dst, src, dst_offset, src_offset, size);
229 #ifdef HAVE_TPETRA_MPI
230 template <
class ExpView,
class ImpView>
231 void DistributorActor::doPostsAllToAll(
const DistributorPlan &plan,
232 const ExpView &exports,
234 const ImpView &imports) {
235 using size_type = Teuchos::Array<size_t>::size_type;
237 TEUCHOS_TEST_FOR_EXCEPTION(
238 !plan.getIndicesTo().is_null(), std::runtime_error,
239 "Send Type=\"Alltoall\" only works for fast-path communication.");
241 auto comm = plan.getComm();
242 const int myRank = comm->getRank();
243 std::vector<int> sendcounts(comm->getSize(), 0);
244 std::vector<int> sdispls(comm->getSize(), 0);
245 std::vector<int> recvcounts(comm->getSize(), 0);
246 std::vector<int> rdispls(comm->getSize(), 0);
248 size_t numBlocks = plan.getNumSends() + plan.hasSelfMessage();
249 for (
size_t p = 0; p < numBlocks; ++p) {
250 sdispls[plan.getProcsTo()[p]] = plan.getStartsTo()[p] * numPackets;
251 size_t sendcount = plan.getLengthsTo()[p] * numPackets;
253 TEUCHOS_TEST_FOR_EXCEPTION(sendcount >
size_t(INT_MAX), std::logic_error,
254 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
255 "Send count for block "
256 << p <<
" (" << sendcount
258 "to be represented as int.");
259 sendcounts[plan.getProcsTo()[p]] =
static_cast<int>(sendcount);
262 const size_type actualNumReceives =
263 Teuchos::as<size_type>(plan.getNumReceives()) +
264 Teuchos::as<size_type>(plan.hasSelfMessage() ? 1 : 0);
265 size_t curBufferOffset = 0;
266 for (size_type i = 0; i < actualNumReceives; ++i) {
267 const size_t curBufLen = plan.getLengthsFrom()[i] * numPackets;
268 TEUCHOS_TEST_FOR_EXCEPTION(
269 curBufferOffset + curBufLen > static_cast<size_t>(imports.size()),
271 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
272 "Exceeded size of 'imports' array in packing loop on Process "
273 << myRank <<
". imports.size() = " << imports.size()
276 << curBufferOffset <<
") + curBufLen(" << curBufLen <<
").");
277 rdispls[plan.getProcsFrom()[i]] = curBufferOffset;
279 TEUCHOS_TEST_FOR_EXCEPTION(curBufLen >
size_t(INT_MAX), std::logic_error,
280 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
281 "Recv count for receive "
282 << i <<
" (" << curBufLen
284 "to be represented as int.");
285 recvcounts[plan.getProcsFrom()[i]] =
static_cast<int>(curBufLen);
286 curBufferOffset += curBufLen;
289 using T =
typename ExpView::non_const_value_type;
290 MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
292 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
293 if (Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL == plan.getSendType()) {
294 MPIX_Comm *mpixComm = *plan.getMPIXComm();
295 TEUCHOS_TEST_FOR_EXCEPTION(
296 !mpixComm, std::runtime_error,
297 "plan's MPIX_Comm null in doPostsAllToAll, but "
298 "DISTRIBUTOR_MPIADVANCE_ALLTOALL set: plan.howInitialized()="
301 const int err = MPIX_Alltoallv(
302 exports.data(), sendcounts.data(), sdispls.data(), rawType,
303 imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
305 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
306 "MPIX_Alltoallv failed with error \""
307 << Teuchos::mpiErrorCodeToString(err)
313 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm =
314 Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int>>(comm);
315 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm =
316 mpiComm->getRawMpiComm();
318 const int err = MPI_Alltoallv(
319 exports.data(), sendcounts.data(), sdispls.data(), rawType,
320 imports.data(), recvcounts.data(), rdispls.data(), rawType, (*rawComm)());
322 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
323 "MPI_Alltoallv failed with error \""
324 << Teuchos::mpiErrorCodeToString(err)
330 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
331 template <
class ExpView,
class ImpView>
332 void DistributorActor::doPostsNbrAllToAllV(
const DistributorPlan &plan,
333 const ExpView &exports,
335 const ImpView &imports) {
336 TEUCHOS_TEST_FOR_EXCEPTION(
337 !plan.getIndicesTo().is_null(), std::runtime_error,
338 "Send Type=\"Alltoall\" only works for fast-path communication.");
340 const int myRank = plan.getComm()->getRank();
341 MPIX_Comm *mpixComm = *plan.getMPIXComm();
343 const size_t numSends = plan.getNumSends() + plan.hasSelfMessage();
344 const size_t numRecvs = plan.getNumReceives() + plan.hasSelfMessage();
345 std::vector<int> sendcounts(numSends, 0);
346 std::vector<int> sdispls(numSends, 0);
347 std::vector<int> recvcounts(numRecvs, 0);
348 std::vector<int> rdispls(numRecvs, 0);
350 for (
size_t p = 0; p < numSends; ++p) {
351 sdispls[p] = plan.getStartsTo()[p] * numPackets;
352 const size_t sendcount = plan.getLengthsTo()[p] * numPackets;
354 TEUCHOS_TEST_FOR_EXCEPTION(sendcount >
size_t(INT_MAX), std::logic_error,
355 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
356 "Send count for block "
357 << p <<
" (" << sendcount
359 "to be represented as int.");
360 sendcounts[p] =
static_cast<int>(sendcount);
363 size_t curBufferOffset = 0;
364 for (
size_t i = 0; i < numRecvs; ++i) {
365 const size_t curBufLen = plan.getLengthsFrom()[i] * numPackets;
366 TEUCHOS_TEST_FOR_EXCEPTION(
367 curBufferOffset + curBufLen > static_cast<size_t>(imports.size()),
369 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
370 "Exceeded size of 'imports' array in packing loop on Process "
371 << myRank <<
". imports.size() = " << imports.size()
374 << curBufferOffset <<
") + curBufLen(" << curBufLen <<
").");
375 rdispls[i] = curBufferOffset;
377 TEUCHOS_TEST_FOR_EXCEPTION(curBufLen >
size_t(INT_MAX), std::logic_error,
378 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
379 "Recv count for receive "
380 << i <<
" (" << curBufLen
382 "to be represented as int.");
383 recvcounts[i] =
static_cast<int>(curBufLen);
384 curBufferOffset += curBufLen;
387 using T =
typename ExpView::non_const_value_type;
388 MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
390 const int err = MPIX_Neighbor_alltoallv(
391 exports.data(), sendcounts.data(), sdispls.data(), rawType,
392 imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
394 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
395 "MPIX_Neighbor_alltoallv failed with error \""
396 << Teuchos::mpiErrorCodeToString(err)
399 #endif // HAVE_TPETRACORE_MPI_ADVANCE
400 #endif // HAVE_TPETRA_MPI
403 template <
class ExpView,
class ImpView>
404 void DistributorActor::doPosts(
const DistributorPlan& plan,
405 const ExpView& exports,
407 const ImpView& imports)
409 static_assert(areKokkosViews<ExpView, ImpView>,
410 "Data arrays for DistributorActor::doPosts must be Kokkos::Views");
411 using Teuchos::Array;
413 using Teuchos::FancyOStream;
414 using Teuchos::includesVerbLevel;
415 using Teuchos::ireceive;
416 using Teuchos::isend;
418 using Teuchos::TypeNameTraits;
419 using Teuchos::typeName;
421 using Kokkos::Compat::create_const_view;
422 using Kokkos::Compat::create_view;
423 using Kokkos::Compat::subview_offset;
424 using Kokkos::Compat::deep_copy_offset;
425 typedef Array<size_t>::size_type size_type;
426 typedef ExpView exports_view_type;
427 typedef ImpView imports_view_type;
429 #ifdef KOKKOS_ENABLE_CUDA
431 (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
432 ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
433 "Please do not use Tpetra::Distributor with UVM allocations. "
434 "See Trilinos GitHub issue #1088.");
435 #endif // KOKKOS_ENABLE_CUDA
437 #ifdef KOKKOS_ENABLE_SYCL
439 (! std::is_same<typename ExpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value &&
440 ! std::is_same<typename ImpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value,
441 "Please do not use Tpetra::Distributor with SharedUSM allocations. "
442 "See Trilinos GitHub issue #1088 (corresponding to CUDA).");
443 #endif // KOKKOS_ENABLE_SYCL
445 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
446 Teuchos::TimeMonitor timeMon (*timer_doPosts3KV_);
447 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
449 const int myRank = plan.getComm()->getRank ();
455 #if defined(HAVE_TPETRA_MPI)
459 if (sendType == Details::DISTRIBUTOR_ALLTOALL) {
460 doPostsAllToAll(plan, exports,numPackets, imports);
463 #ifdef HAVE_TPETRACORE_MPI_ADVANCE
464 else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL) {
465 doPostsAllToAll(plan, exports,numPackets, imports);
467 }
else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
468 doPostsNbrAllToAllV(plan, exports,numPackets, imports);
471 #endif // defined(HAVE_TPETRACORE_MPI_ADVANCE)
474 #else // HAVE_TPETRA_MPI
475 if (plan.hasSelfMessage()) {
483 size_t selfReceiveOffset = 0;
484 deep_copy_offset(imports, exports, selfReceiveOffset,
485 plan.getStartsTo()[0]*numPackets,
486 plan.getLengthsTo()[0]*numPackets);
490 #endif // HAVE_TPETRA_MPI
492 size_t selfReceiveOffset = 0;
494 #ifdef HAVE_TPETRA_DEBUG
495 TEUCHOS_TEST_FOR_EXCEPTION
496 (requests_.size () != 0,
498 "Tpetra::Distributor::doPosts(3 args, Kokkos): Process "
499 << myRank <<
": requests_.size() = " << requests_.size () <<
" != 0.");
500 #endif // HAVE_TPETRA_DEBUG
515 const size_type actualNumReceives = as<size_type> (plan.getNumReceives()) +
516 as<size_type> (plan.hasSelfMessage() ? 1 : 0);
517 requests_.resize (0);
525 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
526 Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts3KV_recvs_);
527 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
529 size_t curBufferOffset = 0;
530 for (size_type i = 0; i < actualNumReceives; ++i) {
531 const size_t curBufLen = plan.getLengthsFrom()[i] * numPackets;
532 if (plan.getProcsFrom()[i] != myRank) {
540 TEUCHOS_TEST_FOR_EXCEPTION(
541 curBufferOffset + curBufLen > static_cast<size_t> (imports.size ()),
542 std::logic_error,
"Tpetra::Distributor::doPosts(3 args, Kokkos): "
543 "Exceeded size of 'imports' array in packing loop on Process " <<
544 myRank <<
". imports.size() = " << imports.size () <<
" < "
545 "curBufferOffset(" << curBufferOffset <<
") + curBufLen(" <<
547 imports_view_type recvBuf =
548 subview_offset (imports, curBufferOffset, curBufLen);
549 requests_.push_back (ireceive<int> (recvBuf, plan.getProcsFrom()[i],
550 mpiTag_, *plan.getComm()));
553 selfReceiveOffset = curBufferOffset;
555 curBufferOffset += curBufLen;
559 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
560 Teuchos::TimeMonitor timeMonSends (*timer_doPosts3KV_sends_);
561 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
568 size_t numBlocks = plan.getNumSends() + plan.hasSelfMessage();
569 size_t procIndex = 0;
570 while ((procIndex < numBlocks) && (plan.getProcsTo()[procIndex] < myRank)) {
573 if (procIndex == numBlocks) {
578 size_t selfIndex = 0;
580 if (plan.getIndicesTo().is_null()) {
582 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
583 Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3KV_sends_fast_);
584 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
588 for (
size_t i = 0; i < numBlocks; ++i) {
589 size_t p = i + procIndex;
590 if (p > (numBlocks - 1)) {
594 if (plan.getProcsTo()[p] != myRank) {
595 exports_view_type tmpSend = subview_offset(
596 exports, plan.getStartsTo()[p]*numPackets, plan.getLengthsTo()[p]*numPackets);
598 if (sendType == Details::DISTRIBUTOR_ISEND) {
602 exports_view_type tmpSendBuf =
603 subview_offset (exports, plan.getStartsTo()[p] * numPackets,
604 plan.getLengthsTo()[p] * numPackets);
605 requests_.push_back (isend<int> (tmpSendBuf, plan.getProcsTo()[p],
606 mpiTag_, *plan.getComm()));
610 as<int> (tmpSend.size ()),
611 plan.getProcsTo()[p], mpiTag_, *plan.getComm());
619 if (plan.hasSelfMessage()) {
627 deep_copy_offset(imports, exports, selfReceiveOffset,
628 plan.getStartsTo()[selfNum]*numPackets,
629 plan.getLengthsTo()[selfNum]*numPackets);
635 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
636 Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts3KV_sends_slow_);
637 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
639 typedef typename ExpView::non_const_value_type Packet;
640 typedef typename ExpView::array_layout Layout;
641 typedef typename ExpView::device_type Device;
642 typedef typename ExpView::memory_traits Mem;
652 #ifdef HAVE_TPETRA_DEBUG
653 if (sendType != Details::DISTRIBUTOR_SEND) {
654 if (plan.getComm()->getRank() == 0)
655 std::cout <<
"The requested Tpetra send type "
657 <<
" requires Distributor data to be ordered by"
658 <<
" the receiving processor rank. Since these"
659 <<
" data are not ordered, Tpetra will use Send"
660 <<
" instead." << std::endl;
664 Kokkos::View<Packet*,Layout,Device,Mem> sendArray (
"sendArray",
665 plan.getMaxSendLength() * numPackets);
667 for (
size_t i = 0; i < numBlocks; ++i) {
668 size_t p = i + procIndex;
669 if (p > (numBlocks - 1)) {
673 if (plan.getProcsTo()[p] != myRank) {
674 size_t sendArrayOffset = 0;
675 size_t j = plan.getStartsTo()[p];
676 for (
size_t k = 0; k < plan.getLengthsTo()[p]; ++k, ++j) {
677 packOffset(sendArray, exports, sendArrayOffset, plan.getIndicesTo()[j]*numPackets, numPackets);
678 sendArrayOffset += numPackets;
680 typename ExpView::execution_space().fence();
683 subview_offset(sendArray,
size_t(0), plan.getLengthsTo()[p]*numPackets);
686 as<int> (tmpSend.size ()),
687 plan.getProcsTo()[p], mpiTag_, *plan.getComm());
691 selfIndex = plan.getStartsTo()[p];
695 if (plan.hasSelfMessage()) {
696 for (
size_t k = 0; k < plan.getLengthsTo()[selfNum]; ++k) {
697 packOffset(imports, exports, selfReceiveOffset, plan.getIndicesTo()[selfIndex]*numPackets, numPackets);
699 selfReceiveOffset += numPackets;
706 #ifdef HAVE_TPETRA_MPI
707 template <
class ExpView,
class ImpView>
708 void DistributorActor::doPostsAllToAll(
709 const DistributorPlan &plan,
const ExpView &exports,
710 const Teuchos::ArrayView<const size_t> &numExportPacketsPerLID,
711 const ImpView &imports,
712 const Teuchos::ArrayView<const size_t> &numImportPacketsPerLID) {
713 TEUCHOS_TEST_FOR_EXCEPTION(
714 !plan.getIndicesTo().is_null(), std::runtime_error,
715 "Send Type=\"Alltoall\" only works for fast-path communication.");
717 using size_type = Teuchos::Array<size_t>::size_type;
719 auto comm = plan.getComm();
720 std::vector<int> sendcounts(comm->getSize(), 0);
721 std::vector<int> sdispls(comm->getSize(), 0);
722 std::vector<int> recvcounts(comm->getSize(), 0);
723 std::vector<int> rdispls(comm->getSize(), 0);
725 size_t curPKToffset = 0;
726 for (
size_t pp = 0; pp < plan.getNumSends(); ++pp) {
727 sdispls[plan.getProcsTo()[pp]] = curPKToffset;
728 size_t numPackets = 0;
729 for (
size_t j = plan.getStartsTo()[pp];
730 j < plan.getStartsTo()[pp] + plan.getLengthsTo()[pp]; ++j) {
731 numPackets += numExportPacketsPerLID[j];
734 TEUCHOS_TEST_FOR_EXCEPTION(numPackets >
size_t(INT_MAX), std::logic_error,
735 "Tpetra::Distributor::doPosts(4 args, Kokkos): "
736 "Send count for send "
737 << pp <<
" (" << numPackets
739 "to be represented as int.");
740 sendcounts[plan.getProcsTo()[pp]] =
static_cast<int>(numPackets);
741 curPKToffset += numPackets;
744 const size_type actualNumReceives =
745 Teuchos::as<size_type>(plan.getNumReceives()) +
746 Teuchos::as<size_type>(plan.hasSelfMessage() ? 1 : 0);
748 size_t curBufferOffset = 0;
749 size_t curLIDoffset = 0;
750 for (size_type i = 0; i < actualNumReceives; ++i) {
751 size_t totalPacketsFrom_i = 0;
752 for (
size_t j = 0; j < plan.getLengthsFrom()[i]; ++j) {
753 totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset + j];
755 curLIDoffset += plan.getLengthsFrom()[i];
757 rdispls[plan.getProcsFrom()[i]] = curBufferOffset;
760 TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i >
size_t(INT_MAX),
762 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
763 "Recv count for receive "
764 << i <<
" (" << totalPacketsFrom_i
766 "to be represented as int.");
767 recvcounts[plan.getProcsFrom()[i]] =
static_cast<int>(totalPacketsFrom_i);
768 curBufferOffset += totalPacketsFrom_i;
771 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm =
772 Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int>>(comm);
773 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm =
774 mpiComm->getRawMpiComm();
775 using T =
typename ExpView::non_const_value_type;
776 MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
778 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
779 if (Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL == plan.getSendType()) {
780 MPIX_Comm *mpixComm = *plan.getMPIXComm();
781 TEUCHOS_TEST_FOR_EXCEPTION(!mpixComm, std::runtime_error,
782 "MPIX_Comm is null in doPostsAllToAll \""
783 << __FILE__ <<
":" << __LINE__);
785 const int err = MPIX_Alltoallv(
786 exports.data(), sendcounts.data(), sdispls.data(), rawType,
787 imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
789 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
790 "MPIX_Alltoallv failed with error \""
791 << Teuchos::mpiErrorCodeToString(err)
796 #endif // HAVE_TPETRACORE_MPI_ADVANCE
798 const int err = MPI_Alltoallv(
799 exports.data(), sendcounts.data(), sdispls.data(), rawType,
800 imports.data(), recvcounts.data(), rdispls.data(), rawType, (*rawComm)());
802 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
803 "MPI_Alltoallv failed with error \""
804 << Teuchos::mpiErrorCodeToString(err)
808 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
809 void DistributorActor::doPostsAllToAllKokkos(
810 const DistributorPlan &plan,
const ExpView &exports,
811 const ExpPacketsView &numExportPacketsPerLID,
812 const ImpView &imports,
813 const ImpPacketsView &numImportPacketsPerLID) {
814 TEUCHOS_TEST_FOR_EXCEPTION(
815 !plan.getIndicesTo().is_null(), std::runtime_error,
816 "Send Type=\"Alltoall\" only works for fast-path communication.");
818 using size_type = Teuchos::Array<size_t>::size_type;
819 using ExpExecSpace =
typename ExpPacketsView::execution_space;
820 using ImpExecSpace =
typename ImpPacketsView::execution_space;
822 auto comm = plan.getComm();
823 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> sendcounts(
"sendcounts", comm->getSize());
824 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> sdispls(
"sdispls", comm->getSize());
825 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> recvcounts(
"recvcounts", comm->getSize());
826 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> rdispls(
"rdispls", comm->getSize());
828 auto sendcounts_d = Kokkos::create_mirror_view(ExpExecSpace(), sendcounts);
829 auto sdispls_d = Kokkos::create_mirror_view(ExpExecSpace(), sdispls);
830 auto recvcounts_d = Kokkos::create_mirror_view(ImpExecSpace(), recvcounts);
831 auto rdispls_d = Kokkos::create_mirror_view(ImpExecSpace(), rdispls);
833 auto getStartsTo = Kokkos::Compat::getKokkosViewDeepCopy<ExpExecSpace>(plan.getStartsTo());
834 auto getLengthsTo = Kokkos::Compat::getKokkosViewDeepCopy<ExpExecSpace>(plan.getLengthsTo());
835 auto getProcsTo = Kokkos::Compat::getKokkosViewDeepCopy<ExpExecSpace>(plan.getProcsTo());
837 size_t curPKToffset = 0;
838 Kokkos::parallel_scan(Kokkos::RangePolicy<ExpExecSpace>(0, plan.getNumSends()), KOKKOS_LAMBDA(
const size_t pp,
size_t& offset,
bool is_final) {
839 sdispls_d(getProcsTo(pp)) = offset;
840 size_t numPackets = 0;
841 for (
size_t j = getStartsTo(pp); j < getStartsTo(pp) + getLengthsTo(pp); ++j) {
842 numPackets += numExportPacketsPerLID(j);
844 sendcounts_d(getProcsTo(pp)) =
static_cast<int>(numPackets);
845 offset += numPackets;
849 Kokkos::parallel_reduce(Kokkos::RangePolicy<ExpExecSpace>(0, plan.getNumSends()), KOKKOS_LAMBDA(
const size_t pp,
int& index) {
850 if(sendcounts_d(getProcsTo(pp)) < 0) {
856 TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error,
857 "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): "
858 "Send count for send "
859 << overflow-1 <<
" is too large "
860 "to be represented as int.");
862 const size_type actualNumReceives =
863 Teuchos::as<size_type>(plan.getNumReceives()) +
864 Teuchos::as<size_type>(plan.hasSelfMessage() ? 1 : 0);
866 auto getLengthsFrom = Kokkos::Compat::getKokkosViewDeepCopy<ImpExecSpace>(plan.getLengthsFrom());
867 auto getProcsFrom = Kokkos::Compat::getKokkosViewDeepCopy<ImpExecSpace>(plan.getProcsFrom());
869 Kokkos::View<size_t*, ImpExecSpace> curLIDoffset(
"curLIDoffset", actualNumReceives);
870 Kokkos::parallel_scan(Kokkos::RangePolicy<ImpExecSpace>(0, actualNumReceives), KOKKOS_LAMBDA(
const size_type i,
size_t& offset,
bool is_final) {
871 if(is_final) curLIDoffset(i) = offset;
872 offset += getLengthsFrom(i);
875 Kokkos::parallel_scan(Kokkos::RangePolicy<ImpExecSpace>(0, actualNumReceives), KOKKOS_LAMBDA(
const size_type i,
size_t& curBufferOffset,
bool is_final) {
876 size_t totalPacketsFrom_i = 0;
877 for(
size_t j = 0; j < getLengthsFrom(i); j++) {
878 totalPacketsFrom_i += numImportPacketsPerLID(curLIDoffset(i) + j);
881 if(is_final) rdispls_d(getProcsFrom(i)) = curBufferOffset;
882 if(is_final) recvcounts_d(getProcsFrom(i)) =
static_cast<int>(totalPacketsFrom_i);
883 curBufferOffset += totalPacketsFrom_i;
886 Kokkos::parallel_reduce(Kokkos::RangePolicy<ExpExecSpace>(0, actualNumReceives), KOKKOS_LAMBDA(
const size_type i,
int& index) {
887 if(recvcounts_d(getProcsFrom(i)) < 0) {
894 TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error,
895 "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): "
896 "Recv count for receive "
897 << overflow-1 <<
" is too large "
898 "to be represented as int.");
905 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm =
906 Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int>>(comm);
907 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm =
908 mpiComm->getRawMpiComm();
909 using T =
typename ExpView::non_const_value_type;
910 MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
912 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
913 if (Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL == plan.getSendType()) {
914 MPIX_Comm *mpixComm = *plan.getMPIXComm();
915 TEUCHOS_TEST_FOR_EXCEPTION(!mpixComm, std::runtime_error,
916 "MPIX_Comm is null in doPostsAllToAll \""
917 << __FILE__ <<
":" << __LINE__);
919 const int err = MPIX_Alltoallv(
920 exports.data(), sendcounts.data(), sdispls.data(), rawType,
921 imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
923 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
924 "MPIX_Alltoallv failed with error \""
925 << Teuchos::mpiErrorCodeToString(err)
930 #endif // HAVE_TPETRACORE_MPI_ADVANCE
932 const int err = MPI_Alltoallv(
933 exports.data(), sendcounts.data(), sdispls.data(), rawType,
934 imports.data(), recvcounts.data(), rdispls.data(), rawType, (*rawComm)());
936 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
937 "MPI_Alltoallv failed with error \""
938 << Teuchos::mpiErrorCodeToString(err)
942 #if defined(HAVE_TPETRACORE_MPI_ADVANCE)
943 template <
class ExpView,
class ImpView>
944 void DistributorActor::doPostsNbrAllToAllV(
945 const DistributorPlan &plan,
const ExpView &exports,
946 const Teuchos::ArrayView<const size_t> &numExportPacketsPerLID,
947 const ImpView &imports,
948 const Teuchos::ArrayView<const size_t> &numImportPacketsPerLID) {
949 TEUCHOS_TEST_FOR_EXCEPTION(
950 !plan.getIndicesTo().is_null(), std::runtime_error,
951 "Send Type=\"Alltoall\" only works for fast-path communication.");
953 const Teuchos_Ordinal numSends = plan.getProcsTo().size();
954 const Teuchos_Ordinal numRecvs = plan.getProcsFrom().size();
956 auto comm = plan.getComm();
957 std::vector<int> sendcounts(numSends, 0);
958 std::vector<int> sdispls(numSends, 0);
959 std::vector<int> recvcounts(numRecvs, 0);
960 std::vector<int> rdispls(numRecvs, 0);
962 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm =
963 Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int>>(comm);
964 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm =
965 mpiComm->getRawMpiComm();
966 using T =
typename ExpView::non_const_value_type;
967 MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
971 size_t curPKToffset = 0;
972 for (Teuchos_Ordinal pp = 0; pp < numSends; ++pp) {
973 sdispls[pp] = curPKToffset;
974 size_t numPackets = 0;
975 for (
size_t j = plan.getStartsTo()[pp];
976 j < plan.getStartsTo()[pp] + plan.getLengthsTo()[pp]; ++j) {
977 numPackets += numExportPacketsPerLID[j];
980 TEUCHOS_TEST_FOR_EXCEPTION(numPackets >
size_t(INT_MAX), std::logic_error,
981 "Tpetra::Distributor::doPosts(4 args, Kokkos): "
982 "Send count for send "
983 << pp <<
" (" << numPackets
985 "to be represented as int.");
986 sendcounts[pp] =
static_cast<int>(numPackets);
987 curPKToffset += numPackets;
989 size_t curBufferOffset = 0;
990 size_t curLIDoffset = 0;
991 for (Teuchos_Ordinal i = 0; i < numRecvs; ++i) {
992 size_t totalPacketsFrom_i = 0;
993 for (
size_t j = 0; j < plan.getLengthsFrom()[i]; ++j) {
994 totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset + j];
996 curLIDoffset += plan.getLengthsFrom()[i];
998 rdispls[i] = curBufferOffset;
1001 TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i >
size_t(INT_MAX),
1003 "Tpetra::Distributor::doPosts(3 args, Kokkos): "
1004 "Recv count for receive "
1005 << i <<
" (" << totalPacketsFrom_i
1006 <<
") is too large "
1007 "to be represented as int.");
1008 recvcounts[i] =
static_cast<int>(totalPacketsFrom_i);
1009 curBufferOffset += totalPacketsFrom_i;
1012 MPIX_Comm *mpixComm = *plan.getMPIXComm();
1013 const int err = MPIX_Neighbor_alltoallv(
1014 exports.data(), sendcounts.data(), sdispls.data(), rawType,
1015 imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
1017 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
1018 "MPIX_Neighbor_alltoallv failed with error \""
1019 << Teuchos::mpiErrorCodeToString(err)
1023 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
1024 void DistributorActor::doPostsNbrAllToAllVKokkos(
1025 const DistributorPlan &plan,
const ExpView &exports,
1026 const ExpPacketsView &numExportPacketsPerLID,
1027 const ImpView &imports,
1028 const ImpPacketsView &numImportPacketsPerLID) {
1029 TEUCHOS_TEST_FOR_EXCEPTION(
1030 !plan.getIndicesTo().is_null(), std::runtime_error,
1031 "Send Type=\"Alltoall\" only works for fast-path communication.");
1033 const Teuchos_Ordinal numSends = plan.getProcsTo().size();
1034 const Teuchos_Ordinal numRecvs = plan.getProcsFrom().size();
1036 auto comm = plan.getComm();
1037 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> sendcounts(
"sendcounts", comm->getSize());
1038 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> sdispls(
"sdispls", comm->getSize());
1039 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> recvcounts(
"recvcounts", comm->getSize());
1040 Kokkos::View<int*, Kokkos::DefaultHostExecutionSpace> rdispls(
"rdispls", comm->getSize());
1042 auto sendcounts_d = Kokkos::create_mirror_view(ExpExecSpace(), sendcounts);
1043 auto sdispls_d = Kokkos::create_mirror_view(ExpExecSpace(), sdispls);
1044 auto recvcounts_d = Kokkos::create_mirror_view(ImpExecSpace(), recvcounts);
1045 auto rdispls_d = Kokkos::create_mirror_view(ImpExecSpace(), rdispls);
1047 auto getStartsTo = Kokkos::Compat::getKokkosViewDeepCopy<ExpExecSpace>(plan.getStartsTo());
1048 auto getLengthsTo = Kokkos::Compat::getKokkosViewDeepCopy<ExpExecSpace>(plan.getLengthsTo());
1050 Teuchos::RCP<const Teuchos::MpiComm<int>> mpiComm =
1051 Teuchos::rcp_dynamic_cast<
const Teuchos::MpiComm<int>>(comm);
1052 Teuchos::RCP<const Teuchos::OpaqueWrapper<MPI_Comm>> rawComm =
1053 mpiComm->getRawMpiComm();
1054 using T =
typename ExpView::non_const_value_type;
1055 using ExpExecSpace =
typename ExpPacketsView::execution_space;
1056 using ImpExecSpace =
typename ImpPacketsView::execution_space;
1057 MPI_Datatype rawType = ::Tpetra::Details::MpiTypeTraits<T>::getType(T());
1061 Kokkos::parallel_scan(Kokkos::RangePolicy<ExpExecSpace>(0, numSends), KOKKOS_LAMBDA(
const Teuchos_Ordinal pp,
size_t& curPKToffset,
bool is_final) {
1062 sdispls_d(pp) = curPKToffset;
1063 size_t numPackets = 0;
1064 for (
size_t j = getStartsTo(pp); j < getStartsTo(pp) + getLengthsTo(pp); ++j) {
1065 numPackets += numExportPacketsPerLID(j);
1067 sendcounts_d(pp) =
static_cast<int>(numPackets);
1068 curPKToffset += numPackets;
1072 Kokkos::parallel_reduce(Kokkos::RangePolicy<ExpExecSpace>(0, numSends), KOKKOS_LAMBDA(
const Teuchos_Ordinal pp,
int& index) {
1073 if(sendcounts_d(pp) < 0) {
1079 TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error,
1080 "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): "
1081 "Send count for send "
1082 << overflow-1 <<
" is too large "
1083 "to be represented as int.");
1085 auto getLengthsFrom = Kokkos::Compat::getKokkosViewDeepCopy<ImpExecSpace>(plan.getLengthsFrom());
1087 Kokkos::View<size_t*, ImpExecSpace> curLIDoffset(
"curLIDoffset", numRecvs);
1088 Kokkos::parallel_scan(Kokkos::RangePolicy<ImpExecSpace>(0, numRecvs), KOKKOS_LAMBDA(
const Teuchos_Ordinal i,
size_t& offset,
bool is_final) {
1089 if(is_final) curLIDoffset(i) = offset;
1090 offset += getLengthsFrom(i);
1093 Kokkos::parallel_scan(Kokkos::RangePolicy<ImpExecSpace>(0, numRecvs), KOKKOS_LAMBDA(
const Teuchos_Ordinal i,
size_t& curBufferOffset,
bool is_final) {
1094 rdispls_d(i) = curBufferOffset;
1095 size_t totalPacketsFrom_i = 0;
1096 for(
size_t j = 0; j < getLengthsFrom(i); j++) {
1097 totalPacketsFrom_i += numImportPacketsPerLID(curLIDoffset(i) + j);
1100 recvcounts_d(i) =
static_cast<int>(totalPacketsFrom_i);
1101 curBufferOffset += totalPacketsFrom_i;
1104 Kokkos::parallel_reduce(Kokkos::RangePolicy<ImpExecSpace>(0, numRecvs), KOKKOS_LAMBDA(
const Teuchos_Ordinal i,
int& index) {
1105 if(recvcounts_d(pp) < 0) {
1112 TEUCHOS_TEST_FOR_EXCEPTION(overflow, std::logic_error,
1113 "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): "
1114 "Recv count for receive "
1115 << overflow-1 <<
") is too large "
1116 "to be represented as int.");
1123 MPIX_Comm *mpixComm = *plan.getMPIXComm();
1124 const int err = MPIX_Neighbor_alltoallv(
1125 exports.data(), sendcounts.data(), sdispls.data(), rawType,
1126 imports.data(), recvcounts.data(), rdispls.data(), rawType, mpixComm);
1128 TEUCHOS_TEST_FOR_EXCEPTION(err != MPI_SUCCESS, std::runtime_error,
1129 "MPIX_Neighbor_alltoallv failed with error \""
1130 << Teuchos::mpiErrorCodeToString(err)
1133 #endif // HAVE_TPETRACORE_MPI_ADVANCE
1134 #endif // HAVE_TPETRA_MPI
1137 template <
class ExpView,
class ImpView>
1138 void DistributorActor::doPosts(
const DistributorPlan& plan,
1139 const ExpView &exports,
1140 const Teuchos::ArrayView<const size_t>& numExportPacketsPerLID,
1141 const ImpView &imports,
1142 const Teuchos::ArrayView<const size_t>& numImportPacketsPerLID)
1144 static_assert(areKokkosViews<ExpView, ImpView>,
1145 "Data arrays for DistributorActor::doPosts must be Kokkos::Views");
1146 using Teuchos::Array;
1148 using Teuchos::ireceive;
1149 using Teuchos::isend;
1150 using Teuchos::send;
1151 using Teuchos::TypeNameTraits;
1153 using Kokkos::Compat::create_const_view;
1154 using Kokkos::Compat::create_view;
1155 using Kokkos::Compat::subview_offset;
1156 using Kokkos::Compat::deep_copy_offset;
1157 typedef Array<size_t>::size_type size_type;
1158 typedef ExpView exports_view_type;
1159 typedef ImpView imports_view_type;
1161 #ifdef KOKKOS_ENABLE_CUDA
1162 static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
1163 ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
1164 "Please do not use Tpetra::Distributor with UVM "
1165 "allocations. See GitHub issue #1088.");
1166 #endif // KOKKOS_ENABLE_CUDA
1168 #ifdef KOKKOS_ENABLE_SYCL
1169 static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value &&
1170 ! std::is_same<typename ImpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value,
1171 "Please do not use Tpetra::Distributor with SharedUSM "
1172 "allocations. See GitHub issue #1088 (corresponding to CUDA).");
1173 #endif // KOKKOS_ENABLE_SYCL
1175 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1176 Teuchos::TimeMonitor timeMon (*timer_doPosts4KV_);
1177 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1183 #ifdef HAVE_TPETRA_MPI
1186 if (sendType == Details::DISTRIBUTOR_ALLTOALL) {
1187 doPostsAllToAll(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
1190 #ifdef HAVE_TPETRACORE_MPI_ADVANCE
1191 else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL)
1193 doPostsAllToAll(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
1195 }
else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
1196 doPostsNbrAllToAllV(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
1201 #else // HAVE_TPETRA_MPI
1202 if (plan.hasSelfMessage()) {
1204 size_t selfReceiveOffset = 0;
1208 Array<size_t> sendPacketOffsets(plan.getNumSends(),0), packetsPerSend(plan.getNumSends(),0);
1209 size_t maxNumPackets = 0;
1210 size_t curPKToffset = 0;
1211 for (
size_t pp=0; pp<plan.getNumSends(); ++pp) {
1212 sendPacketOffsets[pp] = curPKToffset;
1213 size_t numPackets = 0;
1214 for (
size_t j=plan.getStartsTo()[pp]; j<plan.getStartsTo()[pp]+plan.getLengthsTo()[pp]; ++j) {
1215 numPackets += numExportPacketsPerLID[j];
1217 if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1218 packetsPerSend[pp] = numPackets;
1219 curPKToffset += numPackets;
1222 deep_copy_offset(imports, exports, selfReceiveOffset,
1223 sendPacketOffsets[0], packetsPerSend[0]);
1225 #endif // HAVE_TPETRA_MPI
1227 const int myProcID = plan.getComm()->getRank ();
1228 size_t selfReceiveOffset = 0;
1230 #ifdef HAVE_TPETRA_DEBUG
1232 size_t totalNumImportPackets = 0;
1233 for (size_type ii = 0; ii < numImportPacketsPerLID.size (); ++ii) {
1234 totalNumImportPackets += numImportPacketsPerLID[ii];
1236 TEUCHOS_TEST_FOR_EXCEPTION(
1237 imports.extent (0) < totalNumImportPackets, std::runtime_error,
1238 "Tpetra::Distributor::doPosts(4 args, Kokkos): The 'imports' array must have "
1239 "enough entries to hold the expected number of import packets. "
1240 "imports.extent(0) = " << imports.extent (0) <<
" < "
1241 "totalNumImportPackets = " << totalNumImportPackets <<
".");
1242 TEUCHOS_TEST_FOR_EXCEPTION
1243 (requests_.size () != 0, std::logic_error,
"Tpetra::Distributor::"
1244 "doPosts(4 args, Kokkos): Process " << myProcID <<
": requests_.size () = "
1245 << requests_.size () <<
" != 0.");
1246 #endif // HAVE_TPETRA_DEBUG
1260 const size_type actualNumReceives = as<size_type> (plan.getNumReceives()) +
1261 as<size_type> (plan.hasSelfMessage() ? 1 : 0);
1262 requests_.resize (0);
1270 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1271 Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4KV_recvs_);
1272 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1274 size_t curBufferOffset = 0;
1275 size_t curLIDoffset = 0;
1276 for (size_type i = 0; i < actualNumReceives; ++i) {
1277 size_t totalPacketsFrom_i = 0;
1278 for (
size_t j = 0; j < plan.getLengthsFrom()[i]; ++j) {
1279 totalPacketsFrom_i += numImportPacketsPerLID[curLIDoffset+j];
1282 TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i >
size_t(INT_MAX),
1283 std::logic_error,
"Tpetra::Distributor::doPosts(3 args, Kokkos): "
1284 "Recv count for receive " << i <<
" (" << totalPacketsFrom_i <<
") is too large "
1285 "to be represented as int.");
1286 curLIDoffset += plan.getLengthsFrom()[i];
1287 if (plan.getProcsFrom()[i] != myProcID && totalPacketsFrom_i) {
1296 imports_view_type recvBuf =
1297 subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
1298 requests_.push_back (ireceive<int> (recvBuf, plan.getProcsFrom()[i],
1299 mpiTag_, *plan.getComm()));
1302 selfReceiveOffset = curBufferOffset;
1304 curBufferOffset += totalPacketsFrom_i;
1308 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1309 Teuchos::TimeMonitor timeMonSends (*timer_doPosts4KV_sends_);
1310 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1314 Array<size_t> sendPacketOffsets(plan.getNumSends(),0), packetsPerSend(plan.getNumSends(),0);
1315 size_t maxNumPackets = 0;
1316 size_t curPKToffset = 0;
1317 for (
size_t pp=0; pp<plan.getNumSends(); ++pp) {
1318 sendPacketOffsets[pp] = curPKToffset;
1319 size_t numPackets = 0;
1320 for (
size_t j=plan.getStartsTo()[pp]; j<plan.getStartsTo()[pp]+plan.getLengthsTo()[pp]; ++j) {
1321 numPackets += numExportPacketsPerLID[j];
1323 if (numPackets > maxNumPackets) maxNumPackets = numPackets;
1325 TEUCHOS_TEST_FOR_EXCEPTION(numPackets >
size_t(INT_MAX),
1326 std::logic_error,
"Tpetra::Distributor::doPosts(4 args, Kokkos): "
1327 "packetsPerSend[" << pp <<
"] = " << numPackets <<
" is too large "
1328 "to be represented as int.");
1329 packetsPerSend[pp] = numPackets;
1330 curPKToffset += numPackets;
1335 size_t numBlocks = plan.getNumSends() + plan.hasSelfMessage();
1336 size_t procIndex = 0;
1337 while ((procIndex < numBlocks) && (plan.getProcsTo()[procIndex] < myProcID)) {
1340 if (procIndex == numBlocks) {
1345 size_t selfIndex = 0;
1346 if (plan.getIndicesTo().is_null()) {
1348 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1349 Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_fast_);
1350 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1354 for (
size_t i = 0; i < numBlocks; ++i) {
1355 size_t p = i + procIndex;
1356 if (p > (numBlocks - 1)) {
1360 if (plan.getProcsTo()[p] != myProcID && packetsPerSend[p] > 0) {
1361 exports_view_type tmpSend =
1362 subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
1364 if (sendType == Details::DISTRIBUTOR_ISEND) {
1365 exports_view_type tmpSendBuf =
1366 subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
1367 requests_.push_back (isend<int> (tmpSendBuf, plan.getProcsTo()[p],
1368 mpiTag_, *plan.getComm()));
1372 as<int> (tmpSend.size ()),
1373 plan.getProcsTo()[p], mpiTag_, *plan.getComm());
1381 if (plan.hasSelfMessage()) {
1382 deep_copy_offset(imports, exports, selfReceiveOffset,
1383 sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
1388 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1389 Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_slow_);
1390 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1393 typedef typename ExpView::non_const_value_type Packet;
1394 typedef typename ExpView::array_layout Layout;
1395 typedef typename ExpView::device_type Device;
1396 typedef typename ExpView::memory_traits Mem;
1406 #ifdef HAVE_TPETRA_DEBUG
1407 if (sendType != Details::DISTRIBUTOR_SEND) {
1408 if (plan.getComm()->getRank() == 0)
1409 std::cout <<
"The requested Tpetra send type "
1411 <<
" requires Distributor data to be ordered by"
1412 <<
" the receiving processor rank. Since these"
1413 <<
" data are not ordered, Tpetra will use Send"
1414 <<
" instead." << std::endl;
1418 Kokkos::View<Packet*,Layout,Device,Mem> sendArray (
"sendArray",
1421 Array<size_t> indicesOffsets (numExportPacketsPerLID.size(), 0);
1423 for (
int j=0; j<numExportPacketsPerLID.size(); ++j) {
1424 indicesOffsets[j] = ioffset;
1425 ioffset += numExportPacketsPerLID[j];
1428 for (
size_t i = 0; i < numBlocks; ++i) {
1429 size_t p = i + procIndex;
1430 if (p > (numBlocks - 1)) {
1434 if (plan.getProcsTo()[p] != myProcID) {
1435 size_t sendArrayOffset = 0;
1436 size_t j = plan.getStartsTo()[p];
1437 size_t numPacketsTo_p = 0;
1438 for (
size_t k = 0; k < plan.getLengthsTo()[p]; ++k, ++j) {
1439 numPacketsTo_p += numExportPacketsPerLID[j];
1440 deep_copy_offset(sendArray, exports, sendArrayOffset,
1441 indicesOffsets[j], numExportPacketsPerLID[j]);
1442 sendArrayOffset += numExportPacketsPerLID[j];
1444 typename ExpView::execution_space().fence();
1446 if (numPacketsTo_p > 0) {
1448 subview_offset(sendArray,
size_t(0), numPacketsTo_p);
1451 as<int> (tmpSend.size ()),
1452 plan.getProcsTo()[p], mpiTag_, *plan.getComm());
1457 selfIndex = plan.getStartsTo()[p];
1461 if (plan.hasSelfMessage()) {
1462 for (
size_t k = 0; k < plan.getLengthsTo()[selfNum]; ++k) {
1463 deep_copy_offset(imports, exports, selfReceiveOffset,
1464 indicesOffsets[selfIndex],
1465 numExportPacketsPerLID[selfIndex]);
1466 selfReceiveOffset += numExportPacketsPerLID[selfIndex];
1473 template <
class ExpView,
class ExpPacketsView,
class ImpView,
class ImpPacketsView>
1474 void DistributorActor::doPostsKokkos(
const DistributorPlan& plan,
1475 const ExpView &exports,
1476 const ExpPacketsView &numExportPacketsPerLID,
1477 const ImpView &imports,
1478 const ImpPacketsView &numImportPacketsPerLID)
1480 static_assert(areKokkosViews<ExpView, ImpView>,
1481 "Data arrays for DistributorActor::doPostsKokkos must be Kokkos::Views");
1482 static_assert(areKokkosViews<ExpPacketsView, ImpPacketsView>,
1483 "Num packets arrays for DistributorActor::doPostsKokkos must be Kokkos::Views");
1484 using Teuchos::Array;
1486 using Teuchos::ireceive;
1487 using Teuchos::isend;
1488 using Teuchos::send;
1489 using Teuchos::TypeNameTraits;
1491 using Kokkos::Compat::create_const_view;
1492 using Kokkos::Compat::create_view;
1493 using Kokkos::Compat::subview_offset;
1494 using Kokkos::Compat::deep_copy_offset;
1495 using ExpExecSpace =
typename ExpPacketsView::execution_space;
1496 using ImpExecSpace =
typename ImpPacketsView::execution_space;
1497 typedef Array<size_t>::size_type size_type;
1498 typedef ExpView exports_view_type;
1499 typedef ImpView imports_view_type;
1501 #ifdef KOKKOS_ENABLE_CUDA
1502 static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::CudaUVMSpace>::value &&
1503 ! std::is_same<typename ImpView::memory_space, Kokkos::CudaUVMSpace>::value,
1504 "Please do not use Tpetra::Distributor with UVM "
1505 "allocations. See GitHub issue #1088.");
1506 #endif // KOKKOS_ENABLE_CUDA
1508 #ifdef KOKKOS_ENABLE_SYCL
1509 static_assert (! std::is_same<typename ExpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value &&
1510 ! std::is_same<typename ImpView::memory_space, Kokkos::Experimental::SYCLSharedUSMSpace>::value,
1511 "Please do not use Tpetra::Distributor with SharedUSM "
1512 "allocations. See GitHub issue #1088 (corresponding to CUDA).");
1513 #endif // KOKKOS_ENABLE_SYCL
1515 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1516 Teuchos::TimeMonitor timeMon (*timer_doPosts4KV_);
1517 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1523 #ifdef HAVE_TPETRA_MPI
1526 if (sendType == Details::DISTRIBUTOR_ALLTOALL) {
1527 doPostsAllToAllKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
1530 #ifdef HAVE_TPETRACORE_MPI_ADVANCE
1531 else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_ALLTOALL)
1533 doPostsAllToAllKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
1535 }
else if (sendType == Details::DISTRIBUTOR_MPIADVANCE_NBRALLTOALLV) {
1536 doPostsNbrAllToAllVKokkos(plan, exports, numExportPacketsPerLID, imports, numImportPacketsPerLID);
1541 #else // HAVE_TPETRA_MPI
1542 if (plan.hasSelfMessage()) {
1543 size_t packetsPerSend;
1544 Kokkos::parallel_reduce(Kokkos::RangePolicy<ExpExecSpace>(plan.getStartsTo()[0], plan.getStartsTo()[0]+plan.getLengthsTo()[0]), KOKKOS_LAMBDA(
const size_t j,
size_t& packets) {
1545 packets += numExportPacketsPerLID(j);
1548 deep_copy_offset(imports, exports, (
size_t)0, (
size_t)0, packetsPerSend);
1550 #endif // HAVE_TPETRA_MPI
1552 const int myProcID = plan.getComm()->getRank ();
1553 size_t selfReceiveOffset = 0;
1555 #ifdef HAVE_TPETRA_DEBUG
1557 size_t totalNumImportPackets = Kokkos::Experimental::reduce(ImpExecSpace(), numImportPacketsPerLID);
1558 TEUCHOS_TEST_FOR_EXCEPTION(
1559 imports.extent (0) < totalNumImportPackets, std::runtime_error,
1560 "Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): The 'imports' array must have "
1561 "enough entries to hold the expected number of import packets. "
1562 "imports.extent(0) = " << imports.extent (0) <<
" < "
1563 "totalNumImportPackets = " << totalNumImportPackets <<
".");
1564 TEUCHOS_TEST_FOR_EXCEPTION
1565 (requests_.size () != 0, std::logic_error,
"Tpetra::Distributor::"
1566 "doPostsKokkos(4 args, Kokkos): Process " << myProcID <<
": requests_.size () = "
1567 << requests_.size () <<
" != 0.");
1568 #endif // HAVE_TPETRA_DEBUG
1582 const size_type actualNumReceives = as<size_type> (plan.getNumReceives()) +
1583 as<size_type> (plan.hasSelfMessage() ? 1 : 0);
1584 requests_.resize (0);
1592 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1593 Teuchos::TimeMonitor timeMonRecvs (*timer_doPosts4KV_recvs_);
1594 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1596 size_t curBufferOffset = 0;
1597 size_t curLIDoffset = 0;
1598 for (size_type i = 0; i < actualNumReceives; ++i) {
1599 size_t totalPacketsFrom_i = 0;
1600 Kokkos::parallel_reduce(Kokkos::RangePolicy<ImpExecSpace>(0, plan.getLengthsFrom()[i]), KOKKOS_LAMBDA(
const size_t j,
size_t& total) {
1601 total += numImportPacketsPerLID(curLIDoffset+j);
1602 }, totalPacketsFrom_i);
1604 TEUCHOS_TEST_FOR_EXCEPTION(totalPacketsFrom_i >
size_t(INT_MAX),
1605 std::logic_error,
"Tpetra::Distributor::doPostsKokkos(3 args, Kokkos): "
1606 "Recv count for receive " << i <<
" (" << totalPacketsFrom_i <<
") is too large "
1607 "to be represented as int.");
1608 curLIDoffset += plan.getLengthsFrom()[i];
1609 if (plan.getProcsFrom()[i] != myProcID && totalPacketsFrom_i) {
1618 imports_view_type recvBuf =
1619 subview_offset (imports, curBufferOffset, totalPacketsFrom_i);
1620 requests_.push_back (ireceive<int> (recvBuf, plan.getProcsFrom()[i],
1621 mpiTag_, *plan.getComm()));
1624 selfReceiveOffset = curBufferOffset;
1626 curBufferOffset += totalPacketsFrom_i;
1630 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1631 Teuchos::TimeMonitor timeMonSends (*timer_doPosts4KV_sends_);
1632 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1636 Kokkos::View<size_t*, Kokkos::DefaultHostExecutionSpace> sendPacketOffsets(
"sendPacketOffsets", plan.getNumSends());
1637 Kokkos::View<size_t*, Kokkos::DefaultHostExecutionSpace> packetsPerSend(
"packetsPerSend", plan.getNumSends());
1638 auto sendPacketOffsets_d = Kokkos::create_mirror_view(ExpExecSpace(), sendPacketOffsets);
1639 auto packetsPerSend_d = Kokkos::create_mirror_view(ExpExecSpace(), packetsPerSend);
1641 auto starts = Kokkos::Compat::getKokkosViewDeepCopy<ExpExecSpace>(plan.getStartsTo());
1642 auto lengths = Kokkos::Compat::getKokkosViewDeepCopy<ExpExecSpace>(plan.getLengthsTo());
1644 Kokkos::parallel_scan(Kokkos::RangePolicy<ExpExecSpace>(0, plan.getNumSends()), KOKKOS_LAMBDA(
const size_t pp,
size_t& curPKToffset,
bool final_pass) {
1645 if(final_pass) sendPacketOffsets_d(pp) = curPKToffset;
1646 size_t numPackets = 0;
1647 for(
size_t j = starts(pp); j < starts(pp) + lengths(pp); j++) {
1648 numPackets += numExportPacketsPerLID(j);
1650 if(final_pass) packetsPerSend_d(pp) = numPackets;
1651 curPKToffset += numPackets;
1654 size_t maxNumPackets;
1655 Kokkos::parallel_reduce(Kokkos::RangePolicy<ExpExecSpace>(0, plan.getNumSends()), KOKKOS_LAMBDA(
const size_t pp,
size_t& max) {
1656 if(packetsPerSend_d(pp) > max) {
1657 max = packetsPerSend_d(pp);
1659 }, Kokkos::Max<size_t>(maxNumPackets));
1662 TEUCHOS_TEST_FOR_EXCEPTION(maxNumPackets >
size_t(INT_MAX),
1663 std::logic_error,
"Tpetra::Distributor::doPostsKokkos(4 args, Kokkos): "
1664 "numPackets = " << maxNumPackets <<
" is too large "
1665 "to be represented as int.");
1672 size_t numBlocks = plan.getNumSends() + plan.hasSelfMessage();
1673 size_t procIndex = 0;
1674 while ((procIndex < numBlocks) && (plan.getProcsTo()[procIndex] < myProcID)) {
1677 if (procIndex == numBlocks) {
1682 size_t selfIndex = 0;
1683 if (plan.getIndicesTo().is_null()) {
1685 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1686 Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_fast_);
1687 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1691 for (
size_t i = 0; i < numBlocks; ++i) {
1692 size_t p = i + procIndex;
1693 if (p > (numBlocks - 1)) {
1697 if (plan.getProcsTo()[p] != myProcID && packetsPerSend[p] > 0) {
1698 exports_view_type tmpSend =
1699 subview_offset(exports, sendPacketOffsets[p], packetsPerSend[p]);
1701 if (sendType == Details::DISTRIBUTOR_ISEND) {
1702 exports_view_type tmpSendBuf =
1703 subview_offset (exports, sendPacketOffsets[p], packetsPerSend[p]);
1704 requests_.push_back (isend<int> (tmpSendBuf, plan.getProcsTo()[p],
1705 mpiTag_, *plan.getComm()));
1709 as<int> (tmpSend.size ()),
1710 plan.getProcsTo()[p], mpiTag_, *plan.getComm());
1718 if (plan.hasSelfMessage()) {
1719 deep_copy_offset(imports, exports, selfReceiveOffset,
1720 sendPacketOffsets[selfNum], packetsPerSend[selfNum]);
1725 #ifdef HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1726 Teuchos::TimeMonitor timeMonSends2 (*timer_doPosts4KV_sends_slow_);
1727 #endif // HAVE_TPETRA_DISTRIBUTOR_TIMINGS
1730 typedef typename ExpView::non_const_value_type Packet;
1731 typedef typename ExpView::array_layout Layout;
1732 typedef typename ExpView::device_type Device;
1733 typedef typename ExpView::memory_traits Mem;
1743 #ifdef HAVE_TPETRA_DEBUG
1744 if (sendType != Details::DISTRIBUTOR_SEND) {
1745 if (plan.getComm()->getRank() == 0)
1746 std::cout <<
"The requested Tpetra send type "
1748 <<
" requires Distributor data to be ordered by"
1749 <<
" the receiving processor rank. Since these"
1750 <<
" data are not ordered, Tpetra will use Send"
1751 <<
" instead." << std::endl;
1755 Kokkos::View<Packet*,Layout,Device,Mem> sendArray (
"sendArray",
1758 Kokkos::View<size_t*, ExpExecSpace> indicesOffsets (
"indicesOffsets", numExportPacketsPerLID.extent(0));
1760 Kokkos::parallel_scan(Kokkos::RangePolicy<ExpExecSpace>(0, numExportPacketsPerLID.extent(0)), KOKKOS_LAMBDA(
const size_t j,
size_t& offset,
bool is_final) {
1761 if(is_final) indicesOffsets(j) = offset;
1762 offset += numExportPacketsPerLID(j);
1765 for (
size_t i = 0; i < numBlocks; ++i) {
1766 size_t p = i + procIndex;
1767 if (p > (numBlocks - 1)) {
1771 if (plan.getProcsTo()[p] != myProcID) {
1772 size_t j = plan.getStartsTo()[p];
1773 size_t numPacketsTo_p = 0;
1775 auto sendArrayMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), sendArray);
1776 auto exportsMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), exports);
1777 Kokkos::parallel_scan(Kokkos::RangePolicy<ExpExecSpace>(0, plan.getLengthsTo()[p]), KOKKOS_LAMBDA(
const size_t k,
size_t& offset,
bool is_final) {
1779 const size_t dst_end = offset + numExportPacketsPerLID(j + k);
1780 const size_t src_end = indicesOffsets(j + k) + numExportPacketsPerLID(j + k);
1781 auto dst_sub = Kokkos::subview(sendArrayMirror, Kokkos::make_pair(offset, dst_end));
1782 auto src_sub = Kokkos::subview(exportsMirror, Kokkos::make_pair(indicesOffsets(j + k), src_end));
1783 Kokkos::Experimental::local_deep_copy(dst_sub, src_sub);
1785 offset += numExportPacketsPerLID(j + k);
1788 typename ExpView::execution_space().fence();
1790 if (numPacketsTo_p > 0) {
1792 subview_offset(sendArray,
size_t(0), numPacketsTo_p);
1795 as<int> (tmpSend.size ()),
1796 plan.getProcsTo()[p], mpiTag_, *plan.getComm());
1801 selfIndex = plan.getStartsTo()[p];
1805 if (plan.hasSelfMessage()) {
1807 auto importsMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), imports);
1808 auto exportsMirror = Kokkos::create_mirror_view_and_copy(ExpExecSpace(), exports);
1810 Kokkos::parallel_scan(Kokkos::RangePolicy<ExpExecSpace>(0, plan.getLengthsTo()[selfNum]), KOKKOS_LAMBDA(
const size_t k,
size_t& offset,
bool is_final) {
1812 const size_t dst_end = selfReceiveOffset + offset + numExportPacketsPerLID(selfIndex + k);
1813 const size_t src_end = indicesOffsets(selfIndex + k) + numExportPacketsPerLID(selfIndex + k);
1814 auto dst_sub = Kokkos::subview(importsMirror, Kokkos::make_pair(selfReceiveOffset + offset, dst_end));
1815 auto src_sub = Kokkos::subview(exportsMirror, Kokkos::make_pair(indicesOffsets(selfIndex + k), src_end));
1816 Kokkos::Experimental::local_deep_copy(dst_sub, src_sub);
1818 offset += numExportPacketsPerLID(selfIndex + k);
1821 selfIndex += plan.getLengthsTo()[selfNum];
1822 selfReceiveOffset += temp;
Add specializations of Teuchos::Details::MpiTypeTraits for Kokkos::complex<float> and Kokkos::complex...
std::string DistributorSendTypeEnumToString(EDistributorSendType sendType)
Convert an EDistributorSendType enum value to a string.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
std::string DistributorHowInitializedEnumToString(EDistributorHowInitialized how)
Convert an EDistributorHowInitialized enum value to a string.
Stand-alone utility functions and macros.
EDistributorSendType
The type of MPI send that Distributor should use.