42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
56 #include "Kokkos_Core.hpp"
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
84 #endif // DOXYGEN_SHOULD_SKIP_THIS
91 namespace UnpackAndCombineCrsGraphImpl {
102 template<
class Packet,
class GO,
class Device,
class BufferDevice>
104 unpackRow (
const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
105 const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
106 const Kokkos::View<const Packet*,BufferDevice>& imports,
108 const size_t num_ent)
110 using size_type =
typename Kokkos::View<GO*,Device>::size_type;
118 for (size_type k=0; k<num_ent; k++)
119 gids_out(k) = imports(offset+k);
122 if (pids_out.size() > 0) {
123 for (size_type k=0; k<num_ent; k++) {
124 pids_out(k) =
static_cast<int>(imports(offset+num_ent+k));
141 template<
class LocalOrdinal,
148 using LO = LocalOrdinal;
149 using GO =
typename IndicesView::value_type;
150 using packet_type = Packet;
151 using row_ptrs_type = RowView;
152 using indices_type = IndicesView;
153 using buffer_device_type = BufferDevice;
155 using device_type =
typename IndicesView::device_type;
156 using execution_space =
typename device_type::execution_space;
158 using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
159 using offsets_type = Kokkos::View<const size_t*, device_type>;
160 using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
161 using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
163 using gids_scratch_type = Kokkos::View<GO*, device_type>;
164 using pids_scratch_type = Kokkos::View<int*,device_type>;
166 row_ptrs_type row_ptrs_beg;
167 row_ptrs_type row_ptrs_end;
168 indices_type indices;
169 input_buffer_type imports;
170 num_packets_per_lid_type num_packets_per_lid;
171 import_lids_type import_lids;
172 offsets_type offsets;
175 Kokkos::Experimental::UniqueToken<execution_space,
176 Kokkos::Experimental::UniqueTokenScope::Global> tokens;
177 gids_scratch_type gids_scratch;
178 pids_scratch_type pids_scratch;
181 using value_type = Kokkos::pair<int, LO>;
184 const row_ptrs_type& row_ptrs_beg_in,
185 const row_ptrs_type& row_ptrs_end_in,
186 const indices_type& indices_in,
187 const input_buffer_type& imports_in,
188 const num_packets_per_lid_type& num_packets_per_lid_in,
189 const import_lids_type& import_lids_in,
190 const offsets_type& offsets_in,
191 const size_t max_num_ent_in,
192 const bool unpack_pids_in) :
193 row_ptrs_beg(row_ptrs_beg_in),
194 row_ptrs_end(row_ptrs_end_in),
197 num_packets_per_lid(num_packets_per_lid_in),
198 import_lids(import_lids_in),
200 max_num_ent(max_num_ent_in),
201 unpack_pids(unpack_pids_in),
202 tokens(execution_space()),
203 gids_scratch(
"gids_scratch", tokens.size() * max_num_ent),
204 pids_scratch(
"pids_scratch", tokens.size() * max_num_ent)
207 KOKKOS_INLINE_FUNCTION
void init(value_type& dst)
const
209 using Tpetra::Details::OrdinalTraits;
210 dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
213 KOKKOS_INLINE_FUNCTION
void
214 join(
volatile value_type& dst,
const volatile value_type& src)
const
220 using Tpetra::Details::OrdinalTraits;
221 if (src.second != OrdinalTraits<LO>::invalid()) {
226 if (dst.second == OrdinalTraits<LO>::invalid() ||
227 src.second < dst.second) {
233 KOKKOS_INLINE_FUNCTION
234 void operator()(
const LO i, value_type& dst)
const
237 using Kokkos::subview;
238 using Kokkos::MemoryUnmanaged;
239 using size_type =
typename execution_space::size_type;
240 using slice =
typename Kokkos::pair<size_type, size_type>;
242 using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
243 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
245 const size_t num_packets_this_lid = num_packets_per_lid(i);
246 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
247 : num_packets_this_lid;
248 if (unpack_pids && num_packets_this_lid%2 != 0) {
251 dst = Kokkos::make_pair(1, i);
261 const size_t buf_size = imports.size();
262 const size_t offset = offsets(i);
264 if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
265 dst = Kokkos::make_pair(2, i);
272 const size_type token = tokens.acquire();
273 const size_t a =
static_cast<size_t>(token) * max_num_ent;
274 const size_t b = a + num_ent;
275 gids_out_type gids_out = subview(gids_scratch, slice(a, b));
276 pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
278 const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
281 dst = Kokkos::make_pair(3, i);
282 tokens.release(token);
286 auto import_lid = import_lids(i);
287 for (
size_t k = 0; k < num_ent; ++k) {
288 indices(row_ptrs_end(import_lid)) = gids_out(k);
290 row_ptrs_end(import_lid) += 1;
293 tokens.release(token);
298 template<
class NumPackets,
class ImportL
ids,
class Device>
299 Kokkos::UnorderedMap<
typename ImportLids::non_const_value_type,
300 typename NumPackets::non_const_value_type,
302 computeCrsPadding(
const NumPackets& num_packets_per_lid,
303 const ImportLids& import_lids,
304 const bool unpack_pids)
308 using key_type =
typename ImportLids::non_const_value_type;
309 using val_type =
typename NumPackets::non_const_value_type;
310 Kokkos::UnorderedMap<key_type, val_type, Device> padding(import_lids.size());
311 auto policy = Kokkos::RangePolicy<typename Device::execution_space>(0, import_lids.size());
312 Kokkos::parallel_for(
"Fill padding", policy,
313 KOKKOS_LAMBDA(
typename ImportLids::size_type i) {
314 auto how_much_padding = (unpack_pids) ? num_packets_per_lid(i)/2
315 : num_packets_per_lid(i);
316 padding.insert(import_lids(i), how_much_padding);
319 TEUCHOS_TEST_FOR_EXCEPTION(padding.failed_insert(), std::runtime_error,
320 "computeCrsPadding: failed to insert one or more indices in to padding map");
330 template<
class LocalOrdinal,
class Packet,
class RowView,
331 class IndicesView,
class BufferDevice>
334 (
const RowView& row_ptrs_beg,
335 const RowView& row_ptrs_end,
336 IndicesView& indices,
337 const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
338 const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
339 const Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>& import_lids,
340 const bool unpack_pids)
343 using ImportLidsView =
344 Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>;
345 using NumPacketsView =
346 Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>;
347 using LO = LocalOrdinal;
348 using execution_space =
typename BufferDevice::execution_space;
350 Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
351 using unpack_functor_type =
352 UnpackAndCombineFunctor<LO, Packet, RowView, IndicesView, BufferDevice>;
354 const char prefix[] =
355 "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
357 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
358 if (num_import_lids == 0) {
363 using device_type =
typename IndicesView::device_type;
367 computeCrsPadding<NumPacketsView, ImportLidsView, device_type>
368 (num_packets_per_lid, import_lids, unpack_pids);
369 padCrsArrays<RowView, IndicesView, decltype (padding) > (row_ptrs_beg, row_ptrs_end, indices, padding);
372 Kokkos::View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
379 Kokkos::parallel_reduce
381 range_policy (0, LO (num_packets_per_lid.size ())),
382 KOKKOS_LAMBDA (
const LO i,
size_t& running_max_num_ent) {
383 const size_t num_packets_this_lid = num_packets_per_lid(i);
384 const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
385 num_packets_this_lid;
386 if (num_ent > running_max_num_ent) {
387 running_max_num_ent = num_ent;
389 }, Kokkos::Max<size_t> (max_num_ent));
392 unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
393 num_packets_per_lid, import_lids, offsets,
394 max_num_ent, unpack_pids);
396 typename unpack_functor_type::value_type x;
397 Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
398 auto x_h = x.to_std_pair();
399 TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
400 prefix <<
"UnpackAndCombineFunctor reported error code "
401 << x_h.first <<
" for the first bad row " << x_h.second);
404 template<
class Packet,
class LocalGraph,
class BufferDevice>
407 const LocalGraph& local_graph,
408 const Kokkos::View<
const typename LocalGraph::data_type*,
409 typename LocalGraph::device_type,
410 Kokkos::MemoryUnmanaged> permute_from_lids,
411 const Kokkos::View<const Packet*, BufferDevice>& ,
412 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
413 const size_t num_same_ids)
415 using Kokkos::parallel_reduce;
416 using local_graph_type = LocalGraph;
417 using LO =
typename local_graph_type::data_type;
418 using device_type =
typename local_graph_type::device_type;
419 using execution_space =
typename device_type::execution_space;
420 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
426 num_items =
static_cast<LO
>(num_same_ids);
430 range_policy(0, num_items),
431 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
432 update +=
static_cast<size_t>(local_graph.row_map[lid+1]
433 -local_graph.row_map[lid]);
439 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
443 range_policy(0, num_items),
444 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
445 const LO lid = permute_from_lids(i);
446 update +=
static_cast<size_t>(local_graph.row_map[lid+1]
447 - local_graph.row_map[lid]);
454 size_t tot_num_ent = 0;
455 parallel_reduce(
"SumReduce",
456 num_packets_per_lid.size(),
457 KOKKOS_LAMBDA(
const int& i,
size_t& lsum) {
458 lsum += num_packets_per_lid(i) / 2;
459 }, Kokkos::Sum<size_t>(tot_num_ent));
460 count += tot_num_ent;
467 template<
class Packet,
class LO,
class Device,
class BufferDevice>
469 setupRowPointersForRemotes(
470 const Kokkos::View<size_t*, Device>& tgt_rowptr,
471 const Kokkos::View<const LO*, BufferDevice>& import_lids,
472 const Kokkos::View<const Packet*, BufferDevice>& ,
473 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
475 using Kokkos::parallel_reduce;
476 using device_type = Device;
477 using execution_space =
typename device_type::execution_space;
478 using size_type =
typename Kokkos::View<size_t*,device_type>::size_type;
479 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
481 const size_type N = num_packets_per_lid.extent(0);
482 parallel_for(
"Setup row pointers for remotes",
484 KOKKOS_LAMBDA(
const size_t i){
485 using atomic_incr_type =
typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
486 const size_t num_packets_this_lid = num_packets_per_lid(i);
487 const size_t num_ent = num_packets_this_lid / 2;
488 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
493 template<
class Device>
495 makeCrsRowPtrFromLengths(
496 const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
497 const Kokkos::View<size_t*,Device>& new_start_row)
499 using Kokkos::parallel_scan;
500 using device_type = Device;
501 using execution_space =
typename device_type::execution_space;
502 using size_type =
typename Kokkos::View<size_t*,device_type>::size_type;
503 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
504 const size_type N = new_start_row.extent(0);
507 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
508 auto cur_val = tgt_rowptr(i);
510 tgt_rowptr(i) = update;
511 new_start_row(i) = tgt_rowptr(i);
518 template<
class LocalGraph,
class LocalMap>
521 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
522 typename LocalMap::device_type>& tgt_colind,
523 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
524 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
525 const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
526 const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
527 const LocalGraph& local_graph,
528 const LocalMap& local_col_map,
529 const size_t num_same_ids,
532 using Kokkos::parallel_for;
533 using device_type =
typename LocalMap::device_type;
534 using LO =
typename LocalMap::local_ordinal_type;
535 using execution_space =
typename device_type::execution_space;
536 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
539 range_policy(0, num_same_ids),
540 KOKKOS_LAMBDA(
const size_t i) {
541 using atomic_incr_type =
typename std::remove_reference<decltype(new_start_row(0))>::type;
543 const LO src_lid =
static_cast<LO
>(i);
544 size_t src_row = local_graph.row_map(src_lid);
546 const LO tgt_lid =
static_cast<LO
>(i);
547 const size_t tgt_row = tgt_rowptr(tgt_lid);
549 const size_t nsr = local_graph.row_map(src_lid+1)
550 - local_graph.row_map(src_lid);
551 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
553 for (
size_t j=local_graph.row_map(src_lid);
554 j<local_graph.row_map(src_lid+1); ++j) {
555 LO src_col = local_graph.entries(j);
556 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
557 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
563 template<
class LocalGraph,
class LocalMap,
class BufferDevice>
565 copyDataFromPermuteIDs(
566 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
567 typename LocalMap::device_type>& tgt_colind,
568 const Kokkos::View<
int*,
569 typename LocalMap::device_type>& tgt_pids,
570 const Kokkos::View<
size_t*,
571 typename LocalMap::device_type>& new_start_row,
572 const Kokkos::View<
size_t*,
573 typename LocalMap::device_type>& tgt_rowptr,
574 const Kokkos::View<
const int*,
575 typename LocalMap::device_type>& src_pids,
576 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
577 BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
578 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
579 BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
580 const LocalGraph& local_graph,
581 const LocalMap& local_col_map,
584 using Kokkos::parallel_for;
585 using device_type =
typename LocalMap::device_type;
586 using LO =
typename LocalMap::local_ordinal_type;
587 using execution_space =
typename device_type::execution_space;
588 using size_type =
typename Kokkos::View<LO*,device_type>::size_type;
589 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
591 const size_type num_permute_to_lids = permute_to_lids.extent(0);
594 range_policy(0, num_permute_to_lids),
595 KOKKOS_LAMBDA(
const size_t i) {
596 using atomic_incr_type =
typename std::remove_reference<decltype(new_start_row(0))>::type;
598 const LO src_lid = permute_from_lids(i);
599 const size_t src_row = local_graph.row_map(src_lid);
601 const LO tgt_lid = permute_to_lids(i);
602 const size_t tgt_row = tgt_rowptr(tgt_lid);
604 size_t nsr = local_graph.row_map(src_lid+1)
605 - local_graph.row_map(src_lid);
606 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
608 for (
size_t j=local_graph.row_map(src_lid);
609 j<local_graph.row_map(src_lid+1); ++j) {
610 LO src_col = local_graph.entries(j);
611 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
612 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
618 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
620 unpackAndCombineIntoCrsArrays2(
621 const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
622 const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
623 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
624 const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
626 const typename LocalMap::local_ordinal_type*,
628 Kokkos::MemoryUnmanaged>& import_lids,
629 const Kokkos::View<const Packet*, BufferDevice>& imports,
630 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
636 using Kokkos::subview;
637 using Kokkos::MemoryUnmanaged;
638 using Kokkos::parallel_reduce;
639 using Kokkos::atomic_fetch_add;
641 using device_type =
typename LocalMap::device_type;
642 using LO =
typename LocalMap::local_ordinal_type;
643 using GO =
typename LocalMap::global_ordinal_type;
644 using execution_space =
typename device_type::execution_space;
645 using size_type =
typename Kokkos::View<LO*, device_type>::size_type;
646 using slice =
typename Kokkos::pair<size_type, size_type>;
647 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
649 using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
650 using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
652 const size_type num_import_lids = import_lids.size();
653 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
657 parallel_reduce(
"Unpack and combine into CRS",
658 range_policy(0, num_import_lids),
659 KOKKOS_LAMBDA(
const size_t i,
int& err) {
660 using atomic_incr_type =
typename std::remove_reference< decltype( new_start_row(0) )>::type;
661 const size_t num_packets_this_lid = num_packets_per_lid(i);
662 const size_t num_ent = num_packets_this_lid / 2;
663 const size_t offset = offsets(i);
664 const LO lcl_row = import_lids(i);
665 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
666 const size_t end_row = start_row + num_ent;
668 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
669 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
671 err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
674 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
675 const int pid = pids_out(j);
676 pids_out(j) = (pid != my_pid) ? pid : -1;
680 TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
681 std::invalid_argument, prefix <<
682 "Attempting to unpack PIDs, but num_ent is not even; this should never "
683 "happen! Please report this bug to the Tpetra developers.");
688 template<
class Packet,
class LocalGraph,
class LocalMap,
class BufferDevice>
691 const LocalGraph & local_graph,
692 const LocalMap & local_col_map,
693 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
695 Kokkos::MemoryUnmanaged>& import_lids,
696 const Kokkos::View<const Packet*, BufferDevice>& imports,
697 const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
698 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
700 Kokkos::MemoryUnmanaged>& permute_to_lids,
701 const Kokkos::View<
const typename LocalMap::local_ordinal_type*,
703 Kokkos::MemoryUnmanaged>& permute_from_lids,
704 const Kokkos::View<
size_t*,
705 typename LocalMap::device_type,
706 Kokkos::MemoryUnmanaged>& tgt_rowptr,
707 const Kokkos::View<
typename LocalMap::global_ordinal_type*,
708 typename LocalMap::device_type,
709 Kokkos::MemoryUnmanaged>& tgt_colind,
710 const Kokkos::View<
const int*,
711 typename LocalMap::device_type,
712 Kokkos::MemoryUnmanaged>& src_pids,
713 const Kokkos::View<
int*,
714 typename LocalMap::device_type,
715 Kokkos::MemoryUnmanaged>& tgt_pids,
716 const size_t num_same_ids,
717 const size_t tgt_num_rows,
718 const size_t tgt_num_nonzeros,
719 const int my_tgt_pid)
722 using Kokkos::subview;
723 using Kokkos::parallel_for;
724 using Kokkos::MemoryUnmanaged;
725 using packet_type = Packet;
726 using local_map_type = LocalMap;
727 using local_graph_type = LocalGraph;
728 using buffer_device_type = BufferDevice;
729 using device_type =
typename LocalMap::device_type;
730 using LO =
typename LocalMap::local_ordinal_type;
731 using execution_space =
typename device_type::execution_space;
732 using size_type =
typename Kokkos::View<LO*, device_type>::size_type;
733 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
735 const char prefix[] =
"UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
737 const size_t N = tgt_num_rows;
738 const size_t mynnz = tgt_num_nonzeros;
742 const int my_pid = my_tgt_pid;
751 range_policy(0, N+1),
752 KOKKOS_LAMBDA(
const size_t i) {
759 range_policy(0, num_same_ids),
760 KOKKOS_LAMBDA(
const size_t i) {
761 const LO tgt_lid =
static_cast<LO
>(i);
762 const LO src_lid =
static_cast<LO
>(i);
763 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
764 - local_graph.row_map(src_lid);
769 const size_type num_permute_to_lids = permute_to_lids.extent(0);
771 range_policy(0, num_permute_to_lids),
772 KOKKOS_LAMBDA(
const size_t i) {
773 const LO tgt_lid = permute_to_lids(i);
774 const LO src_lid = permute_from_lids(i);
775 tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
776 - local_graph.row_map(src_lid);
781 const size_type num_import_lids = import_lids.extent(0);
782 View<size_t*, device_type> offsets(
"offsets", num_import_lids+1);
785 #ifdef HAVE_TPETRA_DEBUG
787 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
788 const bool condition =
789 nth_offset_h !=
static_cast<size_t>(imports.extent(0));
790 TEUCHOS_TEST_FOR_EXCEPTION
791 (condition, std::logic_error, prefix
792 <<
"The final offset in bytes " << nth_offset_h
793 <<
" != imports.size() = " << imports.extent(0)
794 <<
". Please report this bug to the Tpetra developers.");
796 #endif // HAVE_TPETRA_DEBUG
799 setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
800 tgt_rowptr, import_lids, imports, num_packets_per_lid);
804 View<size_t*, device_type> new_start_row(
"new_start_row", N+1);
807 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
809 auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
810 bool condition = nth_tgt_rowptr_h != mynnz;
811 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
812 prefix <<
"CRS_rowptr[last] = " <<
813 nth_tgt_rowptr_h <<
"!= mynnz = " << mynnz <<
".");
817 copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
818 tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
820 copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
821 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
822 local_graph, local_col_map, my_pid);
824 if (imports.extent(0) <= 0) {
828 unpackAndCombineIntoCrsArrays2<
829 packet_type,local_graph_type,local_map_type,buffer_device_type>(
830 tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
831 num_packets_per_lid, local_graph, local_col_map, my_pid);
871 template<
class LO,
class GO,
class Node>
876 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
877 const Teuchos::ArrayView<const LO>& importLIDs,
884 "Graph must be globally indexed!");
888 using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
890 using device_type =
typename Node::device_type;
891 using buffer_device_type =
typename graph_type::buffer_device_type;
892 using execution_space =
typename device_type::execution_space;
893 using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
894 using row_ptrs_type =
typename graph_type::local_graph_type::row_map_type::non_const_type;
895 using indices_type =
typename graph_type::t_GlobalOrdinal_1D;
899 buffer_device_type bufferOutputDevice;
906 imports.getRawPtr(), imports.size(),
909 auto num_packets_per_lid_d =
911 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
912 true,
"num_packets_per_lid");
916 importLIDs.getRawPtr(), importLIDs.size(),
917 true,
"import_lids");
921 indices_type indices(
"indices", graph.
k_gblInds1D_.extent(0));
924 row_ptrs_type row_ptrs_beg(
"row_ptrs_beg", graph.
k_rowPtrs_.extent(0));
927 const size_t N = (row_ptrs_beg.extent(0) == 0 ? 0 : row_ptrs_beg.extent(0) - 1);
928 row_ptrs_type row_ptrs_end(
"row_ptrs_end", N);
930 bool refill_num_row_entries =
false;
933 refill_num_row_entries =
true;
935 Kokkos::parallel_for(
"Fill end row pointers", range_policy(0, N),
936 KOKKOS_LAMBDA(
const size_t i){
937 row_ptrs_end(i) = row_ptrs_beg(i) + num_row_entries(i);
945 Kokkos::parallel_for(
"Fill end row pointers",
946 range_policy(0, N), KOKKOS_LAMBDA(
const size_t i){
947 row_ptrs_end(i) = row_ptrs_beg(i+1);
952 unpackAndCombine<LO, GO, row_ptrs_type, indices_type, buffer_device_type>
953 (row_ptrs_beg, row_ptrs_end, indices, imports_d,
954 num_packets_per_lid_d, import_lids_d,
false);
958 if (refill_num_row_entries) {
959 Kokkos::parallel_for(
"Fill num entries",
960 range_policy(0, N), KOKKOS_LAMBDA(
const size_t i){
970 template<
class LO,
class GO,
class Node>
972 unpackCrsGraphAndCombineNew(
976 const Kokkos::DualView<
const size_t*,
978 const Kokkos::DualView<
const LO*,
984 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::logic_error,
"METHOD NOT COMPLETE");
986 using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
989 using device_type =
typename Node::device_type;
991 using packet_type =
typename graph_type::packet_type;
992 using local_graph_type =
typename graph_type::local_graph_type;
993 using buffer_device_type =
typename graph_type::buffer_device_type;
994 using buffer_memory_space =
typename buffer_device_type::memory_space;
995 using memory_space =
typename device_type::memory_space;
997 using row_ptrs_type =
typename graph_type::local_graph_type::row_map_type::non_const_type;
998 using execution_space =
typename device_type::execution_space;
999 using indices_type = Kokkos::View<GO*, execution_space>;
1001 static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
1002 "Node::device_type and LocalGraph::device_type must be "
1007 numPacketsPerLID_nc.sync_device ();
1009 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1011 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1012 auto import_lids_d = importLIDs.view_device ();
1016 imports_nc.sync_device ();
1018 auto imports_d = imports.view_device ();
1022 indices_type indices;
1023 row_ptrs_type row_ptrs_beg;
1024 row_ptrs_type row_ptrs_end;
1025 unpackAndCombine<LO,packet_type,row_ptrs_type,indices_type,device_type,buffer_device_type>(
1026 row_ptrs_beg, row_ptrs_end, indices, imports_d,
1027 num_packets_per_lid_d, import_lids_d,
false);
1080 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1084 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1086 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1091 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1092 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1094 using Kokkos::MemoryUnmanaged;
1096 using device_type =
typename Node::device_type;
1100 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1102 TEUCHOS_TEST_FOR_EXCEPTION
1103 (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1104 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size() <<
" != "
1105 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1109 TEUCHOS_TEST_FOR_EXCEPTION
1110 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1111 "CrsGraph 'sourceGraph' must be locally indexed.");
1112 TEUCHOS_TEST_FOR_EXCEPTION
1113 (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1114 prefix <<
"importLIDs.size() = " << importLIDs.size() <<
" != "
1115 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1118 auto permute_from_lids_d =
1120 permuteFromLIDs.getRawPtr(),
1121 permuteFromLIDs.size(),
true,
1122 "permute_from_lids");
1125 imports.getRawPtr(),
1126 imports.size(),
true,
1128 auto num_packets_per_lid_d =
1130 numPacketsPerLID.getRawPtr(),
1131 numPacketsPerLID.size(),
true,
1132 "num_packets_per_lid");
1135 packet_type,local_graph_type,buffer_device_type>(
1136 local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1152 template<
class LocalOrdinal,
class GlobalOrdinal,
class Node>
1156 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1158 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1162 const size_t numSameIDs,
1163 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1164 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1165 size_t TargetNumRows,
1166 size_t TargetNumNonzeros,
1167 const int MyTargetPID,
1168 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1169 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1170 const Teuchos::ArrayView<const int>& SourcePids,
1171 Teuchos::Array<int>& TargetPids)
1175 using Teuchos::outArg;
1176 using Teuchos::REDUCE_MAX;
1177 using Teuchos::reduceAll;
1178 using LO = LocalOrdinal;
1179 using GO = GlobalOrdinal;
1181 using packet_type =
typename crs_graph_type::packet_type;
1182 using local_graph_type =
typename crs_graph_type::local_graph_type;
1183 using buffer_device_type =
typename crs_graph_type::buffer_device_type;
1184 using device_type =
typename Node::device_type;
1185 using size_type =
typename Teuchos::ArrayView<const LO>::size_type;
1187 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1189 TEUCHOS_TEST_FOR_EXCEPTION(
1190 TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1191 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1192 CRS_rowptr.size() <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1194 TEUCHOS_TEST_FOR_EXCEPTION(
1195 permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1196 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size()
1197 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1198 const size_type numImportLIDs = importLIDs.size();
1200 TEUCHOS_TEST_FOR_EXCEPTION(
1201 numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1202 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != "
1203 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1206 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1207 TargetPids.resize(TargetNumNonzeros);
1209 TargetPids.assign(TargetNumNonzeros, -1);
1213 auto local_col_map = sourceGraph.
getColMap()->getLocalMap();
1216 device_type outputDevice;
1217 buffer_device_type bufferOutputDevice;
1219 Kokkos::View<const LO*, buffer_device_type> import_lids_d =
1221 (bufferOutputDevice, importLIDs.getRawPtr(),
1222 importLIDs.size(),
true,
"import_lids");
1224 Kokkos::View<const packet_type*, buffer_device_type> imports_d =
1226 (bufferOutputDevice, imports.getRawPtr(),
1227 imports.size(),
true,
"imports");
1229 Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
1231 numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1232 true,
"num_packets_per_lid");
1234 Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
1236 permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1237 true,
"permute_to_lids");
1239 Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
1241 permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1242 true,
"permute_from_lids");
1244 Kokkos::View<size_t*, device_type> crs_rowptr_d =
1246 CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1247 true,
"crs_rowptr");
1249 Kokkos::View<GO*, device_type> crs_colind_d =
1251 CRS_colind.getRawPtr(), CRS_colind.size(),
1252 true,
"crs_colidx");
1254 Kokkos::View<const int*, device_type> src_pids_d =
1256 SourcePids.getRawPtr(), SourcePids.size(),
1259 Kokkos::View<int*, device_type> tgt_pids_d =
1261 TargetPids.getRawPtr(), TargetPids.size(),
1264 using local_map_type = decltype(local_col_map);
1266 packet_type,local_graph_type,local_map_type,buffer_device_type>(
1267 local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1268 permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1269 tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1274 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1275 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1278 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1279 CRS_colind.getRawPtr(), CRS_colind.size());
1282 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1283 TargetPids.getRawPtr(), TargetPids.size());
1291 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1293 Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1294 CrsGraph<LO, GO, NT>&, \
1295 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1296 const Teuchos::ArrayView<const size_t>&, \
1297 const Teuchos::ArrayView<const LO>&, \
1302 Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1303 CrsGraph<LO, GO, NT>&, \
1304 const Kokkos::DualView<const CrsGraph<LO, GO, NT>::packet_type*, \
1305 CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1306 const Kokkos::DualView<const size_t*, \
1307 CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1308 const Kokkos::DualView<const LO*, \
1309 CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1312 const CombineMode); \
1314 Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1315 const CrsGraph<LO, GO, NT> &, \
1316 const Teuchos::ArrayView<const LO>&, \
1317 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1318 const Teuchos::ArrayView<const size_t>&, \
1321 const CombineMode, \
1323 const Teuchos::ArrayView<const LO>&, \
1324 const Teuchos::ArrayView<const LO>&, \
1328 const Teuchos::ArrayView<size_t>&, \
1329 const Teuchos::ArrayView<GO>&, \
1330 const Teuchos::ArrayView<const int>&, \
1331 Teuchos::Array<int>&); \
1333 Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1334 const CrsGraph<LO, GO, NT> &, \
1335 const Teuchos::ArrayView<const LO> &, \
1336 const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1337 const Teuchos::ArrayView<const size_t>&, \
1342 const Teuchos::ArrayView<const LO>&, \
1343 const Teuchos::ArrayView<const LO>&);
1345 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
t_GlobalOrdinal_1D k_gblInds1D_
Global column indices for all rows.
Declaration of the Tpetra::CrsGraph class.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
bool isGloballyIndexed() const override
Whether the graph's column indices are stored as global indices.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
Functions for manipulating CRS arrays.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Sets up and executes a communication plan for a Tpetra DistObject.
local_graph_type::row_map_type::const_type k_rowPtrs_
Row offsets for "1-D" storage.
CombineMode
Rule for combining data in an Import or Export.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Unpacks and combines a single row of the CrsGraph.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
bool isLocallyIndexed() const override
Whether the graph's column indices are stored as local indices.
void unpackCrsGraphAndCombine(CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode)
Unpack the imported column indices and combine into graph.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
local_graph_type getLocalGraph() const
Get the local graph.
Declaration and definition of Tpetra::Details::getEntryOnHost.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.