40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
46 #include "Teuchos_OrdinalTraits.hpp"
54 #include "Kokkos_Core.hpp"
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
84 #endif // DOXYGEN_SHOULD_SKIP_THIS
91 namespace UnpackAndCombineCrsMatrixImpl {
102 template<
class ST,
class LO,
class GO>
104 unpackRow(
const typename PackTraits<GO>::output_array_type& gids_out,
106 const typename PackTraits<ST>::output_array_type& vals_out,
107 const char imports[],
110 const size_t num_ent,
111 const size_t bytes_per_value)
117 bool unpack_pids = pids_out.size() > 0;
119 const size_t num_ent_beg = offset;
122 const size_t gids_beg = num_ent_beg + num_ent_len;
123 const size_t gids_len =
126 const size_t pids_beg = gids_beg + gids_len;
127 const size_t pids_len = unpack_pids ?
131 const size_t vals_beg = gids_beg + gids_len + pids_len;
132 const size_t vals_len = num_ent * bytes_per_value;
134 const char*
const num_ent_in = imports + num_ent_beg;
135 const char*
const gids_in = imports + gids_beg;
136 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
137 const char*
const vals_in = imports + vals_beg;
139 size_t num_bytes_out = 0;
142 if (static_cast<size_t> (num_ent_out) != num_ent) {
147 Kokkos::pair<int, size_t> p;
152 num_bytes_out += p.second;
159 num_bytes_out += p.second;
166 num_bytes_out += p.second;
169 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
170 if (num_bytes_out != expected_num_bytes) {
186 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
188 typedef LocalMatrix local_matrix_type;
191 typedef typename local_matrix_type::value_type ST;
192 typedef typename local_map_type::local_ordinal_type LO;
193 typedef typename local_map_type::global_ordinal_type GO;
194 typedef typename local_map_type::device_type DT;
195 typedef typename DT::execution_space XS;
197 typedef Kokkos::View<const size_t*, BufferDeviceType>
198 num_packets_per_lid_type;
199 typedef Kokkos::View<const size_t*, DT> offsets_type;
200 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
201 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
203 typedef Kokkos::View<int, DT> error_type;
204 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
206 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
207 "LocalMap::local_ordinal_type and "
208 "LocalMatrix::ordinal_type must be the same.");
210 local_matrix_type local_matrix;
212 input_buffer_type imports;
213 num_packets_per_lid_type num_packets_per_lid;
214 import_lids_type import_lids;
215 Kokkos::View<const LO*[2], DT> batch_info;
216 offsets_type offsets;
219 size_t bytes_per_value;
221 error_type error_code;
224 const local_matrix_type& local_matrix_in,
226 const input_buffer_type& imports_in,
227 const num_packets_per_lid_type& num_packets_per_lid_in,
228 const import_lids_type& import_lids_in,
229 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
230 const offsets_type& offsets_in,
232 const size_t batch_size_in,
233 const size_t bytes_per_value_in,
234 const bool atomic_in) :
235 local_matrix (local_matrix_in),
236 local_col_map (local_col_map_in),
237 imports (imports_in),
238 num_packets_per_lid (num_packets_per_lid_in),
239 import_lids (import_lids_in),
240 batch_info (batch_info_in),
241 offsets (offsets_in),
242 combine_mode (combine_mode_in),
243 batch_size (batch_size_in),
244 bytes_per_value (bytes_per_value_in),
249 KOKKOS_INLINE_FUNCTION
250 void operator()(member_type team_member)
const
253 using Kokkos::subview;
254 using Kokkos::MemoryUnmanaged;
256 const LO batch = team_member.league_rank();
257 const LO lid_no = batch_info(batch, 0);
258 const LO batch_no = batch_info(batch, 1);
260 const size_t num_bytes = num_packets_per_lid(lid_no);
267 const LO import_lid = import_lids(lid_no);
268 const size_t buf_size = imports.size();
269 const size_t offset = offsets(lid_no);
273 const char*
const in_buf = imports.data() + offset;
275 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
278 size_t expected_num_bytes = 0;
285 if (expected_num_bytes > num_bytes)
288 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
289 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
290 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
292 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
296 if (offset > buf_size || offset + num_bytes > buf_size)
299 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300 "At row %d, the offset (%d) > buffer size (%d)\n",
301 (
int) lid_no, (
int) offset, (
int) buf_size
303 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
308 size_t num_entries_in_batch = 0;
309 if (num_entries_in_row <= batch_size)
310 num_entries_in_batch = num_entries_in_row;
311 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
312 num_entries_in_batch = batch_size;
314 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
317 const size_t num_ent_start = offset;
318 const size_t num_ent_end = num_ent_start + bytes_per_lid;
321 const size_t gids_start = num_ent_end;
322 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
324 const size_t vals_start = gids_end;
326 const size_t shift = batch_no * batch_size;
327 const char*
const num_ent_in = imports.data() + num_ent_start;
328 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
329 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
333 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
336 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
337 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
338 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
340 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
343 constexpr
bool matrix_has_sorted_rows =
true;
344 Kokkos::parallel_for(
345 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
346 KOKKOS_LAMBDA(
const LO& j)
351 distance = j * bytes_per_gid;
361 distance = j * bytes_per_value;
364 if (combine_mode ==
ADD) {
368 const bool use_atomic_updates = atomic;
369 (void)local_matrix.sumIntoValues(
374 matrix_has_sorted_rows,
377 }
else if (combine_mode ==
REPLACE) {
381 const bool use_atomic_updates =
false;
382 (void)local_matrix.replaceValues(
387 matrix_has_sorted_rows,
393 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
394 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
396 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
401 team_member.team_barrier();
407 auto error_code_h = Kokkos::create_mirror_view_and_copy(
408 Kokkos::HostSpace(), error_code
410 return error_code_h();
415 struct MaxNumEntTag {};
416 struct TotNumEntTag {};
426 template<
class LO,
class DT,
class BDT>
429 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
430 typedef Kokkos::View<const size_t*, DT> offsets_type;
431 typedef Kokkos::View<const char*, BDT> input_buffer_type;
434 typedef size_t value_type;
437 num_packets_per_lid_type num_packets_per_lid;
438 offsets_type offsets;
439 input_buffer_type imports;
443 const offsets_type& offsets_in,
444 const input_buffer_type& imports_in) :
445 num_packets_per_lid (num_packets_per_lid_in),
446 offsets (offsets_in),
450 KOKKOS_INLINE_FUNCTION
void
451 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
453 const size_t num_bytes = num_packets_per_lid(i);
456 const char*
const in_buf = imports.data () + offsets(i);
458 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
460 update = (update < num_ent) ? num_ent : update;
464 KOKKOS_INLINE_FUNCTION
void
465 join (
const MaxNumEntTag,
466 volatile value_type& dst,
467 const volatile value_type& src)
const
469 if (dst < src) dst = src;
472 KOKKOS_INLINE_FUNCTION
void
473 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
475 const size_t num_bytes = num_packets_per_lid(i);
478 const char*
const in_buf = imports.data () + offsets(i);
480 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
492 template<
class LO,
class DT,
class BDT>
494 compute_maximum_num_entries (
495 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
496 const Kokkos::View<const size_t*, DT>& offsets,
497 const Kokkos::View<const char*, BDT>& imports)
499 typedef typename DT::execution_space XS;
500 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
501 MaxNumEntTag> range_policy;
505 const LO numRowsToUnpack =
506 static_cast<LO
> (num_packets_per_lid.extent (0));
507 size_t max_num_ent = 0;
508 Kokkos::parallel_reduce (
"Max num entries in CRS",
509 range_policy (0, numRowsToUnpack),
510 functor, max_num_ent);
521 template<
class LO,
class DT,
class BDT>
523 compute_total_num_entries (
524 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
525 const Kokkos::View<const size_t*, DT>& offsets,
526 const Kokkos::View<const char*, BDT>& imports)
528 typedef typename DT::execution_space XS;
529 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
530 size_t tot_num_ent = 0;
531 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
533 const LO numRowsToUnpack =
534 static_cast<LO
> (num_packets_per_lid.extent (0));
535 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
536 range_policy (0, numRowsToUnpack),
537 functor, tot_num_ent);
542 KOKKOS_INLINE_FUNCTION
544 unpackRowCount(
const char imports[],
546 const size_t num_bytes)
548 using PT = PackTraits<LO>;
552 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
553 if (p_num_bytes > num_bytes) {
554 return OrdinalTraits<size_t>::invalid();
556 const char*
const in_buf = imports + offset;
557 (void) PT::unpackValue(num_ent_LO, in_buf);
559 return static_cast<size_t>(num_ent_LO);
566 template<
class View1,
class View2>
570 const View1& batches_per_lid,
574 using LO =
typename View2::value_type;
576 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
578 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
580 batch_info(batch, 0) =
static_cast<LO
>(i);
581 batch_info(batch, 1) = batch_no;
585 return batch == batch_info.extent(0);
595 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
597 unpackAndCombineIntoCrsMatrix(
598 const LocalMatrix& local_matrix,
599 const LocalMap& local_map,
600 const Kokkos::View<const char*, BufferDeviceType>& imports,
601 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
602 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
605 using ST =
typename LocalMatrix::value_type;
606 using LO =
typename LocalMap::local_ordinal_type;
607 using DT =
typename LocalMap::device_type;
608 using XS =
typename DT::execution_space;
609 const char prefix[] =
610 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
611 "unpackAndCombineIntoCrsMatrix: ";
613 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
614 if (num_import_lids == 0) {
621 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
622 std::invalid_argument,
623 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
624 "static graph (i.e., was constructed with the CrsMatrix constructor "
625 "that takes a const CrsGraph pointer).");
627 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
628 std::invalid_argument,
629 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
630 "(i.e., was constructed with the CrsMatrix constructor that takes a "
631 "const CrsGraph pointer).");
634 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
635 std::invalid_argument,
636 prefix <<
"Invalid combine mode; should never get "
637 "here! Please report this bug to the Tpetra developers.");
640 bool bad_num_import_lids =
641 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
642 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
643 std::invalid_argument,
644 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
645 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
649 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
653 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
655 const size_t batch_size = std::min(default_batch_size, max_num_ent);
658 size_t num_batches = 0;
659 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
660 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
662 Kokkos::parallel_reduce(
663 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
664 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
666 const size_t num_entries_in_row = unpackRowCount<LO>(
667 imports.data(), offsets(i), num_packets_per_lid(i)
670 (num_entries_in_row <= batch_size) ?
672 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
673 batches += batches_per_lid(i);
677 Kokkos::resize(batch_info, num_batches);
679 Kokkos::HostSpace host_space;
680 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
683 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
685 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
693 const bool atomic = XS::concurrency() != 1;
694 using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
709 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
711 #if defined(KOKKOS_ENABLE_CUDA)
712 constexpr
bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
714 constexpr
bool is_cuda =
false;
716 if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
718 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
722 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
725 auto error_code = f.error();
726 TEUCHOS_TEST_FOR_EXCEPTION(
729 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
733 template<
class LocalMatrix,
class BufferDeviceType>
736 const LocalMatrix& local_matrix,
737 const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
738 const Kokkos::View<const char*, BufferDeviceType>& imports,
739 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
740 const size_t num_same_ids)
742 using Kokkos::parallel_reduce;
743 typedef typename LocalMatrix::ordinal_type LO;
744 typedef typename LocalMatrix::device_type device_type;
745 typedef typename device_type::execution_space XS;
746 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
747 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
748 typedef BufferDeviceType BDT;
754 num_items =
static_cast<LO
>(num_same_ids);
757 parallel_reduce(range_policy(0, num_items),
758 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
759 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
760 -local_matrix.graph.row_map[lid]);
766 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
769 parallel_reduce(range_policy(0, num_items),
770 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
771 const LO lid = permute_from_lids(i);
772 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
773 - local_matrix.graph.row_map[lid]);
780 const size_type np = num_packets_per_lid.extent(0);
781 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
784 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
792 template<
class LO,
class DT,
class BDT>
794 setupRowPointersForRemotes(
795 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
797 const Kokkos::View<const char*, BDT>& imports,
798 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
799 const typename PackTraits<size_t>::input_array_type& offsets)
801 using Kokkos::parallel_reduce;
802 typedef typename DT::execution_space XS;
803 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
804 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
806 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
807 const size_type N = num_packets_per_lid.extent(0);
810 parallel_reduce (
"Setup row pointers for remotes",
812 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
813 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
814 const size_t num_bytes = num_packets_per_lid(i);
815 const size_t offset = offsets(i);
816 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
817 if (num_ent == InvalidNum) {
820 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
828 makeCrsRowPtrFromLengths(
829 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
830 const Kokkos::View<size_t*,DT>& new_start_row)
832 using Kokkos::parallel_scan;
833 typedef typename DT::execution_space XS;
834 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
835 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
836 const size_type N = new_start_row.extent(0);
837 parallel_scan(range_policy(0, N),
838 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
839 auto cur_val = tgt_rowptr(i);
841 tgt_rowptr(i) = update;
842 new_start_row(i) = tgt_rowptr(i);
849 template<
class LocalMatrix,
class LocalMap>
852 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
854 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
855 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
856 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
858 const LocalMatrix& local_matrix,
859 const LocalMap& local_col_map,
860 const size_t num_same_ids,
863 using Kokkos::parallel_for;
864 typedef typename LocalMap::device_type DT;
865 typedef typename LocalMap::local_ordinal_type LO;
866 typedef typename DT::execution_space XS;
867 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
869 parallel_for(range_policy(0, num_same_ids),
870 KOKKOS_LAMBDA(
const size_t i) {
871 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
873 const LO src_lid =
static_cast<LO
>(i);
874 size_t src_row = local_matrix.graph.row_map(src_lid);
876 const LO tgt_lid =
static_cast<LO
>(i);
877 const size_t tgt_row = tgt_rowptr(tgt_lid);
879 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
880 - local_matrix.graph.row_map(src_lid);
881 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
883 for (
size_t j=local_matrix.graph.row_map(src_lid);
884 j<local_matrix.graph.row_map(src_lid+1); ++j) {
885 LO src_col = local_matrix.graph.entries(j);
886 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
887 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
888 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
894 template<
class LocalMatrix,
class LocalMap>
896 copyDataFromPermuteIDs(
897 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
899 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
900 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
901 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
903 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
904 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
905 const LocalMatrix& local_matrix,
906 const LocalMap& local_col_map,
909 using Kokkos::parallel_for;
910 typedef typename LocalMap::device_type DT;
911 typedef typename LocalMap::local_ordinal_type LO;
912 typedef typename DT::execution_space XS;
913 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
914 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
916 const size_type num_permute_to_lids = permute_to_lids.extent(0);
918 parallel_for(range_policy(0, num_permute_to_lids),
919 KOKKOS_LAMBDA(
const size_t i) {
920 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
922 const LO src_lid = permute_from_lids(i);
923 const size_t src_row = local_matrix.graph.row_map(src_lid);
925 const LO tgt_lid = permute_to_lids(i);
926 const size_t tgt_row = tgt_rowptr(tgt_lid);
928 size_t nsr = local_matrix.graph.row_map(src_lid+1)
929 - local_matrix.graph.row_map(src_lid);
930 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
932 for (
size_t j=local_matrix.graph.row_map(src_lid);
933 j<local_matrix.graph.row_map(src_lid+1); ++j) {
934 LO src_col = local_matrix.graph.entries(j);
935 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
936 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
937 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
943 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
945 unpackAndCombineIntoCrsArrays2(
946 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
948 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
949 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
950 const typename PackTraits<size_t>::input_array_type& offsets,
951 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
952 const Kokkos::View<const char*, BufferDeviceType>& imports,
953 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
957 const size_t bytes_per_value)
960 using Kokkos::subview;
961 using Kokkos::MemoryUnmanaged;
962 using Kokkos::parallel_reduce;
963 using Kokkos::atomic_fetch_add;
965 typedef typename LocalMap::device_type DT;
966 typedef typename LocalMap::local_ordinal_type LO;
967 typedef typename LocalMap::global_ordinal_type GO;
968 typedef typename LocalMatrix::value_type ST;
969 typedef typename DT::execution_space XS;
970 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
971 typedef typename Kokkos::pair<size_type, size_type> slice;
972 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
974 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
975 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
976 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
978 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
981 const size_type num_import_lids = import_lids.size();
984 parallel_reduce (
"Unpack and combine into CRS",
985 range_policy (0, num_import_lids),
986 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
987 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
988 const size_t num_bytes = num_packets_per_lid(i);
989 const size_t offset = offsets(i);
990 if (num_bytes == 0) {
994 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
995 if (num_ent == InvalidNum) {
999 const LO lcl_row = import_lids(i);
1000 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1001 const size_t end_row = start_row + num_ent;
1003 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1004 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1005 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1007 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1008 imports.data(), offset, num_bytes,
1009 num_ent, bytes_per_value);
1012 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1013 const int pid = pids_out(j);
1014 pids_out(j) = (pid != my_pid) ? pid : -1;
1021 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1024 const LocalMatrix & local_matrix,
1025 const LocalMap & local_col_map,
1026 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1027 const Kokkos::View<const char*, BufferDeviceType>& imports,
1028 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1029 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1030 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1031 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1032 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1033 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1036 const size_t num_same_ids,
1037 const size_t tgt_num_rows,
1038 const size_t tgt_num_nonzeros,
1039 const int my_tgt_pid,
1040 const size_t bytes_per_value)
1043 using Kokkos::subview;
1044 using Kokkos::parallel_for;
1045 using Kokkos::MemoryUnmanaged;
1047 typedef typename LocalMap::device_type DT;
1048 typedef typename LocalMap::local_ordinal_type LO;
1049 typedef typename DT::execution_space XS;
1050 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1051 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1052 typedef BufferDeviceType BDT;
1054 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1056 const size_t N = tgt_num_rows;
1060 const int my_pid = my_tgt_pid;
1063 parallel_for(range_policy(0, N+1),
1064 KOKKOS_LAMBDA(
const size_t i) {
1070 parallel_for(range_policy(0, num_same_ids),
1071 KOKKOS_LAMBDA(
const size_t i) {
1072 const LO tgt_lid =
static_cast<LO
>(i);
1073 const LO src_lid =
static_cast<LO
>(i);
1074 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1075 - local_matrix.graph.row_map(src_lid);
1080 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1081 parallel_for(range_policy(0, num_permute_to_lids),
1082 KOKKOS_LAMBDA(
const size_t i) {
1083 const LO tgt_lid = permute_to_lids(i);
1084 const LO src_lid = permute_from_lids(i);
1085 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1086 - local_matrix.graph.row_map(src_lid);
1091 const size_type num_import_lids = import_lids.extent(0);
1092 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1095 #ifdef HAVE_TPETRA_DEBUG
1097 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1098 const bool condition =
1099 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1100 TEUCHOS_TEST_FOR_EXCEPTION
1101 (condition, std::logic_error, prefix
1102 <<
"The final offset in bytes " << nth_offset_h
1103 <<
" != imports.size() = " << imports.extent(0)
1104 <<
". Please report this bug to the Tpetra developers.");
1106 #endif // HAVE_TPETRA_DEBUG
1110 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1111 import_lids, imports, num_packets_per_lid, offsets);
1112 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1113 <<
" Error transferring data to target row pointers. "
1114 "Please report this bug to the Tpetra developers.");
1118 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1121 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1124 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1125 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1127 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1128 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1129 local_matrix, local_col_map, my_pid);
1131 if (imports.extent(0) <= 0) {
1135 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1136 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1137 local_matrix, local_col_map, my_pid, bytes_per_value);
1138 TEUCHOS_TEST_FOR_EXCEPTION(
1139 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1140 "should never happen. Please report this bug to the Tpetra developers.");
1186 template<
typename ST,
typename LO,
typename GO,
typename Node>
1190 const Teuchos::ArrayView<const char>& imports,
1191 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1192 const Teuchos::ArrayView<const LO>& importLIDs,
1198 typedef typename Node::device_type device_type;
1200 static_assert (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1201 "Node::device_type and LocalMatrix::device_type must be the same.");
1204 typedef typename device_type::execution_space XS;
1207 typename XS::device_type outputDevice;
1212 auto num_packets_per_lid_d =
1214 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1216 auto import_lids_d =
1218 importLIDs.size(),
true,
"import_lids");
1222 imports.size(),
true,
"imports");
1225 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1227 for (
int i=0; i<importLIDs.size(); i++)
1229 auto lclRow = importLIDs[i];
1230 Teuchos::ArrayView<const LO> A_indices;
1231 Teuchos::ArrayView<const ST> A_values;
1235 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1236 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1237 import_lids_d, combineMode);
1241 template<
typename ST,
typename LO,
typename GO,
typename NT>
1243 unpackCrsMatrixAndCombineNew(
1245 Kokkos::DualView<
char*,
1247 Kokkos::DualView<
size_t*,
1249 const Kokkos::DualView<
const LO*,
1258 using device_type =
typename crs_matrix_type::device_type;
1259 using local_matrix_type =
typename crs_matrix_type::local_matrix_type;
1260 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1263 (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1264 "crs_matrix_type::device_type and local_matrix_type::device_type "
1265 "must be the same.");
1267 if (numPacketsPerLID.need_sync_device()) {
1268 numPacketsPerLID.sync_device ();
1270 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1272 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1273 auto import_lids_d = importLIDs.view_device ();
1275 if (imports.need_sync_device()) {
1276 imports.sync_device ();
1278 auto imports_d = imports.view_device ();
1281 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1282 typedef decltype (local_col_map) local_map_type;
1284 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1288 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1289 import_lids_d, combineMode);
1347 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1350 const
CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1351 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1352 const Teuchos::ArrayView<const
char> &imports,
1353 const Teuchos::ArrayView<const
size_t>& numPacketsPerLID,
1358 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1359 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1361 using Kokkos::MemoryUnmanaged;
1363 typedef typename Node::device_type DT;
1365 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1367 TEUCHOS_TEST_FOR_EXCEPTION
1368 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1369 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1370 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1373 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1374 TEUCHOS_TEST_FOR_EXCEPTION
1375 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1376 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1377 TEUCHOS_TEST_FOR_EXCEPTION
1378 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1379 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1380 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1382 auto local_matrix = sourceMatrix.getLocalMatrix ();
1383 auto permute_from_lids_d =
1385 permuteFromLIDs.getRawPtr (),
1386 permuteFromLIDs.size (),
true,
1387 "permute_from_lids");
1390 imports.getRawPtr (),
1391 imports.size (),
true,
1393 auto num_packets_per_lid_d =
1395 numPacketsPerLID.getRawPtr (),
1396 numPacketsPerLID.size (),
true,
1397 "num_packets_per_lid");
1400 local_matrix, permute_from_lids_d, imports_d,
1401 num_packets_per_lid_d, numSameIDs);
1418 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1422 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1423 const Teuchos::ArrayView<const char>& imports,
1424 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1428 const size_t numSameIDs,
1429 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1430 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1431 size_t TargetNumRows,
1432 size_t TargetNumNonzeros,
1433 const int MyTargetPID,
1434 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1435 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1437 const Teuchos::ArrayView<const int>& SourcePids,
1438 Teuchos::Array<int>& TargetPids)
1445 using Teuchos::ArrayView;
1446 using Teuchos::outArg;
1447 using Teuchos::REDUCE_MAX;
1448 using Teuchos::reduceAll;
1450 typedef LocalOrdinal LO;
1452 typedef typename Node::device_type DT;
1453 typedef typename DT::execution_space XS;
1456 typedef typename matrix_type::impl_scalar_type ST;
1457 typedef typename ArrayView<const LO>::size_type size_type;
1459 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1461 TEUCHOS_TEST_FOR_EXCEPTION(
1462 TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1463 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1464 CRS_rowptr.size () <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1466 TEUCHOS_TEST_FOR_EXCEPTION(
1467 permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1468 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size ()
1469 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size () <<
".");
1470 const size_type numImportLIDs = importLIDs.size ();
1472 TEUCHOS_TEST_FOR_EXCEPTION(
1473 numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1474 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != "
1475 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1478 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1479 TargetPids.resize (TargetNumNonzeros);
1481 TargetPids.assign (TargetNumNonzeros, -1);
1485 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1488 typename XS::device_type outputDevice;
1489 auto import_lids_d =
1491 importLIDs.size(),
true,
"import_lids");
1495 imports.size(),
true,
"imports");
1497 auto num_packets_per_lid_d =
1499 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1501 auto permute_from_lids_d =
1503 permuteFromLIDs.size(),
true,
"permute_from_lids");
1505 auto permute_to_lids_d =
1507 permuteToLIDs.size(),
true,
"permute_to_lids");
1511 CRS_rowptr.size(),
true,
"crs_rowptr");
1515 CRS_colind.size(),
true,
"crs_colidx");
1517 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1518 static_assert (! std::is_same<
1519 typename std::remove_const<
1520 typename std::decay<
1524 std::complex<double> >::value,
1525 "CRS_vals::value_type is std::complex<double>; this should never happen"
1526 ", since std::complex does not work in Kokkos::View objects.");
1527 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1531 CRS_vals.size(),
true,
"crs_vals");
1533 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1534 static_assert (! std::is_same<
1535 typename decltype (crs_vals_d)::non_const_value_type,
1536 std::complex<double> >::value,
1537 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1538 "never happen, since std::complex does not work in Kokkos::View objects.");
1539 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1543 SourcePids.size(),
true,
"src_pids");
1547 TargetPids.size(),
true,
"tgt_pids");
1549 size_t bytes_per_value = 0;
1563 size_t bytes_per_value_l = 0;
1564 if (local_matrix.values.extent(0) > 0) {
1565 const ST& val = local_matrix.values(0);
1568 const ST& val = crs_vals_d(0);
1571 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1572 Teuchos::REDUCE_MAX,
1574 outArg(bytes_per_value));
1577 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1578 static_assert (! std::is_same<
1579 typename decltype (crs_vals_d)::non_const_value_type,
1580 std::complex<double> >::value,
1581 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1582 "never happen, since std::complex does not work in Kokkos::View objects.");
1583 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1586 local_matrix, local_col_map, import_lids_d, imports_d,
1587 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1589 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1593 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1594 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1597 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1598 CRS_colind.getRawPtr(), CRS_colind.size());
1601 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1602 CRS_vals.getRawPtr(), CRS_vals.size());
1605 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1606 TargetPids.getRawPtr(), TargetPids.size());
1614 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1616 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1617 const CrsMatrix<ST, LO, GO, NT>&, \
1618 const Teuchos::ArrayView<const char>&, \
1619 const Teuchos::ArrayView<const size_t>&, \
1620 const Teuchos::ArrayView<const LO>&, \
1625 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1626 const CrsMatrix<ST, LO, GO, NT>&, \
1627 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1628 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1629 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1632 const CombineMode); \
1634 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1635 const CrsMatrix<ST, LO, GO, NT> &, \
1636 const Teuchos::ArrayView<const LO>&, \
1637 const Teuchos::ArrayView<const char>&, \
1638 const Teuchos::ArrayView<const size_t>&, \
1641 const CombineMode, \
1643 const Teuchos::ArrayView<const LO>&, \
1644 const Teuchos::ArrayView<const LO>&, \
1648 const Teuchos::ArrayView<size_t>&, \
1649 const Teuchos::ArrayView<GO>&, \
1650 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1651 const Teuchos::ArrayView<const int>&, \
1652 Teuchos::Array<int>&); \
1654 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1655 const CrsMatrix<ST, LO, GO, NT> &, \
1656 const Teuchos::ArrayView<const LO> &, \
1657 const Teuchos::ArrayView<const char> &, \
1658 const Teuchos::ArrayView<const size_t>&, \
1663 const Teuchos::ArrayView<const LO>&, \
1664 const Teuchos::ArrayView<const LO>&);
1666 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index.
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
"Local" part of Map suitable for Kokkos kernels.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
local_matrix_type getLocalMatrix() const
The local sparse matrix.
int error() const
Host function for getting the error.
void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView< const LocalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...