40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
46 #include "Teuchos_OrdinalTraits.hpp"
54 #include "Kokkos_Core.hpp"
86 namespace UnpackAndCombineCrsMatrixImpl {
97 template<
class ST,
class LO,
class GO>
99 unpackRow(
const typename PackTraits<GO>::output_array_type& gids_out,
101 const typename PackTraits<ST>::output_array_type& vals_out,
102 const char imports[],
105 const size_t num_ent,
106 const size_t bytes_per_value)
112 bool unpack_pids = pids_out.size() > 0;
114 const size_t num_ent_beg = offset;
117 const size_t gids_beg = num_ent_beg + num_ent_len;
118 const size_t gids_len =
121 const size_t pids_beg = gids_beg + gids_len;
122 const size_t pids_len = unpack_pids ?
126 const size_t vals_beg = gids_beg + gids_len + pids_len;
127 const size_t vals_len = num_ent * bytes_per_value;
129 const char*
const num_ent_in = imports + num_ent_beg;
130 const char*
const gids_in = imports + gids_beg;
131 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
132 const char*
const vals_in = imports + vals_beg;
134 size_t num_bytes_out = 0;
137 if (static_cast<size_t> (num_ent_out) != num_ent) {
142 Kokkos::pair<int, size_t> p;
147 num_bytes_out += p.second;
154 num_bytes_out += p.second;
161 num_bytes_out += p.second;
164 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
165 if (num_bytes_out != expected_num_bytes) {
181 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
183 typedef LocalMatrix local_matrix_type;
186 typedef typename local_matrix_type::value_type ST;
190 typedef typename DT::execution_space XS;
192 typedef Kokkos::View<const size_t*, BufferDeviceType>
193 num_packets_per_lid_type;
194 typedef Kokkos::View<const size_t*, DT> offsets_type;
195 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
198 typedef Kokkos::View<int, DT> error_type;
199 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
201 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202 "LocalMap::local_ordinal_type and "
203 "LocalMatrix::ordinal_type must be the same.");
205 local_matrix_type local_matrix;
207 input_buffer_type imports;
208 num_packets_per_lid_type num_packets_per_lid;
209 import_lids_type import_lids;
210 Kokkos::View<const LO*[2], DT> batch_info;
211 offsets_type offsets;
214 size_t bytes_per_value;
216 error_type error_code;
219 const local_matrix_type& local_matrix_in,
221 const input_buffer_type& imports_in,
222 const num_packets_per_lid_type& num_packets_per_lid_in,
223 const import_lids_type& import_lids_in,
224 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
225 const offsets_type& offsets_in,
227 const size_t batch_size_in,
228 const size_t bytes_per_value_in,
229 const bool atomic_in) :
230 local_matrix (local_matrix_in),
231 local_col_map (local_col_map_in),
232 imports (imports_in),
233 num_packets_per_lid (num_packets_per_lid_in),
234 import_lids (import_lids_in),
235 batch_info (batch_info_in),
236 offsets (offsets_in),
237 combine_mode (combine_mode_in),
238 batch_size (batch_size_in),
239 bytes_per_value (bytes_per_value_in),
244 KOKKOS_INLINE_FUNCTION
245 void operator()(member_type team_member)
const
248 using Kokkos::subview;
249 using Kokkos::MemoryUnmanaged;
251 const LO batch = team_member.league_rank();
252 const LO lid_no = batch_info(batch, 0);
253 const LO batch_no = batch_info(batch, 1);
255 const size_t num_bytes = num_packets_per_lid(lid_no);
262 const LO import_lid = import_lids(lid_no);
263 const size_t buf_size = imports.size();
264 const size_t offset = offsets(lid_no);
268 const char*
const in_buf = imports.data() + offset;
270 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
273 size_t expected_num_bytes = 0;
280 if (expected_num_bytes > num_bytes)
283 #ifndef KOKKOS_ENABLE_SYCL
285 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
290 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
294 if (offset > buf_size || offset + num_bytes > buf_size)
297 #ifndef KOKKOS_ENABLE_SYCL
299 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300 "At row %d, the offset (%d) > buffer size (%d)\n",
301 (
int) lid_no, (
int) offset, (
int) buf_size
304 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
309 size_t num_entries_in_batch = 0;
310 if (num_entries_in_row <= batch_size)
311 num_entries_in_batch = num_entries_in_row;
312 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313 num_entries_in_batch = batch_size;
315 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
318 const size_t num_ent_start = offset;
319 const size_t num_ent_end = num_ent_start + bytes_per_lid;
322 const size_t gids_start = num_ent_end;
323 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
325 const size_t vals_start = gids_end;
327 const size_t shift = batch_no * batch_size;
328 const char*
const num_ent_in = imports.data() + num_ent_start;
329 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
334 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
337 #ifndef KOKKOS_ENABLE_SYCL
339 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
344 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
347 constexpr
bool matrix_has_sorted_rows =
true;
350 Kokkos::parallel_for(
351 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
357 distance = j * bytes_per_gid;
367 distance = j * bytes_per_value;
370 if (combine_mode ==
ADD) {
374 const bool use_atomic_updates = atomic;
375 (void)local_matrix.sumIntoValues(
380 matrix_has_sorted_rows,
383 }
else if (combine_mode ==
REPLACE) {
387 const bool use_atomic_updates =
false;
388 (void)local_matrix.replaceValues(
393 matrix_has_sorted_rows,
399 #ifndef KOKKOS_ENABLE_SYCL
401 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
402 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
405 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
410 team_member.team_barrier();
416 auto error_code_h = Kokkos::create_mirror_view_and_copy(
417 Kokkos::HostSpace(), error_code
419 return error_code_h();
424 struct MaxNumEntTag {};
425 struct TotNumEntTag {};
435 template<
class LO,
class DT,
class BDT>
438 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
439 typedef Kokkos::View<const size_t*, DT> offsets_type;
440 typedef Kokkos::View<const char*, BDT> input_buffer_type;
443 typedef size_t value_type;
446 num_packets_per_lid_type num_packets_per_lid;
447 offsets_type offsets;
448 input_buffer_type imports;
452 const offsets_type& offsets_in,
453 const input_buffer_type& imports_in) :
454 num_packets_per_lid (num_packets_per_lid_in),
455 offsets (offsets_in),
459 KOKKOS_INLINE_FUNCTION
void
460 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
462 const size_t num_bytes = num_packets_per_lid(i);
465 const char*
const in_buf = imports.data () + offsets(i);
467 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
469 update = (update < num_ent) ? num_ent : update;
473 KOKKOS_INLINE_FUNCTION
void
474 join (
const MaxNumEntTag,
476 const value_type& src)
const
478 if (dst < src) dst = src;
481 KOKKOS_INLINE_FUNCTION
void
482 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
484 const size_t num_bytes = num_packets_per_lid(i);
487 const char*
const in_buf = imports.data () + offsets(i);
489 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
501 template<
class LO,
class DT,
class BDT>
503 compute_maximum_num_entries (
504 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
505 const Kokkos::View<const size_t*, DT>& offsets,
506 const Kokkos::View<const char*, BDT>& imports)
508 typedef typename DT::execution_space XS;
509 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
510 MaxNumEntTag> range_policy;
514 const LO numRowsToUnpack =
515 static_cast<LO
> (num_packets_per_lid.extent (0));
516 size_t max_num_ent = 0;
517 Kokkos::parallel_reduce (
"Max num entries in CRS",
518 range_policy (0, numRowsToUnpack),
519 functor, max_num_ent);
530 template<
class LO,
class DT,
class BDT>
532 compute_total_num_entries (
533 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
534 const Kokkos::View<const size_t*, DT>& offsets,
535 const Kokkos::View<const char*, BDT>& imports)
537 typedef typename DT::execution_space XS;
538 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
539 size_t tot_num_ent = 0;
540 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
542 const LO numRowsToUnpack =
543 static_cast<LO
> (num_packets_per_lid.extent (0));
544 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
545 range_policy (0, numRowsToUnpack),
546 functor, tot_num_ent);
551 KOKKOS_INLINE_FUNCTION
553 unpackRowCount(
const char imports[],
555 const size_t num_bytes)
557 using PT = PackTraits<LO>;
561 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
562 if (p_num_bytes > num_bytes) {
563 return OrdinalTraits<size_t>::invalid();
565 const char*
const in_buf = imports + offset;
566 (void) PT::unpackValue(num_ent_LO, in_buf);
568 return static_cast<size_t>(num_ent_LO);
575 template<
class View1,
class View2>
579 const View1& batches_per_lid,
583 using LO =
typename View2::value_type;
585 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
587 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
589 batch_info(batch, 0) =
static_cast<LO
>(i);
590 batch_info(batch, 1) = batch_no;
594 return batch == batch_info.extent(0);
604 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
606 unpackAndCombineIntoCrsMatrix(
607 const LocalMatrix& local_matrix,
608 const LocalMap& local_map,
609 const Kokkos::View<const char*, BufferDeviceType>& imports,
610 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
611 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
614 using ST =
typename LocalMatrix::value_type;
617 using XS =
typename DT::execution_space;
618 const char prefix[] =
619 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
620 "unpackAndCombineIntoCrsMatrix: ";
622 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
623 if (num_import_lids == 0) {
630 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
631 std::invalid_argument,
632 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
633 "static graph (i.e., was constructed with the CrsMatrix constructor "
634 "that takes a const CrsGraph pointer).");
636 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
637 std::invalid_argument,
638 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
639 "(i.e., was constructed with the CrsMatrix constructor that takes a "
640 "const CrsGraph pointer).");
643 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
644 std::invalid_argument,
645 prefix <<
"Invalid combine mode; should never get "
646 "here! Please report this bug to the Tpetra developers.");
649 bool bad_num_import_lids =
650 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
651 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
652 std::invalid_argument,
653 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
654 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
658 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
662 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
664 const size_t batch_size = std::min(default_batch_size, max_num_ent);
667 size_t num_batches = 0;
668 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
669 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
671 Kokkos::parallel_reduce(
672 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
673 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
675 const size_t num_entries_in_row = unpackRowCount<LO>(
676 imports.data(), offsets(i), num_packets_per_lid(i)
679 (num_entries_in_row <= batch_size) ?
681 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
682 batches += batches_per_lid(i);
686 Kokkos::resize(batch_info, num_batches);
688 Kokkos::HostSpace host_space;
689 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
693 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
695 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
704 const bool atomic = XS().concurrency() != 1;
705 using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
720 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
722 #if defined(KOKKOS_ENABLE_CUDA)
723 constexpr
bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
725 constexpr
bool is_cuda =
false;
727 if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
729 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
733 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
736 auto error_code = f.error();
737 TEUCHOS_TEST_FOR_EXCEPTION(
740 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
744 template<
class LocalMatrix,
class BufferDeviceType>
747 const LocalMatrix& local_matrix,
748 const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
749 const Kokkos::View<const char*, BufferDeviceType>& imports,
750 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
751 const size_t num_same_ids)
753 using Kokkos::parallel_reduce;
754 typedef typename LocalMatrix::ordinal_type LO;
755 typedef typename LocalMatrix::device_type device_type;
756 typedef typename device_type::execution_space XS;
757 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
758 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
759 typedef BufferDeviceType BDT;
765 num_items =
static_cast<LO
>(num_same_ids);
768 parallel_reduce(range_policy(0, num_items),
769 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
770 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
771 -local_matrix.graph.row_map[lid]);
777 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
780 parallel_reduce(range_policy(0, num_items),
781 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
782 const LO lid = permute_from_lids(i);
783 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
784 - local_matrix.graph.row_map[lid]);
791 const size_type np = num_packets_per_lid.extent(0);
792 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
795 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
803 template<
class LO,
class DT,
class BDT>
805 setupRowPointersForRemotes(
806 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
808 const Kokkos::View<const char*, BDT>& imports,
809 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
810 const typename PackTraits<size_t>::input_array_type& offsets)
812 using Kokkos::parallel_reduce;
813 typedef typename DT::execution_space XS;
814 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
815 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
817 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
818 const size_type N = num_packets_per_lid.extent(0);
821 parallel_reduce (
"Setup row pointers for remotes",
823 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
824 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
825 const size_t num_bytes = num_packets_per_lid(i);
826 const size_t offset = offsets(i);
827 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
828 if (num_ent == InvalidNum) {
831 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
839 makeCrsRowPtrFromLengths(
840 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
841 const Kokkos::View<size_t*,DT>& new_start_row)
843 using Kokkos::parallel_scan;
844 typedef typename DT::execution_space XS;
845 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
847 const size_type N = new_start_row.extent(0);
848 parallel_scan(range_policy(0, N),
849 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
850 auto cur_val = tgt_rowptr(i);
852 tgt_rowptr(i) = update;
853 new_start_row(i) = tgt_rowptr(i);
860 template<
class LocalMatrix,
class LocalMap>
863 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
865 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
866 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
867 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
869 const LocalMatrix& local_matrix,
870 const LocalMap& local_col_map,
871 const size_t num_same_ids,
874 using Kokkos::parallel_for;
877 typedef typename DT::execution_space XS;
878 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
880 parallel_for(range_policy(0, num_same_ids),
881 KOKKOS_LAMBDA(
const size_t i) {
882 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
884 const LO src_lid =
static_cast<LO
>(i);
885 size_t src_row = local_matrix.graph.row_map(src_lid);
887 const LO tgt_lid =
static_cast<LO
>(i);
888 const size_t tgt_row = tgt_rowptr(tgt_lid);
890 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
891 - local_matrix.graph.row_map(src_lid);
892 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
894 for (
size_t j=local_matrix.graph.row_map(src_lid);
895 j<local_matrix.graph.row_map(src_lid+1); ++j) {
896 LO src_col = local_matrix.graph.entries(j);
897 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
898 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
899 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
905 template<
class LocalMatrix,
class LocalMap>
907 copyDataFromPermuteIDs(
908 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
910 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
911 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
912 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
914 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
915 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
916 const LocalMatrix& local_matrix,
917 const LocalMap& local_col_map,
920 using Kokkos::parallel_for;
923 typedef typename DT::execution_space XS;
924 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
925 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
927 const size_type num_permute_to_lids = permute_to_lids.extent(0);
929 parallel_for(range_policy(0, num_permute_to_lids),
930 KOKKOS_LAMBDA(
const size_t i) {
931 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
933 const LO src_lid = permute_from_lids(i);
934 const size_t src_row = local_matrix.graph.row_map(src_lid);
936 const LO tgt_lid = permute_to_lids(i);
937 const size_t tgt_row = tgt_rowptr(tgt_lid);
939 size_t nsr = local_matrix.graph.row_map(src_lid+1)
940 - local_matrix.graph.row_map(src_lid);
941 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
943 for (
size_t j=local_matrix.graph.row_map(src_lid);
944 j<local_matrix.graph.row_map(src_lid+1); ++j) {
945 LO src_col = local_matrix.graph.entries(j);
946 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
947 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
948 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
954 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
956 unpackAndCombineIntoCrsArrays2(
957 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
959 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
960 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
961 const typename PackTraits<size_t>::input_array_type& offsets,
962 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
963 const Kokkos::View<const char*, BufferDeviceType>& imports,
964 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
968 const size_t bytes_per_value)
971 using Kokkos::subview;
972 using Kokkos::MemoryUnmanaged;
973 using Kokkos::parallel_reduce;
974 using Kokkos::atomic_fetch_add;
979 typedef typename LocalMatrix::value_type ST;
980 typedef typename DT::execution_space XS;
981 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
982 typedef typename Kokkos::pair<size_type, size_type> slice;
983 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
985 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
986 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
987 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
989 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
992 const size_type num_import_lids = import_lids.size();
995 parallel_reduce (
"Unpack and combine into CRS",
996 range_policy (0, num_import_lids),
997 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
998 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
999 const size_t num_bytes = num_packets_per_lid(i);
1000 const size_t offset = offsets(i);
1001 if (num_bytes == 0) {
1005 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1006 if (num_ent == InvalidNum) {
1010 const LO lcl_row = import_lids(i);
1011 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1012 const size_t end_row = start_row + num_ent;
1014 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1015 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1016 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1018 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1019 imports.data(), offset, num_bytes,
1020 num_ent, bytes_per_value);
1023 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1024 const int pid = pids_out(j);
1025 pids_out(j) = (pid != my_pid) ? pid : -1;
1032 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1035 const LocalMatrix & local_matrix,
1036 const LocalMap & local_col_map,
1037 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1038 const Kokkos::View<const char*, BufferDeviceType>& imports,
1039 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1040 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1041 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1042 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1043 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1044 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1047 const size_t num_same_ids,
1048 const size_t tgt_num_rows,
1049 const size_t tgt_num_nonzeros,
1050 const int my_tgt_pid,
1051 const size_t bytes_per_value)
1054 using Kokkos::subview;
1055 using Kokkos::parallel_for;
1056 using Kokkos::MemoryUnmanaged;
1060 typedef typename DT::execution_space XS;
1061 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1062 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1063 typedef BufferDeviceType BDT;
1065 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1067 const size_t N = tgt_num_rows;
1071 const int my_pid = my_tgt_pid;
1074 parallel_for(range_policy(0, N+1),
1075 KOKKOS_LAMBDA(
const size_t i) {
1081 parallel_for(range_policy(0, num_same_ids),
1082 KOKKOS_LAMBDA(
const size_t i) {
1083 const LO tgt_lid =
static_cast<LO
>(i);
1084 const LO src_lid =
static_cast<LO
>(i);
1085 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1086 - local_matrix.graph.row_map(src_lid);
1091 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1092 parallel_for(range_policy(0, num_permute_to_lids),
1093 KOKKOS_LAMBDA(
const size_t i) {
1094 const LO tgt_lid = permute_to_lids(i);
1095 const LO src_lid = permute_from_lids(i);
1096 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1097 - local_matrix.graph.row_map(src_lid);
1102 const size_type num_import_lids = import_lids.extent(0);
1103 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1106 #ifdef HAVE_TPETRA_DEBUG
1108 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1109 const bool condition =
1110 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1111 TEUCHOS_TEST_FOR_EXCEPTION
1112 (condition, std::logic_error, prefix
1113 <<
"The final offset in bytes " << nth_offset_h
1114 <<
" != imports.size() = " << imports.extent(0)
1115 <<
". Please report this bug to the Tpetra developers.");
1117 #endif // HAVE_TPETRA_DEBUG
1121 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1122 import_lids, imports, num_packets_per_lid, offsets);
1123 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1124 <<
" Error transferring data to target row pointers. "
1125 "Please report this bug to the Tpetra developers.");
1129 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1132 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1135 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1136 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1138 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1139 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1140 local_matrix, local_col_map, my_pid);
1142 if (imports.extent(0) <= 0) {
1146 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1147 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1148 local_matrix, local_col_map, my_pid, bytes_per_value);
1149 TEUCHOS_TEST_FOR_EXCEPTION(
1150 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1151 "should never happen. Please report this bug to the Tpetra developers.");
1197 template<
typename ST,
typename LO,
typename GO,
typename Node>
1201 const Teuchos::ArrayView<const char>& imports,
1202 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1203 const Teuchos::ArrayView<const LO>& importLIDs,
1208 typedef typename Node::device_type device_type;
1210 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1211 "Node::device_type and LocalMatrix::device_type must be the same.");
1214 device_type outputDevice;
1219 auto num_packets_per_lid_d =
1221 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1223 auto import_lids_d =
1225 importLIDs.size(),
true,
"import_lids");
1229 imports.size(),
true,
"imports");
1232 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1243 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1244 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1245 import_lids_d, combineMode);
1249 template<
typename ST,
typename LO,
typename GO,
typename NT>
1251 unpackCrsMatrixAndCombineNew(
1253 Kokkos::DualView<
char*,
1255 Kokkos::DualView<
size_t*,
1257 const Kokkos::DualView<
const LO*,
1265 using device_type =
typename crs_matrix_type::device_type;
1266 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1267 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1270 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1271 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1272 "must be the same.");
1274 if (numPacketsPerLID.need_sync_device()) {
1275 numPacketsPerLID.sync_device ();
1277 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1279 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1280 auto import_lids_d = importLIDs.view_device ();
1282 if (imports.need_sync_device()) {
1283 imports.sync_device ();
1285 auto imports_d = imports.view_device ();
1288 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1289 typedef decltype (local_col_map) local_map_type;
1291 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1292 local_matrix_device_type,
1295 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1296 import_lids_d, combineMode);
1354 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1357 const
CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1358 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1359 const Teuchos::ArrayView<const
char> &imports,
1360 const Teuchos::ArrayView<const
size_t>& numPacketsPerLID,
1364 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1365 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1367 using Kokkos::MemoryUnmanaged;
1369 typedef typename Node::device_type DT;
1371 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1373 TEUCHOS_TEST_FOR_EXCEPTION
1374 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1375 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1376 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1379 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1380 TEUCHOS_TEST_FOR_EXCEPTION
1381 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1382 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1383 TEUCHOS_TEST_FOR_EXCEPTION
1384 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1385 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1386 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1388 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1389 auto permute_from_lids_d =
1391 permuteFromLIDs.getRawPtr (),
1392 permuteFromLIDs.size (),
true,
1393 "permute_from_lids");
1396 imports.getRawPtr (),
1397 imports.size (),
true,
1399 auto num_packets_per_lid_d =
1401 numPacketsPerLID.getRawPtr (),
1402 numPacketsPerLID.size (),
true,
1403 "num_packets_per_lid");
1406 local_matrix, permute_from_lids_d, imports_d,
1407 num_packets_per_lid_d, numSameIDs);
1424 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1428 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1429 const Teuchos::ArrayView<const char>& imports,
1430 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1433 const size_t numSameIDs,
1434 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1435 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1436 size_t TargetNumRows,
1437 size_t TargetNumNonzeros,
1438 const int MyTargetPID,
1439 const Teuchos::ArrayView<size_t>& CRS_rowptr,
1440 const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1442 const Teuchos::ArrayView<const int>& SourcePids,
1443 Teuchos::Array<int>& TargetPids)
1445 using execution_space =
typename Node::execution_space;
1451 using Teuchos::ArrayView;
1452 using Teuchos::outArg;
1453 using Teuchos::REDUCE_MAX;
1454 using Teuchos::reduceAll;
1456 typedef LocalOrdinal LO;
1458 typedef typename Node::device_type DT;
1461 typedef typename matrix_type::impl_scalar_type ST;
1462 typedef typename ArrayView<const LO>::size_type size_type;
1464 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1466 TEUCHOS_TEST_FOR_EXCEPTION(
1467 TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1468 std::invalid_argument, prefix <<
"CRS_rowptr.size() = " <<
1469 CRS_rowptr.size () <<
"!= TargetNumRows+1 = " << TargetNumRows+1 <<
".");
1471 TEUCHOS_TEST_FOR_EXCEPTION(
1472 permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1473 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size ()
1474 <<
"!= permuteFromLIDs.size() = " << permuteFromLIDs.size () <<
".");
1475 const size_type numImportLIDs = importLIDs.size ();
1477 TEUCHOS_TEST_FOR_EXCEPTION(
1478 numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1479 prefix <<
"importLIDs.size() = " << numImportLIDs <<
" != "
1480 "numPacketsPerLID.size() = " << numPacketsPerLID.size() <<
".");
1483 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1484 TargetPids.resize (TargetNumNonzeros);
1486 TargetPids.assign (TargetNumNonzeros, -1);
1490 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1494 auto import_lids_d =
1496 importLIDs.size(),
true,
"import_lids");
1500 imports.size(),
true,
"imports");
1502 auto num_packets_per_lid_d =
1504 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1506 auto permute_from_lids_d =
1508 permuteFromLIDs.size(),
true,
"permute_from_lids");
1510 auto permute_to_lids_d =
1512 permuteToLIDs.size(),
true,
"permute_to_lids");
1516 CRS_rowptr.size(),
true,
"crs_rowptr");
1520 CRS_colind.size(),
true,
"crs_colidx");
1522 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1523 static_assert (! std::is_same<
1524 typename std::remove_const<
1525 typename std::decay<
1529 std::complex<double> >::value,
1530 "CRS_vals::value_type is std::complex<double>; this should never happen"
1531 ", since std::complex does not work in Kokkos::View objects.");
1532 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1536 CRS_vals.size(),
true,
"crs_vals");
1538 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1539 static_assert (! std::is_same<
1540 typename decltype (crs_vals_d)::non_const_value_type,
1541 std::complex<double> >::value,
1542 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1543 "never happen, since std::complex does not work in Kokkos::View objects.");
1544 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1548 SourcePids.size(),
true,
"src_pids");
1552 TargetPids.size(),
true,
"tgt_pids");
1554 size_t bytes_per_value = 0;
1568 size_t bytes_per_value_l = 0;
1569 if (local_matrix.values.extent(0) > 0) {
1570 const ST& val = local_matrix.values(0);
1573 const ST& val = crs_vals_d(0);
1576 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1577 Teuchos::REDUCE_MAX,
1579 outArg(bytes_per_value));
1582 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1583 static_assert (! std::is_same<
1584 typename decltype (crs_vals_d)::non_const_value_type,
1585 std::complex<double> >::value,
1586 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1587 "never happen, since std::complex does not work in Kokkos::View objects.");
1588 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1591 local_matrix, local_col_map, import_lids_d, imports_d,
1592 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1593 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1594 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1598 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1599 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1601 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1603 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1604 CRS_colind.getRawPtr(), CRS_colind.size());
1606 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1608 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1609 CRS_vals.getRawPtr(), CRS_vals.size());
1611 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1613 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1614 TargetPids.getRawPtr(), TargetPids.size());
1616 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1622 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1624 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1625 const CrsMatrix<ST, LO, GO, NT>&, \
1626 const Teuchos::ArrayView<const char>&, \
1627 const Teuchos::ArrayView<const size_t>&, \
1628 const Teuchos::ArrayView<const LO>&, \
1632 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1633 const CrsMatrix<ST, LO, GO, NT>&, \
1634 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1635 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1636 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1638 const CombineMode); \
1640 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1641 const CrsMatrix<ST, LO, GO, NT> &, \
1642 const Teuchos::ArrayView<const LO>&, \
1643 const Teuchos::ArrayView<const char>&, \
1644 const Teuchos::ArrayView<const size_t>&, \
1646 const CombineMode, \
1648 const Teuchos::ArrayView<const LO>&, \
1649 const Teuchos::ArrayView<const LO>&, \
1653 const Teuchos::ArrayView<size_t>&, \
1654 const Teuchos::ArrayView<GO>&, \
1655 const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1656 const Teuchos::ArrayView<const int>&, \
1657 Teuchos::Array<int>&); \
1659 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1660 const CrsMatrix<ST, LO, GO, NT> &, \
1661 const Teuchos::ArrayView<const LO> &, \
1662 const Teuchos::ArrayView<const char> &, \
1663 const Teuchos::ArrayView<const size_t>&, \
1667 const Teuchos::ArrayView<const LO>&, \
1668 const Teuchos::ArrayView<const LO>&);
1670 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
int error() const
Host function for getting the error.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...