10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
59 namespace UnpackAndCombineCrsMatrixImpl {
70 template <
class ST,
class LO,
class GO>
72 unpackRow(
const typename PackTraits<GO>::output_array_type& gids_out,
74 const typename PackTraits<ST>::output_array_type& vals_out,
79 const size_t bytes_per_value) {
84 bool unpack_pids = pids_out.size() > 0;
86 const size_t num_ent_beg = offset;
89 const size_t gids_beg = num_ent_beg + num_ent_len;
90 const size_t gids_len =
93 const size_t pids_beg = gids_beg + gids_len;
96 const size_t vals_beg = gids_beg + gids_len + pids_len;
97 const size_t vals_len = num_ent * bytes_per_value;
99 const char*
const num_ent_in = imports + num_ent_beg;
100 const char*
const gids_in = imports + gids_beg;
101 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
102 const char*
const vals_in = imports + vals_beg;
104 size_t num_bytes_out = 0;
107 if (static_cast<size_t>(num_ent_out) != num_ent) {
112 Kokkos::pair<int, size_t> p;
117 num_bytes_out += p.second;
124 num_bytes_out += p.second;
131 num_bytes_out += p.second;
134 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
135 if (num_bytes_out != expected_num_bytes) {
151 template <
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
153 typedef LocalMatrix local_matrix_type;
156 typedef typename local_matrix_type::value_type ST;
160 typedef typename DT::execution_space XS;
162 typedef Kokkos::View<const size_t*, BufferDeviceType>
163 num_packets_per_lid_type;
164 typedef Kokkos::View<const size_t*, DT> offsets_type;
165 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
166 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
168 typedef Kokkos::View<int, DT> error_type;
169 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
171 static_assert(std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
172 "LocalMap::local_ordinal_type and "
173 "LocalMatrix::ordinal_type must be the same.");
175 local_matrix_type local_matrix;
177 input_buffer_type imports;
178 num_packets_per_lid_type num_packets_per_lid;
179 import_lids_type import_lids;
180 Kokkos::View<const LO* [2], DT> batch_info;
181 offsets_type offsets;
184 size_t bytes_per_value;
186 error_type error_code;
189 const local_matrix_type& local_matrix_in,
191 const input_buffer_type& imports_in,
192 const num_packets_per_lid_type& num_packets_per_lid_in,
193 const import_lids_type& import_lids_in,
194 const Kokkos::View<
const LO* [2], DT>& batch_info_in,
195 const offsets_type& offsets_in,
197 const size_t batch_size_in,
198 const size_t bytes_per_value_in,
199 const bool atomic_in)
200 : local_matrix(local_matrix_in)
201 , local_col_map(local_col_map_in)
202 , imports(imports_in)
203 , num_packets_per_lid(num_packets_per_lid_in)
204 , import_lids(import_lids_in)
205 , batch_info(batch_info_in)
206 , offsets(offsets_in)
207 , combine_mode(combine_mode_in)
208 , batch_size(batch_size_in)
209 , bytes_per_value(bytes_per_value_in)
211 , error_code(
"error") {}
213 KOKKOS_INLINE_FUNCTION
214 void operator()(member_type team_member)
const {
215 using Kokkos::MemoryUnmanaged;
216 using Kokkos::subview;
219 const LO batch = team_member.league_rank();
220 const LO lid_no = batch_info(batch, 0);
221 const LO batch_no = batch_info(batch, 1);
223 const size_t num_bytes = num_packets_per_lid(lid_no);
230 const LO import_lid = import_lids(lid_no);
231 const size_t buf_size = imports.size();
232 const size_t offset = offsets(lid_no);
236 const char*
const in_buf = imports.data() + offset;
238 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
241 size_t expected_num_bytes = 0;
248 if (expected_num_bytes > num_bytes) {
250 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
251 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
252 (
int)lid_no, (
int)expected_num_bytes, (
int)num_bytes);
254 Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
258 if (offset > buf_size || offset + num_bytes > buf_size) {
260 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
261 "At row %d, the offset (%d) > buffer size (%d)\n",
262 (
int)lid_no, (
int)offset, (
int)buf_size);
264 Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
269 size_t num_entries_in_batch = 0;
270 if (num_entries_in_row <= batch_size)
271 num_entries_in_batch = num_entries_in_row;
272 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
273 num_entries_in_batch = batch_size;
275 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
278 const size_t num_ent_start = offset;
279 const size_t num_ent_end = num_ent_start + bytes_per_lid;
282 const size_t gids_start = num_ent_end;
283 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
285 const size_t vals_start = gids_end;
287 const size_t shift = batch_no * batch_size;
288 const char*
const num_ent_in = imports.data() + num_ent_start;
289 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
290 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
294 if (static_cast<size_t>(num_ent_out) != num_entries_in_row) {
296 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
297 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
298 (
int)lid_no, (
int)num_entries_in_row, (
int)num_ent_out);
300 Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
303 constexpr
bool matrix_has_sorted_rows =
true;
306 Kokkos::parallel_for(
307 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
308 [=, *
this](
const LO& j) {
312 distance = j * bytes_per_gid;
322 distance = j * bytes_per_value;
325 if (combine_mode ==
ADD) {
329 const bool use_atomic_updates = atomic;
330 (void)local_matrix.sumIntoValues(
335 matrix_has_sorted_rows,
337 }
else if (combine_mode ==
REPLACE) {
341 const bool use_atomic_updates =
false;
342 (void)local_matrix.replaceValues(
347 matrix_has_sorted_rows,
352 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
353 "At row %d, an unknown error occurred during unpack\n",
355 Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
359 team_member.team_barrier();
364 auto error_code_h = Kokkos::create_mirror_view_and_copy(
365 Kokkos::HostSpace(), error_code);
366 return error_code_h();
371 struct MaxNumEntTag {};
372 struct TotNumEntTag {};
382 template <
class LO,
class DT,
class BDT>
385 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
386 typedef Kokkos::View<const size_t*, DT> offsets_type;
387 typedef Kokkos::View<const char*, BDT> input_buffer_type;
390 typedef size_t value_type;
393 num_packets_per_lid_type num_packets_per_lid;
394 offsets_type offsets;
395 input_buffer_type imports;
399 const offsets_type& offsets_in,
400 const input_buffer_type& imports_in)
401 : num_packets_per_lid(num_packets_per_lid_in)
402 , offsets(offsets_in)
403 , imports(imports_in) {}
405 KOKKOS_INLINE_FUNCTION
void
406 operator()(
const MaxNumEntTag,
const LO i, value_type& update)
const {
408 const size_t num_bytes = num_packets_per_lid(i);
411 const char*
const in_buf = imports.data() + offsets(i);
413 const size_t num_ent =
static_cast<size_t>(num_ent_LO);
415 update = (update < num_ent) ? num_ent : update;
419 KOKKOS_INLINE_FUNCTION
void
420 join(
const MaxNumEntTag,
422 const value_type& src)
const {
423 if (dst < src) dst = src;
426 KOKKOS_INLINE_FUNCTION
void
427 operator()(
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
429 const size_t num_bytes = num_packets_per_lid(i);
432 const char*
const in_buf = imports.data() + offsets(i);
434 tot_num_ent +=
static_cast<size_t>(num_ent_LO);
446 template <
class LO,
class DT,
class BDT>
448 compute_maximum_num_entries(
449 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
450 const Kokkos::View<const size_t*, DT>& offsets,
451 const Kokkos::View<const char*, BDT>& imports) {
452 typedef typename DT::execution_space XS;
453 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
459 const LO numRowsToUnpack =
460 static_cast<LO
>(num_packets_per_lid.extent(0));
461 size_t max_num_ent = 0;
462 Kokkos::parallel_reduce(
"Max num entries in CRS",
463 range_policy(0, numRowsToUnpack),
464 functor, max_num_ent);
475 template <
class LO,
class DT,
class BDT>
477 compute_total_num_entries(
478 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
479 const Kokkos::View<const size_t*, DT>& offsets,
480 const Kokkos::View<const char*, BDT>& imports) {
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
483 size_t tot_num_ent = 0;
484 NumEntriesFunctor<LO, DT, BDT> functor(num_packets_per_lid, offsets,
486 const LO numRowsToUnpack =
487 static_cast<LO
>(num_packets_per_lid.extent(0));
488 Kokkos::parallel_reduce(
"Total num entries in CRS to unpack",
489 range_policy(0, numRowsToUnpack),
490 functor, tot_num_ent);
495 KOKKOS_INLINE_FUNCTION
497 unpackRowCount(
const char imports[],
499 const size_t num_bytes) {
500 using PT = PackTraits<LO>;
504 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
505 if (p_num_bytes > num_bytes) {
506 return OrdinalTraits<size_t>::invalid();
508 const char*
const in_buf = imports + offset;
509 (void)PT::unpackValue(num_ent_LO, in_buf);
511 return static_cast<size_t>(num_ent_LO);
518 template <
class View1,
class View2>
521 const View1& batches_per_lid,
523 using LO =
typename View2::value_type;
525 for (
size_t i = 0; i < batches_per_lid.extent(0); i++) {
526 for (
size_t batch_no = 0; batch_no < batches_per_lid(i); batch_no++) {
527 batch_info(batch, 0) =
static_cast<LO
>(i);
528 batch_info(batch, 1) = batch_no;
532 return batch == batch_info.extent(0);
542 template <
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
543 void unpackAndCombineIntoCrsMatrix(
544 const LocalMatrix& local_matrix,
545 const LocalMap& local_map,
546 const Kokkos::View<const char*, BufferDeviceType>& imports,
547 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
548 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
550 using ST =
typename LocalMatrix::value_type;
553 using XS =
typename DT::execution_space;
554 const char prefix[] =
555 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
556 "unpackAndCombineIntoCrsMatrix: ";
558 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
559 if (num_import_lids == 0) {
566 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
567 std::invalid_argument,
568 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
569 "static graph (i.e., was constructed with the CrsMatrix constructor "
570 "that takes a const CrsGraph pointer).");
572 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
573 std::invalid_argument,
574 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
575 "(i.e., was constructed with the CrsMatrix constructor that takes a "
576 "const CrsGraph pointer).");
579 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
580 std::invalid_argument,
581 prefix <<
"Invalid combine mode; should never get "
582 "here! Please report this bug to the Tpetra developers.");
585 bool bad_num_import_lids =
586 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
587 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
588 std::invalid_argument,
589 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
590 "numPacketsPerLID.size() ("
591 << num_packets_per_lid.extent(0) <<
").");
595 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids + 1);
599 size_t max_num_ent = compute_maximum_num_entries<LO, DT>(num_packets_per_lid, offsets, imports);
601 const size_t batch_size = std::min(default_batch_size, max_num_ent);
604 size_t num_batches = 0;
605 Kokkos::View<LO* [2], DT> batch_info(
"", num_batches);
606 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
608 Kokkos::parallel_reduce(
609 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
610 KOKKOS_LAMBDA(
const size_t i,
size_t& batches) {
611 const size_t num_entries_in_row = unpackRowCount<LO>(
612 imports.data(), offsets(i), num_packets_per_lid(i));
614 (num_entries_in_row <= batch_size) ? 1 : num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
615 batches += batches_per_lid(i);
618 Kokkos::resize(batch_info, num_batches);
620 Kokkos::HostSpace host_space;
621 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
625 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
627 (void)compute_batch_info(batches_per_lid_h, batch_info_h);
636 const bool atomic = XS().concurrency() != 1;
637 using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
651 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
653 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid()) {
654 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
656 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
659 auto error_code = f.error();
660 TEUCHOS_TEST_FOR_EXCEPTION(
663 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code);
666 template <
class LocalMatrix,
class BufferDeviceType>
669 const LocalMatrix& local_matrix,
670 const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
671 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
672 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
673 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
675 const Kokkos::View<const char*, BufferDeviceType>& imports,
676 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
678 const size_t num_same_ids) {
679 using Kokkos::parallel_reduce;
680 typedef typename LocalMatrix::ordinal_type LO;
681 typedef typename LocalMatrix::device_type device_type;
682 typedef typename device_type::execution_space XS;
683 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
684 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>> range_policy;
685 typedef BufferDeviceType BDT;
691 num_items =
static_cast<LO
>(num_same_ids);
695 range_policy(0, num_items),
696 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
697 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
704 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
708 range_policy(0, num_items),
709 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
710 const LO lid = permute_from_lids(i);
711 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
719 const size_type np = num_packets_per_lid.extent(0);
720 Kokkos::View<size_t*, device_type> offsets(
"offsets", np + 1);
723 compute_total_num_entries<LO, device_type, BDT>(num_packets_per_lid,
731 template <
class LO,
class DT,
class BDT>
732 int setupRowPointersForRemotes(
733 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
735 const Kokkos::View<const char*, BDT>& imports,
736 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
737 const typename PackTraits<size_t>::input_array_type& offsets) {
738 using Kokkos::parallel_reduce;
739 typedef typename DT::execution_space XS;
740 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
741 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
743 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
744 const size_type N = num_packets_per_lid.extent(0);
748 "Setup row pointers for remotes",
750 KOKKOS_LAMBDA(
const size_t i,
int& k_error) {
751 typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
752 const size_t num_bytes = num_packets_per_lid(i);
753 const size_t offset = offsets(i);
754 const size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
755 if (num_ent == InvalidNum) {
758 Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
766 void makeCrsRowPtrFromLengths(
767 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
768 const Kokkos::View<size_t*, DT>& new_start_row) {
769 using Kokkos::parallel_scan;
770 typedef typename DT::execution_space XS;
771 typedef typename Kokkos::View<size_t*, DT>::size_type size_type;
772 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
773 const size_type N = new_start_row.extent(0);
776 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
777 auto cur_val = tgt_rowptr(i);
779 tgt_rowptr(i) = update;
780 new_start_row(i) = tgt_rowptr(i);
786 template <
class LocalMatrix,
class LocalMap>
787 void copyDataFromSameIDs(
788 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
790 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
791 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
792 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
794 const LocalMatrix& local_matrix,
795 const LocalMap& local_col_map,
796 const size_t num_same_ids,
798 using Kokkos::parallel_for;
801 typedef typename DT::execution_space XS;
802 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
805 range_policy(0, num_same_ids),
806 KOKKOS_LAMBDA(
const size_t i) {
807 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
809 const LO src_lid =
static_cast<LO
>(i);
810 size_t src_row = local_matrix.graph.row_map(src_lid);
812 const LO tgt_lid =
static_cast<LO
>(i);
813 const size_t tgt_row = tgt_rowptr(tgt_lid);
815 const size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
816 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
818 for (
size_t j = local_matrix.graph.row_map(src_lid);
819 j < local_matrix.graph.row_map(src_lid + 1); ++j) {
820 LO src_col = local_matrix.graph.entries(j);
821 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
822 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
823 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
828 template <
class LocalMatrix,
class LocalMap>
829 void copyDataFromPermuteIDs(
830 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
832 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
833 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
834 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
836 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
837 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
838 const LocalMatrix& local_matrix,
839 const LocalMap& local_col_map,
841 using Kokkos::parallel_for;
844 typedef typename DT::execution_space XS;
845 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
848 const size_type num_permute_to_lids = permute_to_lids.extent(0);
851 range_policy(0, num_permute_to_lids),
852 KOKKOS_LAMBDA(
const size_t i) {
853 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
855 const LO src_lid = permute_from_lids(i);
856 const size_t src_row = local_matrix.graph.row_map(src_lid);
858 const LO tgt_lid = permute_to_lids(i);
859 const size_t tgt_row = tgt_rowptr(tgt_lid);
861 size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
862 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
864 for (
size_t j = local_matrix.graph.row_map(src_lid);
865 j < local_matrix.graph.row_map(src_lid + 1); ++j) {
866 LO src_col = local_matrix.graph.entries(j);
867 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
868 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
869 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
874 template <
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
875 int unpackAndCombineIntoCrsArrays2(
876 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
878 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
879 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
880 const typename PackTraits<size_t>::input_array_type& offsets,
881 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
882 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
883 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
884 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
886 const Kokkos::View<const char*, BufferDeviceType>& imports,
887 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
892 const size_t bytes_per_value) {
893 using Kokkos::atomic_fetch_add;
894 using Kokkos::MemoryUnmanaged;
895 using Kokkos::parallel_reduce;
896 using Kokkos::subview;
902 typedef typename LocalMatrix::value_type ST;
903 typedef typename DT::execution_space XS;
904 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
905 typedef typename Kokkos::pair<size_type, size_type> slice;
906 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
908 typedef View<int*, DT, MemoryUnmanaged> pids_out_type;
909 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
910 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
912 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
915 const size_type num_import_lids = import_lids.size();
919 "Unpack and combine into CRS",
920 range_policy(0, num_import_lids),
921 KOKKOS_LAMBDA(
const size_t i,
int& k_error) {
922 typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
923 const size_t num_bytes = num_packets_per_lid(i);
924 const size_t offset = offsets(i);
925 if (num_bytes == 0) {
929 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
930 if (num_ent == InvalidNum) {
934 const LO lcl_row = import_lids(i);
935 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
936 const size_t end_row = start_row + num_ent;
938 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
939 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
940 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
942 k_error += unpackRow<ST, LO, GO>(gids_out, pids_out, vals_out,
943 imports.data(), offset, num_bytes,
944 num_ent, bytes_per_value);
947 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
948 const int pid = pids_out(j);
949 pids_out(j) = (pid != my_pid) ? pid : -1;
957 template <
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
959 const LocalMatrix& local_matrix,
960 const LocalMap& local_col_map,
961 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
962 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
963 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
964 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
966 const Kokkos::View<const char*, BufferDeviceType>& imports,
967 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
969 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
970 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
971 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
972 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
973 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
976 const size_t num_same_ids,
977 const size_t tgt_num_rows,
978 const size_t tgt_num_nonzeros,
979 const int my_tgt_pid,
980 const size_t bytes_per_value) {
981 using Kokkos::MemoryUnmanaged;
982 using Kokkos::parallel_for;
983 using Kokkos::subview;
988 typedef typename DT::execution_space XS;
989 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
990 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
991 typedef BufferDeviceType BDT;
993 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
995 const size_t N = tgt_num_rows;
999 const int my_pid = my_tgt_pid;
1003 range_policy(0, N + 1),
1004 KOKKOS_LAMBDA(
const size_t i) {
1010 range_policy(0, num_same_ids),
1011 KOKKOS_LAMBDA(
const size_t i) {
1012 const LO tgt_lid =
static_cast<LO
>(i);
1013 const LO src_lid =
static_cast<LO
>(i);
1014 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1018 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1020 range_policy(0, num_permute_to_lids),
1021 KOKKOS_LAMBDA(
const size_t i) {
1022 const LO tgt_lid = permute_to_lids(i);
1023 const LO src_lid = permute_from_lids(i);
1024 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1028 const size_type num_import_lids = import_lids.extent(0);
1029 View<size_t*, DT> offsets(
"offsets", num_import_lids + 1);
1032 #ifdef HAVE_TPETRA_DEBUG
1034 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1035 const bool condition =
1036 nth_offset_h !=
static_cast<size_t>(imports.extent(0));
1037 TEUCHOS_TEST_FOR_EXCEPTION(condition, std::logic_error, prefix <<
"The final offset in bytes " << nth_offset_h <<
" != imports.size() = " << imports.extent(0) <<
". Please report this bug to the Tpetra developers.");
1039 #endif // HAVE_TPETRA_DEBUG
1043 setupRowPointersForRemotes<LO, DT, BDT>(tgt_rowptr,
1044 import_lids, imports, num_packets_per_lid, offsets);
1045 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix <<
" Error transferring data to target row pointers. "
1046 "Please report this bug to the Tpetra developers.");
1050 View<size_t*, DT> new_start_row(
"new_start_row", N + 1);
1053 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1056 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1057 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1059 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1060 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1061 local_matrix, local_col_map, my_pid);
1063 if (imports.extent(0) <= 0) {
1067 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1068 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1069 local_matrix, local_col_map, my_pid, bytes_per_value);
1070 TEUCHOS_TEST_FOR_EXCEPTION(
1071 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1072 "should never happen. Please report this bug to the Tpetra developers.");
1118 template <
typename ST,
typename LO,
typename GO,
typename Node>
1121 const Teuchos::ArrayView<const char>& imports,
1122 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1123 const Teuchos::ArrayView<const LO>& importLIDs,
1127 typedef typename Node::device_type device_type;
1129 static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1130 "Node::device_type and LocalMatrix::device_type must be the same.");
1133 device_type outputDevice;
1138 auto num_packets_per_lid_d =
1140 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1142 auto import_lids_d =
1144 importLIDs.size(),
true,
"import_lids");
1148 imports.size(),
true,
"imports");
1151 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1162 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1163 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1164 import_lids_d, combineMode);
1167 template <
typename ST,
typename LO,
typename GO,
typename NT>
1168 void unpackCrsMatrixAndCombineNew(
1170 Kokkos::DualView<
char*,
1173 Kokkos::DualView<
size_t*,
1176 const Kokkos::DualView<
const LO*,
1183 using device_type =
typename crs_matrix_type::device_type;
1184 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1185 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1187 static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1188 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1189 "must be the same.");
1191 if (numPacketsPerLID.need_sync_device()) {
1192 numPacketsPerLID.sync_device();
1194 auto num_packets_per_lid_d = numPacketsPerLID.view_device();
1196 TEUCHOS_ASSERT(!importLIDs.need_sync_device());
1197 auto import_lids_d = importLIDs.view_device();
1199 if (imports.need_sync_device()) {
1200 imports.sync_device();
1202 auto imports_d = imports.view_device();
1205 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1206 typedef decltype(local_col_map) local_map_type;
1208 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1209 local_matrix_device_type,
1211 buffer_device_type>(local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1212 import_lids_d, combineMode);
1270 template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1273 const
CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& sourceMatrix,
1274 const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1275 const Teuchos::ArrayView<const
char>& imports,
1276 const Teuchos::ArrayView<const
size_t>& numPacketsPerLID,
1280 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1281 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs) {
1282 using Kokkos::MemoryUnmanaged;
1284 typedef typename Node::device_type DT;
1285 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1287 TEUCHOS_TEST_FOR_EXCEPTION(permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1288 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size() <<
" != "
1289 "permuteFromLIDs.size() = "
1290 << permuteFromLIDs.size() <<
".");
1293 const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1294 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix <<
"The input "
1295 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1296 TEUCHOS_TEST_FOR_EXCEPTION(importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1297 prefix <<
"importLIDs.size() = " << importLIDs.size() <<
" != "
1298 "numPacketsPerLID.size() = "
1299 << numPacketsPerLID.size() <<
".");
1301 auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1303 using kokkos_device_type = Kokkos::Device<
typename Node::device_type::execution_space,
1304 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1306 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1307 Kokkos::View<LocalOrdinal const*, kokkos_device_type, void, void> permute_from_lids_d =
1309 Kokkos::View<LocalOrdinal const*, kokkos_device_type> permute_from_lids_d =
1312 permuteFromLIDs.getRawPtr(),
1313 permuteFromLIDs.size(),
true,
1314 "permute_from_lids");
1316 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1317 Kokkos::View<const char*, kokkos_device_type, void, void> imports_d =
1319 Kokkos::View<const char*, kokkos_device_type> imports_d =
1322 imports.getRawPtr(),
1323 imports.size(),
true,
1326 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1327 Kokkos::View<const size_t*, kokkos_device_type, void, void> num_packets_per_lid_d =
1329 Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1332 numPacketsPerLID.getRawPtr(),
1333 numPacketsPerLID.size(),
true,
1334 "num_packets_per_lid");
1337 local_matrix, permute_from_lids_d, imports_d,
1338 num_packets_per_lid_d, numSameIDs);
1356 template <
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1359 const Kokkos::View<LocalOrdinal
const*,
1360 Kokkos::Device<
typename Node::device_type::execution_space,
1361 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1362 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1368 const Kokkos::View<
const char*,
1369 Kokkos::Device<
typename Node::device_type::execution_space,
1370 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1371 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1377 const Kokkos::View<
const size_t*,
1378 Kokkos::Device<
typename Node::device_type::execution_space,
1379 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1380 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1385 num_packets_per_lid_d,
1386 const size_t numSameIDs,
1387 const Kokkos::View<LocalOrdinal
const*,
1388 Kokkos::Device<
typename Node::device_type::execution_space,
1389 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1390 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1396 const Kokkos::View<LocalOrdinal
const*,
1397 Kokkos::Device<
typename Node::device_type::execution_space,
1398 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1399 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1404 permute_from_lids_d,
1405 size_t TargetNumRows,
1406 const int MyTargetPID,
1407 Kokkos::View<size_t*, typename Node::device_type>& crs_rowptr_d,
1408 Kokkos::View<GlobalOrdinal*, typename Node::device_type>& crs_colind_d,
1410 const Teuchos::ArrayView<const int>& SourcePids,
1411 Kokkos::View<int*, typename Node::device_type>& TargetPids) {
1412 using execution_space =
typename Node::execution_space;
1418 using Teuchos::ArrayView;
1419 using Teuchos::outArg;
1420 using Teuchos::REDUCE_MAX;
1421 using Teuchos::reduceAll;
1423 typedef typename Node::device_type DT;
1426 typedef typename matrix_type::impl_scalar_type ST;
1428 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1429 #ifdef HAVE_TPETRA_MMM_TIMINGS
1430 using Teuchos::TimeMonitor;
1431 Teuchos::RCP<TimeMonitor> tm;
1434 using Kokkos::MemoryUnmanaged;
1436 TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1437 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size() <<
" != "
1438 "permute_from_lids_d.size() = "
1439 << permute_from_lids_d.size() <<
".");
1443 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix <<
"The input "
1444 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1445 TEUCHOS_TEST_FOR_EXCEPTION(((
size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1446 prefix <<
"import_lids_d.size() = " << import_lids_d.size() <<
" != "
1447 "num_packets_per_lid_d.size() = "
1448 << num_packets_per_lid_d.size() <<
".");
1453 #ifdef HAVE_TPETRA_MMM_TIMINGS
1454 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1456 size_t TargetNumNonzeros =
1458 local_matrix, permute_from_lids_d, imports_d,
1459 num_packets_per_lid_d, numSameIDs);
1460 #ifdef HAVE_TPETRA_MMM_TIMINGS
1464 #ifdef HAVE_TPETRA_MMM_TIMINGS
1465 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1467 Kokkos::resize(crs_rowptr_d, TargetNumRows + 1);
1468 Kokkos::resize(crs_colind_d, TargetNumNonzeros);
1469 Kokkos::resize(crs_vals_d, TargetNumNonzeros);
1470 #ifdef HAVE_TPETRA_MMM_TIMINGS
1474 TEUCHOS_TEST_FOR_EXCEPTION(
1475 permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1476 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size()
1477 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1479 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1480 Kokkos::resize(TargetPids, TargetNumNonzeros);
1485 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1487 #ifdef HAVE_TPETRA_MMM_TIMINGS
1488 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1495 SourcePids.size(),
true,
"src_pids");
1497 #ifdef HAVE_TPETRA_MMM_TIMINGS
1501 size_t bytes_per_value = 0;
1514 size_t bytes_per_value_l = 0;
1515 if (local_matrix.values.extent(0) > 0) {
1516 const ST& val = local_matrix.values(0);
1519 const ST& val = crs_vals_d(0);
1522 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1523 Teuchos::REDUCE_MAX,
1525 outArg(bytes_per_value));
1528 #ifdef HAVE_TPETRA_MMM_TIMINGS
1529 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1532 local_matrix, local_col_map, import_lids_d, imports_d,
1533 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1534 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1535 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1537 #ifdef HAVE_TPETRA_MMM_TIMINGS
1542 #ifdef HAVE_TPETRA_MMM_TIMINGS
1543 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1546 Kokkos::parallel_for(
1547 "setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0, TargetPids.size()), KOKKOS_LAMBDA(
const size_t i) {
1548 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1553 template <
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1556 const Kokkos::View<LocalOrdinal
const*,
1557 Kokkos::Device<
typename Node::device_type::execution_space,
1558 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1559 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1565 const Kokkos::View<
const char*,
1566 Kokkos::Device<
typename Node::device_type::execution_space,
1567 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1568 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1574 const Kokkos::View<
const size_t*,
1575 Kokkos::Device<
typename Node::device_type::execution_space,
1576 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1577 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1582 num_packets_per_lid_d,
1583 const size_t numSameIDs,
1584 const Kokkos::View<LocalOrdinal
const*,
1585 Kokkos::Device<
typename Node::device_type::execution_space,
1586 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1587 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1593 const Kokkos::View<LocalOrdinal
const*,
1594 Kokkos::Device<
typename Node::device_type::execution_space,
1595 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1596 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1601 permute_from_lids_d,
1602 size_t TargetNumRows,
1603 const int MyTargetPID,
1604 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1605 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1606 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1607 const Teuchos::ArrayView<const int>& SourcePids,
1608 Teuchos::Array<int>& TargetPids) {
1609 using execution_space =
typename Node::execution_space;
1615 using Teuchos::ArrayView;
1616 using Teuchos::outArg;
1617 using Teuchos::REDUCE_MAX;
1618 using Teuchos::reduceAll;
1620 typedef typename Node::device_type DT;
1623 typedef typename matrix_type::impl_scalar_type ST;
1625 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1626 #ifdef HAVE_TPETRA_MMM_TIMINGS
1627 using Teuchos::TimeMonitor;
1628 Teuchos::RCP<TimeMonitor> tm;
1631 using Kokkos::MemoryUnmanaged;
1633 TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1634 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size() <<
" != "
1635 "permute_from_lids_d.size() = "
1636 << permute_from_lids_d.size() <<
".");
1640 TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix <<
"The input "
1641 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1642 TEUCHOS_TEST_FOR_EXCEPTION(((
size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1643 prefix <<
"import_lids_d.size() = " << import_lids_d.size() <<
" != "
1644 "num_packets_per_lid_d.size() = "
1645 << num_packets_per_lid_d.size() <<
".");
1650 #ifdef HAVE_TPETRA_MMM_TIMINGS
1651 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1653 size_t TargetNumNonzeros =
1655 local_matrix, permute_from_lids_d, imports_d,
1656 num_packets_per_lid_d, numSameIDs);
1657 #ifdef HAVE_TPETRA_MMM_TIMINGS
1661 #ifdef HAVE_TPETRA_MMM_TIMINGS
1662 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1664 CRS_rowptr.resize(TargetNumRows + 1);
1665 CRS_colind.resize(TargetNumNonzeros);
1666 CRS_vals.resize(TargetNumNonzeros);
1667 Teuchos::ArrayRCP<ST>
const& CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1668 #ifdef HAVE_TPETRA_MMM_TIMINGS
1672 TEUCHOS_TEST_FOR_EXCEPTION(
1673 permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1674 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size()
1675 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1678 if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1679 TargetPids.resize(TargetNumNonzeros);
1681 TargetPids.assign(TargetNumNonzeros, -1);
1684 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1686 #ifdef HAVE_TPETRA_MMM_TIMINGS
1687 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1694 CRS_rowptr.size(),
true,
"crs_rowptr");
1698 CRS_colind.size(),
true,
"crs_colidx");
1699 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1700 static_assert(!std::is_same<
1701 typename std::remove_const<
1702 typename std::decay<
1703 decltype(CRS_vals_impl_scalar_type)>::type::value_type>::type,
1704 std::complex<double>>::value,
1705 "CRS_vals::value_type is std::complex<double>; this should never happen"
1706 ", since std::complex does not work in Kokkos::View objects.");
1707 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1711 CRS_vals_impl_scalar_type.size(),
true,
"crs_vals");
1713 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714 static_assert(!std::is_same<
1715 typename decltype(crs_vals_d)::non_const_value_type,
1716 std::complex<double>>::value,
1717 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718 "never happen, since std::complex does not work in Kokkos::View objects.");
1719 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1723 SourcePids.size(),
true,
"src_pids");
1727 TargetPids.size(),
true,
"tgt_pids");
1729 #ifdef HAVE_TPETRA_MMM_TIMINGS
1733 size_t bytes_per_value = 0;
1746 size_t bytes_per_value_l = 0;
1747 if (local_matrix.values.extent(0) > 0) {
1748 const ST& val = local_matrix.values(0);
1751 const ST& val = crs_vals_d(0);
1754 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1755 Teuchos::REDUCE_MAX,
1757 outArg(bytes_per_value));
1760 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1761 static_assert(!std::is_same<
1762 typename decltype(crs_vals_d)::non_const_value_type,
1763 std::complex<double>>::value,
1764 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1765 "never happen, since std::complex does not work in Kokkos::View objects.");
1766 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1768 #ifdef HAVE_TPETRA_MMM_TIMINGS
1769 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1772 local_matrix, local_col_map, import_lids_d, imports_d,
1773 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1774 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1775 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1777 #ifdef HAVE_TPETRA_MMM_TIMINGS
1782 #ifdef HAVE_TPETRA_MMM_TIMINGS
1783 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1785 typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
1786 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1788 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1790 typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
1791 CRS_colind.getRawPtr(), CRS_colind.size());
1793 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1795 typename decltype(crs_vals_d)::host_mirror_type crs_vals_h(
1796 CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1798 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1800 typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
1801 TargetPids.getRawPtr(), TargetPids.size());
1803 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1810 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT) \
1812 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1813 const CrsMatrix<ST, LO, GO, NT>&, \
1814 const Teuchos::ArrayView<const char>&, \
1815 const Teuchos::ArrayView<const size_t>&, \
1816 const Teuchos::ArrayView<const LO>&, \
1820 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1821 const CrsMatrix<ST, LO, GO, NT>&, \
1822 const Teuchos::ArrayView<const LO>&, \
1823 const Teuchos::ArrayView<const char>&, \
1824 const Teuchos::ArrayView<const size_t>&, \
1828 const Teuchos::ArrayView<const LO>&, \
1829 const Teuchos::ArrayView<const LO>&); \
1831 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1832 const CrsMatrix<ST, LO, GO, NT>&, \
1833 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1834 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1835 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1837 const CombineMode); \
1839 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1840 const CrsMatrix<ST, LO, GO, NT>&, \
1841 const Kokkos::View<LO const*, \
1842 Kokkos::Device<typename NT::device_type::execution_space, \
1843 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1845 const Kokkos::View<const char*, \
1846 Kokkos::Device<typename NT::device_type::execution_space, \
1847 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1849 const Kokkos::View<const size_t*, \
1850 Kokkos::Device<typename NT::device_type::execution_space, \
1851 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1854 const Kokkos::View<LO const*, \
1855 Kokkos::Device<typename NT::device_type::execution_space, \
1856 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1858 const Kokkos::View<LO const*, \
1859 Kokkos::Device<typename NT::device_type::execution_space, \
1860 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1864 Kokkos::View<size_t*, typename NT::device_type>&, \
1865 Kokkos::View<GO*, typename NT::device_type>&, \
1866 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1867 const Teuchos::ArrayView<const int>&, \
1868 Kokkos::View<int*, typename NT::device_type>&); \
1870 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1871 const CrsMatrix<ST, LO, GO, NT>&, \
1872 const Kokkos::View<LO const*, \
1873 Kokkos::Device<typename NT::device_type::execution_space, \
1874 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1876 const Kokkos::View<const char*, \
1877 Kokkos::Device<typename NT::device_type::execution_space, \
1878 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1880 const Kokkos::View<const size_t*, \
1881 Kokkos::Device<typename NT::device_type::execution_space, \
1882 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1885 const Kokkos::View<LO const*, \
1886 Kokkos::Device<typename NT::device_type::execution_space, \
1887 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1889 const Kokkos::View<LO const*, \
1890 Kokkos::Device<typename NT::device_type::execution_space, \
1891 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1895 Teuchos::ArrayRCP<size_t>&, \
1896 Teuchos::ArrayRCP<GO>&, \
1897 Teuchos::ArrayRCP<ST>&, \
1898 const Teuchos::ArrayView<const int>&, \
1899 Teuchos::Array<int>&);
1901 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT) \
1903 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1904 const CrsMatrix<ST, LO, GO, NT>&, \
1905 const Teuchos::ArrayView<const char>&, \
1906 const Teuchos::ArrayView<const size_t>&, \
1907 const Teuchos::ArrayView<const LO>&, \
1911 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1912 const CrsMatrix<ST, LO, GO, NT>&, \
1913 const Teuchos::ArrayView<const LO>&, \
1914 const Teuchos::ArrayView<const char>&, \
1915 const Teuchos::ArrayView<const size_t>&, \
1919 const Teuchos::ArrayView<const LO>&, \
1920 const Teuchos::ArrayView<const LO>&); \
1922 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1923 const CrsMatrix<ST, LO, GO, NT>&, \
1924 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1925 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1926 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1928 const CombineMode); \
1930 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1931 const CrsMatrix<ST, LO, GO, NT>&, \
1932 const Kokkos::View<LO const*, \
1933 Kokkos::Device<typename NT::device_type::execution_space, \
1934 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1935 const Kokkos::View<const char*, \
1936 Kokkos::Device<typename NT::device_type::execution_space, \
1937 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1938 const Kokkos::View<const size_t*, \
1939 Kokkos::Device<typename NT::device_type::execution_space, \
1940 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1942 const Kokkos::View<LO const*, \
1943 Kokkos::Device<typename NT::device_type::execution_space, \
1944 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1945 const Kokkos::View<LO const*, \
1946 Kokkos::Device<typename NT::device_type::execution_space, \
1947 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1950 Kokkos::View<size_t*, typename NT::device_type>&, \
1951 Kokkos::View<GO*, typename NT::device_type>&, \
1952 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1953 const Teuchos::ArrayView<const int>&, \
1954 Kokkos::View<int*, typename NT::device_type>&); \
1956 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1957 const CrsMatrix<ST, LO, GO, NT>&, \
1958 const Kokkos::View<LO const*, \
1959 Kokkos::Device<typename NT::device_type::execution_space, \
1960 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1961 const Kokkos::View<const char*, \
1962 Kokkos::Device<typename NT::device_type::execution_space, \
1963 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1964 const Kokkos::View<const size_t*, \
1965 Kokkos::Device<typename NT::device_type::execution_space, \
1966 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1968 const Kokkos::View<LO const*, \
1969 Kokkos::Device<typename NT::device_type::execution_space, \
1970 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1971 const Kokkos::View<LO const*, \
1972 Kokkos::Device<typename NT::device_type::execution_space, \
1973 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1976 Teuchos::ArrayRCP<size_t>&, \
1977 Teuchos::ArrayRCP<GO>&, \
1978 Teuchos::ArrayRCP<ST>&, \
1979 const Teuchos::ArrayView<const int>&, \
1980 Teuchos::Array<int>&);
1982 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1983 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1984 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT)
1986 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1987 TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT)
1990 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
int error() const
Host function for getting the error.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...