40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
45 #include "TpetraCore_config.h"
46 #include "Kokkos_Core.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Teuchos_ArrayView.hpp"
49 #include "Teuchos_OrdinalTraits.hpp"
50 #include "Teuchos_TimeMonitor.hpp"
58 #include "Tpetra_Details_DefaultTypes.hpp"
89 namespace UnpackAndCombineCrsMatrixImpl {
100 template<
class ST,
class LO,
class GO>
102 unpackRow(
const typename PackTraits<GO>::output_array_type& gids_out,
104 const typename PackTraits<ST>::output_array_type& vals_out,
105 const char imports[],
108 const size_t num_ent,
109 const size_t bytes_per_value)
115 bool unpack_pids = pids_out.size() > 0;
117 const size_t num_ent_beg = offset;
120 const size_t gids_beg = num_ent_beg + num_ent_len;
121 const size_t gids_len =
124 const size_t pids_beg = gids_beg + gids_len;
125 const size_t pids_len = unpack_pids ?
129 const size_t vals_beg = gids_beg + gids_len + pids_len;
130 const size_t vals_len = num_ent * bytes_per_value;
132 const char*
const num_ent_in = imports + num_ent_beg;
133 const char*
const gids_in = imports + gids_beg;
134 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
135 const char*
const vals_in = imports + vals_beg;
137 size_t num_bytes_out = 0;
140 if (static_cast<size_t> (num_ent_out) != num_ent) {
145 Kokkos::pair<int, size_t> p;
150 num_bytes_out += p.second;
157 num_bytes_out += p.second;
164 num_bytes_out += p.second;
167 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
168 if (num_bytes_out != expected_num_bytes) {
184 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
186 typedef LocalMatrix local_matrix_type;
189 typedef typename local_matrix_type::value_type ST;
193 typedef typename DT::execution_space XS;
195 typedef Kokkos::View<const size_t*, BufferDeviceType>
196 num_packets_per_lid_type;
197 typedef Kokkos::View<const size_t*, DT> offsets_type;
198 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
199 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
201 typedef Kokkos::View<int, DT> error_type;
202 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
204 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
205 "LocalMap::local_ordinal_type and "
206 "LocalMatrix::ordinal_type must be the same.");
208 local_matrix_type local_matrix;
210 input_buffer_type imports;
211 num_packets_per_lid_type num_packets_per_lid;
212 import_lids_type import_lids;
213 Kokkos::View<const LO*[2], DT> batch_info;
214 offsets_type offsets;
217 size_t bytes_per_value;
219 error_type error_code;
222 const local_matrix_type& local_matrix_in,
224 const input_buffer_type& imports_in,
225 const num_packets_per_lid_type& num_packets_per_lid_in,
226 const import_lids_type& import_lids_in,
227 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
228 const offsets_type& offsets_in,
230 const size_t batch_size_in,
231 const size_t bytes_per_value_in,
232 const bool atomic_in) :
233 local_matrix (local_matrix_in),
234 local_col_map (local_col_map_in),
235 imports (imports_in),
236 num_packets_per_lid (num_packets_per_lid_in),
237 import_lids (import_lids_in),
238 batch_info (batch_info_in),
239 offsets (offsets_in),
240 combine_mode (combine_mode_in),
241 batch_size (batch_size_in),
242 bytes_per_value (bytes_per_value_in),
247 KOKKOS_INLINE_FUNCTION
248 void operator()(member_type team_member)
const
251 using Kokkos::subview;
252 using Kokkos::MemoryUnmanaged;
254 const LO batch = team_member.league_rank();
255 const LO lid_no = batch_info(batch, 0);
256 const LO batch_no = batch_info(batch, 1);
258 const size_t num_bytes = num_packets_per_lid(lid_no);
265 const LO import_lid = import_lids(lid_no);
266 const size_t buf_size = imports.size();
267 const size_t offset = offsets(lid_no);
271 const char*
const in_buf = imports.data() + offset;
273 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
276 size_t expected_num_bytes = 0;
283 if (expected_num_bytes > num_bytes)
286 #ifndef KOKKOS_ENABLE_SYCL
288 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
289 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
290 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
293 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
297 if (offset > buf_size || offset + num_bytes > buf_size)
300 #ifndef KOKKOS_ENABLE_SYCL
302 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
303 "At row %d, the offset (%d) > buffer size (%d)\n",
304 (
int) lid_no, (
int) offset, (
int) buf_size
307 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
312 size_t num_entries_in_batch = 0;
313 if (num_entries_in_row <= batch_size)
314 num_entries_in_batch = num_entries_in_row;
315 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
316 num_entries_in_batch = batch_size;
318 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
321 const size_t num_ent_start = offset;
322 const size_t num_ent_end = num_ent_start + bytes_per_lid;
325 const size_t gids_start = num_ent_end;
326 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
328 const size_t vals_start = gids_end;
330 const size_t shift = batch_no * batch_size;
331 const char*
const num_ent_in = imports.data() + num_ent_start;
332 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
333 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
337 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
340 #ifndef KOKKOS_ENABLE_SYCL
342 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
343 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
344 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
347 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
350 constexpr
bool matrix_has_sorted_rows =
true;
353 Kokkos::parallel_for(
354 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
360 distance = j * bytes_per_gid;
370 distance = j * bytes_per_value;
373 if (combine_mode ==
ADD) {
377 const bool use_atomic_updates = atomic;
378 (void)local_matrix.sumIntoValues(
383 matrix_has_sorted_rows,
386 }
else if (combine_mode ==
REPLACE) {
390 const bool use_atomic_updates =
false;
391 (void)local_matrix.replaceValues(
396 matrix_has_sorted_rows,
402 #ifndef KOKKOS_ENABLE_SYCL
404 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
405 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
408 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
413 team_member.team_barrier();
419 auto error_code_h = Kokkos::create_mirror_view_and_copy(
420 Kokkos::HostSpace(), error_code
422 return error_code_h();
427 struct MaxNumEntTag {};
428 struct TotNumEntTag {};
438 template<
class LO,
class DT,
class BDT>
441 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
442 typedef Kokkos::View<const size_t*, DT> offsets_type;
443 typedef Kokkos::View<const char*, BDT> input_buffer_type;
446 typedef size_t value_type;
449 num_packets_per_lid_type num_packets_per_lid;
450 offsets_type offsets;
451 input_buffer_type imports;
455 const offsets_type& offsets_in,
456 const input_buffer_type& imports_in) :
457 num_packets_per_lid (num_packets_per_lid_in),
458 offsets (offsets_in),
462 KOKKOS_INLINE_FUNCTION
void
463 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
465 const size_t num_bytes = num_packets_per_lid(i);
468 const char*
const in_buf = imports.data () + offsets(i);
470 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
472 update = (update < num_ent) ? num_ent : update;
476 KOKKOS_INLINE_FUNCTION
void
477 join (
const MaxNumEntTag,
479 const value_type& src)
const
481 if (dst < src) dst = src;
484 KOKKOS_INLINE_FUNCTION
void
485 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
487 const size_t num_bytes = num_packets_per_lid(i);
490 const char*
const in_buf = imports.data () + offsets(i);
492 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
504 template<
class LO,
class DT,
class BDT>
506 compute_maximum_num_entries (
507 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
508 const Kokkos::View<const size_t*, DT>& offsets,
509 const Kokkos::View<const char*, BDT>& imports)
511 typedef typename DT::execution_space XS;
512 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
513 MaxNumEntTag> range_policy;
517 const LO numRowsToUnpack =
518 static_cast<LO
> (num_packets_per_lid.extent (0));
519 size_t max_num_ent = 0;
520 Kokkos::parallel_reduce (
"Max num entries in CRS",
521 range_policy (0, numRowsToUnpack),
522 functor, max_num_ent);
533 template<
class LO,
class DT,
class BDT>
535 compute_total_num_entries (
536 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
537 const Kokkos::View<const size_t*, DT>& offsets,
538 const Kokkos::View<const char*, BDT>& imports)
540 typedef typename DT::execution_space XS;
541 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
542 size_t tot_num_ent = 0;
543 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
545 const LO numRowsToUnpack =
546 static_cast<LO
> (num_packets_per_lid.extent (0));
547 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
548 range_policy (0, numRowsToUnpack),
549 functor, tot_num_ent);
554 KOKKOS_INLINE_FUNCTION
556 unpackRowCount(
const char imports[],
558 const size_t num_bytes)
560 using PT = PackTraits<LO>;
564 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
565 if (p_num_bytes > num_bytes) {
566 return OrdinalTraits<size_t>::invalid();
568 const char*
const in_buf = imports + offset;
569 (void) PT::unpackValue(num_ent_LO, in_buf);
571 return static_cast<size_t>(num_ent_LO);
578 template<
class View1,
class View2>
582 const View1& batches_per_lid,
586 using LO =
typename View2::value_type;
588 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
590 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
592 batch_info(batch, 0) =
static_cast<LO
>(i);
593 batch_info(batch, 1) = batch_no;
597 return batch == batch_info.extent(0);
607 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
609 unpackAndCombineIntoCrsMatrix(
610 const LocalMatrix& local_matrix,
611 const LocalMap& local_map,
612 const Kokkos::View<const char*, BufferDeviceType>& imports,
613 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
614 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
617 using ST =
typename LocalMatrix::value_type;
620 using XS =
typename DT::execution_space;
621 const char prefix[] =
622 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
623 "unpackAndCombineIntoCrsMatrix: ";
625 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
626 if (num_import_lids == 0) {
633 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
634 std::invalid_argument,
635 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
636 "static graph (i.e., was constructed with the CrsMatrix constructor "
637 "that takes a const CrsGraph pointer).");
639 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
640 std::invalid_argument,
641 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
642 "(i.e., was constructed with the CrsMatrix constructor that takes a "
643 "const CrsGraph pointer).");
646 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
647 std::invalid_argument,
648 prefix <<
"Invalid combine mode; should never get "
649 "here! Please report this bug to the Tpetra developers.");
652 bool bad_num_import_lids =
653 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
654 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
655 std::invalid_argument,
656 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
657 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
661 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
665 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
667 const size_t batch_size = std::min(default_batch_size, max_num_ent);
670 size_t num_batches = 0;
671 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
672 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
674 Kokkos::parallel_reduce(
675 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
676 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
678 const size_t num_entries_in_row = unpackRowCount<LO>(
679 imports.data(), offsets(i), num_packets_per_lid(i)
682 (num_entries_in_row <= batch_size) ?
684 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
685 batches += batches_per_lid(i);
689 Kokkos::resize(batch_info, num_batches);
691 Kokkos::HostSpace host_space;
692 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
696 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
698 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
707 const bool atomic = XS().concurrency() != 1;
708 using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
723 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
725 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
727 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
731 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
734 auto error_code = f.error();
735 TEUCHOS_TEST_FOR_EXCEPTION(
738 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
742 template<
class LocalMatrix,
class BufferDeviceType>
745 const LocalMatrix& local_matrix,
746 const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
747 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
748 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
749 const size_t num_same_ids)
751 using Kokkos::parallel_reduce;
752 typedef typename LocalMatrix::ordinal_type LO;
753 typedef typename LocalMatrix::device_type device_type;
754 typedef typename device_type::execution_space XS;
755 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
756 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
757 typedef BufferDeviceType BDT;
763 num_items =
static_cast<LO
>(num_same_ids);
766 parallel_reduce(range_policy(0, num_items),
767 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
768 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
769 -local_matrix.graph.row_map[lid]);
775 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
778 parallel_reduce(range_policy(0, num_items),
779 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
780 const LO lid = permute_from_lids(i);
781 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
782 - local_matrix.graph.row_map[lid]);
789 const size_type np = num_packets_per_lid.extent(0);
790 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
793 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
801 template<
class LO,
class DT,
class BDT>
803 setupRowPointersForRemotes(
804 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
806 const Kokkos::View<const char*, BDT>& imports,
807 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
808 const typename PackTraits<size_t>::input_array_type& offsets)
810 using Kokkos::parallel_reduce;
811 typedef typename DT::execution_space XS;
812 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
813 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
815 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
816 const size_type N = num_packets_per_lid.extent(0);
819 parallel_reduce (
"Setup row pointers for remotes",
821 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
822 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
823 const size_t num_bytes = num_packets_per_lid(i);
824 const size_t offset = offsets(i);
825 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
826 if (num_ent == InvalidNum) {
829 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
837 makeCrsRowPtrFromLengths(
838 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
839 const Kokkos::View<size_t*,DT>& new_start_row)
841 using Kokkos::parallel_scan;
842 typedef typename DT::execution_space XS;
843 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
844 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
845 const size_type N = new_start_row.extent(0);
846 parallel_scan(range_policy(0, N),
847 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
848 auto cur_val = tgt_rowptr(i);
850 tgt_rowptr(i) = update;
851 new_start_row(i) = tgt_rowptr(i);
858 template<
class LocalMatrix,
class LocalMap>
861 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
863 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
864 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
865 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
867 const LocalMatrix& local_matrix,
868 const LocalMap& local_col_map,
869 const size_t num_same_ids,
872 using Kokkos::parallel_for;
875 typedef typename DT::execution_space XS;
876 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
878 parallel_for(range_policy(0, num_same_ids),
879 KOKKOS_LAMBDA(
const size_t i) {
880 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
882 const LO src_lid =
static_cast<LO
>(i);
883 size_t src_row = local_matrix.graph.row_map(src_lid);
885 const LO tgt_lid =
static_cast<LO
>(i);
886 const size_t tgt_row = tgt_rowptr(tgt_lid);
888 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
889 - local_matrix.graph.row_map(src_lid);
890 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
892 for (
size_t j=local_matrix.graph.row_map(src_lid);
893 j<local_matrix.graph.row_map(src_lid+1); ++j) {
894 LO src_col = local_matrix.graph.entries(j);
895 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
896 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
897 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
903 template<
class LocalMatrix,
class LocalMap>
905 copyDataFromPermuteIDs(
906 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
908 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
909 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
910 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
912 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
913 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
914 const LocalMatrix& local_matrix,
915 const LocalMap& local_col_map,
918 using Kokkos::parallel_for;
921 typedef typename DT::execution_space XS;
922 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
923 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
925 const size_type num_permute_to_lids = permute_to_lids.extent(0);
927 parallel_for(range_policy(0, num_permute_to_lids),
928 KOKKOS_LAMBDA(
const size_t i) {
929 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
931 const LO src_lid = permute_from_lids(i);
932 const size_t src_row = local_matrix.graph.row_map(src_lid);
934 const LO tgt_lid = permute_to_lids(i);
935 const size_t tgt_row = tgt_rowptr(tgt_lid);
937 size_t nsr = local_matrix.graph.row_map(src_lid+1)
938 - local_matrix.graph.row_map(src_lid);
939 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
941 for (
size_t j=local_matrix.graph.row_map(src_lid);
942 j<local_matrix.graph.row_map(src_lid+1); ++j) {
943 LO src_col = local_matrix.graph.entries(j);
944 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
945 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
946 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
952 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
954 unpackAndCombineIntoCrsArrays2(
955 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
957 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
958 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
959 const typename PackTraits<size_t>::input_array_type& offsets,
960 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
961 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
962 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
966 const size_t bytes_per_value)
969 using Kokkos::subview;
970 using Kokkos::MemoryUnmanaged;
971 using Kokkos::parallel_reduce;
972 using Kokkos::atomic_fetch_add;
977 typedef typename LocalMatrix::value_type ST;
978 typedef typename DT::execution_space XS;
979 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
980 typedef typename Kokkos::pair<size_type, size_type> slice;
981 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
983 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
984 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
985 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
987 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
990 const size_type num_import_lids = import_lids.size();
993 parallel_reduce (
"Unpack and combine into CRS",
994 range_policy (0, num_import_lids),
995 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
996 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
997 const size_t num_bytes = num_packets_per_lid(i);
998 const size_t offset = offsets(i);
999 if (num_bytes == 0) {
1003 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1004 if (num_ent == InvalidNum) {
1008 const LO lcl_row = import_lids(i);
1009 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1010 const size_t end_row = start_row + num_ent;
1012 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1013 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1014 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1016 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1017 imports.data(), offset, num_bytes,
1018 num_ent, bytes_per_value);
1021 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1022 const int pid = pids_out(j);
1023 pids_out(j) = (pid != my_pid) ? pid : -1;
1030 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1033 const LocalMatrix & local_matrix,
1034 const LocalMap & local_col_map,
1035 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1036 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1037 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1038 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1039 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1040 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1041 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1042 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1045 const size_t num_same_ids,
1046 const size_t tgt_num_rows,
1047 const size_t tgt_num_nonzeros,
1048 const int my_tgt_pid,
1049 const size_t bytes_per_value)
1052 using Kokkos::subview;
1053 using Kokkos::parallel_for;
1054 using Kokkos::MemoryUnmanaged;
1058 typedef typename DT::execution_space XS;
1059 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1060 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1061 typedef BufferDeviceType BDT;
1063 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1065 const size_t N = tgt_num_rows;
1069 const int my_pid = my_tgt_pid;
1072 parallel_for(range_policy(0, N+1),
1073 KOKKOS_LAMBDA(
const size_t i) {
1079 parallel_for(range_policy(0, num_same_ids),
1080 KOKKOS_LAMBDA(
const size_t i) {
1081 const LO tgt_lid =
static_cast<LO
>(i);
1082 const LO src_lid =
static_cast<LO
>(i);
1083 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1084 - local_matrix.graph.row_map(src_lid);
1089 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1090 parallel_for(range_policy(0, num_permute_to_lids),
1091 KOKKOS_LAMBDA(
const size_t i) {
1092 const LO tgt_lid = permute_to_lids(i);
1093 const LO src_lid = permute_from_lids(i);
1094 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1095 - local_matrix.graph.row_map(src_lid);
1100 const size_type num_import_lids = import_lids.extent(0);
1101 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1104 #ifdef HAVE_TPETRA_DEBUG
1106 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1107 const bool condition =
1108 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1109 TEUCHOS_TEST_FOR_EXCEPTION
1110 (condition, std::logic_error, prefix
1111 <<
"The final offset in bytes " << nth_offset_h
1112 <<
" != imports.size() = " << imports.extent(0)
1113 <<
". Please report this bug to the Tpetra developers.");
1115 #endif // HAVE_TPETRA_DEBUG
1119 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1120 import_lids, imports, num_packets_per_lid, offsets);
1121 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1122 <<
" Error transferring data to target row pointers. "
1123 "Please report this bug to the Tpetra developers.");
1127 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1130 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1133 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1134 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1136 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1137 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1138 local_matrix, local_col_map, my_pid);
1140 if (imports.extent(0) <= 0) {
1144 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1145 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1146 local_matrix, local_col_map, my_pid, bytes_per_value);
1147 TEUCHOS_TEST_FOR_EXCEPTION(
1148 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1149 "should never happen. Please report this bug to the Tpetra developers.");
1195 template<
typename ST,
typename LO,
typename GO,
typename Node>
1199 const Teuchos::ArrayView<const char>& imports,
1200 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1201 const Teuchos::ArrayView<const LO>& importLIDs,
1206 typedef typename Node::device_type device_type;
1208 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1209 "Node::device_type and LocalMatrix::device_type must be the same.");
1212 device_type outputDevice;
1217 auto num_packets_per_lid_d =
1219 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1221 auto import_lids_d =
1223 importLIDs.size(),
true,
"import_lids");
1227 imports.size(),
true,
"imports");
1230 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1241 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1242 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1243 import_lids_d, combineMode);
1247 template<
typename ST,
typename LO,
typename GO,
typename NT>
1249 unpackCrsMatrixAndCombineNew(
1251 Kokkos::DualView<
char*,
1253 Kokkos::DualView<
size_t*,
1255 const Kokkos::DualView<
const LO*,
1263 using device_type =
typename crs_matrix_type::device_type;
1264 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1265 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1268 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1269 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1270 "must be the same.");
1272 if (numPacketsPerLID.need_sync_device()) {
1273 numPacketsPerLID.sync_device ();
1275 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1277 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1278 auto import_lids_d = importLIDs.view_device ();
1280 if (imports.need_sync_device()) {
1281 imports.sync_device ();
1283 auto imports_d = imports.view_device ();
1286 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1287 typedef decltype (local_col_map) local_map_type;
1289 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1290 local_matrix_device_type,
1293 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1294 import_lids_d, combineMode);
1352 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1355 const
CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1356 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1357 const Teuchos::ArrayView<const
char> &imports,
1358 const Teuchos::ArrayView<const
size_t>& numPacketsPerLID,
1362 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1363 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1365 using Kokkos::MemoryUnmanaged;
1367 typedef typename Node::device_type DT;
1368 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1370 TEUCHOS_TEST_FOR_EXCEPTION
1371 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1372 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1373 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1376 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1377 TEUCHOS_TEST_FOR_EXCEPTION
1378 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1379 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1380 TEUCHOS_TEST_FOR_EXCEPTION
1381 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1382 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1383 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1385 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1387 using kokkos_device_type = Kokkos::Device<
typename Node::device_type::execution_space,
1388 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1390 Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1392 permuteFromLIDs.getRawPtr (),
1393 permuteFromLIDs.size (),
true,
1394 "permute_from_lids");
1396 Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1398 imports.getRawPtr (),
1399 imports.size (),
true,
1402 Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1404 numPacketsPerLID.getRawPtr (),
1405 numPacketsPerLID.size (),
true,
1406 "num_packets_per_lid");
1409 local_matrix, permute_from_lids_d, imports_d,
1410 num_packets_per_lid_d, numSameIDs);
1428 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1432 const Kokkos::View<LocalOrdinal
const *,
1433 Kokkos::Device<
typename Node::device_type::execution_space,
1434 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1435 void,
void > import_lids_d,
1436 const Kokkos::View<
const char*,
1437 Kokkos::Device<
typename Node::device_type::execution_space,
1438 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1439 void,
void > imports_d,
1440 const Kokkos::View<
const size_t*,
1441 Kokkos::Device<
typename Node::device_type::execution_space,
1442 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1443 void,
void > num_packets_per_lid_d,
1444 const size_t numSameIDs,
1445 const Kokkos::View<LocalOrdinal
const *,
1446 Kokkos::Device<
typename Node::device_type::execution_space,
1447 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1448 void,
void > permute_to_lids_d,
1449 const Kokkos::View<LocalOrdinal
const *,
1450 Kokkos::Device<
typename Node::device_type::execution_space,
1451 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1452 void,
void > permute_from_lids_d,
1453 size_t TargetNumRows,
1454 const int MyTargetPID,
1455 Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1456 Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1458 const Teuchos::ArrayView<const int>& SourcePids,
1459 Kokkos::View<int*,typename Node::device_type> &TargetPids)
1461 using execution_space =
typename Node::execution_space;
1467 using Teuchos::ArrayView;
1468 using Teuchos::outArg;
1469 using Teuchos::REDUCE_MAX;
1470 using Teuchos::reduceAll;
1472 typedef typename Node::device_type DT;
1475 typedef typename matrix_type::impl_scalar_type ST;
1477 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1478 # ifdef HAVE_TPETRA_MMM_TIMINGS
1479 using Teuchos::TimeMonitor;
1480 Teuchos::RCP<TimeMonitor> tm;
1483 using Kokkos::MemoryUnmanaged;
1485 TEUCHOS_TEST_FOR_EXCEPTION
1486 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1487 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1488 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1492 TEUCHOS_TEST_FOR_EXCEPTION
1493 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1494 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1495 TEUCHOS_TEST_FOR_EXCEPTION
1496 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1497 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1498 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1503 # ifdef HAVE_TPETRA_MMM_TIMINGS
1504 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1506 size_t TargetNumNonzeros =
1508 local_matrix, permute_from_lids_d, imports_d,
1509 num_packets_per_lid_d, numSameIDs);
1510 # ifdef HAVE_TPETRA_MMM_TIMINGS
1514 # ifdef HAVE_TPETRA_MMM_TIMINGS
1515 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1517 Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1518 Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1519 Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1520 # ifdef HAVE_TPETRA_MMM_TIMINGS
1524 TEUCHOS_TEST_FOR_EXCEPTION(
1525 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1526 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1527 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1529 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1530 Kokkos::resize(TargetPids,TargetNumNonzeros);
1535 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1537 # ifdef HAVE_TPETRA_MMM_TIMINGS
1538 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1545 SourcePids.size(),
true,
"src_pids");
1547 # ifdef HAVE_TPETRA_MMM_TIMINGS
1551 size_t bytes_per_value = 0;
1565 size_t bytes_per_value_l = 0;
1566 if (local_matrix.values.extent(0) > 0) {
1567 const ST& val = local_matrix.values(0);
1570 const ST& val = crs_vals_d(0);
1573 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1574 Teuchos::REDUCE_MAX,
1576 outArg(bytes_per_value));
1579 # ifdef HAVE_TPETRA_MMM_TIMINGS
1580 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1583 local_matrix, local_col_map, import_lids_d, imports_d,
1584 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1585 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1586 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1588 # ifdef HAVE_TPETRA_MMM_TIMINGS
1593 # ifdef HAVE_TPETRA_MMM_TIMINGS
1594 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1597 Kokkos::parallel_for(
"setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (
const size_t i) {
1598 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1603 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1607 const Kokkos::View<LocalOrdinal
const *,
1608 Kokkos::Device<
typename Node::device_type::execution_space,
1609 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1610 void,
void > import_lids_d,
1611 const Kokkos::View<
const char*,
1612 Kokkos::Device<
typename Node::device_type::execution_space,
1613 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1614 void,
void > imports_d,
1615 const Kokkos::View<
const size_t*,
1616 Kokkos::Device<
typename Node::device_type::execution_space,
1617 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1618 void,
void > num_packets_per_lid_d,
1619 const size_t numSameIDs,
1620 const Kokkos::View<LocalOrdinal
const *,
1621 Kokkos::Device<
typename Node::device_type::execution_space,
1622 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1623 void,
void > permute_to_lids_d,
1624 const Kokkos::View<LocalOrdinal
const *,
1625 Kokkos::Device<
typename Node::device_type::execution_space,
1626 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1627 void,
void > permute_from_lids_d,
1628 size_t TargetNumRows,
1629 const int MyTargetPID,
1630 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1631 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1632 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1633 const Teuchos::ArrayView<const int>& SourcePids,
1634 Teuchos::Array<int>& TargetPids)
1636 using execution_space =
typename Node::execution_space;
1642 using Teuchos::ArrayView;
1643 using Teuchos::outArg;
1644 using Teuchos::REDUCE_MAX;
1645 using Teuchos::reduceAll;
1647 typedef typename Node::device_type DT;
1650 typedef typename matrix_type::impl_scalar_type ST;
1652 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1653 # ifdef HAVE_TPETRA_MMM_TIMINGS
1654 using Teuchos::TimeMonitor;
1655 Teuchos::RCP<TimeMonitor> tm;
1658 using Kokkos::MemoryUnmanaged;
1660 TEUCHOS_TEST_FOR_EXCEPTION
1661 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1662 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1663 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1667 TEUCHOS_TEST_FOR_EXCEPTION
1668 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1669 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1670 TEUCHOS_TEST_FOR_EXCEPTION
1671 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1672 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1673 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1678 # ifdef HAVE_TPETRA_MMM_TIMINGS
1679 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1681 size_t TargetNumNonzeros =
1683 local_matrix, permute_from_lids_d, imports_d,
1684 num_packets_per_lid_d, numSameIDs);
1685 # ifdef HAVE_TPETRA_MMM_TIMINGS
1689 # ifdef HAVE_TPETRA_MMM_TIMINGS
1690 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1692 CRS_rowptr.resize (TargetNumRows+1);
1693 CRS_colind.resize(TargetNumNonzeros);
1694 CRS_vals.resize(TargetNumNonzeros);
1695 Teuchos::ArrayRCP<ST>
const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1696 # ifdef HAVE_TPETRA_MMM_TIMINGS
1700 TEUCHOS_TEST_FOR_EXCEPTION(
1701 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1702 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1703 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1706 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1707 TargetPids.resize (TargetNumNonzeros);
1709 TargetPids.assign (TargetNumNonzeros, -1);
1712 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1714 # ifdef HAVE_TPETRA_MMM_TIMINGS
1715 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1722 CRS_rowptr.size(),
true,
"crs_rowptr");
1726 CRS_colind.size(),
true,
"crs_colidx");
1727 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1728 static_assert (! std::is_same<
1729 typename std::remove_const<
1730 typename std::decay<
1731 decltype (CRS_vals_impl_scalar_type)
1734 std::complex<double> >::value,
1735 "CRS_vals::value_type is std::complex<double>; this should never happen"
1736 ", since std::complex does not work in Kokkos::View objects.");
1737 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1741 CRS_vals_impl_scalar_type.size(),
true,
"crs_vals");
1743 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1744 static_assert (! std::is_same<
1745 typename decltype (crs_vals_d)::non_const_value_type,
1746 std::complex<double> >::value,
1747 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1748 "never happen, since std::complex does not work in Kokkos::View objects.");
1749 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1753 SourcePids.size(),
true,
"src_pids");
1757 TargetPids.size(),
true,
"tgt_pids");
1759 # ifdef HAVE_TPETRA_MMM_TIMINGS
1763 size_t bytes_per_value = 0;
1777 size_t bytes_per_value_l = 0;
1778 if (local_matrix.values.extent(0) > 0) {
1779 const ST& val = local_matrix.values(0);
1782 const ST& val = crs_vals_d(0);
1785 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1786 Teuchos::REDUCE_MAX,
1788 outArg(bytes_per_value));
1791 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1792 static_assert (! std::is_same<
1793 typename decltype (crs_vals_d)::non_const_value_type,
1794 std::complex<double> >::value,
1795 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1796 "never happen, since std::complex does not work in Kokkos::View objects.");
1797 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1799 # ifdef HAVE_TPETRA_MMM_TIMINGS
1800 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1803 local_matrix, local_col_map, import_lids_d, imports_d,
1804 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1805 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1806 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1808 # ifdef HAVE_TPETRA_MMM_TIMINGS
1813 # ifdef HAVE_TPETRA_MMM_TIMINGS
1814 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1816 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1817 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1819 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1821 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1822 CRS_colind.getRawPtr(), CRS_colind.size());
1824 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1826 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1827 CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1829 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1831 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1832 TargetPids.getRawPtr(), TargetPids.size());
1834 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1842 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1844 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1845 const CrsMatrix<ST, LO, GO, NT>&, \
1846 const Teuchos::ArrayView<const char>&, \
1847 const Teuchos::ArrayView<const size_t>&, \
1848 const Teuchos::ArrayView<const LO>&, \
1852 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1853 const CrsMatrix<ST, LO, GO, NT> &, \
1854 const Teuchos::ArrayView<const LO> &, \
1855 const Teuchos::ArrayView<const char> &, \
1856 const Teuchos::ArrayView<const size_t>&, \
1860 const Teuchos::ArrayView<const LO>&, \
1861 const Teuchos::ArrayView<const LO>&); \
1863 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1864 const CrsMatrix<ST, LO, GO, NT>&, \
1865 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1866 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1867 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1869 const CombineMode); \
1871 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1872 const CrsMatrix<ST, LO, GO, NT> &, \
1873 const Kokkos::View<LO const *, \
1874 Kokkos::Device<typename NT::device_type::execution_space, \
1875 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1877 const Kokkos::View<const char*, \
1878 Kokkos::Device<typename NT::device_type::execution_space, \
1879 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1881 const Kokkos::View<const size_t*, \
1882 Kokkos::Device<typename NT::device_type::execution_space, \
1883 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1886 const Kokkos::View<LO const *, \
1887 Kokkos::Device<typename NT::device_type::execution_space, \
1888 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1890 const Kokkos::View<LO const *, \
1891 Kokkos::Device<typename NT::device_type::execution_space, \
1892 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1896 Kokkos::View<size_t*,typename NT::device_type>&, \
1897 Kokkos::View<GO*,typename NT::device_type>&, \
1898 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1899 const Teuchos::ArrayView<const int>&, \
1900 Kokkos::View<int*,typename NT::device_type>&); \
1902 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1903 const CrsMatrix<ST, LO, GO, NT> &, \
1904 const Kokkos::View<LO const *, \
1905 Kokkos::Device<typename NT::device_type::execution_space, \
1906 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1908 const Kokkos::View<const char*, \
1909 Kokkos::Device<typename NT::device_type::execution_space, \
1910 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1912 const Kokkos::View<const size_t*, \
1913 Kokkos::Device<typename NT::device_type::execution_space, \
1914 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1917 const Kokkos::View<LO const *, \
1918 Kokkos::Device<typename NT::device_type::execution_space, \
1919 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1921 const Kokkos::View<LO const *, \
1922 Kokkos::Device<typename NT::device_type::execution_space, \
1923 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1927 Teuchos::ArrayRCP<size_t>&, \
1928 Teuchos::ArrayRCP<GO>&, \
1929 Teuchos::ArrayRCP<ST>&, \
1930 const Teuchos::ArrayView<const int>&, \
1931 Teuchos::Array<int>&);
1933 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
int error() const
Host function for getting the error.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...