10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
59 namespace UnpackAndCombineCrsMatrixImpl {
70 template<
class ST,
class LO,
class GO>
72 unpackRow(
const typename PackTraits<GO>::output_array_type& gids_out,
74 const typename PackTraits<ST>::output_array_type& vals_out,
79 const size_t bytes_per_value)
85 bool unpack_pids = pids_out.size() > 0;
87 const size_t num_ent_beg = offset;
90 const size_t gids_beg = num_ent_beg + num_ent_len;
91 const size_t gids_len =
94 const size_t pids_beg = gids_beg + gids_len;
95 const size_t pids_len = unpack_pids ?
99 const size_t vals_beg = gids_beg + gids_len + pids_len;
100 const size_t vals_len = num_ent * bytes_per_value;
102 const char*
const num_ent_in = imports + num_ent_beg;
103 const char*
const gids_in = imports + gids_beg;
104 const char*
const pids_in = unpack_pids ? imports + pids_beg :
nullptr;
105 const char*
const vals_in = imports + vals_beg;
107 size_t num_bytes_out = 0;
110 if (static_cast<size_t> (num_ent_out) != num_ent) {
115 Kokkos::pair<int, size_t> p;
120 num_bytes_out += p.second;
127 num_bytes_out += p.second;
134 num_bytes_out += p.second;
137 const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138 if (num_bytes_out != expected_num_bytes) {
154 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
156 typedef LocalMatrix local_matrix_type;
159 typedef typename local_matrix_type::value_type ST;
163 typedef typename DT::execution_space XS;
165 typedef Kokkos::View<const size_t*, BufferDeviceType>
166 num_packets_per_lid_type;
167 typedef Kokkos::View<const size_t*, DT> offsets_type;
168 typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169 typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
171 typedef Kokkos::View<int, DT> error_type;
172 using member_type =
typename Kokkos::TeamPolicy<XS>::member_type;
174 static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175 "LocalMap::local_ordinal_type and "
176 "LocalMatrix::ordinal_type must be the same.");
178 local_matrix_type local_matrix;
180 input_buffer_type imports;
181 num_packets_per_lid_type num_packets_per_lid;
182 import_lids_type import_lids;
183 Kokkos::View<const LO*[2], DT> batch_info;
184 offsets_type offsets;
187 size_t bytes_per_value;
189 error_type error_code;
192 const local_matrix_type& local_matrix_in,
194 const input_buffer_type& imports_in,
195 const num_packets_per_lid_type& num_packets_per_lid_in,
196 const import_lids_type& import_lids_in,
197 const Kokkos::View<
const LO*[2], DT>& batch_info_in,
198 const offsets_type& offsets_in,
200 const size_t batch_size_in,
201 const size_t bytes_per_value_in,
202 const bool atomic_in) :
203 local_matrix (local_matrix_in),
204 local_col_map (local_col_map_in),
205 imports (imports_in),
206 num_packets_per_lid (num_packets_per_lid_in),
207 import_lids (import_lids_in),
208 batch_info (batch_info_in),
209 offsets (offsets_in),
210 combine_mode (combine_mode_in),
211 batch_size (batch_size_in),
212 bytes_per_value (bytes_per_value_in),
217 KOKKOS_INLINE_FUNCTION
218 void operator()(member_type team_member)
const
221 using Kokkos::subview;
222 using Kokkos::MemoryUnmanaged;
224 const LO batch = team_member.league_rank();
225 const LO lid_no = batch_info(batch, 0);
226 const LO batch_no = batch_info(batch, 1);
228 const size_t num_bytes = num_packets_per_lid(lid_no);
235 const LO import_lid = import_lids(lid_no);
236 const size_t buf_size = imports.size();
237 const size_t offset = offsets(lid_no);
241 const char*
const in_buf = imports.data() + offset;
243 const size_t num_entries_in_row =
static_cast<size_t>(num_ent_LO);
246 size_t expected_num_bytes = 0;
253 if (expected_num_bytes > num_bytes)
256 #ifndef KOKKOS_ENABLE_SYCL
258 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
259 "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
260 (
int) lid_no, (
int) expected_num_bytes, (
int) num_bytes
263 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
267 if (offset > buf_size || offset + num_bytes > buf_size)
270 #ifndef KOKKOS_ENABLE_SYCL
272 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
273 "At row %d, the offset (%d) > buffer size (%d)\n",
274 (
int) lid_no, (
int) offset, (
int) buf_size
277 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
282 size_t num_entries_in_batch = 0;
283 if (num_entries_in_row <= batch_size)
284 num_entries_in_batch = num_entries_in_row;
285 else if (num_entries_in_row >= (batch_no + 1) * batch_size)
286 num_entries_in_batch = batch_size;
288 num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
291 const size_t num_ent_start = offset;
292 const size_t num_ent_end = num_ent_start + bytes_per_lid;
295 const size_t gids_start = num_ent_end;
296 const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
298 const size_t vals_start = gids_end;
300 const size_t shift = batch_no * batch_size;
301 const char*
const num_ent_in = imports.data() + num_ent_start;
302 const char*
const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
303 const char*
const vals_in = imports.data() + vals_start + shift * bytes_per_value;
307 if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
310 #ifndef KOKKOS_ENABLE_SYCL
312 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
313 "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
314 (
int) lid_no, (
int) num_entries_in_row, (
int) num_ent_out
317 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
320 constexpr
bool matrix_has_sorted_rows =
true;
323 Kokkos::parallel_for(
324 Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
330 distance = j * bytes_per_gid;
340 distance = j * bytes_per_value;
343 if (combine_mode ==
ADD) {
347 const bool use_atomic_updates = atomic;
348 (void)local_matrix.sumIntoValues(
353 matrix_has_sorted_rows,
356 }
else if (combine_mode ==
REPLACE) {
360 const bool use_atomic_updates =
false;
361 (void)local_matrix.replaceValues(
366 matrix_has_sorted_rows,
372 #ifndef KOKKOS_ENABLE_SYCL
374 "*** Error: UnpackCrsMatrixAndCombineFunctor: "
375 "At row %d, an unknown error occurred during unpack\n", (
int) lid_no
378 Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
383 team_member.team_barrier();
389 auto error_code_h = Kokkos::create_mirror_view_and_copy(
390 Kokkos::HostSpace(), error_code
392 return error_code_h();
397 struct MaxNumEntTag {};
398 struct TotNumEntTag {};
408 template<
class LO,
class DT,
class BDT>
411 typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
412 typedef Kokkos::View<const size_t*, DT> offsets_type;
413 typedef Kokkos::View<const char*, BDT> input_buffer_type;
416 typedef size_t value_type;
419 num_packets_per_lid_type num_packets_per_lid;
420 offsets_type offsets;
421 input_buffer_type imports;
425 const offsets_type& offsets_in,
426 const input_buffer_type& imports_in) :
427 num_packets_per_lid (num_packets_per_lid_in),
428 offsets (offsets_in),
432 KOKKOS_INLINE_FUNCTION
void
433 operator() (
const MaxNumEntTag,
const LO i, value_type& update)
const {
435 const size_t num_bytes = num_packets_per_lid(i);
438 const char*
const in_buf = imports.data () + offsets(i);
440 const size_t num_ent =
static_cast<size_t> (num_ent_LO);
442 update = (update < num_ent) ? num_ent : update;
446 KOKKOS_INLINE_FUNCTION
void
447 join (
const MaxNumEntTag,
449 const value_type& src)
const
451 if (dst < src) dst = src;
454 KOKKOS_INLINE_FUNCTION
void
455 operator() (
const TotNumEntTag,
const LO i, value_type& tot_num_ent)
const {
457 const size_t num_bytes = num_packets_per_lid(i);
460 const char*
const in_buf = imports.data () + offsets(i);
462 tot_num_ent +=
static_cast<size_t> (num_ent_LO);
474 template<
class LO,
class DT,
class BDT>
476 compute_maximum_num_entries (
477 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
478 const Kokkos::View<const size_t*, DT>& offsets,
479 const Kokkos::View<const char*, BDT>& imports)
481 typedef typename DT::execution_space XS;
482 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
483 MaxNumEntTag> range_policy;
487 const LO numRowsToUnpack =
488 static_cast<LO
> (num_packets_per_lid.extent (0));
489 size_t max_num_ent = 0;
490 Kokkos::parallel_reduce (
"Max num entries in CRS",
491 range_policy (0, numRowsToUnpack),
492 functor, max_num_ent);
503 template<
class LO,
class DT,
class BDT>
505 compute_total_num_entries (
506 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
507 const Kokkos::View<const size_t*, DT>& offsets,
508 const Kokkos::View<const char*, BDT>& imports)
510 typedef typename DT::execution_space XS;
511 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
512 size_t tot_num_ent = 0;
513 NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
515 const LO numRowsToUnpack =
516 static_cast<LO
> (num_packets_per_lid.extent (0));
517 Kokkos::parallel_reduce (
"Total num entries in CRS to unpack",
518 range_policy (0, numRowsToUnpack),
519 functor, tot_num_ent);
524 KOKKOS_INLINE_FUNCTION
526 unpackRowCount(
const char imports[],
528 const size_t num_bytes)
530 using PT = PackTraits<LO>;
534 const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
535 if (p_num_bytes > num_bytes) {
536 return OrdinalTraits<size_t>::invalid();
538 const char*
const in_buf = imports + offset;
539 (void) PT::unpackValue(num_ent_LO, in_buf);
541 return static_cast<size_t>(num_ent_LO);
548 template<
class View1,
class View2>
552 const View1& batches_per_lid,
556 using LO =
typename View2::value_type;
558 for (
size_t i=0; i<batches_per_lid.extent(0); i++)
560 for (
size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
562 batch_info(batch, 0) =
static_cast<LO
>(i);
563 batch_info(batch, 1) = batch_no;
567 return batch == batch_info.extent(0);
577 template<
class LocalMatrix,
class LocalMap,
class BufferDeviceType>
579 unpackAndCombineIntoCrsMatrix(
580 const LocalMatrix& local_matrix,
581 const LocalMap& local_map,
582 const Kokkos::View<const char*, BufferDeviceType>& imports,
583 const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
584 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
587 using ST =
typename LocalMatrix::value_type;
590 using XS =
typename DT::execution_space;
591 const char prefix[] =
592 "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
593 "unpackAndCombineIntoCrsMatrix: ";
595 const size_t num_import_lids =
static_cast<size_t>(import_lids.extent(0));
596 if (num_import_lids == 0) {
603 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
ABSMAX,
604 std::invalid_argument,
605 prefix <<
"ABSMAX combine mode is not yet implemented for a matrix that has a "
606 "static graph (i.e., was constructed with the CrsMatrix constructor "
607 "that takes a const CrsGraph pointer).");
609 TEUCHOS_TEST_FOR_EXCEPTION(combine_mode ==
INSERT,
610 std::invalid_argument,
611 prefix <<
"INSERT combine mode is not allowed if the matrix has a static graph "
612 "(i.e., was constructed with the CrsMatrix constructor that takes a "
613 "const CrsGraph pointer).");
616 TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode ==
ADD || combine_mode ==
REPLACE),
617 std::invalid_argument,
618 prefix <<
"Invalid combine mode; should never get "
619 "here! Please report this bug to the Tpetra developers.");
622 bool bad_num_import_lids =
623 num_import_lids !=
static_cast<size_t>(num_packets_per_lid.extent(0));
624 TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
625 std::invalid_argument,
626 prefix <<
"importLIDs.size() (" << num_import_lids <<
") != "
627 "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) <<
").");
631 Kokkos::View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
635 size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
637 const size_t batch_size = std::min(default_batch_size, max_num_ent);
640 size_t num_batches = 0;
641 Kokkos::View<LO*[2], DT> batch_info(
"", num_batches);
642 Kokkos::View<size_t*, DT> batches_per_lid(
"", num_import_lids);
644 Kokkos::parallel_reduce(
645 Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
646 KOKKOS_LAMBDA(
const size_t i,
size_t& batches)
648 const size_t num_entries_in_row = unpackRowCount<LO>(
649 imports.data(), offsets(i), num_packets_per_lid(i)
652 (num_entries_in_row <= batch_size) ?
654 num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
655 batches += batches_per_lid(i);
659 Kokkos::resize(batch_info, num_batches);
661 Kokkos::HostSpace host_space;
662 auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
666 auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
668 (void) compute_batch_info(batches_per_lid_h, batch_info_h);
677 const bool atomic = XS().concurrency() != 1;
678 using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
693 using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
695 if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
697 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
701 Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
704 auto error_code = f.error();
705 TEUCHOS_TEST_FOR_EXCEPTION(
708 prefix <<
"UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
712 template<
class LocalMatrix,
class BufferDeviceType>
715 const LocalMatrix& local_matrix,
716 const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
717 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
718 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
719 const size_t num_same_ids)
721 using Kokkos::parallel_reduce;
722 typedef typename LocalMatrix::ordinal_type LO;
723 typedef typename LocalMatrix::device_type device_type;
724 typedef typename device_type::execution_space XS;
725 typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
726 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
727 typedef BufferDeviceType BDT;
733 num_items =
static_cast<LO
>(num_same_ids);
736 parallel_reduce(range_policy(0, num_items),
737 KOKKOS_LAMBDA(
const LO lid,
size_t& update) {
738 update +=
static_cast<size_t>(local_matrix.graph.row_map[lid+1]
739 -local_matrix.graph.row_map[lid]);
745 num_items =
static_cast<LO
>(permute_from_lids.extent(0));
748 parallel_reduce(range_policy(0, num_items),
749 KOKKOS_LAMBDA(
const LO i,
size_t& update) {
750 const LO lid = permute_from_lids(i);
751 update +=
static_cast<size_t> (local_matrix.graph.row_map[lid+1]
752 - local_matrix.graph.row_map[lid]);
759 const size_type np = num_packets_per_lid.extent(0);
760 Kokkos::View<size_t*, device_type> offsets(
"offsets", np+1);
763 compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
771 template<
class LO,
class DT,
class BDT>
773 setupRowPointersForRemotes(
774 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
776 const Kokkos::View<const char*, BDT>& imports,
777 const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
778 const typename PackTraits<size_t>::input_array_type& offsets)
780 using Kokkos::parallel_reduce;
781 typedef typename DT::execution_space XS;
782 typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
783 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
785 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
786 const size_type N = num_packets_per_lid.extent(0);
789 parallel_reduce (
"Setup row pointers for remotes",
791 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
792 typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
793 const size_t num_bytes = num_packets_per_lid(i);
794 const size_t offset = offsets(i);
795 const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
796 if (num_ent == InvalidNum) {
799 Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
807 makeCrsRowPtrFromLengths(
808 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
809 const Kokkos::View<size_t*,DT>& new_start_row)
811 using Kokkos::parallel_scan;
812 typedef typename DT::execution_space XS;
813 typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
814 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
815 const size_type N = new_start_row.extent(0);
816 parallel_scan(range_policy(0, N),
817 KOKKOS_LAMBDA(
const size_t& i,
size_t& update,
const bool&
final) {
818 auto cur_val = tgt_rowptr(i);
820 tgt_rowptr(i) = update;
821 new_start_row(i) = tgt_rowptr(i);
828 template<
class LocalMatrix,
class LocalMap>
831 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
833 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
834 const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
835 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
837 const LocalMatrix& local_matrix,
838 const LocalMap& local_col_map,
839 const size_t num_same_ids,
842 using Kokkos::parallel_for;
845 typedef typename DT::execution_space XS;
846 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
848 parallel_for(range_policy(0, num_same_ids),
849 KOKKOS_LAMBDA(
const size_t i) {
850 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
852 const LO src_lid =
static_cast<LO
>(i);
853 size_t src_row = local_matrix.graph.row_map(src_lid);
855 const LO tgt_lid =
static_cast<LO
>(i);
856 const size_t tgt_row = tgt_rowptr(tgt_lid);
858 const size_t nsr = local_matrix.graph.row_map(src_lid+1)
859 - local_matrix.graph.row_map(src_lid);
860 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
862 for (
size_t j=local_matrix.graph.row_map(src_lid);
863 j<local_matrix.graph.row_map(src_lid+1); ++j) {
864 LO src_col = local_matrix.graph.entries(j);
865 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
866 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
867 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
873 template<
class LocalMatrix,
class LocalMap>
875 copyDataFromPermuteIDs(
876 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
878 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
879 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
880 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
882 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
883 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
884 const LocalMatrix& local_matrix,
885 const LocalMap& local_col_map,
888 using Kokkos::parallel_for;
891 typedef typename DT::execution_space XS;
892 typedef typename PackTraits<LO>::input_array_type::size_type size_type;
893 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
895 const size_type num_permute_to_lids = permute_to_lids.extent(0);
897 parallel_for(range_policy(0, num_permute_to_lids),
898 KOKKOS_LAMBDA(
const size_t i) {
899 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
901 const LO src_lid = permute_from_lids(i);
902 const size_t src_row = local_matrix.graph.row_map(src_lid);
904 const LO tgt_lid = permute_to_lids(i);
905 const size_t tgt_row = tgt_rowptr(tgt_lid);
907 size_t nsr = local_matrix.graph.row_map(src_lid+1)
908 - local_matrix.graph.row_map(src_lid);
909 Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
911 for (
size_t j=local_matrix.graph.row_map(src_lid);
912 j<local_matrix.graph.row_map(src_lid+1); ++j) {
913 LO src_col = local_matrix.graph.entries(j);
914 tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
915 tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
916 tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
922 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
924 unpackAndCombineIntoCrsArrays2(
925 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
927 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
928 const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
929 const typename PackTraits<size_t>::input_array_type& offsets,
930 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
931 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
932 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
936 const size_t bytes_per_value)
939 using Kokkos::subview;
940 using Kokkos::MemoryUnmanaged;
941 using Kokkos::parallel_reduce;
942 using Kokkos::atomic_fetch_add;
947 typedef typename LocalMatrix::value_type ST;
948 typedef typename DT::execution_space XS;
949 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
950 typedef typename Kokkos::pair<size_type, size_type> slice;
951 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
953 typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
954 typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
955 typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
957 const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
960 const size_type num_import_lids = import_lids.size();
963 parallel_reduce (
"Unpack and combine into CRS",
964 range_policy (0, num_import_lids),
965 KOKKOS_LAMBDA (
const size_t i,
int& k_error) {
966 typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
967 const size_t num_bytes = num_packets_per_lid(i);
968 const size_t offset = offsets(i);
969 if (num_bytes == 0) {
973 size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
974 if (num_ent == InvalidNum) {
978 const LO lcl_row = import_lids(i);
979 const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
980 const size_t end_row = start_row + num_ent;
982 gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
983 vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
984 pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
986 k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
987 imports.data(), offset, num_bytes,
988 num_ent, bytes_per_value);
991 for (
size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
992 const int pid = pids_out(j);
993 pids_out(j) = (pid != my_pid) ? pid : -1;
1000 template<
typename LocalMatrix,
typename LocalMap,
typename BufferDeviceType>
1003 const LocalMatrix & local_matrix,
1004 const LocalMap & local_col_map,
1005 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1006 const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1007 const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1008 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1009 const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1010 const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1011 const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1012 const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1015 const size_t num_same_ids,
1016 const size_t tgt_num_rows,
1017 const size_t tgt_num_nonzeros,
1018 const int my_tgt_pid,
1019 const size_t bytes_per_value)
1022 using Kokkos::subview;
1023 using Kokkos::parallel_for;
1024 using Kokkos::MemoryUnmanaged;
1028 typedef typename DT::execution_space XS;
1029 typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1030 typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1031 typedef BufferDeviceType BDT;
1033 const char prefix[] =
"unpackAndCombineIntoCrsArrays: ";
1035 const size_t N = tgt_num_rows;
1039 const int my_pid = my_tgt_pid;
1042 parallel_for(range_policy(0, N+1),
1043 KOKKOS_LAMBDA(
const size_t i) {
1049 parallel_for(range_policy(0, num_same_ids),
1050 KOKKOS_LAMBDA(
const size_t i) {
1051 const LO tgt_lid =
static_cast<LO
>(i);
1052 const LO src_lid =
static_cast<LO
>(i);
1053 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1054 - local_matrix.graph.row_map(src_lid);
1059 const size_type num_permute_to_lids = permute_to_lids.extent(0);
1060 parallel_for(range_policy(0, num_permute_to_lids),
1061 KOKKOS_LAMBDA(
const size_t i) {
1062 const LO tgt_lid = permute_to_lids(i);
1063 const LO src_lid = permute_from_lids(i);
1064 tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1065 - local_matrix.graph.row_map(src_lid);
1070 const size_type num_import_lids = import_lids.extent(0);
1071 View<size_t*, DT> offsets(
"offsets", num_import_lids+1);
1074 #ifdef HAVE_TPETRA_DEBUG
1076 auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1077 const bool condition =
1078 nth_offset_h !=
static_cast<size_t>(imports.extent (0));
1079 TEUCHOS_TEST_FOR_EXCEPTION
1080 (condition, std::logic_error, prefix
1081 <<
"The final offset in bytes " << nth_offset_h
1082 <<
" != imports.size() = " << imports.extent(0)
1083 <<
". Please report this bug to the Tpetra developers.");
1085 #endif // HAVE_TPETRA_DEBUG
1089 setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1090 import_lids, imports, num_packets_per_lid, offsets);
1091 TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1092 <<
" Error transferring data to target row pointers. "
1093 "Please report this bug to the Tpetra developers.");
1097 View<size_t*, DT> new_start_row (
"new_start_row", N+1);
1100 makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1103 copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1104 tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1106 copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1107 tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1108 local_matrix, local_col_map, my_pid);
1110 if (imports.extent(0) <= 0) {
1114 int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1115 tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1116 local_matrix, local_col_map, my_pid, bytes_per_value);
1117 TEUCHOS_TEST_FOR_EXCEPTION(
1118 unpack_err != 0, std::logic_error, prefix <<
"unpack loop failed. This "
1119 "should never happen. Please report this bug to the Tpetra developers.");
1165 template<
typename ST,
typename LO,
typename GO,
typename Node>
1169 const Teuchos::ArrayView<const char>& imports,
1170 const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1171 const Teuchos::ArrayView<const LO>& importLIDs,
1176 typedef typename Node::device_type device_type;
1178 static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1179 "Node::device_type and LocalMatrix::device_type must be the same.");
1182 device_type outputDevice;
1187 auto num_packets_per_lid_d =
1189 numPacketsPerLID.size(),
true,
"num_packets_per_lid");
1191 auto import_lids_d =
1193 importLIDs.size(),
true,
"import_lids");
1197 imports.size(),
true,
"imports");
1200 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1211 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1212 local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1213 import_lids_d, combineMode);
1217 template<
typename ST,
typename LO,
typename GO,
typename NT>
1219 unpackCrsMatrixAndCombineNew(
1221 Kokkos::DualView<
char*,
1223 Kokkos::DualView<
size_t*,
1225 const Kokkos::DualView<
const LO*,
1233 using device_type =
typename crs_matrix_type::device_type;
1234 using local_matrix_device_type =
typename crs_matrix_type::local_matrix_device_type;
1235 using buffer_device_type =
typename dist_object_type::buffer_device_type;
1238 (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1239 "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1240 "must be the same.");
1242 if (numPacketsPerLID.need_sync_device()) {
1243 numPacketsPerLID.sync_device ();
1245 auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1247 TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1248 auto import_lids_d = importLIDs.view_device ();
1250 if (imports.need_sync_device()) {
1251 imports.sync_device ();
1253 auto imports_d = imports.view_device ();
1256 auto local_col_map = sourceMatrix.
getColMap ()->getLocalMap ();
1257 typedef decltype (local_col_map) local_map_type;
1259 UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1260 local_matrix_device_type,
1263 > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1264 import_lids_d, combineMode);
1322 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1325 const
CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1326 const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1327 const Teuchos::ArrayView<const
char> &imports,
1328 const Teuchos::ArrayView<const
size_t>& numPacketsPerLID,
1332 const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1333 const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1335 using Kokkos::MemoryUnmanaged;
1337 typedef typename Node::device_type DT;
1338 const char prefix[] =
"unpackAndCombineWithOwningPIDsCount: ";
1340 TEUCHOS_TEST_FOR_EXCEPTION
1341 (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1342 prefix <<
"permuteToLIDs.size() = " << permuteToLIDs.size () <<
" != "
1343 "permuteFromLIDs.size() = " << permuteFromLIDs.size() <<
".");
1346 const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1347 TEUCHOS_TEST_FOR_EXCEPTION
1348 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1349 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1350 TEUCHOS_TEST_FOR_EXCEPTION
1351 (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1352 prefix <<
"importLIDs.size() = " << importLIDs.size () <<
" != "
1353 "numPacketsPerLID.size() = " << numPacketsPerLID.size () <<
".");
1355 auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1357 using kokkos_device_type = Kokkos::Device<
typename Node::device_type::execution_space,
1358 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1360 Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1362 permuteFromLIDs.getRawPtr (),
1363 permuteFromLIDs.size (),
true,
1364 "permute_from_lids");
1366 Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1368 imports.getRawPtr (),
1369 imports.size (),
true,
1372 Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1374 numPacketsPerLID.getRawPtr (),
1375 numPacketsPerLID.size (),
true,
1376 "num_packets_per_lid");
1379 local_matrix, permute_from_lids_d, imports_d,
1380 num_packets_per_lid_d, numSameIDs);
1398 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1402 const Kokkos::View<LocalOrdinal
const *,
1403 Kokkos::Device<
typename Node::device_type::execution_space,
1404 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1405 void,
void > import_lids_d,
1406 const Kokkos::View<
const char*,
1407 Kokkos::Device<
typename Node::device_type::execution_space,
1408 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1409 void,
void > imports_d,
1410 const Kokkos::View<
const size_t*,
1411 Kokkos::Device<
typename Node::device_type::execution_space,
1412 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1413 void,
void > num_packets_per_lid_d,
1414 const size_t numSameIDs,
1415 const Kokkos::View<LocalOrdinal
const *,
1416 Kokkos::Device<
typename Node::device_type::execution_space,
1417 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1418 void,
void > permute_to_lids_d,
1419 const Kokkos::View<LocalOrdinal
const *,
1420 Kokkos::Device<
typename Node::device_type::execution_space,
1421 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1422 void,
void > permute_from_lids_d,
1423 size_t TargetNumRows,
1424 const int MyTargetPID,
1425 Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1426 Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1428 const Teuchos::ArrayView<const int>& SourcePids,
1429 Kokkos::View<int*,typename Node::device_type> &TargetPids)
1431 using execution_space =
typename Node::execution_space;
1437 using Teuchos::ArrayView;
1438 using Teuchos::outArg;
1439 using Teuchos::REDUCE_MAX;
1440 using Teuchos::reduceAll;
1442 typedef typename Node::device_type DT;
1445 typedef typename matrix_type::impl_scalar_type ST;
1447 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1448 # ifdef HAVE_TPETRA_MMM_TIMINGS
1449 using Teuchos::TimeMonitor;
1450 Teuchos::RCP<TimeMonitor> tm;
1453 using Kokkos::MemoryUnmanaged;
1455 TEUCHOS_TEST_FOR_EXCEPTION
1456 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1457 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1458 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1462 TEUCHOS_TEST_FOR_EXCEPTION
1463 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1464 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1465 TEUCHOS_TEST_FOR_EXCEPTION
1466 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1467 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1468 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1473 # ifdef HAVE_TPETRA_MMM_TIMINGS
1474 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1476 size_t TargetNumNonzeros =
1478 local_matrix, permute_from_lids_d, imports_d,
1479 num_packets_per_lid_d, numSameIDs);
1480 # ifdef HAVE_TPETRA_MMM_TIMINGS
1484 # ifdef HAVE_TPETRA_MMM_TIMINGS
1485 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1487 Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1488 Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1489 Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1490 # ifdef HAVE_TPETRA_MMM_TIMINGS
1494 TEUCHOS_TEST_FOR_EXCEPTION(
1495 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1496 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1497 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1499 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1500 Kokkos::resize(TargetPids,TargetNumNonzeros);
1505 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1507 # ifdef HAVE_TPETRA_MMM_TIMINGS
1508 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1515 SourcePids.size(),
true,
"src_pids");
1517 # ifdef HAVE_TPETRA_MMM_TIMINGS
1521 size_t bytes_per_value = 0;
1535 size_t bytes_per_value_l = 0;
1536 if (local_matrix.values.extent(0) > 0) {
1537 const ST& val = local_matrix.values(0);
1540 const ST& val = crs_vals_d(0);
1543 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1544 Teuchos::REDUCE_MAX,
1546 outArg(bytes_per_value));
1549 # ifdef HAVE_TPETRA_MMM_TIMINGS
1550 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1553 local_matrix, local_col_map, import_lids_d, imports_d,
1554 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1555 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1556 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1558 # ifdef HAVE_TPETRA_MMM_TIMINGS
1563 # ifdef HAVE_TPETRA_MMM_TIMINGS
1564 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1567 Kokkos::parallel_for(
"setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (
const size_t i) {
1568 if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1573 template<
typename Scalar,
typename LocalOrdinal,
typename GlobalOrdinal,
typename Node>
1577 const Kokkos::View<LocalOrdinal
const *,
1578 Kokkos::Device<
typename Node::device_type::execution_space,
1579 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1580 void,
void > import_lids_d,
1581 const Kokkos::View<
const char*,
1582 Kokkos::Device<
typename Node::device_type::execution_space,
1583 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1584 void,
void > imports_d,
1585 const Kokkos::View<
const size_t*,
1586 Kokkos::Device<
typename Node::device_type::execution_space,
1587 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1588 void,
void > num_packets_per_lid_d,
1589 const size_t numSameIDs,
1590 const Kokkos::View<LocalOrdinal
const *,
1591 Kokkos::Device<
typename Node::device_type::execution_space,
1592 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1593 void,
void > permute_to_lids_d,
1594 const Kokkos::View<LocalOrdinal
const *,
1595 Kokkos::Device<
typename Node::device_type::execution_space,
1596 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1597 void,
void > permute_from_lids_d,
1598 size_t TargetNumRows,
1599 const int MyTargetPID,
1600 Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1601 Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1602 Teuchos::ArrayRCP<Scalar>& CRS_vals,
1603 const Teuchos::ArrayView<const int>& SourcePids,
1604 Teuchos::Array<int>& TargetPids)
1606 using execution_space =
typename Node::execution_space;
1612 using Teuchos::ArrayView;
1613 using Teuchos::outArg;
1614 using Teuchos::REDUCE_MAX;
1615 using Teuchos::reduceAll;
1617 typedef typename Node::device_type DT;
1620 typedef typename matrix_type::impl_scalar_type ST;
1622 const char prefix[] =
"Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1623 # ifdef HAVE_TPETRA_MMM_TIMINGS
1624 using Teuchos::TimeMonitor;
1625 Teuchos::RCP<TimeMonitor> tm;
1628 using Kokkos::MemoryUnmanaged;
1630 TEUCHOS_TEST_FOR_EXCEPTION
1631 (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1632 prefix <<
"permute_to_lids_d.size() = " << permute_to_lids_d.size () <<
" != "
1633 "permute_from_lids_d.size() = " << permute_from_lids_d.size() <<
".");
1637 TEUCHOS_TEST_FOR_EXCEPTION
1638 (! locallyIndexed, std::invalid_argument, prefix <<
"The input "
1639 "CrsMatrix 'sourceMatrix' must be locally indexed.");
1640 TEUCHOS_TEST_FOR_EXCEPTION
1641 (((
size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1642 prefix <<
"import_lids_d.size() = " << import_lids_d.size () <<
" != "
1643 "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () <<
".");
1648 # ifdef HAVE_TPETRA_MMM_TIMINGS
1649 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineWithOwningPIDsCount"))));
1651 size_t TargetNumNonzeros =
1653 local_matrix, permute_from_lids_d, imports_d,
1654 num_packets_per_lid_d, numSameIDs);
1655 # ifdef HAVE_TPETRA_MMM_TIMINGS
1659 # ifdef HAVE_TPETRA_MMM_TIMINGS
1660 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"resize CRS pointers"))));
1662 CRS_rowptr.resize (TargetNumRows+1);
1663 CRS_colind.resize(TargetNumNonzeros);
1664 CRS_vals.resize(TargetNumNonzeros);
1665 Teuchos::ArrayRCP<ST>
const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1666 # ifdef HAVE_TPETRA_MMM_TIMINGS
1670 TEUCHOS_TEST_FOR_EXCEPTION(
1671 permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1672 prefix <<
"permuteToLIDs.size() = " << permute_to_lids_d.size ()
1673 <<
"!= permute_from_lids_d.size() = " << permute_from_lids_d.size () <<
".");
1676 if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1677 TargetPids.resize (TargetNumNonzeros);
1679 TargetPids.assign (TargetNumNonzeros, -1);
1682 auto local_col_map = sourceMatrix.
getColMap()->getLocalMap();
1684 # ifdef HAVE_TPETRA_MMM_TIMINGS
1685 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"create mirror views from inputs"))));
1692 CRS_rowptr.size(),
true,
"crs_rowptr");
1696 CRS_colind.size(),
true,
"crs_colidx");
1697 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1698 static_assert (! std::is_same<
1699 typename std::remove_const<
1700 typename std::decay<
1701 decltype (CRS_vals_impl_scalar_type)
1704 std::complex<double> >::value,
1705 "CRS_vals::value_type is std::complex<double>; this should never happen"
1706 ", since std::complex does not work in Kokkos::View objects.");
1707 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1711 CRS_vals_impl_scalar_type.size(),
true,
"crs_vals");
1713 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714 static_assert (! std::is_same<
1715 typename decltype (crs_vals_d)::non_const_value_type,
1716 std::complex<double> >::value,
1717 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718 "never happen, since std::complex does not work in Kokkos::View objects.");
1719 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1723 SourcePids.size(),
true,
"src_pids");
1727 TargetPids.size(),
true,
"tgt_pids");
1729 # ifdef HAVE_TPETRA_MMM_TIMINGS
1733 size_t bytes_per_value = 0;
1747 size_t bytes_per_value_l = 0;
1748 if (local_matrix.values.extent(0) > 0) {
1749 const ST& val = local_matrix.values(0);
1752 const ST& val = crs_vals_d(0);
1755 Teuchos::reduceAll<int, size_t>(*(sourceMatrix.
getComm()),
1756 Teuchos::REDUCE_MAX,
1758 outArg(bytes_per_value));
1761 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1762 static_assert (! std::is_same<
1763 typename decltype (crs_vals_d)::non_const_value_type,
1764 std::complex<double> >::value,
1765 "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1766 "never happen, since std::complex does not work in Kokkos::View objects.");
1767 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1769 # ifdef HAVE_TPETRA_MMM_TIMINGS
1770 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"unpackAndCombineIntoCrsArrays"))));
1773 local_matrix, local_col_map, import_lids_d, imports_d,
1774 num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1775 crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1776 numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1778 # ifdef HAVE_TPETRA_MMM_TIMINGS
1783 # ifdef HAVE_TPETRA_MMM_TIMINGS
1784 tm = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string(
"copy back to host"))));
1786 typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1787 CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1789 deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1791 typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1792 CRS_colind.getRawPtr(), CRS_colind.size());
1794 deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1796 typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1797 CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1799 deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1801 typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1802 TargetPids.getRawPtr(), TargetPids.size());
1804 deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1812 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1814 Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1815 const CrsMatrix<ST, LO, GO, NT>&, \
1816 const Teuchos::ArrayView<const char>&, \
1817 const Teuchos::ArrayView<const size_t>&, \
1818 const Teuchos::ArrayView<const LO>&, \
1822 Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1823 const CrsMatrix<ST, LO, GO, NT> &, \
1824 const Teuchos::ArrayView<const LO> &, \
1825 const Teuchos::ArrayView<const char> &, \
1826 const Teuchos::ArrayView<const size_t>&, \
1830 const Teuchos::ArrayView<const LO>&, \
1831 const Teuchos::ArrayView<const LO>&); \
1833 Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1834 const CrsMatrix<ST, LO, GO, NT>&, \
1835 Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1836 Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1837 const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1839 const CombineMode); \
1841 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1842 const CrsMatrix<ST, LO, GO, NT> &, \
1843 const Kokkos::View<LO const *, \
1844 Kokkos::Device<typename NT::device_type::execution_space, \
1845 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1847 const Kokkos::View<const char*, \
1848 Kokkos::Device<typename NT::device_type::execution_space, \
1849 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1851 const Kokkos::View<const size_t*, \
1852 Kokkos::Device<typename NT::device_type::execution_space, \
1853 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1856 const Kokkos::View<LO const *, \
1857 Kokkos::Device<typename NT::device_type::execution_space, \
1858 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1860 const Kokkos::View<LO const *, \
1861 Kokkos::Device<typename NT::device_type::execution_space, \
1862 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1866 Kokkos::View<size_t*,typename NT::device_type>&, \
1867 Kokkos::View<GO*,typename NT::device_type>&, \
1868 Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1869 const Teuchos::ArrayView<const int>&, \
1870 Kokkos::View<int*,typename NT::device_type>&); \
1872 Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1873 const CrsMatrix<ST, LO, GO, NT> &, \
1874 const Kokkos::View<LO const *, \
1875 Kokkos::Device<typename NT::device_type::execution_space, \
1876 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1878 const Kokkos::View<const char*, \
1879 Kokkos::Device<typename NT::device_type::execution_space, \
1880 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1882 const Kokkos::View<const size_t*, \
1883 Kokkos::Device<typename NT::device_type::execution_space, \
1884 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1887 const Kokkos::View<LO const *, \
1888 Kokkos::Device<typename NT::device_type::execution_space, \
1889 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1891 const Kokkos::View<LO const *, \
1892 Kokkos::Device<typename NT::device_type::execution_space, \
1893 Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1897 Teuchos::ArrayRCP<size_t>&, \
1898 Teuchos::ArrayRCP<GO>&, \
1899 Teuchos::ArrayRCP<ST>&, \
1900 const Teuchos::ArrayView<const int>&, \
1901 Teuchos::Array<int>&);
1903 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for "invalid" (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
"Local" part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don't currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
int error() const
Host function for getting the error.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Unpacks and combines a single row of the CrsMatrix.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...