Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
12 
13 #include <memory>
14 #include <string>
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
30 
51 
52 namespace Tpetra {
53 
54 //
55 // Users must never rely on anything in the Details namespace.
56 //
57 namespace Details {
58 
59 namespace UnpackAndCombineCrsMatrixImpl {
60 
70 template <class ST, class LO, class GO>
71 KOKKOS_FUNCTION int
72 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
73  const typename PackTraits<int>::output_array_type& pids_out,
74  const typename PackTraits<ST>::output_array_type& vals_out,
75  const char imports[],
76  const size_t offset,
77  const size_t /* num_bytes */,
78  const size_t num_ent,
79  const size_t bytes_per_value) {
80  if (num_ent == 0) {
81  // Empty rows always take zero bytes, to ensure sparsity.
82  return 0;
83  }
84  bool unpack_pids = pids_out.size() > 0;
85 
86  const size_t num_ent_beg = offset;
87  const size_t num_ent_len = PackTraits<LO>::packValueCount(LO(0));
88 
89  const size_t gids_beg = num_ent_beg + num_ent_len;
90  const size_t gids_len =
91  num_ent * PackTraits<GO>::packValueCount(GO(0));
92 
93  const size_t pids_beg = gids_beg + gids_len;
94  const size_t pids_len = unpack_pids ? size_t(num_ent * PackTraits<int>::packValueCount(int(0))) : size_t(0);
95 
96  const size_t vals_beg = gids_beg + gids_len + pids_len;
97  const size_t vals_len = num_ent * bytes_per_value;
98 
99  const char* const num_ent_in = imports + num_ent_beg;
100  const char* const gids_in = imports + gids_beg;
101  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
102  const char* const vals_in = imports + vals_beg;
103 
104  size_t num_bytes_out = 0;
105  LO num_ent_out;
106  num_bytes_out += PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
107  if (static_cast<size_t>(num_ent_out) != num_ent) {
108  return 20; // error code
109  }
110 
111  {
112  Kokkos::pair<int, size_t> p;
113  p = PackTraits<GO>::unpackArray(gids_out.data(), gids_in, num_ent);
114  if (p.first != 0) {
115  return 21; // error code
116  }
117  num_bytes_out += p.second;
118 
119  if (unpack_pids) {
120  p = PackTraits<int>::unpackArray(pids_out.data(), pids_in, num_ent);
121  if (p.first != 0) {
122  return 22; // error code
123  }
124  num_bytes_out += p.second;
125  }
126 
127  p = PackTraits<ST>::unpackArray(vals_out.data(), vals_in, num_ent);
128  if (p.first != 0) {
129  return 23; // error code
130  }
131  num_bytes_out += p.second;
132  }
133 
134  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
135  if (num_bytes_out != expected_num_bytes) {
136  return 24; // error code
137  }
138  return 0; // no errors
139 } // unpackRow
140 
151 template <class LocalMatrix, class LocalMap, class BufferDeviceType>
153  typedef LocalMatrix local_matrix_type;
154  typedef LocalMap local_map_type;
155 
156  typedef typename local_matrix_type::value_type ST;
157  typedef typename local_map_type::local_ordinal_type LO;
158  typedef typename local_map_type::global_ordinal_type GO;
159  typedef typename local_map_type::device_type DT;
160  typedef typename DT::execution_space XS;
161 
162  typedef Kokkos::View<const size_t*, BufferDeviceType>
163  num_packets_per_lid_type;
164  typedef Kokkos::View<const size_t*, DT> offsets_type;
165  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
166  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
167 
168  typedef Kokkos::View<int, DT> error_type;
169  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
170 
171  static_assert(std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
172  "LocalMap::local_ordinal_type and "
173  "LocalMatrix::ordinal_type must be the same.");
174 
175  local_matrix_type local_matrix;
176  local_map_type local_col_map;
177  input_buffer_type imports;
178  num_packets_per_lid_type num_packets_per_lid;
179  import_lids_type import_lids;
180  Kokkos::View<const LO* [2], DT> batch_info;
181  offsets_type offsets;
182  Tpetra::CombineMode combine_mode;
183  size_t batch_size;
184  size_t bytes_per_value;
185  bool atomic;
186  error_type error_code;
187 
189  const local_matrix_type& local_matrix_in,
190  const local_map_type& local_col_map_in,
191  const input_buffer_type& imports_in,
192  const num_packets_per_lid_type& num_packets_per_lid_in,
193  const import_lids_type& import_lids_in,
194  const Kokkos::View<const LO* [2], DT>& batch_info_in,
195  const offsets_type& offsets_in,
196  const Tpetra::CombineMode combine_mode_in,
197  const size_t batch_size_in,
198  const size_t bytes_per_value_in,
199  const bool atomic_in)
200  : local_matrix(local_matrix_in)
201  , local_col_map(local_col_map_in)
202  , imports(imports_in)
203  , num_packets_per_lid(num_packets_per_lid_in)
204  , import_lids(import_lids_in)
205  , batch_info(batch_info_in)
206  , offsets(offsets_in)
207  , combine_mode(combine_mode_in)
208  , batch_size(batch_size_in)
209  , bytes_per_value(bytes_per_value_in)
210  , atomic(atomic_in)
211  , error_code("error") {}
212 
213  KOKKOS_INLINE_FUNCTION
214  void operator()(member_type team_member) const {
215  using Kokkos::MemoryUnmanaged;
216  using Kokkos::subview;
217  using Kokkos::View;
218 
219  const LO batch = team_member.league_rank();
220  const LO lid_no = batch_info(batch, 0);
221  const LO batch_no = batch_info(batch, 1);
222 
223  const size_t num_bytes = num_packets_per_lid(lid_no);
224 
225  // Only unpack data if there is a nonzero number of bytes.
226  if (num_bytes == 0)
227  return;
228 
229  // there is actually something in the row
230  const LO import_lid = import_lids(lid_no);
231  const size_t buf_size = imports.size();
232  const size_t offset = offsets(lid_no);
233 
234  // Get the number of entries to expect in the received data for this row.
235  LO num_ent_LO = 0;
236  const char* const in_buf = imports.data() + offset;
237  (void)PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
238  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
239 
240  // Count the number of bytes expected to unpack
241  size_t expected_num_bytes = 0;
242  {
243  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
244  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
245  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
246  }
247 
248  if (expected_num_bytes > num_bytes) {
249  Kokkos::printf(
250  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
251  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
252  (int)lid_no, (int)expected_num_bytes, (int)num_bytes);
253 
254  Kokkos::atomic_compare_exchange(error_code.data(), 0, 21);
255  return;
256  }
257 
258  if (offset > buf_size || offset + num_bytes > buf_size) {
259  Kokkos::printf(
260  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
261  "At row %d, the offset (%d) > buffer size (%d)\n",
262  (int)lid_no, (int)offset, (int)buf_size);
263 
264  Kokkos::atomic_compare_exchange(error_code.data(), 0, 22);
265  return;
266  }
267 
268  // Determine the number of entries to unpack in this batch
269  size_t num_entries_in_batch = 0;
270  if (num_entries_in_row <= batch_size)
271  num_entries_in_batch = num_entries_in_row;
272  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
273  num_entries_in_batch = batch_size;
274  else
275  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
276 
277  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
278  const size_t num_ent_start = offset;
279  const size_t num_ent_end = num_ent_start + bytes_per_lid;
280 
281  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
282  const size_t gids_start = num_ent_end;
283  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
284 
285  const size_t vals_start = gids_end;
286 
287  const size_t shift = batch_no * batch_size;
288  const char* const num_ent_in = imports.data() + num_ent_start;
289  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
290  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
291 
292  LO num_ent_out;
293  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
294  if (static_cast<size_t>(num_ent_out) != num_entries_in_row) {
295  Kokkos::printf(
296  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
297  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
298  (int)lid_no, (int)num_entries_in_row, (int)num_ent_out);
299 
300  Kokkos::atomic_compare_exchange(error_code.data(), 0, 23);
301  }
302 
303  constexpr bool matrix_has_sorted_rows = true; // see #6282
304  // Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
305  // By ref triggers compiler bug in CUDA 10.
306  Kokkos::parallel_for(
307  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
308  [=, *this](const LO& j) {
309  size_t distance = 0;
310 
311  GO gid_out;
312  distance = j * bytes_per_gid;
313  (void)PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
314  auto lid_out = local_col_map.getLocalElement(gid_out);
315 
316  // Column indices come in as global indices, in case the
317  // source object's column Map differs from the target object's
318  // (this's) column Map, and must be converted local index values
319 
320  // assume that ST is default constructible
321  ST val_out;
322  distance = j * bytes_per_value;
323  (void)PackTraits<ST>::unpackValue(val_out, vals_in + distance);
324 
325  if (combine_mode == ADD) {
326  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
327  // different threads don't touch the same row (i.e., no
328  // duplicates in incoming LIDs list).
329  const bool use_atomic_updates = atomic;
330  (void)local_matrix.sumIntoValues(
331  import_lid,
332  &lid_out,
333  1,
334  &val_out,
335  matrix_has_sorted_rows,
336  use_atomic_updates);
337  } else if (combine_mode == REPLACE) {
338  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
339  // combine mode with multiple incoming rows that touch the same
340  // target matrix entries, so we never need atomic updates.
341  const bool use_atomic_updates = false;
342  (void)local_matrix.replaceValues(
343  import_lid,
344  &lid_out,
345  1,
346  &val_out,
347  matrix_has_sorted_rows,
348  use_atomic_updates);
349  } else {
350  // should never get here
351  Kokkos::printf(
352  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
353  "At row %d, an unknown error occurred during unpack\n",
354  (int)lid_no);
355  Kokkos::atomic_compare_exchange(error_code.data(), 0, 31);
356  }
357  });
358 
359  team_member.team_barrier();
360  }
361 
363  int error() const {
364  auto error_code_h = Kokkos::create_mirror_view_and_copy(
365  Kokkos::HostSpace(), error_code);
366  return error_code_h();
367  }
368 
369 }; // UnpackCrsMatrixAndCombineFunctor
370 
371 struct MaxNumEntTag {};
372 struct TotNumEntTag {};
373 
382 template <class LO, class DT, class BDT>
384  public:
385  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
386  typedef Kokkos::View<const size_t*, DT> offsets_type;
387  typedef Kokkos::View<const char*, BDT> input_buffer_type;
388  // This needs to be public, since it appears in the argument list of
389  // public methods (see below). Otherwise, build errors may happen.
390  typedef size_t value_type;
391 
392  private:
393  num_packets_per_lid_type num_packets_per_lid;
394  offsets_type offsets;
395  input_buffer_type imports;
396 
397  public:
398  NumEntriesFunctor(const num_packets_per_lid_type num_packets_per_lid_in,
399  const offsets_type& offsets_in,
400  const input_buffer_type& imports_in)
401  : num_packets_per_lid(num_packets_per_lid_in)
402  , offsets(offsets_in)
403  , imports(imports_in) {}
404 
405  KOKKOS_INLINE_FUNCTION void
406  operator()(const MaxNumEntTag, const LO i, value_type& update) const {
407  // Get how many entries to expect in the received data for this row.
408  const size_t num_bytes = num_packets_per_lid(i);
409  if (num_bytes > 0) {
410  LO num_ent_LO = 0; // output argument of unpackValue
411  const char* const in_buf = imports.data() + offsets(i);
412  (void)PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
413  const size_t num_ent = static_cast<size_t>(num_ent_LO);
414 
415  update = (update < num_ent) ? num_ent : update;
416  }
417  }
418 
419  KOKKOS_INLINE_FUNCTION void
420  join(const MaxNumEntTag,
421  value_type& dst,
422  const value_type& src) const {
423  if (dst < src) dst = src;
424  }
425 
426  KOKKOS_INLINE_FUNCTION void
427  operator()(const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
428  // Get how many entries to expect in the received data for this row.
429  const size_t num_bytes = num_packets_per_lid(i);
430  if (num_bytes > 0) {
431  LO num_ent_LO = 0; // output argument of unpackValue
432  const char* const in_buf = imports.data() + offsets(i);
433  (void)PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
434  tot_num_ent += static_cast<size_t>(num_ent_LO);
435  }
436  }
437 }; // NumEntriesFunctor
438 
446 template <class LO, class DT, class BDT>
447 size_t
448 compute_maximum_num_entries(
449  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
450  const Kokkos::View<const size_t*, DT>& offsets,
451  const Kokkos::View<const char*, BDT>& imports) {
452  typedef typename DT::execution_space XS;
453  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
454  MaxNumEntTag>
455  range_policy;
456 
457  NumEntriesFunctor<LO, DT, BDT> functor(num_packets_per_lid, offsets,
458  imports);
459  const LO numRowsToUnpack =
460  static_cast<LO>(num_packets_per_lid.extent(0));
461  size_t max_num_ent = 0;
462  Kokkos::parallel_reduce("Max num entries in CRS",
463  range_policy(0, numRowsToUnpack),
464  functor, max_num_ent);
465  return max_num_ent;
466 }
467 
475 template <class LO, class DT, class BDT>
476 size_t
477 compute_total_num_entries(
478  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
479  const Kokkos::View<const size_t*, DT>& offsets,
480  const Kokkos::View<const char*, BDT>& imports) {
481  typedef typename DT::execution_space XS;
482  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
483  size_t tot_num_ent = 0;
484  NumEntriesFunctor<LO, DT, BDT> functor(num_packets_per_lid, offsets,
485  imports);
486  const LO numRowsToUnpack =
487  static_cast<LO>(num_packets_per_lid.extent(0));
488  Kokkos::parallel_reduce("Total num entries in CRS to unpack",
489  range_policy(0, numRowsToUnpack),
490  functor, tot_num_ent);
491  return tot_num_ent;
492 }
493 
494 template <class LO>
495 KOKKOS_INLINE_FUNCTION
496  size_t
497  unpackRowCount(const char imports[],
498  const size_t offset,
499  const size_t num_bytes) {
500  using PT = PackTraits<LO>;
501 
502  LO num_ent_LO = 0;
503  if (num_bytes > 0) {
504  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
505  if (p_num_bytes > num_bytes) {
506  return OrdinalTraits<size_t>::invalid();
507  }
508  const char* const in_buf = imports + offset;
509  (void)PT::unpackValue(num_ent_LO, in_buf);
510  }
511  return static_cast<size_t>(num_ent_LO);
512 }
513 
518 template <class View1, class View2>
519 inline bool
520 compute_batch_info(
521  const View1& batches_per_lid,
522  View2& batch_info) {
523  using LO = typename View2::value_type;
524  size_t batch = 0;
525  for (size_t i = 0; i < batches_per_lid.extent(0); i++) {
526  for (size_t batch_no = 0; batch_no < batches_per_lid(i); batch_no++) {
527  batch_info(batch, 0) = static_cast<LO>(i);
528  batch_info(batch, 1) = batch_no;
529  batch++;
530  }
531  }
532  return batch == batch_info.extent(0);
533 }
534 
542 template <class LocalMatrix, class LocalMap, class BufferDeviceType>
543 void unpackAndCombineIntoCrsMatrix(
544  const LocalMatrix& local_matrix,
545  const LocalMap& local_map,
546  const Kokkos::View<const char*, BufferDeviceType>& imports,
547  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
548  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
549  const Tpetra::CombineMode combine_mode) {
550  using ST = typename LocalMatrix::value_type;
551  using LO = typename LocalMap::local_ordinal_type;
552  using DT = typename LocalMap::device_type;
553  using XS = typename DT::execution_space;
554  const char prefix[] =
555  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
556  "unpackAndCombineIntoCrsMatrix: ";
557 
558  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
559  if (num_import_lids == 0) {
560  // Nothing to unpack
561  return;
562  }
563 
564  {
565  // Check for correct input
566  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
567  std::invalid_argument,
568  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
569  "static graph (i.e., was constructed with the CrsMatrix constructor "
570  "that takes a const CrsGraph pointer).");
571 
572  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
573  std::invalid_argument,
574  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
575  "(i.e., was constructed with the CrsMatrix constructor that takes a "
576  "const CrsGraph pointer).");
577 
578  // Unknown combine mode!
579  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
580  std::invalid_argument,
581  prefix << "Invalid combine mode; should never get "
582  "here! Please report this bug to the Tpetra developers.");
583 
584  // Check that sizes of input objects are consistent.
585  bool bad_num_import_lids =
586  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
587  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
588  std::invalid_argument,
589  prefix << "importLIDs.size() (" << num_import_lids << ") != "
590  "numPacketsPerLID.size() ("
591  << num_packets_per_lid.extent(0) << ").");
592  } // end QA error checking
593 
594  // Get the offsets
595  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids + 1);
596  computeOffsetsFromCounts(offsets, num_packets_per_lid);
597 
598  // Determine the sizes of the unpack batches
599  size_t max_num_ent = compute_maximum_num_entries<LO, DT>(num_packets_per_lid, offsets, imports);
600  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
601  const size_t batch_size = std::min(default_batch_size, max_num_ent);
602 
603  // To achieve some balance amongst threads, unpack each row in equal size batches
604  size_t num_batches = 0;
605  Kokkos::View<LO* [2], DT> batch_info("", num_batches);
606  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
607  // Compute meta data that allows batch unpacking
608  Kokkos::parallel_reduce(
609  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
610  KOKKOS_LAMBDA(const size_t i, size_t& batches) {
611  const size_t num_entries_in_row = unpackRowCount<LO>(
612  imports.data(), offsets(i), num_packets_per_lid(i));
613  batches_per_lid(i) =
614  (num_entries_in_row <= batch_size) ? 1 : num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
615  batches += batches_per_lid(i);
616  },
617  num_batches);
618  Kokkos::resize(batch_info, num_batches);
619 
620  Kokkos::HostSpace host_space;
621  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
622  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
623  Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
624 
625  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
626 
627  (void)compute_batch_info(batches_per_lid_h, batch_info_h);
628  // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
629  Kokkos::deep_copy(XS(), batch_info, batch_info_h);
630 
631  // FIXME (TJF SEP 2017)
632  // The scalar type is not necessarily default constructible
633  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
634 
635  // Now do the actual unpack!
636  const bool atomic = XS().concurrency() != 1;
637  using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
638  functor f(
639  local_matrix,
640  local_map,
641  imports,
642  num_packets_per_lid,
643  import_lids,
644  batch_info,
645  offsets,
646  combine_mode,
647  batch_size,
648  bytes_per_value,
649  atomic);
650 
651  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
653  if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid()) {
654  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
655  } else {
656  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
657  }
658 
659  auto error_code = f.error();
660  TEUCHOS_TEST_FOR_EXCEPTION(
661  error_code != 0,
662  std::runtime_error,
663  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code);
664 } // unpackAndCombineIntoCrsMatrix (Kokkos version)
665 
666 template <class LocalMatrix, class BufferDeviceType>
667 size_t
669  const LocalMatrix& local_matrix,
670  const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
671 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
672  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
673  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
674 #else
675  const Kokkos::View<const char*, BufferDeviceType>& imports,
676  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
677 #endif
678  const size_t num_same_ids) {
679  using Kokkos::parallel_reduce;
680  typedef typename LocalMatrix::ordinal_type LO;
681  typedef typename LocalMatrix::device_type device_type;
682  typedef typename device_type::execution_space XS;
683  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
684  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>> range_policy;
685  typedef BufferDeviceType BDT;
686 
687  size_t count = 0;
688  LO num_items;
689 
690  // Number of matrix entries to unpack (returned by this function).
691  num_items = static_cast<LO>(num_same_ids);
692  if (num_items) {
693  size_t kcnt = 0;
694  parallel_reduce(
695  range_policy(0, num_items),
696  KOKKOS_LAMBDA(const LO lid, size_t& update) {
697  update += static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
698  },
699  kcnt);
700  count += kcnt;
701  }
702 
703  // Count entries copied directly from the source matrix with permuting.
704  num_items = static_cast<LO>(permute_from_lids.extent(0));
705  if (num_items) {
706  size_t kcnt = 0;
707  parallel_reduce(
708  range_policy(0, num_items),
709  KOKKOS_LAMBDA(const LO i, size_t& update) {
710  const LO lid = permute_from_lids(i);
711  update += static_cast<size_t>(local_matrix.graph.row_map[lid + 1] - local_matrix.graph.row_map[lid]);
712  },
713  kcnt);
714  count += kcnt;
715  }
716 
717  {
718  // Count entries received from other MPI processes.
719  const size_type np = num_packets_per_lid.extent(0);
720  Kokkos::View<size_t*, device_type> offsets("offsets", np + 1);
721  computeOffsetsFromCounts(offsets, num_packets_per_lid);
722  count +=
723  compute_total_num_entries<LO, device_type, BDT>(num_packets_per_lid,
724  offsets, imports);
725  }
726 
727  return count;
728 } // unpackAndCombineWithOwningPIDsCount (Kokkos version)
729 
731 template <class LO, class DT, class BDT>
732 int setupRowPointersForRemotes(
733  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
734  const typename PackTraits<LO>::input_array_type& import_lids,
735  const Kokkos::View<const char*, BDT>& imports,
736  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
737  const typename PackTraits<size_t>::input_array_type& offsets) {
738  using Kokkos::parallel_reduce;
739  typedef typename DT::execution_space XS;
740  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
741  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
742 
743  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
744  const size_type N = num_packets_per_lid.extent(0);
745 
746  int errors = 0;
747  parallel_reduce(
748  "Setup row pointers for remotes",
749  range_policy(0, N),
750  KOKKOS_LAMBDA(const size_t i, int& k_error) {
751  typedef typename std::remove_reference<decltype(tgt_rowptr(0))>::type atomic_incr_type;
752  const size_t num_bytes = num_packets_per_lid(i);
753  const size_t offset = offsets(i);
754  const size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
755  if (num_ent == InvalidNum) {
756  k_error += 1;
757  }
758  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
759  },
760  errors);
761  return errors;
762 }
763 
764 // Convert array of row lengths to a CRS pointer array
765 template <class DT>
766 void makeCrsRowPtrFromLengths(
767  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
768  const Kokkos::View<size_t*, DT>& new_start_row) {
769  using Kokkos::parallel_scan;
770  typedef typename DT::execution_space XS;
771  typedef typename Kokkos::View<size_t*, DT>::size_type size_type;
772  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
773  const size_type N = new_start_row.extent(0);
774  parallel_scan(
775  range_policy(0, N),
776  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
777  auto cur_val = tgt_rowptr(i);
778  if (final) {
779  tgt_rowptr(i) = update;
780  new_start_row(i) = tgt_rowptr(i);
781  }
782  update += cur_val;
783  });
784 }
785 
786 template <class LocalMatrix, class LocalMap>
787 void copyDataFromSameIDs(
788  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
789  const typename PackTraits<int>::output_array_type& tgt_pids,
790  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
791  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
792  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
793  const typename PackTraits<int>::input_array_type& src_pids,
794  const LocalMatrix& local_matrix,
795  const LocalMap& local_col_map,
796  const size_t num_same_ids,
797  const int my_pid) {
798  using Kokkos::parallel_for;
799  typedef typename LocalMap::device_type DT;
800  typedef typename LocalMap::local_ordinal_type LO;
801  typedef typename DT::execution_space XS;
802  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
803 
804  parallel_for(
805  range_policy(0, num_same_ids),
806  KOKKOS_LAMBDA(const size_t i) {
807  typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
808 
809  const LO src_lid = static_cast<LO>(i);
810  size_t src_row = local_matrix.graph.row_map(src_lid);
811 
812  const LO tgt_lid = static_cast<LO>(i);
813  const size_t tgt_row = tgt_rowptr(tgt_lid);
814 
815  const size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
816  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
817 
818  for (size_t j = local_matrix.graph.row_map(src_lid);
819  j < local_matrix.graph.row_map(src_lid + 1); ++j) {
820  LO src_col = local_matrix.graph.entries(j);
821  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
822  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
823  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
824  }
825  });
826 }
827 
828 template <class LocalMatrix, class LocalMap>
829 void copyDataFromPermuteIDs(
830  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
831  const typename PackTraits<int>::output_array_type& tgt_pids,
832  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
833  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
834  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
835  const typename PackTraits<int>::input_array_type& src_pids,
836  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
837  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
838  const LocalMatrix& local_matrix,
839  const LocalMap& local_col_map,
840  const int my_pid) {
841  using Kokkos::parallel_for;
842  typedef typename LocalMap::device_type DT;
843  typedef typename LocalMap::local_ordinal_type LO;
844  typedef typename DT::execution_space XS;
845  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
846  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
847 
848  const size_type num_permute_to_lids = permute_to_lids.extent(0);
849 
850  parallel_for(
851  range_policy(0, num_permute_to_lids),
852  KOKKOS_LAMBDA(const size_t i) {
853  typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
854 
855  const LO src_lid = permute_from_lids(i);
856  const size_t src_row = local_matrix.graph.row_map(src_lid);
857 
858  const LO tgt_lid = permute_to_lids(i);
859  const size_t tgt_row = tgt_rowptr(tgt_lid);
860 
861  size_t nsr = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
862  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
863 
864  for (size_t j = local_matrix.graph.row_map(src_lid);
865  j < local_matrix.graph.row_map(src_lid + 1); ++j) {
866  LO src_col = local_matrix.graph.entries(j);
867  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
868  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
869  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
870  }
871  });
872 }
873 
874 template <typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
875 int unpackAndCombineIntoCrsArrays2(
876  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
877  const typename PackTraits<int>::output_array_type& tgt_pids,
878  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
879  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
880  const typename PackTraits<size_t>::input_array_type& offsets,
881  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
882 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
883  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
884  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
885 #else
886  const Kokkos::View<const char*, BufferDeviceType>& imports,
887  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
888 #endif
889  const LocalMatrix& /* local_matrix */,
890  const LocalMap /*& local_col_map*/,
891  const int my_pid,
892  const size_t bytes_per_value) {
893  using Kokkos::atomic_fetch_add;
894  using Kokkos::MemoryUnmanaged;
895  using Kokkos::parallel_reduce;
896  using Kokkos::subview;
897  using Kokkos::View;
899  typedef typename LocalMap::device_type DT;
900  typedef typename LocalMap::local_ordinal_type LO;
901  typedef typename LocalMap::global_ordinal_type GO;
902  typedef typename LocalMatrix::value_type ST;
903  typedef typename DT::execution_space XS;
904  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
905  typedef typename Kokkos::pair<size_type, size_type> slice;
906  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type>> range_policy;
907 
908  typedef View<int*, DT, MemoryUnmanaged> pids_out_type;
909  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
910  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
911 
912  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
913 
914  int errors = 0;
915  const size_type num_import_lids = import_lids.size();
916 
917  // RemoteIDs: Loop structure following UnpackAndCombine
918  parallel_reduce(
919  "Unpack and combine into CRS",
920  range_policy(0, num_import_lids),
921  KOKKOS_LAMBDA(const size_t i, int& k_error) {
922  typedef typename std::remove_reference<decltype(new_start_row(0))>::type atomic_incr_type;
923  const size_t num_bytes = num_packets_per_lid(i);
924  const size_t offset = offsets(i);
925  if (num_bytes == 0) {
926  // Empty buffer means that the row is empty.
927  return;
928  }
929  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
930  if (num_ent == InvalidNum) {
931  k_error += 1;
932  return;
933  }
934  const LO lcl_row = import_lids(i);
935  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
936  const size_t end_row = start_row + num_ent;
937 
938  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
939  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
940  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
941 
942  k_error += unpackRow<ST, LO, GO>(gids_out, pids_out, vals_out,
943  imports.data(), offset, num_bytes,
944  num_ent, bytes_per_value);
945 
946  // Correct target PIDs.
947  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
948  const int pid = pids_out(j);
949  pids_out(j) = (pid != my_pid) ? pid : -1;
950  }
951  },
952  errors);
953 
954  return errors;
955 }
956 
957 template <typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
959  const LocalMatrix& local_matrix,
960  const LocalMap& local_col_map,
961  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
962 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
963  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
964  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
965 #else
966  const Kokkos::View<const char*, BufferDeviceType>& imports,
967  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
968 #endif
969  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
970  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
971  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
972  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
973  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
974  const typename PackTraits<int>::input_array_type& src_pids,
975  const typename PackTraits<int>::output_array_type& tgt_pids,
976  const size_t num_same_ids,
977  const size_t tgt_num_rows,
978  const size_t tgt_num_nonzeros,
979  const int my_tgt_pid,
980  const size_t bytes_per_value) {
981  using Kokkos::MemoryUnmanaged;
982  using Kokkos::parallel_for;
983  using Kokkos::subview;
984  using Kokkos::View;
986  typedef typename LocalMap::device_type DT;
987  typedef typename LocalMap::local_ordinal_type LO;
988  typedef typename DT::execution_space XS;
989  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
990  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>> range_policy;
991  typedef BufferDeviceType BDT;
992 
993  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
994 
995  const size_t N = tgt_num_rows;
996 
997  // In the case of reduced communicators, the sourceMatrix won't have
998  // the right "my_pid", so thus we have to supply it.
999  const int my_pid = my_tgt_pid;
1000 
1001  // Zero the rowptr
1002  parallel_for(
1003  range_policy(0, N + 1),
1004  KOKKOS_LAMBDA(const size_t i) {
1005  tgt_rowptr(i) = 0;
1006  });
1007 
1008  // same IDs: Always first, always in the same place
1009  parallel_for(
1010  range_policy(0, num_same_ids),
1011  KOKKOS_LAMBDA(const size_t i) {
1012  const LO tgt_lid = static_cast<LO>(i);
1013  const LO src_lid = static_cast<LO>(i);
1014  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1015  });
1016 
1017  // Permute IDs: Still local, but reordered
1018  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1019  parallel_for(
1020  range_policy(0, num_permute_to_lids),
1021  KOKKOS_LAMBDA(const size_t i) {
1022  const LO tgt_lid = permute_to_lids(i);
1023  const LO src_lid = permute_from_lids(i);
1024  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid + 1) - local_matrix.graph.row_map(src_lid);
1025  });
1026 
1027  // Get the offsets from the number of packets per LID
1028  const size_type num_import_lids = import_lids.extent(0);
1029  View<size_t*, DT> offsets("offsets", num_import_lids + 1);
1030  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1031 
1032 #ifdef HAVE_TPETRA_DEBUG
1033  {
1034  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1035  const bool condition =
1036  nth_offset_h != static_cast<size_t>(imports.extent(0));
1037  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::logic_error, prefix << "The final offset in bytes " << nth_offset_h << " != imports.size() = " << imports.extent(0) << ". Please report this bug to the Tpetra developers.");
1038  }
1039 #endif // HAVE_TPETRA_DEBUG
1040 
1041  // Setup row pointers for remotes
1042  int k_error =
1043  setupRowPointersForRemotes<LO, DT, BDT>(tgt_rowptr,
1044  import_lids, imports, num_packets_per_lid, offsets);
1045  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix << " Error transferring data to target row pointers. "
1046  "Please report this bug to the Tpetra developers.");
1047 
1048  // If multiple processes contribute to the same row, we may need to
1049  // update row offsets. This tracks that.
1050  View<size_t*, DT> new_start_row("new_start_row", N + 1);
1051 
1052  // Turn row length into a real CRS row pointer
1053  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1054 
1055  // SameIDs: Copy the data over
1056  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1057  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1058 
1059  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1060  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1061  local_matrix, local_col_map, my_pid);
1062 
1063  if (imports.extent(0) <= 0) {
1064  return;
1065  }
1066 
1067  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1068  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1069  local_matrix, local_col_map, my_pid, bytes_per_value);
1070  TEUCHOS_TEST_FOR_EXCEPTION(
1071  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1072  "should never happen. Please report this bug to the Tpetra developers.");
1073 
1074  return;
1075 }
1076 
1077 } // namespace UnpackAndCombineCrsMatrixImpl
1078 
1118 template <typename ST, typename LO, typename GO, typename Node>
1120  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1121  const Teuchos::ArrayView<const char>& imports,
1122  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1123  const Teuchos::ArrayView<const LO>& importLIDs,
1124  size_t /* constantNumPackets */,
1125  CombineMode combineMode) {
1126  using Kokkos::View;
1127  typedef typename Node::device_type device_type;
1128  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1129  static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1130  "Node::device_type and LocalMatrix::device_type must be the same.");
1131 
1132  // Convert all Teuchos::Array to Kokkos::View.
1133  device_type outputDevice;
1134 
1135  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1136  // them to device. Since unpacking is done directly in to the local matrix
1137  // (lclMatrix), no copying needs to be performed after unpacking.
1138  auto num_packets_per_lid_d =
1139  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1140  numPacketsPerLID.size(), true, "num_packets_per_lid");
1141 
1142  auto import_lids_d =
1143  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1144  importLIDs.size(), true, "import_lids");
1145 
1146  auto imports_d =
1147  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1148  imports.size(), true, "imports");
1149 
1150  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1151  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1152 
1153  // KDDKDD This loop doesn't appear to do anything; what is it?
1154  // KDDKDD for (int i=0; i<importLIDs.size(); i++)
1155  // KDDKDD {
1156  // KDDKDD auto lclRow = importLIDs[i];
1157  // KDDKDD Teuchos::ArrayView<const LO> A_indices;
1158  // KDDKDD Teuchos::ArrayView<const ST> A_values;
1159  // KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1160  // KDDKDD }
1161  // Now do the actual unpack!
1162  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1163  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1164  import_lids_d, combineMode);
1165 }
1166 
1167 template <typename ST, typename LO, typename GO, typename NT>
1168 void unpackCrsMatrixAndCombineNew(
1169  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1170  Kokkos::DualView<char*,
1172  imports,
1173  Kokkos::DualView<size_t*,
1175  numPacketsPerLID,
1176  const Kokkos::DualView<const LO*,
1178  const size_t /* constantNumPackets */,
1179  const CombineMode combineMode) {
1180  using Kokkos::View;
1181  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1182  using dist_object_type = DistObject<char, LO, GO, NT>;
1183  using device_type = typename crs_matrix_type::device_type;
1184  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1185  using buffer_device_type = typename dist_object_type::buffer_device_type;
1186 
1187  static_assert(std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1188  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1189  "must be the same.");
1190 
1191  if (numPacketsPerLID.need_sync_device()) {
1192  numPacketsPerLID.sync_device();
1193  }
1194  auto num_packets_per_lid_d = numPacketsPerLID.view_device();
1195 
1196  TEUCHOS_ASSERT(!importLIDs.need_sync_device());
1197  auto import_lids_d = importLIDs.view_device();
1198 
1199  if (imports.need_sync_device()) {
1200  imports.sync_device();
1201  }
1202  auto imports_d = imports.view_device();
1203 
1204  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1205  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1206  typedef decltype(local_col_map) local_map_type;
1207 
1208  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1209  local_matrix_device_type,
1210  local_map_type,
1211  buffer_device_type>(local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1212  import_lids_d, combineMode);
1213 }
1214 
1261 //
1270 template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1271 size_t
1273  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& sourceMatrix,
1274  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1275  const Teuchos::ArrayView<const char>& imports,
1276  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1277  size_t /* constantNumPackets */,
1278  CombineMode /* combineMode */,
1279  size_t numSameIDs,
1280  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1281  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs) {
1282  using Kokkos::MemoryUnmanaged;
1283  using Kokkos::View;
1284  typedef typename Node::device_type DT;
1285  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1286 
1287  TEUCHOS_TEST_FOR_EXCEPTION(permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1288  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1289  "permuteFromLIDs.size() = "
1290  << permuteFromLIDs.size() << ".");
1291  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1292  // process, then the matrix is neither locally nor globally indexed.
1293  const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1294  TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1295  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1296  TEUCHOS_TEST_FOR_EXCEPTION(importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1297  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1298  "numPacketsPerLID.size() = "
1299  << numPacketsPerLID.size() << ".");
1300 
1301  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1302 
1303  using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1304  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1305 
1306 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1307  Kokkos::View<LocalOrdinal const*, kokkos_device_type, void, void> permute_from_lids_d =
1308 #else
1309  Kokkos::View<LocalOrdinal const*, kokkos_device_type> permute_from_lids_d =
1310 #endif
1312  permuteFromLIDs.getRawPtr(),
1313  permuteFromLIDs.size(), true,
1314  "permute_from_lids");
1315 
1316 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1317  Kokkos::View<const char*, kokkos_device_type, void, void> imports_d =
1318 #else
1319  Kokkos::View<const char*, kokkos_device_type> imports_d =
1320 #endif
1322  imports.getRawPtr(),
1323  imports.size(), true,
1324  "imports");
1325 
1326 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1327  Kokkos::View<const size_t*, kokkos_device_type, void, void> num_packets_per_lid_d =
1328 #else
1329  Kokkos::View<const size_t*, kokkos_device_type> num_packets_per_lid_d =
1330 #endif
1332  numPacketsPerLID.getRawPtr(),
1333  numPacketsPerLID.size(), true,
1334  "num_packets_per_lid");
1335 
1337  local_matrix, permute_from_lids_d, imports_d,
1338  num_packets_per_lid_d, numSameIDs);
1339 } // unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1340 
1355 
1356 template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1359  const Kokkos::View<LocalOrdinal const*,
1360  Kokkos::Device<typename Node::device_type::execution_space,
1361  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1362 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1363  ,
1364  void, void
1365 #endif
1366  >
1367  import_lids_d,
1368  const Kokkos::View<const char*,
1369  Kokkos::Device<typename Node::device_type::execution_space,
1370  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1371 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1372  ,
1373  void, void
1374 #endif
1375  >
1376  imports_d,
1377  const Kokkos::View<const size_t*,
1378  Kokkos::Device<typename Node::device_type::execution_space,
1379  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1380 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1381  ,
1382  void, void
1383 #endif
1384  >
1385  num_packets_per_lid_d,
1386  const size_t numSameIDs,
1387  const Kokkos::View<LocalOrdinal const*,
1388  Kokkos::Device<typename Node::device_type::execution_space,
1389  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1390 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1391  ,
1392  void, void
1393 #endif
1394  >
1395  permute_to_lids_d,
1396  const Kokkos::View<LocalOrdinal const*,
1397  Kokkos::Device<typename Node::device_type::execution_space,
1398  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1399 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1400  ,
1401  void, void
1402 #endif
1403  >
1404  permute_from_lids_d,
1405  size_t TargetNumRows,
1406  const int MyTargetPID,
1407  Kokkos::View<size_t*, typename Node::device_type>& crs_rowptr_d,
1408  Kokkos::View<GlobalOrdinal*, typename Node::device_type>& crs_colind_d,
1409  Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*, typename Node::device_type>& crs_vals_d,
1410  const Teuchos::ArrayView<const int>& SourcePids,
1411  Kokkos::View<int*, typename Node::device_type>& TargetPids) {
1412  using execution_space = typename Node::execution_space;
1414 
1415  using Kokkos::deep_copy;
1416  using Kokkos::View;
1417 
1418  using Teuchos::ArrayView;
1419  using Teuchos::outArg;
1420  using Teuchos::REDUCE_MAX;
1421  using Teuchos::reduceAll;
1422 
1423  typedef typename Node::device_type DT;
1424 
1426  typedef typename matrix_type::impl_scalar_type ST;
1427 
1428  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1429 #ifdef HAVE_TPETRA_MMM_TIMINGS
1430  using Teuchos::TimeMonitor;
1431  Teuchos::RCP<TimeMonitor> tm;
1432 #endif
1433 
1434  using Kokkos::MemoryUnmanaged;
1435 
1436  TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1437  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size() << " != "
1438  "permute_from_lids_d.size() = "
1439  << permute_from_lids_d.size() << ".");
1440  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1441  // process, then the matrix is neither locally nor globally indexed.
1442  const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1443  TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1444  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1445  TEUCHOS_TEST_FOR_EXCEPTION(((size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1446  prefix << "import_lids_d.size() = " << import_lids_d.size() << " != "
1447  "num_packets_per_lid_d.size() = "
1448  << num_packets_per_lid_d.size() << ".");
1449 
1450  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1451 
1452  // TargetNumNonzeros is number of nonzeros in local matrix.
1453 #ifdef HAVE_TPETRA_MMM_TIMINGS
1454  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1455 #endif
1456  size_t TargetNumNonzeros =
1458  local_matrix, permute_from_lids_d, imports_d,
1459  num_packets_per_lid_d, numSameIDs);
1460 #ifdef HAVE_TPETRA_MMM_TIMINGS
1461  tm = Teuchos::null;
1462 #endif
1463 
1464 #ifdef HAVE_TPETRA_MMM_TIMINGS
1465  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1466 #endif
1467  Kokkos::resize(crs_rowptr_d, TargetNumRows + 1);
1468  Kokkos::resize(crs_colind_d, TargetNumNonzeros);
1469  Kokkos::resize(crs_vals_d, TargetNumNonzeros);
1470 #ifdef HAVE_TPETRA_MMM_TIMINGS
1471  tm = Teuchos::null;
1472 #endif
1473 
1474  TEUCHOS_TEST_FOR_EXCEPTION(
1475  permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1476  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size()
1477  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1478 
1479  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1480  Kokkos::resize(TargetPids, TargetNumNonzeros);
1481  }
1482  Kokkos::deep_copy(execution_space(), TargetPids, -1);
1483 
1484  // Grab pointers for sourceMatrix
1485  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1486 
1487 #ifdef HAVE_TPETRA_MMM_TIMINGS
1488  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1489 #endif
1490  // Convert input arrays to Kokkos::Views
1491  DT outputDevice;
1492 
1493  auto src_pids_d =
1494  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1495  SourcePids.size(), true, "src_pids");
1496 
1497 #ifdef HAVE_TPETRA_MMM_TIMINGS
1498  tm = Teuchos::null;
1499 #endif
1500 
1501  size_t bytes_per_value = 0;
1503  // assume that ST is default constructible
1504  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1505  } else {
1506  // Since the packed data come from the source matrix, we can use the source
1507  // matrix to get the number of bytes per Scalar value stored in the matrix.
1508  // This assumes that all Scalar values in the source matrix require the same
1509  // number of bytes. If the source matrix has no entries on the calling
1510  // process, then we hope that some process does have some idea how big
1511  // a Scalar value is. Of course, if no processes have any entries, then no
1512  // values should be packed (though this does assume that in our packing
1513  // scheme, rows with zero entries take zero bytes).
1514  size_t bytes_per_value_l = 0;
1515  if (local_matrix.values.extent(0) > 0) {
1516  const ST& val = local_matrix.values(0);
1517  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1518  } else {
1519  const ST& val = crs_vals_d(0);
1520  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1521  }
1522  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1523  Teuchos::REDUCE_MAX,
1524  bytes_per_value_l,
1525  outArg(bytes_per_value));
1526  }
1527 
1528 #ifdef HAVE_TPETRA_MMM_TIMINGS
1529  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1530 #endif
1532  local_matrix, local_col_map, import_lids_d, imports_d,
1533  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1534  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1535  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1536  bytes_per_value);
1537 #ifdef HAVE_TPETRA_MMM_TIMINGS
1538  tm = Teuchos::null;
1539 #endif
1540 
1541  // Copy outputs back to host
1542 #ifdef HAVE_TPETRA_MMM_TIMINGS
1543  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1544 #endif
1545 
1546  Kokkos::parallel_for(
1547  "setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0, TargetPids.size()), KOKKOS_LAMBDA(const size_t i) {
1548  if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1549  });
1550 
1551 } // unpackAndCombineIntoCrsArrays
1552 
1553 template <typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1556  const Kokkos::View<LocalOrdinal const*,
1557  Kokkos::Device<typename Node::device_type::execution_space,
1558  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1559 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1560  ,
1561  void, void
1562 #endif
1563  >
1564  import_lids_d,
1565  const Kokkos::View<const char*,
1566  Kokkos::Device<typename Node::device_type::execution_space,
1567  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1568 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1569  ,
1570  void, void
1571 #endif
1572  >
1573  imports_d,
1574  const Kokkos::View<const size_t*,
1575  Kokkos::Device<typename Node::device_type::execution_space,
1576  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1577 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1578  ,
1579  void, void
1580 #endif
1581  >
1582  num_packets_per_lid_d,
1583  const size_t numSameIDs,
1584  const Kokkos::View<LocalOrdinal const*,
1585  Kokkos::Device<typename Node::device_type::execution_space,
1586  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1587 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1588  ,
1589  void, void
1590 #endif
1591  >
1592  permute_to_lids_d,
1593  const Kokkos::View<LocalOrdinal const*,
1594  Kokkos::Device<typename Node::device_type::execution_space,
1595  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>
1596 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1597  ,
1598  void, void
1599 #endif
1600  >
1601  permute_from_lids_d,
1602  size_t TargetNumRows,
1603  const int MyTargetPID,
1604  Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1605  Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1606  Teuchos::ArrayRCP<Scalar>& CRS_vals,
1607  const Teuchos::ArrayView<const int>& SourcePids,
1608  Teuchos::Array<int>& TargetPids) {
1609  using execution_space = typename Node::execution_space;
1611 
1612  using Kokkos::deep_copy;
1613  using Kokkos::View;
1614 
1615  using Teuchos::ArrayView;
1616  using Teuchos::outArg;
1617  using Teuchos::REDUCE_MAX;
1618  using Teuchos::reduceAll;
1619 
1620  typedef typename Node::device_type DT;
1621 
1623  typedef typename matrix_type::impl_scalar_type ST;
1624 
1625  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1626 #ifdef HAVE_TPETRA_MMM_TIMINGS
1627  using Teuchos::TimeMonitor;
1628  Teuchos::RCP<TimeMonitor> tm;
1629 #endif
1630 
1631  using Kokkos::MemoryUnmanaged;
1632 
1633  TEUCHOS_TEST_FOR_EXCEPTION(permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1634  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size() << " != "
1635  "permute_from_lids_d.size() = "
1636  << permute_from_lids_d.size() << ".");
1637  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1638  // process, then the matrix is neither locally nor globally indexed.
1639  const bool locallyIndexed = sourceMatrix.isLocallyIndexed();
1640  TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
1641  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1642  TEUCHOS_TEST_FOR_EXCEPTION(((size_t)import_lids_d.size()) != num_packets_per_lid_d.size(), std::invalid_argument,
1643  prefix << "import_lids_d.size() = " << import_lids_d.size() << " != "
1644  "num_packets_per_lid_d.size() = "
1645  << num_packets_per_lid_d.size() << ".");
1646 
1647  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1648 
1649  // TargetNumNonzeros is number of nonzeros in local matrix.
1650 #ifdef HAVE_TPETRA_MMM_TIMINGS
1651  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1652 #endif
1653  size_t TargetNumNonzeros =
1655  local_matrix, permute_from_lids_d, imports_d,
1656  num_packets_per_lid_d, numSameIDs);
1657 #ifdef HAVE_TPETRA_MMM_TIMINGS
1658  tm = Teuchos::null;
1659 #endif
1660 
1661 #ifdef HAVE_TPETRA_MMM_TIMINGS
1662  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1663 #endif
1664  CRS_rowptr.resize(TargetNumRows + 1);
1665  CRS_colind.resize(TargetNumNonzeros);
1666  CRS_vals.resize(TargetNumNonzeros);
1667  Teuchos::ArrayRCP<ST> const& CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1668 #ifdef HAVE_TPETRA_MMM_TIMINGS
1669  tm = Teuchos::null;
1670 #endif
1671 
1672  TEUCHOS_TEST_FOR_EXCEPTION(
1673  permute_to_lids_d.size() != permute_from_lids_d.size(), std::invalid_argument,
1674  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size()
1675  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1676 
1677  // Preseed TargetPids with -1 for local
1678  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1679  TargetPids.resize(TargetNumNonzeros);
1680  }
1681  TargetPids.assign(TargetNumNonzeros, -1);
1682 
1683  // Grab pointers for sourceMatrix
1684  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1685 
1686 #ifdef HAVE_TPETRA_MMM_TIMINGS
1687  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1688 #endif
1689  // Convert input arrays to Kokkos::Views
1690  DT outputDevice;
1691 
1692  auto crs_rowptr_d =
1693  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1694  CRS_rowptr.size(), true, "crs_rowptr");
1695 
1696  auto crs_colind_d =
1697  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1698  CRS_colind.size(), true, "crs_colidx");
1699 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1700  static_assert(!std::is_same<
1701  typename std::remove_const<
1702  typename std::decay<
1703  decltype(CRS_vals_impl_scalar_type)>::type::value_type>::type,
1704  std::complex<double>>::value,
1705  "CRS_vals::value_type is std::complex<double>; this should never happen"
1706  ", since std::complex does not work in Kokkos::View objects.");
1707 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1708 
1709  auto crs_vals_d =
1710  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(),
1711  CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1712 
1713 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714  static_assert(!std::is_same<
1715  typename decltype(crs_vals_d)::non_const_value_type,
1716  std::complex<double>>::value,
1717  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718  "never happen, since std::complex does not work in Kokkos::View objects.");
1719 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1720 
1721  auto src_pids_d =
1722  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1723  SourcePids.size(), true, "src_pids");
1724 
1725  auto tgt_pids_d =
1726  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1727  TargetPids.size(), true, "tgt_pids");
1728 
1729 #ifdef HAVE_TPETRA_MMM_TIMINGS
1730  tm = Teuchos::null;
1731 #endif
1732 
1733  size_t bytes_per_value = 0;
1735  // assume that ST is default constructible
1736  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1737  } else {
1738  // Since the packed data come from the source matrix, we can use the source
1739  // matrix to get the number of bytes per Scalar value stored in the matrix.
1740  // This assumes that all Scalar values in the source matrix require the same
1741  // number of bytes. If the source matrix has no entries on the calling
1742  // process, then we hope that some process does have some idea how big
1743  // a Scalar value is. Of course, if no processes have any entries, then no
1744  // values should be packed (though this does assume that in our packing
1745  // scheme, rows with zero entries take zero bytes).
1746  size_t bytes_per_value_l = 0;
1747  if (local_matrix.values.extent(0) > 0) {
1748  const ST& val = local_matrix.values(0);
1749  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1750  } else {
1751  const ST& val = crs_vals_d(0);
1752  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1753  }
1754  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1755  Teuchos::REDUCE_MAX,
1756  bytes_per_value_l,
1757  outArg(bytes_per_value));
1758  }
1759 
1760 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1761  static_assert(!std::is_same<
1762  typename decltype(crs_vals_d)::non_const_value_type,
1763  std::complex<double>>::value,
1764  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1765  "never happen, since std::complex does not work in Kokkos::View objects.");
1766 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1767 
1768 #ifdef HAVE_TPETRA_MMM_TIMINGS
1769  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1770 #endif
1772  local_matrix, local_col_map, import_lids_d, imports_d,
1773  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1774  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1775  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1776  bytes_per_value);
1777 #ifdef HAVE_TPETRA_MMM_TIMINGS
1778  tm = Teuchos::null;
1779 #endif
1780 
1781  // Copy outputs back to host
1782 #ifdef HAVE_TPETRA_MMM_TIMINGS
1783  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1784 #endif
1785  typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
1786  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1787  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1788  deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1789 
1790  typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
1791  CRS_colind.getRawPtr(), CRS_colind.size());
1792  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1793  deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1794 
1795  typename decltype(crs_vals_d)::host_mirror_type crs_vals_h(
1796  CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1797  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1798  deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1799 
1800  typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
1801  TargetPids.getRawPtr(), TargetPids.size());
1802  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1803  deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1804 
1805 } // unpackAndCombineIntoCrsArrays
1806 
1807 } // namespace Details
1808 } // namespace Tpetra
1809 
1810 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT) \
1811  template void \
1812  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1813  const CrsMatrix<ST, LO, GO, NT>&, \
1814  const Teuchos::ArrayView<const char>&, \
1815  const Teuchos::ArrayView<const size_t>&, \
1816  const Teuchos::ArrayView<const LO>&, \
1817  size_t, \
1818  CombineMode); \
1819  template size_t \
1820  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1821  const CrsMatrix<ST, LO, GO, NT>&, \
1822  const Teuchos::ArrayView<const LO>&, \
1823  const Teuchos::ArrayView<const char>&, \
1824  const Teuchos::ArrayView<const size_t>&, \
1825  size_t, \
1826  CombineMode, \
1827  size_t, \
1828  const Teuchos::ArrayView<const LO>&, \
1829  const Teuchos::ArrayView<const LO>&); \
1830  template void \
1831  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1832  const CrsMatrix<ST, LO, GO, NT>&, \
1833  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1834  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1835  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1836  const size_t, \
1837  const CombineMode); \
1838  template void \
1839  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1840  const CrsMatrix<ST, LO, GO, NT>&, \
1841  const Kokkos::View<LO const*, \
1842  Kokkos::Device<typename NT::device_type::execution_space, \
1843  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1844  void, void>, \
1845  const Kokkos::View<const char*, \
1846  Kokkos::Device<typename NT::device_type::execution_space, \
1847  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1848  void, void>, \
1849  const Kokkos::View<const size_t*, \
1850  Kokkos::Device<typename NT::device_type::execution_space, \
1851  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1852  void, void>, \
1853  const size_t, \
1854  const Kokkos::View<LO const*, \
1855  Kokkos::Device<typename NT::device_type::execution_space, \
1856  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1857  void, void>, \
1858  const Kokkos::View<LO const*, \
1859  Kokkos::Device<typename NT::device_type::execution_space, \
1860  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1861  void, void>, \
1862  size_t, \
1863  const int, \
1864  Kokkos::View<size_t*, typename NT::device_type>&, \
1865  Kokkos::View<GO*, typename NT::device_type>&, \
1866  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1867  const Teuchos::ArrayView<const int>&, \
1868  Kokkos::View<int*, typename NT::device_type>&); \
1869  template void \
1870  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1871  const CrsMatrix<ST, LO, GO, NT>&, \
1872  const Kokkos::View<LO const*, \
1873  Kokkos::Device<typename NT::device_type::execution_space, \
1874  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1875  void, void>, \
1876  const Kokkos::View<const char*, \
1877  Kokkos::Device<typename NT::device_type::execution_space, \
1878  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1879  void, void>, \
1880  const Kokkos::View<const size_t*, \
1881  Kokkos::Device<typename NT::device_type::execution_space, \
1882  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1883  void, void>, \
1884  const size_t, \
1885  const Kokkos::View<LO const*, \
1886  Kokkos::Device<typename NT::device_type::execution_space, \
1887  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1888  void, void>, \
1889  const Kokkos::View<LO const*, \
1890  Kokkos::Device<typename NT::device_type::execution_space, \
1891  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1892  void, void>, \
1893  size_t, \
1894  const int, \
1895  Teuchos::ArrayRCP<size_t>&, \
1896  Teuchos::ArrayRCP<GO>&, \
1897  Teuchos::ArrayRCP<ST>&, \
1898  const Teuchos::ArrayView<const int>&, \
1899  Teuchos::Array<int>&);
1900 
1901 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT) \
1902  template void \
1903  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT>( \
1904  const CrsMatrix<ST, LO, GO, NT>&, \
1905  const Teuchos::ArrayView<const char>&, \
1906  const Teuchos::ArrayView<const size_t>&, \
1907  const Teuchos::ArrayView<const LO>&, \
1908  size_t, \
1909  CombineMode); \
1910  template size_t \
1911  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT>( \
1912  const CrsMatrix<ST, LO, GO, NT>&, \
1913  const Teuchos::ArrayView<const LO>&, \
1914  const Teuchos::ArrayView<const char>&, \
1915  const Teuchos::ArrayView<const size_t>&, \
1916  size_t, \
1917  CombineMode, \
1918  size_t, \
1919  const Teuchos::ArrayView<const LO>&, \
1920  const Teuchos::ArrayView<const LO>&); \
1921  template void \
1922  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT>( \
1923  const CrsMatrix<ST, LO, GO, NT>&, \
1924  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1925  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1926  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1927  const size_t, \
1928  const CombineMode); \
1929  template void \
1930  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1931  const CrsMatrix<ST, LO, GO, NT>&, \
1932  const Kokkos::View<LO const*, \
1933  Kokkos::Device<typename NT::device_type::execution_space, \
1934  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1935  const Kokkos::View<const char*, \
1936  Kokkos::Device<typename NT::device_type::execution_space, \
1937  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1938  const Kokkos::View<const size_t*, \
1939  Kokkos::Device<typename NT::device_type::execution_space, \
1940  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1941  const size_t, \
1942  const Kokkos::View<LO const*, \
1943  Kokkos::Device<typename NT::device_type::execution_space, \
1944  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1945  const Kokkos::View<LO const*, \
1946  Kokkos::Device<typename NT::device_type::execution_space, \
1947  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1948  size_t, \
1949  const int, \
1950  Kokkos::View<size_t*, typename NT::device_type>&, \
1951  Kokkos::View<GO*, typename NT::device_type>&, \
1952  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*, typename NT::device_type>&, \
1953  const Teuchos::ArrayView<const int>&, \
1954  Kokkos::View<int*, typename NT::device_type>&); \
1955  template void \
1956  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT>( \
1957  const CrsMatrix<ST, LO, GO, NT>&, \
1958  const Kokkos::View<LO const*, \
1959  Kokkos::Device<typename NT::device_type::execution_space, \
1960  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1961  const Kokkos::View<const char*, \
1962  Kokkos::Device<typename NT::device_type::execution_space, \
1963  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1964  const Kokkos::View<const size_t*, \
1965  Kokkos::Device<typename NT::device_type::execution_space, \
1966  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1967  const size_t, \
1968  const Kokkos::View<LO const*, \
1969  Kokkos::Device<typename NT::device_type::execution_space, \
1970  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1971  const Kokkos::View<LO const*, \
1972  Kokkos::Device<typename NT::device_type::execution_space, \
1973  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>>, \
1974  size_t, \
1975  const int, \
1976  Teuchos::ArrayRCP<size_t>&, \
1977  Teuchos::ArrayRCP<GO>&, \
1978  Teuchos::ArrayRCP<ST>&, \
1979  const Teuchos::ArrayView<const int>&, \
1980  Teuchos::Array<int>&);
1981 
1982 #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4
1983 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1984  TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_ON(ST, LO, GO, NT)
1985 #else
1986 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT(ST, LO, GO, NT) \
1987  TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT_KOKKOS_DEPRECATED_CODE_4_OFF(ST, LO, GO, NT)
1988 #endif
1989 
1990 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...