Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
12 
13 #include <memory>
14 #include <string>
15 #include "TpetraCore_config.h"
16 #include "Kokkos_Core.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Teuchos_ArrayView.hpp"
19 #include "Teuchos_OrdinalTraits.hpp"
20 #include "Teuchos_TimeMonitor.hpp"
28 #include "Tpetra_Details_DefaultTypes.hpp"
30 
51 
52 namespace Tpetra {
53 
54 //
55 // Users must never rely on anything in the Details namespace.
56 //
57 namespace Details {
58 
59 namespace UnpackAndCombineCrsMatrixImpl {
60 
70 template<class ST, class LO, class GO>
71 KOKKOS_FUNCTION int
72 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
73  const typename PackTraits<int>::output_array_type& pids_out,
74  const typename PackTraits<ST>::output_array_type& vals_out,
75  const char imports[],
76  const size_t offset,
77  const size_t /* num_bytes */,
78  const size_t num_ent,
79  const size_t bytes_per_value)
80 {
81  if (num_ent == 0) {
82  // Empty rows always take zero bytes, to ensure sparsity.
83  return 0;
84  }
85  bool unpack_pids = pids_out.size() > 0;
86 
87  const size_t num_ent_beg = offset;
88  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
89 
90  const size_t gids_beg = num_ent_beg + num_ent_len;
91  const size_t gids_len =
92  num_ent * PackTraits<GO>::packValueCount (GO (0));
93 
94  const size_t pids_beg = gids_beg + gids_len;
95  const size_t pids_len = unpack_pids ?
96  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
97  size_t (0);
98 
99  const size_t vals_beg = gids_beg + gids_len + pids_len;
100  const size_t vals_len = num_ent * bytes_per_value;
101 
102  const char* const num_ent_in = imports + num_ent_beg;
103  const char* const gids_in = imports + gids_beg;
104  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
105  const char* const vals_in = imports + vals_beg;
106 
107  size_t num_bytes_out = 0;
108  LO num_ent_out;
109  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
110  if (static_cast<size_t> (num_ent_out) != num_ent) {
111  return 20; // error code
112  }
113 
114  {
115  Kokkos::pair<int, size_t> p;
116  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
117  if (p.first != 0) {
118  return 21; // error code
119  }
120  num_bytes_out += p.second;
121 
122  if (unpack_pids) {
123  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
124  if (p.first != 0) {
125  return 22; // error code
126  }
127  num_bytes_out += p.second;
128  }
129 
130  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
131  if (p.first != 0) {
132  return 23; // error code
133  }
134  num_bytes_out += p.second;
135  }
136 
137  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
138  if (num_bytes_out != expected_num_bytes) {
139  return 24; // error code
140  }
141  return 0; // no errors
142 } //unpackRow
143 
154 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
156  typedef LocalMatrix local_matrix_type;
157  typedef LocalMap local_map_type;
158 
159  typedef typename local_matrix_type::value_type ST;
160  typedef typename local_map_type::local_ordinal_type LO;
161  typedef typename local_map_type::global_ordinal_type GO;
162  typedef typename local_map_type::device_type DT;
163  typedef typename DT::execution_space XS;
164 
165  typedef Kokkos::View<const size_t*, BufferDeviceType>
166  num_packets_per_lid_type;
167  typedef Kokkos::View<const size_t*, DT> offsets_type;
168  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
169  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
170 
171  typedef Kokkos::View<int, DT> error_type;
172  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
173 
174  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
175  "LocalMap::local_ordinal_type and "
176  "LocalMatrix::ordinal_type must be the same.");
177 
178  local_matrix_type local_matrix;
179  local_map_type local_col_map;
180  input_buffer_type imports;
181  num_packets_per_lid_type num_packets_per_lid;
182  import_lids_type import_lids;
183  Kokkos::View<const LO*[2], DT> batch_info;
184  offsets_type offsets;
185  Tpetra::CombineMode combine_mode;
186  size_t batch_size;
187  size_t bytes_per_value;
188  bool atomic;
189  error_type error_code;
190 
192  const local_matrix_type& local_matrix_in,
193  const local_map_type& local_col_map_in,
194  const input_buffer_type& imports_in,
195  const num_packets_per_lid_type& num_packets_per_lid_in,
196  const import_lids_type& import_lids_in,
197  const Kokkos::View<const LO*[2], DT>& batch_info_in,
198  const offsets_type& offsets_in,
199  const Tpetra::CombineMode combine_mode_in,
200  const size_t batch_size_in,
201  const size_t bytes_per_value_in,
202  const bool atomic_in) :
203  local_matrix (local_matrix_in),
204  local_col_map (local_col_map_in),
205  imports (imports_in),
206  num_packets_per_lid (num_packets_per_lid_in),
207  import_lids (import_lids_in),
208  batch_info (batch_info_in),
209  offsets (offsets_in),
210  combine_mode (combine_mode_in),
211  batch_size (batch_size_in),
212  bytes_per_value (bytes_per_value_in),
213  atomic (atomic_in),
214  error_code("error")
215  {}
216 
217  KOKKOS_INLINE_FUNCTION
218  void operator()(member_type team_member) const
219  {
220  using Kokkos::View;
221  using Kokkos::subview;
222  using Kokkos::MemoryUnmanaged;
223 
224  const LO batch = team_member.league_rank();
225  const LO lid_no = batch_info(batch, 0);
226  const LO batch_no = batch_info(batch, 1);
227 
228  const size_t num_bytes = num_packets_per_lid(lid_no);
229 
230  // Only unpack data if there is a nonzero number of bytes.
231  if (num_bytes == 0)
232  return;
233 
234  // there is actually something in the row
235  const LO import_lid = import_lids(lid_no);
236  const size_t buf_size = imports.size();
237  const size_t offset = offsets(lid_no);
238 
239  // Get the number of entries to expect in the received data for this row.
240  LO num_ent_LO = 0;
241  const char* const in_buf = imports.data() + offset;
242  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
243  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
244 
245  // Count the number of bytes expected to unpack
246  size_t expected_num_bytes = 0;
247  {
248  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
249  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
250  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
251  }
252 
253  if (expected_num_bytes > num_bytes)
254  {
255 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
256 #ifndef KOKKOS_ENABLE_SYCL
257  printf(
258  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
259  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
260  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
261  );
262 #endif
263  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
264  return;
265  }
266 
267  if (offset > buf_size || offset + num_bytes > buf_size)
268  {
269 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
270 #ifndef KOKKOS_ENABLE_SYCL
271  printf(
272  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
273  "At row %d, the offset (%d) > buffer size (%d)\n",
274  (int) lid_no, (int) offset, (int) buf_size
275  );
276 #endif
277  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
278  return;
279  }
280 
281  // Determine the number of entries to unpack in this batch
282  size_t num_entries_in_batch = 0;
283  if (num_entries_in_row <= batch_size)
284  num_entries_in_batch = num_entries_in_row;
285  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
286  num_entries_in_batch = batch_size;
287  else
288  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
289 
290  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
291  const size_t num_ent_start = offset;
292  const size_t num_ent_end = num_ent_start + bytes_per_lid;
293 
294  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
295  const size_t gids_start = num_ent_end;
296  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
297 
298  const size_t vals_start = gids_end;
299 
300  const size_t shift = batch_no * batch_size;
301  const char* const num_ent_in = imports.data() + num_ent_start;
302  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
303  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
304 
305  LO num_ent_out;
306  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
307  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
308  {
309 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
310 #ifndef KOKKOS_ENABLE_SYCL
311  printf(
312  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
313  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
314  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
315  );
316 #endif
317  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
318  }
319 
320  constexpr bool matrix_has_sorted_rows = true; // see #6282
321  //Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
322  //By ref triggers compiler bug in CUDA 10.
323  Kokkos::parallel_for(
324  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
325  [=](const LO& j)
326  {
327  size_t distance = 0;
328 
329  GO gid_out;
330  distance = j * bytes_per_gid;
331  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
332  auto lid_out = local_col_map.getLocalElement(gid_out);
333 
334  // Column indices come in as global indices, in case the
335  // source object's column Map differs from the target object's
336  // (this's) column Map, and must be converted local index values
337 
338  // assume that ST is default constructible
339  ST val_out;
340  distance = j * bytes_per_value;
341  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
342 
343  if (combine_mode == ADD) {
344  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
345  // different threads don't touch the same row (i.e., no
346  // duplicates in incoming LIDs list).
347  const bool use_atomic_updates = atomic;
348  (void)local_matrix.sumIntoValues(
349  import_lid,
350  &lid_out,
351  1,
352  &val_out,
353  matrix_has_sorted_rows,
354  use_atomic_updates
355  );
356  } else if (combine_mode == REPLACE) {
357  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
358  // combine mode with multiple incoming rows that touch the same
359  // target matrix entries, so we never need atomic updates.
360  const bool use_atomic_updates = false;
361  (void)local_matrix.replaceValues(
362  import_lid,
363  &lid_out,
364  1,
365  &val_out,
366  matrix_has_sorted_rows,
367  use_atomic_updates
368  );
369  } else {
370  // should never get here
371 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
372 #ifndef KOKKOS_ENABLE_SYCL
373  printf(
374  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
375  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
376  );
377 #endif
378  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
379  }
380  }
381  );
382 
383  team_member.team_barrier();
384 
385  }
386 
388  int error() const {
389  auto error_code_h = Kokkos::create_mirror_view_and_copy(
390  Kokkos::HostSpace(), error_code
391  );
392  return error_code_h();
393  }
394 
395 }; //UnpackCrsMatrixAndCombineFunctor
396 
397 struct MaxNumEntTag {};
398 struct TotNumEntTag {};
399 
408 template<class LO, class DT, class BDT>
410 public:
411  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
412  typedef Kokkos::View<const size_t*, DT> offsets_type;
413  typedef Kokkos::View<const char*, BDT> input_buffer_type;
414  // This needs to be public, since it appears in the argument list of
415  // public methods (see below). Otherwise, build errors may happen.
416  typedef size_t value_type;
417 
418 private:
419  num_packets_per_lid_type num_packets_per_lid;
420  offsets_type offsets;
421  input_buffer_type imports;
422 
423 public:
424  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
425  const offsets_type& offsets_in,
426  const input_buffer_type& imports_in) :
427  num_packets_per_lid (num_packets_per_lid_in),
428  offsets (offsets_in),
429  imports (imports_in)
430  {}
431 
432  KOKKOS_INLINE_FUNCTION void
433  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
434  // Get how many entries to expect in the received data for this row.
435  const size_t num_bytes = num_packets_per_lid(i);
436  if (num_bytes > 0) {
437  LO num_ent_LO = 0; // output argument of unpackValue
438  const char* const in_buf = imports.data () + offsets(i);
439  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
440  const size_t num_ent = static_cast<size_t> (num_ent_LO);
441 
442  update = (update < num_ent) ? num_ent : update;
443  }
444  }
445 
446  KOKKOS_INLINE_FUNCTION void
447  join (const MaxNumEntTag,
448  value_type& dst,
449  const value_type& src) const
450  {
451  if (dst < src) dst = src;
452  }
453 
454  KOKKOS_INLINE_FUNCTION void
455  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
456  // Get how many entries to expect in the received data for this row.
457  const size_t num_bytes = num_packets_per_lid(i);
458  if (num_bytes > 0) {
459  LO num_ent_LO = 0; // output argument of unpackValue
460  const char* const in_buf = imports.data () + offsets(i);
461  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
462  tot_num_ent += static_cast<size_t> (num_ent_LO);
463  }
464  }
465 }; //NumEntriesFunctor
466 
474 template<class LO, class DT, class BDT>
475 size_t
476 compute_maximum_num_entries (
477  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
478  const Kokkos::View<const size_t*, DT>& offsets,
479  const Kokkos::View<const char*, BDT>& imports)
480 {
481  typedef typename DT::execution_space XS;
482  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
483  MaxNumEntTag> range_policy;
484 
485  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
486  imports);
487  const LO numRowsToUnpack =
488  static_cast<LO> (num_packets_per_lid.extent (0));
489  size_t max_num_ent = 0;
490  Kokkos::parallel_reduce ("Max num entries in CRS",
491  range_policy (0, numRowsToUnpack),
492  functor, max_num_ent);
493  return max_num_ent;
494 }
495 
503 template<class LO, class DT, class BDT>
504 size_t
505 compute_total_num_entries (
506  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
507  const Kokkos::View<const size_t*, DT>& offsets,
508  const Kokkos::View<const char*, BDT>& imports)
509 {
510  typedef typename DT::execution_space XS;
511  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
512  size_t tot_num_ent = 0;
513  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
514  imports);
515  const LO numRowsToUnpack =
516  static_cast<LO> (num_packets_per_lid.extent (0));
517  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
518  range_policy (0, numRowsToUnpack),
519  functor, tot_num_ent);
520  return tot_num_ent;
521 }
522 
523 template<class LO>
524 KOKKOS_INLINE_FUNCTION
525 size_t
526 unpackRowCount(const char imports[],
527  const size_t offset,
528  const size_t num_bytes)
529 {
530  using PT = PackTraits<LO>;
531 
532  LO num_ent_LO = 0;
533  if (num_bytes > 0) {
534  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
535  if (p_num_bytes > num_bytes) {
536  return OrdinalTraits<size_t>::invalid();
537  }
538  const char* const in_buf = imports + offset;
539  (void) PT::unpackValue(num_ent_LO, in_buf);
540  }
541  return static_cast<size_t>(num_ent_LO);
542 }
543 
548 template<class View1, class View2>
549 inline
550 bool
551 compute_batch_info(
552  const View1& batches_per_lid,
553  View2& batch_info
554 )
555 {
556  using LO = typename View2::value_type;
557  size_t batch = 0;
558  for (size_t i=0; i<batches_per_lid.extent(0); i++)
559  {
560  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
561  {
562  batch_info(batch, 0) = static_cast<LO>(i);
563  batch_info(batch, 1) = batch_no;
564  batch++;
565  }
566  }
567  return batch == batch_info.extent(0);
568 }
569 
577 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
578 void
579 unpackAndCombineIntoCrsMatrix(
580  const LocalMatrix& local_matrix,
581  const LocalMap& local_map,
582  const Kokkos::View<const char*, BufferDeviceType>& imports,
583  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
584  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
585  const Tpetra::CombineMode combine_mode)
586 {
587  using ST = typename LocalMatrix::value_type;
588  using LO = typename LocalMap::local_ordinal_type;
589  using DT = typename LocalMap::device_type;
590  using XS = typename DT::execution_space;
591  const char prefix[] =
592  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
593  "unpackAndCombineIntoCrsMatrix: ";
594 
595  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
596  if (num_import_lids == 0) {
597  // Nothing to unpack
598  return;
599  }
600 
601  {
602  // Check for correct input
603  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
604  std::invalid_argument,
605  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
606  "static graph (i.e., was constructed with the CrsMatrix constructor "
607  "that takes a const CrsGraph pointer).");
608 
609  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
610  std::invalid_argument,
611  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
612  "(i.e., was constructed with the CrsMatrix constructor that takes a "
613  "const CrsGraph pointer).");
614 
615  // Unknown combine mode!
616  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
617  std::invalid_argument,
618  prefix << "Invalid combine mode; should never get "
619  "here! Please report this bug to the Tpetra developers.");
620 
621  // Check that sizes of input objects are consistent.
622  bool bad_num_import_lids =
623  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
624  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
625  std::invalid_argument,
626  prefix << "importLIDs.size() (" << num_import_lids << ") != "
627  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
628  } // end QA error checking
629 
630  // Get the offsets
631  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
632  computeOffsetsFromCounts(offsets, num_packets_per_lid);
633 
634  // Determine the sizes of the unpack batches
635  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
636  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
637  const size_t batch_size = std::min(default_batch_size, max_num_ent);
638 
639  // To achieve some balance amongst threads, unpack each row in equal size batches
640  size_t num_batches = 0;
641  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
642  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
643  // Compute meta data that allows batch unpacking
644  Kokkos::parallel_reduce(
645  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
646  KOKKOS_LAMBDA(const size_t i, size_t& batches)
647  {
648  const size_t num_entries_in_row = unpackRowCount<LO>(
649  imports.data(), offsets(i), num_packets_per_lid(i)
650  );
651  batches_per_lid(i) =
652  (num_entries_in_row <= batch_size) ?
653  1 :
654  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
655  batches += batches_per_lid(i);
656  },
657  num_batches
658  );
659  Kokkos::resize(batch_info, num_batches);
660 
661  Kokkos::HostSpace host_space;
662  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
663  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
664  Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
665 
666  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
667 
668  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
669  // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
670  Kokkos::deep_copy(XS(), batch_info, batch_info_h);
671 
672  // FIXME (TJF SEP 2017)
673  // The scalar type is not necessarily default constructible
674  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
675 
676  // Now do the actual unpack!
677  const bool atomic = XS().concurrency() != 1;
678  using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
679  functor f(
680  local_matrix,
681  local_map,
682  imports,
683  num_packets_per_lid,
684  import_lids,
685  batch_info,
686  offsets,
687  combine_mode,
688  batch_size,
689  bytes_per_value,
690  atomic
691  );
692 
693  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
695  if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
696  {
697  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
698  }
699  else
700  {
701  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
702  }
703 
704  auto error_code = f.error();
705  TEUCHOS_TEST_FOR_EXCEPTION(
706  error_code != 0,
707  std::runtime_error,
708  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
709  );
710 } //unpackAndCombineIntoCrsMatrix (Kokkos version)
711 
712 template<class LocalMatrix, class BufferDeviceType>
713 size_t
715  const LocalMatrix& local_matrix,
716  const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
717  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
718  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
719  const size_t num_same_ids)
720 {
721  using Kokkos::parallel_reduce;
722  typedef typename LocalMatrix::ordinal_type LO;
723  typedef typename LocalMatrix::device_type device_type;
724  typedef typename device_type::execution_space XS;
725  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
726  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
727  typedef BufferDeviceType BDT;
728 
729  size_t count = 0;
730  LO num_items;
731 
732  // Number of matrix entries to unpack (returned by this function).
733  num_items = static_cast<LO>(num_same_ids);
734  if (num_items) {
735  size_t kcnt = 0;
736  parallel_reduce(range_policy(0, num_items),
737  KOKKOS_LAMBDA(const LO lid, size_t& update) {
738  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
739  -local_matrix.graph.row_map[lid]);
740  }, kcnt);
741  count += kcnt;
742  }
743 
744  // Count entries copied directly from the source matrix with permuting.
745  num_items = static_cast<LO>(permute_from_lids.extent(0));
746  if (num_items) {
747  size_t kcnt = 0;
748  parallel_reduce(range_policy(0, num_items),
749  KOKKOS_LAMBDA(const LO i, size_t& update) {
750  const LO lid = permute_from_lids(i);
751  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
752  - local_matrix.graph.row_map[lid]);
753  }, kcnt);
754  count += kcnt;
755  }
756 
757  {
758  // Count entries received from other MPI processes.
759  const size_type np = num_packets_per_lid.extent(0);
760  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
761  computeOffsetsFromCounts(offsets, num_packets_per_lid);
762  count +=
763  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
764  offsets, imports);
765  }
766 
767  return count;
768 } //unpackAndCombineWithOwningPIDsCount (Kokkos version)
769 
771 template<class LO, class DT, class BDT>
772 int
773 setupRowPointersForRemotes(
774  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
775  const typename PackTraits<LO>::input_array_type& import_lids,
776  const Kokkos::View<const char*, BDT>& imports,
777  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
778  const typename PackTraits<size_t>::input_array_type& offsets)
779 {
780  using Kokkos::parallel_reduce;
781  typedef typename DT::execution_space XS;
782  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
783  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
784 
785  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
786  const size_type N = num_packets_per_lid.extent(0);
787 
788  int errors = 0;
789  parallel_reduce ("Setup row pointers for remotes",
790  range_policy (0, N),
791  KOKKOS_LAMBDA (const size_t i, int& k_error) {
792  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
793  const size_t num_bytes = num_packets_per_lid(i);
794  const size_t offset = offsets(i);
795  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
796  if (num_ent == InvalidNum) {
797  k_error += 1;
798  }
799  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
800  }, errors);
801  return errors;
802 }
803 
804 // Convert array of row lengths to a CRS pointer array
805 template<class DT>
806 void
807 makeCrsRowPtrFromLengths(
808  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
809  const Kokkos::View<size_t*,DT>& new_start_row)
810 {
811  using Kokkos::parallel_scan;
812  typedef typename DT::execution_space XS;
813  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
814  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
815  const size_type N = new_start_row.extent(0);
816  parallel_scan(range_policy(0, N),
817  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
818  auto cur_val = tgt_rowptr(i);
819  if (final) {
820  tgt_rowptr(i) = update;
821  new_start_row(i) = tgt_rowptr(i);
822  }
823  update += cur_val;
824  }
825  );
826 }
827 
828 template<class LocalMatrix, class LocalMap>
829 void
830 copyDataFromSameIDs(
831  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
832  const typename PackTraits<int>::output_array_type& tgt_pids,
833  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
834  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
835  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
836  const typename PackTraits<int>::input_array_type& src_pids,
837  const LocalMatrix& local_matrix,
838  const LocalMap& local_col_map,
839  const size_t num_same_ids,
840  const int my_pid)
841 {
842  using Kokkos::parallel_for;
843  typedef typename LocalMap::device_type DT;
844  typedef typename LocalMap::local_ordinal_type LO;
845  typedef typename DT::execution_space XS;
846  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
847 
848  parallel_for(range_policy(0, num_same_ids),
849  KOKKOS_LAMBDA(const size_t i) {
850  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
851 
852  const LO src_lid = static_cast<LO>(i);
853  size_t src_row = local_matrix.graph.row_map(src_lid);
854 
855  const LO tgt_lid = static_cast<LO>(i);
856  const size_t tgt_row = tgt_rowptr(tgt_lid);
857 
858  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
859  - local_matrix.graph.row_map(src_lid);
860  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
861 
862  for (size_t j=local_matrix.graph.row_map(src_lid);
863  j<local_matrix.graph.row_map(src_lid+1); ++j) {
864  LO src_col = local_matrix.graph.entries(j);
865  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
866  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
867  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
868  }
869  }
870  );
871 }
872 
873 template<class LocalMatrix, class LocalMap>
874 void
875 copyDataFromPermuteIDs(
876  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
877  const typename PackTraits<int>::output_array_type& tgt_pids,
878  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
879  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
880  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
881  const typename PackTraits<int>::input_array_type& src_pids,
882  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
883  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
884  const LocalMatrix& local_matrix,
885  const LocalMap& local_col_map,
886  const int my_pid)
887 {
888  using Kokkos::parallel_for;
889  typedef typename LocalMap::device_type DT;
890  typedef typename LocalMap::local_ordinal_type LO;
891  typedef typename DT::execution_space XS;
892  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
893  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
894 
895  const size_type num_permute_to_lids = permute_to_lids.extent(0);
896 
897  parallel_for(range_policy(0, num_permute_to_lids),
898  KOKKOS_LAMBDA(const size_t i) {
899  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
900 
901  const LO src_lid = permute_from_lids(i);
902  const size_t src_row = local_matrix.graph.row_map(src_lid);
903 
904  const LO tgt_lid = permute_to_lids(i);
905  const size_t tgt_row = tgt_rowptr(tgt_lid);
906 
907  size_t nsr = local_matrix.graph.row_map(src_lid+1)
908  - local_matrix.graph.row_map(src_lid);
909  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
910 
911  for (size_t j=local_matrix.graph.row_map(src_lid);
912  j<local_matrix.graph.row_map(src_lid+1); ++j) {
913  LO src_col = local_matrix.graph.entries(j);
914  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
915  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
916  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
917  }
918  }
919  );
920 }
921 
922 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
923 int
924 unpackAndCombineIntoCrsArrays2(
925  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
926  const typename PackTraits<int>::output_array_type& tgt_pids,
927  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
928  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
929  const typename PackTraits<size_t>::input_array_type& offsets,
930  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
931  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
932  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
933  const LocalMatrix& /* local_matrix */,
934  const LocalMap /*& local_col_map*/,
935  const int my_pid,
936  const size_t bytes_per_value)
937 {
938  using Kokkos::View;
939  using Kokkos::subview;
940  using Kokkos::MemoryUnmanaged;
941  using Kokkos::parallel_reduce;
942  using Kokkos::atomic_fetch_add;
944  typedef typename LocalMap::device_type DT;
945  typedef typename LocalMap::local_ordinal_type LO;
946  typedef typename LocalMap::global_ordinal_type GO;
947  typedef typename LocalMatrix::value_type ST;
948  typedef typename DT::execution_space XS;
949  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
950  typedef typename Kokkos::pair<size_type, size_type> slice;
951  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
952 
953  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
954  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
955  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
956 
957  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
958 
959  int errors = 0;
960  const size_type num_import_lids = import_lids.size();
961 
962  // RemoteIDs: Loop structure following UnpackAndCombine
963  parallel_reduce ("Unpack and combine into CRS",
964  range_policy (0, num_import_lids),
965  KOKKOS_LAMBDA (const size_t i, int& k_error) {
966  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
967  const size_t num_bytes = num_packets_per_lid(i);
968  const size_t offset = offsets(i);
969  if (num_bytes == 0) {
970  // Empty buffer means that the row is empty.
971  return;
972  }
973  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
974  if (num_ent == InvalidNum) {
975  k_error += 1;
976  return;
977  }
978  const LO lcl_row = import_lids(i);
979  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
980  const size_t end_row = start_row + num_ent;
981 
982  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
983  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
984  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
985 
986  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
987  imports.data(), offset, num_bytes,
988  num_ent, bytes_per_value);
989 
990  // Correct target PIDs.
991  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
992  const int pid = pids_out(j);
993  pids_out(j) = (pid != my_pid) ? pid : -1;
994  }
995  }, errors);
996 
997  return errors;
998 }
999 
1000 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1001 void
1003  const LocalMatrix & local_matrix,
1004  const LocalMap & local_col_map,
1005  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1006  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1007  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1008  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1009  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1010  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1011  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1012  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1013  const typename PackTraits<int>::input_array_type& src_pids,
1014  const typename PackTraits<int>::output_array_type& tgt_pids,
1015  const size_t num_same_ids,
1016  const size_t tgt_num_rows,
1017  const size_t tgt_num_nonzeros,
1018  const int my_tgt_pid,
1019  const size_t bytes_per_value)
1020 {
1021  using Kokkos::View;
1022  using Kokkos::subview;
1023  using Kokkos::parallel_for;
1024  using Kokkos::MemoryUnmanaged;
1026  typedef typename LocalMap::device_type DT;
1027  typedef typename LocalMap::local_ordinal_type LO;
1028  typedef typename DT::execution_space XS;
1029  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1030  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1031  typedef BufferDeviceType BDT;
1032 
1033  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1034 
1035  const size_t N = tgt_num_rows;
1036 
1037  // In the case of reduced communicators, the sourceMatrix won't have
1038  // the right "my_pid", so thus we have to supply it.
1039  const int my_pid = my_tgt_pid;
1040 
1041  // Zero the rowptr
1042  parallel_for(range_policy(0, N+1),
1043  KOKKOS_LAMBDA(const size_t i) {
1044  tgt_rowptr(i) = 0;
1045  }
1046  );
1047 
1048  // same IDs: Always first, always in the same place
1049  parallel_for(range_policy(0, num_same_ids),
1050  KOKKOS_LAMBDA(const size_t i) {
1051  const LO tgt_lid = static_cast<LO>(i);
1052  const LO src_lid = static_cast<LO>(i);
1053  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1054  - local_matrix.graph.row_map(src_lid);
1055  }
1056  );
1057 
1058  // Permute IDs: Still local, but reordered
1059  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1060  parallel_for(range_policy(0, num_permute_to_lids),
1061  KOKKOS_LAMBDA(const size_t i) {
1062  const LO tgt_lid = permute_to_lids(i);
1063  const LO src_lid = permute_from_lids(i);
1064  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1065  - local_matrix.graph.row_map(src_lid);
1066  }
1067  );
1068 
1069  // Get the offsets from the number of packets per LID
1070  const size_type num_import_lids = import_lids.extent(0);
1071  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1072  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1073 
1074 #ifdef HAVE_TPETRA_DEBUG
1075  {
1076  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1077  const bool condition =
1078  nth_offset_h != static_cast<size_t>(imports.extent (0));
1079  TEUCHOS_TEST_FOR_EXCEPTION
1080  (condition, std::logic_error, prefix
1081  << "The final offset in bytes " << nth_offset_h
1082  << " != imports.size() = " << imports.extent(0)
1083  << ". Please report this bug to the Tpetra developers.");
1084  }
1085 #endif // HAVE_TPETRA_DEBUG
1086 
1087  // Setup row pointers for remotes
1088  int k_error =
1089  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1090  import_lids, imports, num_packets_per_lid, offsets);
1091  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1092  << " Error transferring data to target row pointers. "
1093  "Please report this bug to the Tpetra developers.");
1094 
1095  // If multiple processes contribute to the same row, we may need to
1096  // update row offsets. This tracks that.
1097  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1098 
1099  // Turn row length into a real CRS row pointer
1100  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1101 
1102  // SameIDs: Copy the data over
1103  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1104  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1105 
1106  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1107  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1108  local_matrix, local_col_map, my_pid);
1109 
1110  if (imports.extent(0) <= 0) {
1111  return;
1112  }
1113 
1114  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1115  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1116  local_matrix, local_col_map, my_pid, bytes_per_value);
1117  TEUCHOS_TEST_FOR_EXCEPTION(
1118  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1119  "should never happen. Please report this bug to the Tpetra developers.");
1120 
1121  return;
1122 }
1123 
1124 } // namespace UnpackAndCombineCrsMatrixImpl
1125 
1165 template<typename ST, typename LO, typename GO, typename Node>
1166 void
1168  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1169  const Teuchos::ArrayView<const char>& imports,
1170  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1171  const Teuchos::ArrayView<const LO>& importLIDs,
1172  size_t /* constantNumPackets */,
1173  CombineMode combineMode)
1174 {
1175  using Kokkos::View;
1176  typedef typename Node::device_type device_type;
1177  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1178  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1179  "Node::device_type and LocalMatrix::device_type must be the same.");
1180 
1181  // Convert all Teuchos::Array to Kokkos::View.
1182  device_type outputDevice;
1183 
1184  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1185  // them to device. Since unpacking is done directly in to the local matrix
1186  // (lclMatrix), no copying needs to be performed after unpacking.
1187  auto num_packets_per_lid_d =
1188  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1189  numPacketsPerLID.size(), true, "num_packets_per_lid");
1190 
1191  auto import_lids_d =
1192  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1193  importLIDs.size(), true, "import_lids");
1194 
1195  auto imports_d =
1196  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1197  imports.size(), true, "imports");
1198 
1199  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1200  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1201 
1202 //KDDKDD This loop doesn't appear to do anything; what is it?
1203 //KDDKDD for (int i=0; i<importLIDs.size(); i++)
1204 //KDDKDD {
1205 //KDDKDD auto lclRow = importLIDs[i];
1206 //KDDKDD Teuchos::ArrayView<const LO> A_indices;
1207 //KDDKDD Teuchos::ArrayView<const ST> A_values;
1208 //KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1209 //KDDKDD }
1210  // Now do the actual unpack!
1211  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1212  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1213  import_lids_d, combineMode);
1214 
1215 }
1216 
1217 template<typename ST, typename LO, typename GO, typename NT>
1218 void
1219 unpackCrsMatrixAndCombineNew(
1220  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1221  Kokkos::DualView<char*,
1223  Kokkos::DualView<size_t*,
1224  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1225  const Kokkos::DualView<const LO*,
1227  const size_t /* constantNumPackets */,
1228  const CombineMode combineMode)
1229 {
1230  using Kokkos::View;
1231  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1232  using dist_object_type = DistObject<char, LO, GO, NT>;
1233  using device_type = typename crs_matrix_type::device_type;
1234  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1235  using buffer_device_type = typename dist_object_type::buffer_device_type;
1236 
1237  static_assert
1238  (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1239  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1240  "must be the same.");
1241 
1242  if (numPacketsPerLID.need_sync_device()) {
1243  numPacketsPerLID.sync_device ();
1244  }
1245  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1246 
1247  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1248  auto import_lids_d = importLIDs.view_device ();
1249 
1250  if (imports.need_sync_device()) {
1251  imports.sync_device ();
1252  }
1253  auto imports_d = imports.view_device ();
1254 
1255  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1256  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1257  typedef decltype (local_col_map) local_map_type;
1258 
1259  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1260  local_matrix_device_type,
1261  local_map_type,
1262  buffer_device_type
1263  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1264  import_lids_d, combineMode);
1265 }
1266 
1313 //
1322 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1323 size_t
1325  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1326  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1327  const Teuchos::ArrayView<const char> &imports,
1328  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1329  size_t /* constantNumPackets */,
1330  CombineMode /* combineMode */,
1331  size_t numSameIDs,
1332  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1333  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1334 {
1335  using Kokkos::MemoryUnmanaged;
1336  using Kokkos::View;
1337  typedef typename Node::device_type DT;
1338  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1339 
1340  TEUCHOS_TEST_FOR_EXCEPTION
1341  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1342  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1343  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1344  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1345  // process, then the matrix is neither locally nor globally indexed.
1346  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1347  TEUCHOS_TEST_FOR_EXCEPTION
1348  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1349  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1350  TEUCHOS_TEST_FOR_EXCEPTION
1351  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1352  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1353  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1354 
1355  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1356 
1357  using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1358  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1359 
1360  Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1362  permuteFromLIDs.getRawPtr (),
1363  permuteFromLIDs.size (), true,
1364  "permute_from_lids");
1365 
1366  Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1368  imports.getRawPtr (),
1369  imports.size (), true,
1370  "imports");
1371 
1372  Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1374  numPacketsPerLID.getRawPtr (),
1375  numPacketsPerLID.size (), true,
1376  "num_packets_per_lid");
1377 
1379  local_matrix, permute_from_lids_d, imports_d,
1380  num_packets_per_lid_d, numSameIDs);
1381 } //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1382 
1397 
1398 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1399 void
1402  const Kokkos::View<LocalOrdinal const *,
1403  Kokkos::Device<typename Node::device_type::execution_space,
1404  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1405  void, void > import_lids_d,
1406  const Kokkos::View<const char*,
1407  Kokkos::Device<typename Node::device_type::execution_space,
1408  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1409  void, void > imports_d,
1410  const Kokkos::View<const size_t*,
1411  Kokkos::Device<typename Node::device_type::execution_space,
1412  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1413  void, void > num_packets_per_lid_d,
1414  const size_t numSameIDs,
1415  const Kokkos::View<LocalOrdinal const *,
1416  Kokkos::Device<typename Node::device_type::execution_space,
1417  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1418  void, void > permute_to_lids_d,
1419  const Kokkos::View<LocalOrdinal const *,
1420  Kokkos::Device<typename Node::device_type::execution_space,
1421  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1422  void, void > permute_from_lids_d,
1423  size_t TargetNumRows,
1424  const int MyTargetPID,
1425  Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1426  Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1427  Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,typename Node::device_type>& crs_vals_d,
1428  const Teuchos::ArrayView<const int>& SourcePids,
1429  Kokkos::View<int*,typename Node::device_type> &TargetPids)
1430 {
1431  using execution_space = typename Node::execution_space;
1433 
1434  using Kokkos::View;
1435  using Kokkos::deep_copy;
1436 
1437  using Teuchos::ArrayView;
1438  using Teuchos::outArg;
1439  using Teuchos::REDUCE_MAX;
1440  using Teuchos::reduceAll;
1441 
1442  typedef typename Node::device_type DT;
1443 
1445  typedef typename matrix_type::impl_scalar_type ST;
1446 
1447  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1448 # ifdef HAVE_TPETRA_MMM_TIMINGS
1449  using Teuchos::TimeMonitor;
1450  Teuchos::RCP<TimeMonitor> tm;
1451 # endif
1452 
1453  using Kokkos::MemoryUnmanaged;
1454 
1455  TEUCHOS_TEST_FOR_EXCEPTION
1456  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1457  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1458  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1459  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1460  // process, then the matrix is neither locally nor globally indexed.
1461  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1462  TEUCHOS_TEST_FOR_EXCEPTION
1463  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1464  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1465  TEUCHOS_TEST_FOR_EXCEPTION
1466  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1467  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1468  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1469 
1470  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1471 
1472  // TargetNumNonzeros is number of nonzeros in local matrix.
1473 # ifdef HAVE_TPETRA_MMM_TIMINGS
1474  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1475 # endif
1476  size_t TargetNumNonzeros =
1478  local_matrix, permute_from_lids_d, imports_d,
1479  num_packets_per_lid_d, numSameIDs);
1480 # ifdef HAVE_TPETRA_MMM_TIMINGS
1481  tm = Teuchos::null;
1482 # endif
1483 
1484 # ifdef HAVE_TPETRA_MMM_TIMINGS
1485  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1486 # endif
1487  Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1488  Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1489  Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1490 # ifdef HAVE_TPETRA_MMM_TIMINGS
1491  tm = Teuchos::null;
1492 # endif
1493 
1494  TEUCHOS_TEST_FOR_EXCEPTION(
1495  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1496  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1497  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1498 
1499  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1500  Kokkos::resize(TargetPids,TargetNumNonzeros);
1501  }
1502  Kokkos::deep_copy(execution_space(), TargetPids, -1);
1503 
1504  // Grab pointers for sourceMatrix
1505  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1506 
1507 # ifdef HAVE_TPETRA_MMM_TIMINGS
1508  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1509 # endif
1510  // Convert input arrays to Kokkos::Views
1511  DT outputDevice;
1512 
1513  auto src_pids_d =
1514  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1515  SourcePids.size(), true, "src_pids");
1516 
1517 # ifdef HAVE_TPETRA_MMM_TIMINGS
1518  tm = Teuchos::null;
1519 # endif
1520 
1521  size_t bytes_per_value = 0;
1523  // assume that ST is default constructible
1524  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1525  }
1526  else {
1527  // Since the packed data come from the source matrix, we can use the source
1528  // matrix to get the number of bytes per Scalar value stored in the matrix.
1529  // This assumes that all Scalar values in the source matrix require the same
1530  // number of bytes. If the source matrix has no entries on the calling
1531  // process, then we hope that some process does have some idea how big
1532  // a Scalar value is. Of course, if no processes have any entries, then no
1533  // values should be packed (though this does assume that in our packing
1534  // scheme, rows with zero entries take zero bytes).
1535  size_t bytes_per_value_l = 0;
1536  if (local_matrix.values.extent(0) > 0) {
1537  const ST& val = local_matrix.values(0);
1538  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1539  } else {
1540  const ST& val = crs_vals_d(0);
1541  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1542  }
1543  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1544  Teuchos::REDUCE_MAX,
1545  bytes_per_value_l,
1546  outArg(bytes_per_value));
1547  }
1548 
1549 # ifdef HAVE_TPETRA_MMM_TIMINGS
1550  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1551 # endif
1553  local_matrix, local_col_map, import_lids_d, imports_d,
1554  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1555  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1556  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1557  bytes_per_value);
1558 # ifdef HAVE_TPETRA_MMM_TIMINGS
1559  tm = Teuchos::null;
1560 # endif
1561 
1562  // Copy outputs back to host
1563 # ifdef HAVE_TPETRA_MMM_TIMINGS
1564  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1565 # endif
1566 
1567  Kokkos::parallel_for("setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (const size_t i) {
1568  if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1569  });
1570 
1571 } //unpackAndCombineIntoCrsArrays
1572 
1573 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1574 void
1577  const Kokkos::View<LocalOrdinal const *,
1578  Kokkos::Device<typename Node::device_type::execution_space,
1579  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1580  void, void > import_lids_d,
1581  const Kokkos::View<const char*,
1582  Kokkos::Device<typename Node::device_type::execution_space,
1583  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1584  void, void > imports_d,
1585  const Kokkos::View<const size_t*,
1586  Kokkos::Device<typename Node::device_type::execution_space,
1587  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1588  void, void > num_packets_per_lid_d,
1589  const size_t numSameIDs,
1590  const Kokkos::View<LocalOrdinal const *,
1591  Kokkos::Device<typename Node::device_type::execution_space,
1592  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1593  void, void > permute_to_lids_d,
1594  const Kokkos::View<LocalOrdinal const *,
1595  Kokkos::Device<typename Node::device_type::execution_space,
1596  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1597  void, void > permute_from_lids_d,
1598  size_t TargetNumRows,
1599  const int MyTargetPID,
1600  Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1601  Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1602  Teuchos::ArrayRCP<Scalar>& CRS_vals,
1603  const Teuchos::ArrayView<const int>& SourcePids,
1604  Teuchos::Array<int>& TargetPids)
1605 {
1606  using execution_space = typename Node::execution_space;
1608 
1609  using Kokkos::View;
1610  using Kokkos::deep_copy;
1611 
1612  using Teuchos::ArrayView;
1613  using Teuchos::outArg;
1614  using Teuchos::REDUCE_MAX;
1615  using Teuchos::reduceAll;
1616 
1617  typedef typename Node::device_type DT;
1618 
1620  typedef typename matrix_type::impl_scalar_type ST;
1621 
1622  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1623 # ifdef HAVE_TPETRA_MMM_TIMINGS
1624  using Teuchos::TimeMonitor;
1625  Teuchos::RCP<TimeMonitor> tm;
1626 # endif
1627 
1628  using Kokkos::MemoryUnmanaged;
1629 
1630  TEUCHOS_TEST_FOR_EXCEPTION
1631  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1632  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1633  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1634  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1635  // process, then the matrix is neither locally nor globally indexed.
1636  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1637  TEUCHOS_TEST_FOR_EXCEPTION
1638  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1639  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1640  TEUCHOS_TEST_FOR_EXCEPTION
1641  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1642  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1643  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1644 
1645  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1646 
1647  // TargetNumNonzeros is number of nonzeros in local matrix.
1648 # ifdef HAVE_TPETRA_MMM_TIMINGS
1649  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1650 # endif
1651  size_t TargetNumNonzeros =
1653  local_matrix, permute_from_lids_d, imports_d,
1654  num_packets_per_lid_d, numSameIDs);
1655 # ifdef HAVE_TPETRA_MMM_TIMINGS
1656  tm = Teuchos::null;
1657 # endif
1658 
1659 # ifdef HAVE_TPETRA_MMM_TIMINGS
1660  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1661 # endif
1662  CRS_rowptr.resize (TargetNumRows+1);
1663  CRS_colind.resize(TargetNumNonzeros);
1664  CRS_vals.resize(TargetNumNonzeros);
1665  Teuchos::ArrayRCP<ST> const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1666 # ifdef HAVE_TPETRA_MMM_TIMINGS
1667  tm = Teuchos::null;
1668 # endif
1669 
1670  TEUCHOS_TEST_FOR_EXCEPTION(
1671  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1672  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1673  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1674 
1675  // Preseed TargetPids with -1 for local
1676  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1677  TargetPids.resize (TargetNumNonzeros);
1678  }
1679  TargetPids.assign (TargetNumNonzeros, -1);
1680 
1681  // Grab pointers for sourceMatrix
1682  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1683 
1684 # ifdef HAVE_TPETRA_MMM_TIMINGS
1685  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1686 # endif
1687  // Convert input arrays to Kokkos::Views
1688  DT outputDevice;
1689 
1690  auto crs_rowptr_d =
1691  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1692  CRS_rowptr.size(), true, "crs_rowptr");
1693 
1694  auto crs_colind_d =
1695  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1696  CRS_colind.size(), true, "crs_colidx");
1697 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1698  static_assert (! std::is_same<
1699  typename std::remove_const<
1700  typename std::decay<
1701  decltype (CRS_vals_impl_scalar_type)
1702  >::type::value_type
1703  >::type,
1704  std::complex<double> >::value,
1705  "CRS_vals::value_type is std::complex<double>; this should never happen"
1706  ", since std::complex does not work in Kokkos::View objects.");
1707 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1708 
1709  auto crs_vals_d =
1710  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(),
1711  CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1712 
1713 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1714  static_assert (! std::is_same<
1715  typename decltype (crs_vals_d)::non_const_value_type,
1716  std::complex<double> >::value,
1717  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1718  "never happen, since std::complex does not work in Kokkos::View objects.");
1719 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1720 
1721  auto src_pids_d =
1722  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1723  SourcePids.size(), true, "src_pids");
1724 
1725  auto tgt_pids_d =
1726  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1727  TargetPids.size(), true, "tgt_pids");
1728 
1729 # ifdef HAVE_TPETRA_MMM_TIMINGS
1730  tm = Teuchos::null;
1731 # endif
1732 
1733  size_t bytes_per_value = 0;
1735  // assume that ST is default constructible
1736  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1737  }
1738  else {
1739  // Since the packed data come from the source matrix, we can use the source
1740  // matrix to get the number of bytes per Scalar value stored in the matrix.
1741  // This assumes that all Scalar values in the source matrix require the same
1742  // number of bytes. If the source matrix has no entries on the calling
1743  // process, then we hope that some process does have some idea how big
1744  // a Scalar value is. Of course, if no processes have any entries, then no
1745  // values should be packed (though this does assume that in our packing
1746  // scheme, rows with zero entries take zero bytes).
1747  size_t bytes_per_value_l = 0;
1748  if (local_matrix.values.extent(0) > 0) {
1749  const ST& val = local_matrix.values(0);
1750  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1751  } else {
1752  const ST& val = crs_vals_d(0);
1753  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1754  }
1755  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1756  Teuchos::REDUCE_MAX,
1757  bytes_per_value_l,
1758  outArg(bytes_per_value));
1759  }
1760 
1761 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1762  static_assert (! std::is_same<
1763  typename decltype (crs_vals_d)::non_const_value_type,
1764  std::complex<double> >::value,
1765  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1766  "never happen, since std::complex does not work in Kokkos::View objects.");
1767 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1768 
1769 # ifdef HAVE_TPETRA_MMM_TIMINGS
1770  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1771 # endif
1773  local_matrix, local_col_map, import_lids_d, imports_d,
1774  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1775  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1776  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1777  bytes_per_value);
1778 # ifdef HAVE_TPETRA_MMM_TIMINGS
1779  tm = Teuchos::null;
1780 # endif
1781 
1782  // Copy outputs back to host
1783 # ifdef HAVE_TPETRA_MMM_TIMINGS
1784  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1785 # endif
1786  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1787  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1788  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1789  deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1790 
1791  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1792  CRS_colind.getRawPtr(), CRS_colind.size());
1793  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1794  deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1795 
1796  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1797  CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1798  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1799  deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1800 
1801  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1802  TargetPids.getRawPtr(), TargetPids.size());
1803  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1804  deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1805 
1806 } //unpackAndCombineIntoCrsArrays
1807 
1808 
1809 } // namespace Details
1810 } // namespace Tpetra
1811 
1812 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1813  template void \
1814  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1815  const CrsMatrix<ST, LO, GO, NT>&, \
1816  const Teuchos::ArrayView<const char>&, \
1817  const Teuchos::ArrayView<const size_t>&, \
1818  const Teuchos::ArrayView<const LO>&, \
1819  size_t, \
1820  CombineMode); \
1821  template size_t \
1822  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1823  const CrsMatrix<ST, LO, GO, NT> &, \
1824  const Teuchos::ArrayView<const LO> &, \
1825  const Teuchos::ArrayView<const char> &, \
1826  const Teuchos::ArrayView<const size_t>&, \
1827  size_t, \
1828  CombineMode, \
1829  size_t, \
1830  const Teuchos::ArrayView<const LO>&, \
1831  const Teuchos::ArrayView<const LO>&); \
1832  template void \
1833  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1834  const CrsMatrix<ST, LO, GO, NT>&, \
1835  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1836  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1837  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1838  const size_t, \
1839  const CombineMode); \
1840  template void \
1841  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1842  const CrsMatrix<ST, LO, GO, NT> &, \
1843  const Kokkos::View<LO const *, \
1844  Kokkos::Device<typename NT::device_type::execution_space, \
1845  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1846  void, void >, \
1847  const Kokkos::View<const char*, \
1848  Kokkos::Device<typename NT::device_type::execution_space, \
1849  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1850  void, void >, \
1851  const Kokkos::View<const size_t*, \
1852  Kokkos::Device<typename NT::device_type::execution_space, \
1853  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1854  void, void >, \
1855  const size_t, \
1856  const Kokkos::View<LO const *, \
1857  Kokkos::Device<typename NT::device_type::execution_space, \
1858  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1859  void, void >, \
1860  const Kokkos::View<LO const *, \
1861  Kokkos::Device<typename NT::device_type::execution_space, \
1862  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1863  void, void >, \
1864  size_t, \
1865  const int, \
1866  Kokkos::View<size_t*,typename NT::device_type>&, \
1867  Kokkos::View<GO*,typename NT::device_type>&, \
1868  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1869  const Teuchos::ArrayView<const int>&, \
1870  Kokkos::View<int*,typename NT::device_type>&); \
1871  template void \
1872  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1873  const CrsMatrix<ST, LO, GO, NT> &, \
1874  const Kokkos::View<LO const *, \
1875  Kokkos::Device<typename NT::device_type::execution_space, \
1876  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1877  void, void >, \
1878  const Kokkos::View<const char*, \
1879  Kokkos::Device<typename NT::device_type::execution_space, \
1880  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1881  void, void >, \
1882  const Kokkos::View<const size_t*, \
1883  Kokkos::Device<typename NT::device_type::execution_space, \
1884  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1885  void, void >, \
1886  const size_t, \
1887  const Kokkos::View<LO const *, \
1888  Kokkos::Device<typename NT::device_type::execution_space, \
1889  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1890  void, void >, \
1891  const Kokkos::View<LO const *, \
1892  Kokkos::Device<typename NT::device_type::execution_space, \
1893  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1894  void, void >, \
1895  size_t, \
1896  const int, \
1897  Teuchos::ArrayRCP<size_t>&, \
1898  Teuchos::ArrayRCP<GO>&, \
1899  Teuchos::ArrayRCP<ST>&, \
1900  const Teuchos::ArrayView<const int>&, \
1901  Teuchos::Array<int>&);
1902 
1903 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...