Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
55 #include "Kokkos_Core.hpp"
56 #include <memory>
57 #include <string>
58 
79 
80 namespace Tpetra {
81 
82 #ifndef DOXYGEN_SHOULD_SKIP_THIS
83 // Forward declaration of Distributor
84 class Distributor;
85 #endif // DOXYGEN_SHOULD_SKIP_THIS
86 
87 //
88 // Users must never rely on anything in the Details namespace.
89 //
90 namespace Details {
91 
92 namespace UnpackAndCombineCrsMatrixImpl {
93 
106 template<class ST, class LO, class GO, class DT, class BDT>
107 KOKKOS_FUNCTION int
108 unpackRow(typename PackTraits<GO, DT>::output_array_type& gids_out,
109  typename PackTraits<int, DT>::output_array_type& pids_out,
110  typename PackTraits<ST, DT>::output_array_type& vals_out,
111  const Kokkos::View<const char*, BDT>& imports,
112  const size_t offset,
113  const size_t /* num_bytes */,
114  const size_t num_ent,
115  const size_t num_bytes_per_value)
116 {
117  if (num_ent == 0) {
118  // Empty rows always take zero bytes, to ensure sparsity.
119  return 0;
120  }
121  bool unpack_pids = pids_out.size() > 0;
122 
123  const size_t num_ent_beg = offset;
124  const size_t num_ent_len = PackTraits<LO, BDT>::packValueCount (LO (0));
125 
126  const size_t gids_beg = num_ent_beg + num_ent_len;
127  const size_t gids_len =
128  num_ent * PackTraits<GO, BDT>::packValueCount (GO (0));
129 
130  const size_t pids_beg = gids_beg + gids_len;
131  const size_t pids_len = unpack_pids ?
132  size_t (num_ent * PackTraits<int, BDT>::packValueCount (int (0))) :
133  size_t (0);
134 
135  const size_t vals_beg = gids_beg + gids_len + pids_len;
136  const size_t vals_len = num_ent * num_bytes_per_value;
137 
138  const char* const num_ent_in = imports.data () + num_ent_beg;
139  const char* const gids_in = imports.data () + gids_beg;
140  const char* const pids_in = unpack_pids ? imports.data () + pids_beg : NULL;
141  const char* const vals_in = imports.data () + vals_beg;
142 
143  size_t num_bytes_out = 0;
144  LO num_ent_out;
145  num_bytes_out += PackTraits<LO, BDT>::unpackValue (num_ent_out, num_ent_in);
146  if (static_cast<size_t> (num_ent_out) != num_ent) {
147  return 20; // error code
148  }
149 
150  {
151  Kokkos::pair<int, size_t> p;
152  p = PackTraits<GO, BDT>::unpackArray (gids_out.data (), gids_in, num_ent);
153  if (p.first != 0) {
154  return 21; // error code
155  }
156  num_bytes_out += p.second;
157 
158  if (unpack_pids) {
159  p = PackTraits<int, BDT>::unpackArray (pids_out.data (), pids_in, num_ent);
160  if (p.first != 0) {
161  return 22; // error code
162  }
163  num_bytes_out += p.second;
164  }
165 
166  p = PackTraits<ST, BDT>::unpackArray (vals_out.data (), vals_in, num_ent);
167  if (p.first != 0) {
168  return 23; // error code
169  }
170  num_bytes_out += p.second;
171  }
172 
173  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
174  if (num_bytes_out != expected_num_bytes) {
175  return 24; // error code
176  }
177  return 0; // no errors
178 }
179 
190 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
192  typedef LocalMatrix local_matrix_type;
193  typedef LocalMap local_map_type;
194 
195  typedef typename local_matrix_type::value_type ST;
196  typedef typename local_map_type::local_ordinal_type LO;
197  typedef typename local_map_type::global_ordinal_type GO;
198  typedef typename local_map_type::device_type DT;
199  typedef typename DT::execution_space XS;
200 
201  typedef Kokkos::View<const size_t*, BufferDeviceType>
202  num_packets_per_lid_type;
203  typedef Kokkos::View<const size_t*, DT> offsets_type;
204  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
205  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
206 
207  typedef Kokkos::View<LO*, DT> lids_scratch_type;
208  typedef Kokkos::View<GO*, DT> gids_scratch_type;
209  typedef Kokkos::View<int*,DT> pids_scratch_type;
210  typedef Kokkos::View<ST*, DT> vals_scratch_type;
211 
212  typedef Kokkos::pair<int, LO> value_type;
213 
214  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
215  "LocalMap::local_ordinal_type and "
216  "LocalMatrix::ordinal_type must be the same.");
217 
218  local_matrix_type local_matrix;
219  local_map_type local_col_map;
220  input_buffer_type imports;
221  num_packets_per_lid_type num_packets_per_lid;
222  import_lids_type import_lids;
223  offsets_type offsets;
224  Tpetra::CombineMode combine_mode;
225  size_t max_num_ent;
226  bool unpack_pids;
227  size_t num_bytes_per_value;
228  bool atomic;
229  Kokkos::Experimental::UniqueToken<XS, Kokkos::Experimental::UniqueTokenScope::Global> tokens;
230  lids_scratch_type lids_scratch;
231  gids_scratch_type gids_scratch;
232  pids_scratch_type pids_scratch;
233  vals_scratch_type vals_scratch;
234 
236  const local_matrix_type& local_matrix_in,
237  const local_map_type& local_col_map_in,
238  const input_buffer_type& imports_in,
239  const num_packets_per_lid_type& num_packets_per_lid_in,
240  const import_lids_type& import_lids_in,
241  const offsets_type& offsets_in,
242  const Tpetra::CombineMode combine_mode_in,
243  const size_t max_num_ent_in,
244  const bool unpack_pids_in,
245  const size_t num_bytes_per_value_in,
246  const bool atomic_in) :
247  local_matrix (local_matrix_in),
248  local_col_map (local_col_map_in),
249  imports (imports_in),
250  num_packets_per_lid (num_packets_per_lid_in),
251  import_lids (import_lids_in),
252  offsets (offsets_in),
253  combine_mode (combine_mode_in),
254  max_num_ent (max_num_ent_in),
255  unpack_pids (unpack_pids_in),
256  num_bytes_per_value (num_bytes_per_value_in),
257  atomic (atomic_in),
258  tokens (XS()),
259  lids_scratch (Kokkos::view_alloc("lids_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent),
260  gids_scratch (Kokkos::view_alloc("gids_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent),
261  pids_scratch (Kokkos::view_alloc("pids_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent),
262  vals_scratch (Kokkos::view_alloc("vals_scratch", Kokkos::WithoutInitializing), tokens.size() * max_num_ent)
263  {}
264 
265  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
266  {
267  using Tpetra::Details::OrdinalTraits;
268  dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
269  }
270 
271  KOKKOS_INLINE_FUNCTION void
272  join (volatile value_type& dst, const volatile value_type& src) const
273  {
274  // `dst` should reflect the first (least) bad index and
275  // all other associated error codes and data. Thus, we need only
276  // check if the `src` object shows an error and if its associated
277  // bad index is less than `dst`'s bad index.
278  using Tpetra::Details::OrdinalTraits;
279  if (src.second != OrdinalTraits<LO>::invalid ()) {
280  // An error in the src; check if
281  // 1. `dst` shows errors
282  // 2. If `dst` does show errors, if src's bad index is less than
283  // *this' bad index
284  if (dst.second == OrdinalTraits<LO>::invalid () ||
285  src.second < dst.second) {
286  dst = src;
287  }
288  }
289  }
290 
291  KOKKOS_INLINE_FUNCTION
292  void operator()(const LO i, value_type& dst) const
293  {
294  using Kokkos::View;
295  using Kokkos::subview;
296  using Kokkos::MemoryUnmanaged;
297  typedef typename XS::size_type size_type;
298  typedef typename Kokkos::pair<size_type, size_type> slice;
299  typedef BufferDeviceType BDT;
300 
301  typedef View<LO*, DT, MemoryUnmanaged> lids_out_type;
302  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
303  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
304  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
305 
306  const size_t num_bytes = num_packets_per_lid(i);
307 
308  // Only unpack data if there is a nonzero number of bytes.
309  if (num_bytes == 0) {
310  return;
311  }
312 
313  // there is actually something in the row
314  const LO import_lid = import_lids[i];
315  const size_t buf_size = imports.size();
316  const size_t offset = offsets(i);
317 
318  // Get the number of entries to expect in the received data for this row.
319  LO num_ent_LO = 0;
320  const char* const in_buf = imports.data () + offset;
321  (void) PackTraits<LO, BDT>::unpackValue (num_ent_LO, in_buf);
322  const size_t num_ent = static_cast<size_t> (num_ent_LO);
323 
324  // Count the number of bytes expected to unpack
325  size_t expected_num_bytes = 0;
326  {
327  expected_num_bytes += PackTraits<LO, BDT>::packValueCount (LO (0));
328  expected_num_bytes += num_ent * PackTraits<GO, BDT>::packValueCount (GO (0));
329  if (unpack_pids) {
330  expected_num_bytes += num_ent * PackTraits<int, BDT>::packValueCount (int (0));
331  }
332  expected_num_bytes += num_ent * PackTraits<ST, BDT>::packValueCount (ST ());
333  }
334 
335  if (expected_num_bytes > num_bytes) {
336  dst = Kokkos::make_pair (1, i); // wrong number of bytes
337  return;
338  }
339 
340  if (offset > buf_size || offset + num_bytes > buf_size) {
341  dst = Kokkos::make_pair (2, i); // out of bounds
342  return;
343  }
344 
345  // Get subviews in to the scratch arrays. The token returned from acquire
346  // is an integer in [0, tokens.size()). It is used to grab a unique (to
347  // this thread) subview of the scratch arrays.
348  const size_type token = tokens.acquire();
349  const size_t a = static_cast<size_t>(token) * max_num_ent;
350  const size_t b = a + num_ent;
351  lids_out_type lids_out = subview(lids_scratch, slice(a, b));
352  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
353  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
354  vals_out_type vals_out = subview(vals_scratch, slice(a, b));
355 
356  // Unpack this row!
357  int unpack_err =
358  unpackRow<ST,LO,GO,DT,BDT>(gids_out, pids_out, vals_out,
359  imports, offset, num_bytes,
360  num_ent, num_bytes_per_value);
361  if (unpack_err != 0) {
362  dst = Kokkos::make_pair (unpack_err, i); // unpack error
363  tokens.release (token);
364  return;
365  }
366 
367  // Column indices come in as global indices, in case the
368  // source object's column Map differs from the target object's
369  // (this's) column Map, and must be converted local index values
370  for (size_t k = 0; k < num_ent; ++k) {
371  lids_out(k) = local_col_map.getLocalElement (gids_out(k));
372  }
373 
374  // Combine the values according to the combine_mode
375  const LO* const lids_raw = const_cast<const LO*> (lids_out.data ());
376  const ST* const vals_raw = const_cast<const ST*> (vals_out.data ());
377  LO num_modified = 0;
378  if (combine_mode == ADD) {
379  num_modified +=
380  local_matrix.sumIntoValues (import_lid, lids_raw, num_ent,
381  vals_raw, false, atomic);
382  }
383  else if (combine_mode == REPLACE) {
384  num_modified +=
385  local_matrix.replaceValues (import_lid, lids_raw, num_ent,
386  vals_raw, false, atomic);
387  }
388  else {
389  dst = Kokkos::make_pair (4, i); // invalid combine mode
390  tokens.release (token);
391  return;
392  }
393 
394  tokens.release (token);
395  }
396 };
397 
398 struct MaxNumEntTag {};
399 struct TotNumEntTag {};
400 
409 template<class LO, class DT, class BDT>
411 public:
412  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
413  typedef Kokkos::View<const size_t*, DT> offsets_type;
414  typedef Kokkos::View<const char*, BDT> input_buffer_type;
415  // This needs to be public, since it appears in the argument list of
416  // public methods (see below). Otherwise, build errors may happen.
417  typedef size_t value_type;
418 
419 private:
420  typedef Kokkos::pair<size_t,size_t> slice;
421 
422  num_packets_per_lid_type num_packets_per_lid;
423  offsets_type offsets;
424  input_buffer_type imports;
425 
426 public:
427  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
428  const offsets_type& offsets_in,
429  const input_buffer_type& imports_in) :
430  num_packets_per_lid (num_packets_per_lid_in),
431  offsets (offsets_in),
432  imports (imports_in)
433  {}
434 
435  KOKKOS_INLINE_FUNCTION void
436  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
437  // Get how many entries to expect in the received data for this row.
438  const size_t num_bytes = num_packets_per_lid(i);
439  if (num_bytes > 0) {
440  LO num_ent_LO = 0; // output argument of unpackValue
441  const char* const in_buf = imports.data () + offsets(i);
442  (void) PackTraits<LO, BDT>::unpackValue (num_ent_LO, in_buf);
443  const size_t num_ent = static_cast<size_t> (num_ent_LO);
444 
445  update = (update < num_ent) ? num_ent : update;
446  }
447  }
448 
449  KOKKOS_INLINE_FUNCTION void
450  join (const MaxNumEntTag,
451  volatile value_type& dst,
452  const volatile value_type& src) const
453  {
454  if (dst < src) dst = src;
455  }
456 
457  KOKKOS_INLINE_FUNCTION void
458  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
459  // Get how many entries to expect in the received data for this row.
460  const size_t num_bytes = num_packets_per_lid(i);
461  if (num_bytes > 0) {
462  LO num_ent_LO = 0; // output argument of unpackValue
463  const char* const in_buf = imports.data () + offsets(i);
464  (void) PackTraits<LO, BDT>::unpackValue (num_ent_LO, in_buf);
465  tot_num_ent += static_cast<size_t> (num_ent_LO);
466  }
467  }
468 };
469 
477 template<class LO, class DT, class BDT>
478 size_t
479 compute_maximum_num_entries (
480  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
481  const Kokkos::View<const size_t*, DT>& offsets,
482  const Kokkos::View<const char*, BDT>& imports)
483 {
484  typedef typename DT::execution_space XS;
485  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
486  MaxNumEntTag> range_policy;
487 
488  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
489  imports);
490  const LO numRowsToUnpack =
491  static_cast<LO> (num_packets_per_lid.extent (0));
492  size_t max_num_ent = 0;
493  Kokkos::parallel_reduce ("Max num entries in CRS",
494  range_policy (0, numRowsToUnpack),
495  functor, max_num_ent);
496  return max_num_ent;
497 }
498 
506 template<class LO, class DT, class BDT>
507 size_t
508 compute_total_num_entries (
509  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
510  const Kokkos::View<const size_t*, DT>& offsets,
511  const Kokkos::View<const char*, BDT>& imports)
512 {
513  typedef typename DT::execution_space XS;
514  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
515  size_t tot_num_ent = 0;
516  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
517  imports);
518  const LO numRowsToUnpack =
519  static_cast<LO> (num_packets_per_lid.extent (0));
520  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
521  range_policy (0, numRowsToUnpack),
522  functor, tot_num_ent);
523  return tot_num_ent;
524 }
525 
533 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
534 void
535 unpackAndCombineIntoCrsMatrix(
536  const LocalMatrix& local_matrix,
537  const LocalMap& local_map,
538  const Kokkos::View<const char*, BufferDeviceType>& imports,
539  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
540  const typename PackTraits<typename LocalMap::local_ordinal_type, BufferDeviceType>::input_array_type import_lids,
541  const Tpetra::CombineMode combine_mode,
542  const bool unpack_pids,
543  const bool atomic)
544 {
545  typedef typename LocalMatrix::value_type ST;
546  typedef typename LocalMap::local_ordinal_type LO;
547  typedef typename LocalMap::device_type DT;
548  typedef typename DT::execution_space XS;
549  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
550  typedef UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType> unpack_functor_type;
551 
552  const char prefix[] =
553  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix: ";
554 
555  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
556  if (num_import_lids == 0) {
557  // Nothing to unpack
558  return;
559  }
560 
561  {
562  // Check for correct input
563  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
564  std::invalid_argument,
565  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
566  "static graph (i.e., was constructed with the CrsMatrix constructor "
567  "that takes a const CrsGraph pointer).");
568 
569  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
570  std::invalid_argument,
571  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
572  "(i.e., was constructed with the CrsMatrix constructor that takes a "
573  "const CrsGraph pointer).");
574 
575  // Unknown combine mode!
576  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
577  std::invalid_argument,
578  prefix << "Invalid combine mode; should never get "
579  "here! Please report this bug to the Tpetra developers.");
580 
581  // Check that sizes of input objects are consistent.
582  bool bad_num_import_lids =
583  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
584  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
585  std::invalid_argument,
586  prefix << "importLIDs.size() (" << num_import_lids << ") != "
587  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
588  } // end QA error checking
589 
590  // Get the offsets
591  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
592  computeOffsetsFromCounts(offsets, num_packets_per_lid);
593 
594  // Determine the maximum number of entries in any row in the matrix. The
595  // maximum number of entries is needed to allocate unpack buffers on the
596  // device.
597  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(
598  num_packets_per_lid, offsets, imports);
599 
600  // FIXME (TJF SEP 2017)
601  // The scalar type is not necessarily default constructible
602  size_t num_bytes_per_value = PackTraits<ST, DT>::packValueCount(ST());
603 
604  // Now do the actual unpack!
605  unpack_functor_type f(local_matrix, local_map,
606  imports, num_packets_per_lid, import_lids, offsets, combine_mode,
607  max_num_ent, unpack_pids, num_bytes_per_value, atomic);
608 
609  typename unpack_functor_type::value_type x;
610  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
611  auto x_h = x.to_std_pair();
612  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
613  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code "
614  << x_h.first << " for the first bad row " << x_h.second);
615 
616  return;
617 }
618 
619 template<class LocalMatrix, class BufferDeviceType>
620 size_t
622  const LocalMatrix& local_matrix,
623  const typename PackTraits<typename LocalMatrix::ordinal_type, typename LocalMatrix::device_type>::input_array_type permute_from_lids,
624  const Kokkos::View<const char*, BufferDeviceType>& imports,
625  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
626  const size_t num_same_ids)
627 {
628  using Kokkos::parallel_reduce;
629  typedef typename LocalMatrix::ordinal_type LO;
630  typedef typename LocalMatrix::device_type device_type;
631  typedef typename device_type::execution_space XS;
632  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
633  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
634  typedef BufferDeviceType BDT;
635 
636  size_t count = 0;
637  LO num_items;
638 
639  // Number of matrix entries to unpack (returned by this function).
640  num_items = static_cast<LO>(num_same_ids);
641  if (num_items) {
642  size_t kcnt = 0;
643  parallel_reduce(range_policy(0, num_items),
644  KOKKOS_LAMBDA(const LO lid, size_t& update) {
645  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
646  -local_matrix.graph.row_map[lid]);
647  }, kcnt);
648  count += kcnt;
649  }
650 
651  // Count entries copied directly from the source matrix with permuting.
652  num_items = static_cast<LO>(permute_from_lids.extent(0));
653  if (num_items) {
654  size_t kcnt = 0;
655  parallel_reduce(range_policy(0, num_items),
656  KOKKOS_LAMBDA(const LO i, size_t& update) {
657  const LO lid = permute_from_lids(i);
658  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
659  - local_matrix.graph.row_map[lid]);
660  }, kcnt);
661  count += kcnt;
662  }
663 
664  {
665  // Count entries received from other MPI processes.
666  const size_type np = num_packets_per_lid.extent(0);
667  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
668  computeOffsetsFromCounts(offsets, num_packets_per_lid);
669  count +=
670  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
671  offsets, imports);
672  }
673 
674  return count;
675 }
676 
677 template<class LO, class DT, class BDT>
678 KOKKOS_INLINE_FUNCTION
679 size_t
680 unpackRowCount(const Kokkos::View<const char*, BDT>& imports,
681  const size_t offset,
682  const size_t num_bytes)
683 {
684  LO num_ent_LO = 0;
685  if (num_bytes > 0) {
686  const size_t p_num_bytes = PackTraits<LO, DT>::packValueCount(num_ent_LO);
687  if (p_num_bytes > num_bytes) {
688  return OrdinalTraits<size_t>::invalid();
689  }
690  const char* const in_buf = imports.data () + offset;
691  (void) PackTraits<LO,DT>::unpackValue(num_ent_LO, in_buf);
692  }
693  return static_cast<size_t>(num_ent_LO);
694 }
695 
697 template<class LO, class DT, class BDT>
698 int
699 setupRowPointersForRemotes(
700  const typename PackTraits<size_t, DT>::output_array_type& tgt_rowptr,
701  const typename PackTraits<LO, DT>::input_array_type& import_lids,
702  const Kokkos::View<const char*, BDT>& imports,
703  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
704  const typename PackTraits<size_t, DT>::input_array_type& offsets)
705 {
706  using Kokkos::parallel_reduce;
707  typedef typename DT::execution_space XS;
708  typedef typename PackTraits<size_t,DT>::input_array_type::size_type size_type;
709  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
710 
711  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
712  const size_type N = num_packets_per_lid.extent(0);
713 
714  int errors = 0;
715  parallel_reduce ("Setup row pointers for remotes",
716  range_policy (0, N),
717  KOKKOS_LAMBDA (const size_t i, int& k_error) {
718  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
719  const size_t num_bytes = num_packets_per_lid(i);
720  const size_t offset = offsets(i);
721  const size_t num_ent = unpackRowCount<LO, DT, BDT> (imports, offset, num_bytes);
722  if (num_ent == InvalidNum) {
723  k_error += 1;
724  }
725  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
726  }, errors);
727  return errors;
728 }
729 
730 // Convert array of row lengths to a CRS pointer array
731 template<class DT>
732 void
733 makeCrsRowPtrFromLengths(
734  const typename PackTraits<size_t,DT>::output_array_type& tgt_rowptr,
735  const Kokkos::View<size_t*,DT>& new_start_row)
736 {
737  using Kokkos::parallel_scan;
738  typedef typename DT::execution_space XS;
739  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
740  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
741  const size_type N = new_start_row.extent(0);
742  parallel_scan(range_policy(0, N),
743  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
744  auto cur_val = tgt_rowptr(i);
745  if (final) {
746  tgt_rowptr(i) = update;
747  new_start_row(i) = tgt_rowptr(i);
748  }
749  update += cur_val;
750  }
751  );
752 }
753 
754 template<class LocalMatrix, class LocalMap>
755 void
756 copyDataFromSameIDs(
757  const typename PackTraits<typename LocalMap::global_ordinal_type, typename LocalMap::device_type>::output_array_type& tgt_colind,
758  const typename PackTraits<int, typename LocalMap::device_type>::output_array_type& tgt_pids,
759  const typename PackTraits<typename LocalMatrix::value_type, typename LocalMap::device_type>::output_array_type& tgt_vals,
760  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
761  const typename PackTraits<size_t, typename LocalMap::device_type>::output_array_type& tgt_rowptr,
762  const typename PackTraits<int, typename LocalMap::device_type>::input_array_type& src_pids,
763  const LocalMatrix& local_matrix,
764  const LocalMap& local_col_map,
765  const size_t num_same_ids,
766  const int my_pid)
767 {
768  using Kokkos::parallel_for;
769  typedef typename LocalMap::device_type DT;
770  typedef typename LocalMap::local_ordinal_type LO;
771  typedef typename DT::execution_space XS;
772  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
773 
774  parallel_for(range_policy(0, num_same_ids),
775  KOKKOS_LAMBDA(const size_t i) {
776  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
777 
778  const LO src_lid = static_cast<LO>(i);
779  size_t src_row = local_matrix.graph.row_map(src_lid);
780 
781  const LO tgt_lid = static_cast<LO>(i);
782  const size_t tgt_row = tgt_rowptr(tgt_lid);
783 
784  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
785  - local_matrix.graph.row_map(src_lid);
786  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
787 
788  for (size_t j=local_matrix.graph.row_map(src_lid);
789  j<local_matrix.graph.row_map(src_lid+1); ++j) {
790  LO src_col = local_matrix.graph.entries(j);
791  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
792  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
793  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
794  }
795  }
796  );
797 }
798 
799 template<class LocalMatrix, class LocalMap>
800 void
801 copyDataFromPermuteIDs(
802  const typename PackTraits<typename LocalMap::global_ordinal_type, typename LocalMap::device_type>::output_array_type& tgt_colind,
803  const typename PackTraits<int, typename LocalMap::device_type>::output_array_type& tgt_pids,
804  const typename PackTraits<typename LocalMatrix::value_type, typename LocalMap::device_type>::output_array_type& tgt_vals,
805  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
806  const typename PackTraits<size_t, typename LocalMap::device_type>::output_array_type& tgt_rowptr,
807  const typename PackTraits<int, typename LocalMap::device_type>::input_array_type& src_pids,
808  const typename PackTraits<typename LocalMap::local_ordinal_type, typename LocalMap::device_type>::input_array_type& permute_to_lids,
809  const typename PackTraits<typename LocalMap::local_ordinal_type, typename LocalMap::device_type>::input_array_type& permute_from_lids,
810  const LocalMatrix& local_matrix,
811  const LocalMap& local_col_map,
812  const int my_pid)
813 {
814  using Kokkos::parallel_for;
815  typedef typename LocalMap::device_type DT;
816  typedef typename LocalMap::local_ordinal_type LO;
817  typedef typename DT::execution_space XS;
818  typedef typename PackTraits<LO,DT>::input_array_type::size_type size_type;
819  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
820 
821  const size_type num_permute_to_lids = permute_to_lids.extent(0);
822 
823  parallel_for(range_policy(0, num_permute_to_lids),
824  KOKKOS_LAMBDA(const size_t i) {
825  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
826 
827  const LO src_lid = permute_from_lids(i);
828  const size_t src_row = local_matrix.graph.row_map(src_lid);
829 
830  const LO tgt_lid = permute_to_lids(i);
831  const size_t tgt_row = tgt_rowptr(tgt_lid);
832 
833  size_t nsr = local_matrix.graph.row_map(src_lid+1)
834  - local_matrix.graph.row_map(src_lid);
835  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
836 
837  for (size_t j=local_matrix.graph.row_map(src_lid);
838  j<local_matrix.graph.row_map(src_lid+1); ++j) {
839  LO src_col = local_matrix.graph.entries(j);
840  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
841  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
842  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
843  }
844  }
845  );
846 }
847 
848 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
849 int
850 unpackAndCombineIntoCrsArrays2(
851  const typename PackTraits<typename LocalMap::global_ordinal_type, typename LocalMap::device_type>::output_array_type& tgt_colind,
852  const typename PackTraits<int, typename LocalMap::device_type>::output_array_type& tgt_pids,
853  const typename PackTraits<typename LocalMatrix::value_type, typename LocalMap::device_type>::output_array_type& tgt_vals,
854  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
855  const typename PackTraits<size_t, typename LocalMap::device_type>::input_array_type& offsets,
856  const typename PackTraits<typename LocalMap::local_ordinal_type, typename LocalMap::device_type>::input_array_type& import_lids,
857  const Kokkos::View<const char*, BufferDeviceType>& imports,
858  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
859  const LocalMatrix& /* local_matrix */,
860  const LocalMap /*& local_col_map*/,
861  const int my_pid,
862  const size_t num_bytes_per_value)
863 {
864  using Kokkos::View;
865  using Kokkos::subview;
866  using Kokkos::MemoryUnmanaged;
867  using Kokkos::parallel_reduce;
868  using Kokkos::atomic_fetch_add;
870  typedef typename LocalMap::device_type DT;
871  typedef typename LocalMap::local_ordinal_type LO;
872  typedef typename LocalMap::global_ordinal_type GO;
873  typedef typename LocalMatrix::value_type ST;
874  typedef typename DT::execution_space XS;
875  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
876  typedef typename Kokkos::pair<size_type, size_type> slice;
877  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
878  typedef BufferDeviceType BDT;
879 
880  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
881  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
882  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
883 
884  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
885 
886  int errors = 0;
887  const size_type num_import_lids = import_lids.size();
888 
889  // RemoteIDs: Loop structure following UnpackAndCombine
890  parallel_reduce ("Unpack and combine into CRS",
891  range_policy (0, num_import_lids),
892  KOKKOS_LAMBDA (const size_t i, int& k_error) {
893  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
894  const size_t num_bytes = num_packets_per_lid(i);
895  const size_t offset = offsets(i);
896  if (num_bytes == 0) {
897  // Empty buffer means that the row is empty.
898  return;
899  }
900  size_t num_ent = unpackRowCount<LO,DT,BDT>(imports, offset, num_bytes);
901  if (num_ent == InvalidNum) {
902  k_error += 1;
903  return;
904  }
905  const LO lcl_row = import_lids(i);
906  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
907  const size_t end_row = start_row + num_ent;
908 
909  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
910  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
911  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
912 
913  k_error += unpackRow<ST,LO,GO,DT,BDT>(gids_out, pids_out, vals_out,
914  imports, offset, num_bytes,
915  num_ent, num_bytes_per_value);
916 
917  // Correct target PIDs.
918  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
919  const int pid = pids_out(j);
920  pids_out(j) = (pid != my_pid) ? pid : -1;
921  }
922  }, errors);
923 
924  return errors;
925 }
926 
927 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
928 void
930  const LocalMatrix & local_matrix,
931  const LocalMap & local_col_map,
932  const typename PackTraits<typename LocalMap::local_ordinal_type, typename LocalMap::device_type>::input_array_type& import_lids,
933  const Kokkos::View<const char*, BufferDeviceType>& imports,
934  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
935  const typename PackTraits<typename LocalMap::local_ordinal_type, typename LocalMap::device_type>::input_array_type& permute_to_lids,
936  const typename PackTraits<typename LocalMap::local_ordinal_type, typename LocalMap::device_type>::input_array_type& permute_from_lids,
937  const typename PackTraits<size_t, typename LocalMap::device_type>::output_array_type& tgt_rowptr,
938  const typename PackTraits<typename LocalMap::global_ordinal_type, typename LocalMap::device_type>::output_array_type& tgt_colind,
939  const typename PackTraits<typename LocalMatrix::value_type, typename LocalMap::device_type>::output_array_type& tgt_vals,
940  const typename PackTraits<int, typename LocalMap::device_type>::input_array_type& src_pids,
941  const typename PackTraits<int, typename LocalMap::device_type>::output_array_type& tgt_pids,
942  const size_t num_same_ids,
943  const size_t tgt_num_rows,
944  const size_t tgt_num_nonzeros,
945  const int my_tgt_pid,
946  const size_t num_bytes_per_value)
947 {
948  using Kokkos::View;
949  using Kokkos::subview;
950  using Kokkos::parallel_for;
951  using Kokkos::MemoryUnmanaged;
953  typedef typename LocalMap::device_type DT;
954  typedef typename LocalMap::local_ordinal_type LO;
955  typedef typename DT::execution_space XS;
956  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
957  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
958  typedef BufferDeviceType BDT;
959 
960  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
961 
962  const size_t N = tgt_num_rows;
963  const size_t mynnz = tgt_num_nonzeros;
964 
965  // In the case of reduced communicators, the sourceMatrix won't have
966  // the right "my_pid", so thus we have to supply it.
967  const int my_pid = my_tgt_pid;
968 
969  // Zero the rowptr
970  parallel_for(range_policy(0, N+1),
971  KOKKOS_LAMBDA(const size_t i) {
972  tgt_rowptr(i) = 0;
973  }
974  );
975 
976  // same IDs: Always first, always in the same place
977  parallel_for(range_policy(0, num_same_ids),
978  KOKKOS_LAMBDA(const size_t i) {
979  const LO tgt_lid = static_cast<LO>(i);
980  const LO src_lid = static_cast<LO>(i);
981  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
982  - local_matrix.graph.row_map(src_lid);
983  }
984  );
985 
986  // Permute IDs: Still local, but reordered
987  const size_type num_permute_to_lids = permute_to_lids.extent(0);
988  parallel_for(range_policy(0, num_permute_to_lids),
989  KOKKOS_LAMBDA(const size_t i) {
990  const LO tgt_lid = permute_to_lids(i);
991  const LO src_lid = permute_from_lids(i);
992  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
993  - local_matrix.graph.row_map(src_lid);
994  }
995  );
996 
997  // Get the offsets from the number of packets per LID
998  const size_type num_import_lids = import_lids.extent(0);
999  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1000  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1001 
1002 #ifdef HAVE_TPETRA_DEBUG
1003  {
1004  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1005  const bool condition =
1006  nth_offset_h != static_cast<size_t>(imports.extent (0));
1007  TEUCHOS_TEST_FOR_EXCEPTION
1008  (condition, std::logic_error, prefix
1009  << "The final offset in bytes " << nth_offset_h
1010  << " != imports.size() = " << imports.extent(0)
1011  << ". Please report this bug to the Tpetra developers.");
1012  }
1013 #endif // HAVE_TPETRA_DEBUG
1014 
1015  // Setup row pointers for remotes
1016  int k_error =
1017  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1018  import_lids, imports, num_packets_per_lid, offsets);
1019  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1020  << " Error transferring data to target row pointers. "
1021  "Please report this bug to the Tpetra developers.");
1022 
1023  // If multiple processes contribute to the same row, we may need to
1024  // update row offsets. This tracks that.
1025  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1026 
1027  // Turn row length into a real CRS row pointer
1028  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1029  {
1030  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
1031  bool condition = nth_tgt_rowptr_h != mynnz;
1032  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
1033  prefix << "CRS_rowptr[last] = " <<
1034  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
1035  }
1036 
1037  // SameIDs: Copy the data over
1038  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1039  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1040 
1041  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1042  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1043  local_matrix, local_col_map, my_pid);
1044 
1045  if (imports.extent(0) <= 0) {
1046  return;
1047  }
1048 
1049  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1050  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1051  local_matrix, local_col_map, my_pid, num_bytes_per_value);
1052  TEUCHOS_TEST_FOR_EXCEPTION(
1053  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1054  "should never happen. Please report this bug to the Tpetra developers.");
1055 
1056  return;
1057 }
1058 
1059 } // namespace UnpackAndCombineCrsMatrixImpl
1060 
1100 template<typename ST, typename LO, typename GO, typename Node>
1101 void
1103  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1104  const Teuchos::ArrayView<const char>& imports,
1105  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1106  const Teuchos::ArrayView<const LO>& importLIDs,
1107  size_t /* constantNumPackets */,
1108  Distributor & /* distor */,
1109  CombineMode combineMode,
1110  const bool atomic)
1111 {
1112  using Kokkos::View;
1113  typedef typename Node::device_type device_type;
1114  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_type local_matrix_type;
1115  static_assert (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1116  "Node::device_type and LocalMatrix::device_type must be the same.");
1117 
1118  // Execution space.
1119  typedef typename device_type::execution_space XS;
1120 
1121  // Convert all Teuchos::Array to Kokkos::View.
1122  typename XS::device_type outputDevice;
1123 
1124  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1125  // them to device. Since unpacking is done directly in to the local matrix
1126  // (lclMatrix), no copying needs to be performed after unpacking.
1127  auto num_packets_per_lid_d =
1128  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1129  numPacketsPerLID.size(), true, "num_packets_per_lid");
1130 
1131  auto import_lids_d =
1132  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1133  importLIDs.size(), true, "import_lids");
1134 
1135  auto imports_d =
1136  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1137  imports.size(), true, "imports");
1138 
1139  auto local_matrix = sourceMatrix.getLocalMatrix();
1140  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1141 
1142  // Now do the actual unpack!
1143  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1144  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1145  import_lids_d, combineMode, false, atomic);
1146 
1147  return;
1148 }
1149 
1150 template<typename ST, typename LO, typename GO, typename NT>
1151 void
1152 unpackCrsMatrixAndCombineNew (const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1153  const Kokkos::DualView<const char*,
1155  const Kokkos::DualView<const size_t*,
1156  typename DistObject<char, LO, GO, NT>::buffer_device_type>& numPacketsPerLID,
1157  const Kokkos::DualView<const LO*,
1159  const size_t /* constantNumPackets */,
1160  Distributor& /* distor */,
1161  const CombineMode combineMode,
1162  const bool atomic)
1163 {
1165  using Kokkos::View;
1166  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1167  using dist_object_type = DistObject<char, LO, GO, NT>;
1168  using device_type = typename crs_matrix_type::device_type;
1169  using local_matrix_type = typename crs_matrix_type::local_matrix_type;
1170  using buffer_device_type = typename dist_object_type::buffer_device_type;
1171 
1172  static_assert
1173  (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1174  "crs_matrix_type::device_type and local_matrix_type::device_type "
1175  "must be the same.");
1176 
1177  {
1178  auto numPacketsPerLID_nc = castAwayConstDualView (numPacketsPerLID);
1179  numPacketsPerLID_nc.sync_device ();
1180  }
1181  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1182 
1183  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1184  auto import_lids_d = importLIDs.view_device ();
1185 
1186  {
1187  auto imports_nc = castAwayConstDualView (imports);
1188  imports_nc.sync_device ();
1189  }
1190  auto imports_d = imports.view_device ();
1191 
1192  auto local_matrix = sourceMatrix.getLocalMatrix ();
1193  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1194  typedef decltype (local_col_map) local_map_type;
1195 
1196  // Now do the actual unpack!
1197  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1198  local_matrix_type,
1199  local_map_type,
1200  buffer_device_type
1201  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1202  import_lids_d, combineMode, false, atomic);
1203 }
1204 
1251 //
1260 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1261 size_t
1263  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1264  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1265  const Teuchos::ArrayView<const char> &imports,
1266  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1267  size_t /* constantNumPackets */,
1268  Distributor &/* distor */,
1269  CombineMode /* combineMode */,
1270  size_t numSameIDs,
1271  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1272  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1273 {
1274  using Kokkos::MemoryUnmanaged;
1275  using Kokkos::View;
1276  typedef typename Node::device_type DT;
1278  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1279 
1280  TEUCHOS_TEST_FOR_EXCEPTION
1281  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1282  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1283  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1284  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1285  // process, then the matrix is neither locally nor globally indexed.
1286  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1287  TEUCHOS_TEST_FOR_EXCEPTION
1288  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1289  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1290  TEUCHOS_TEST_FOR_EXCEPTION
1291  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1292  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1293  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1294 
1295  auto local_matrix = sourceMatrix.getLocalMatrix ();
1296  auto permute_from_lids_d =
1298  permuteFromLIDs.getRawPtr (),
1299  permuteFromLIDs.size (), true,
1300  "permute_from_lids");
1301  auto imports_d =
1303  imports.getRawPtr (),
1304  imports.size (), true,
1305  "imports");
1306  auto num_packets_per_lid_d =
1308  numPacketsPerLID.getRawPtr (),
1309  numPacketsPerLID.size (), true,
1310  "num_packets_per_lid");
1311 
1313  local_matrix, permute_from_lids_d, imports_d,
1314  num_packets_per_lid_d, numSameIDs);
1315 }
1316 
1331 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1332 void
1335  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1336  const Teuchos::ArrayView<const char>& imports,
1337  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1338  const size_t /* constantNumPackets */,
1339  Distributor& /* distor */,
1340  const CombineMode /* combineMode */,
1341  const size_t numSameIDs,
1342  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1343  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1344  size_t TargetNumRows,
1345  size_t TargetNumNonzeros,
1346  const int MyTargetPID,
1347  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1348  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1349  const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
1350  const Teuchos::ArrayView<const int>& SourcePids,
1351  Teuchos::Array<int>& TargetPids)
1352 {
1354 
1355  using Kokkos::View;
1356  using Kokkos::deep_copy;
1357 
1358  using Teuchos::ArrayView;
1359  using Teuchos::outArg;
1360  using Teuchos::REDUCE_MAX;
1361  using Teuchos::reduceAll;
1362 
1363  typedef LocalOrdinal LO;
1364 
1365  typedef typename Node::device_type DT;
1366  typedef typename DT::execution_space XS;
1367 
1369  typedef typename matrix_type::impl_scalar_type ST;
1370  typedef typename ArrayView<const LO>::size_type size_type;
1371 
1372  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1373 
1374  TEUCHOS_TEST_FOR_EXCEPTION(
1375  TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1376  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1377  CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1378 
1379  TEUCHOS_TEST_FOR_EXCEPTION(
1380  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1381  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1382  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1383  const size_type numImportLIDs = importLIDs.size ();
1384 
1385  TEUCHOS_TEST_FOR_EXCEPTION(
1386  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1387  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1388  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1389 
1390  // Preseed TargetPids with -1 for local
1391  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1392  TargetPids.resize (TargetNumNonzeros);
1393  }
1394  TargetPids.assign (TargetNumNonzeros, -1);
1395 
1396  // Grab pointers for sourceMatrix
1397  auto local_matrix = sourceMatrix.getLocalMatrix();
1398  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1399 
1400  // Convert input arrays to Kokkos::View
1401  typename XS::device_type outputDevice;
1402  auto import_lids_d =
1403  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1404  importLIDs.size(), true, "import_lids");
1405 
1406  auto imports_d =
1407  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1408  imports.size(), true, "imports");
1409 
1410  auto num_packets_per_lid_d =
1411  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1412  numPacketsPerLID.size(), true, "num_packets_per_lid");
1413 
1414  auto permute_from_lids_d =
1415  create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(),
1416  permuteFromLIDs.size(), true, "permute_from_lids");
1417 
1418  auto permute_to_lids_d =
1419  create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(),
1420  permuteToLIDs.size(), true, "permute_to_lids");
1421 
1422  auto crs_rowptr_d =
1423  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1424  CRS_rowptr.size(), true, "crs_rowptr");
1425 
1426  auto crs_colind_d =
1427  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1428  CRS_colind.size(), true, "crs_colidx");
1429 
1430 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1431  static_assert (! std::is_same<
1432  typename std::remove_const<
1433  typename std::decay<
1434  decltype (CRS_vals)
1435  >::type::value_type
1436  >::type,
1437  std::complex<double> >::value,
1438  "CRS_vals::value_type is std::complex<double>; this should never happen"
1439  ", since std::complex does not work in Kokkos::View objects.");
1440 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1441 
1442  auto crs_vals_d =
1443  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(),
1444  CRS_vals.size(), true, "crs_vals");
1445 
1446 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1447  static_assert (! std::is_same<
1448  typename decltype (crs_vals_d)::non_const_value_type,
1449  std::complex<double> >::value,
1450  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1451  "never happen, since std::complex does not work in Kokkos::View objects.");
1452 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1453 
1454  auto src_pids_d =
1455  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1456  SourcePids.size(), true, "src_pids");
1457 
1458  auto tgt_pids_d =
1459  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1460  TargetPids.size(), true, "tgt_pids");
1461 
1462  size_t num_bytes_per_value = 0;
1464  // assume that ST is default constructible
1465  num_bytes_per_value = PackTraits<ST,DT>::packValueCount(ST());
1466  }
1467  else {
1468  // Since the packed data come from the source matrix, we can use the source
1469  // matrix to get the number of bytes per Scalar value stored in the matrix.
1470  // This assumes that all Scalar values in the source matrix require the same
1471  // number of bytes. If the source matrix has no entries on the calling
1472  // process, then we hope that some process does have some idea how big
1473  // a Scalar value is. Of course, if no processes have any entries, then no
1474  // values should be packed (though this does assume that in our packing
1475  // scheme, rows with zero entries take zero bytes).
1476  size_t num_bytes_per_value_l = 0;
1477  if (local_matrix.values.extent(0) > 0) {
1478  const ST& val = local_matrix.values(0);
1479  num_bytes_per_value_l = PackTraits<ST,DT>::packValueCount(val);
1480  } else {
1481  const ST& val = crs_vals_d(0);
1482  num_bytes_per_value_l = PackTraits<ST,DT>::packValueCount(val);
1483  }
1484  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1485  Teuchos::REDUCE_MAX,
1486  num_bytes_per_value_l,
1487  outArg(num_bytes_per_value));
1488  }
1489 
1490 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1491  static_assert (! std::is_same<
1492  typename decltype (crs_vals_d)::non_const_value_type,
1493  std::complex<double> >::value,
1494  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1495  "never happen, since std::complex does not work in Kokkos::View objects.");
1496 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1497 
1499  local_matrix, local_col_map, import_lids_d, imports_d,
1500  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1501  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1502  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1503  num_bytes_per_value);
1504 
1505  // Copy outputs back to host
1506  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1507  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1508  deep_copy(crs_rowptr_h, crs_rowptr_d);
1509 
1510  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1511  CRS_colind.getRawPtr(), CRS_colind.size());
1512  deep_copy(crs_colind_h, crs_colind_d);
1513 
1514  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1515  CRS_vals.getRawPtr(), CRS_vals.size());
1516  deep_copy(crs_vals_h, crs_vals_d);
1517 
1518  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1519  TargetPids.getRawPtr(), TargetPids.size());
1520  deep_copy(tgt_pids_h, tgt_pids_d);
1521 
1522 }
1523 
1524 } // namespace Details
1525 } // namespace Tpetra
1526 
1527 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1528  template void \
1529  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1530  const CrsMatrix<ST, LO, GO, NT>&, \
1531  const Teuchos::ArrayView<const char>&, \
1532  const Teuchos::ArrayView<const size_t>&, \
1533  const Teuchos::ArrayView<const LO>&, \
1534  size_t, \
1535  Distributor&, \
1536  CombineMode, \
1537  const bool); \
1538  template void \
1539  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1540  const CrsMatrix<ST, LO, GO, NT>&, \
1541  const Kokkos::DualView<const char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1542  const Kokkos::DualView<const size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1543  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1544  const size_t, \
1545  Distributor&, \
1546  const CombineMode, \
1547  const bool); \
1548  template void \
1549  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1550  const CrsMatrix<ST, LO, GO, NT> &, \
1551  const Teuchos::ArrayView<const LO>&, \
1552  const Teuchos::ArrayView<const char>&, \
1553  const Teuchos::ArrayView<const size_t>&, \
1554  const size_t, \
1555  Distributor&, \
1556  const CombineMode, \
1557  const size_t, \
1558  const Teuchos::ArrayView<const LO>&, \
1559  const Teuchos::ArrayView<const LO>&, \
1560  size_t, \
1561  size_t, \
1562  const int, \
1563  const Teuchos::ArrayView<size_t>&, \
1564  const Teuchos::ArrayView<GO>&, \
1565  const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1566  const Teuchos::ArrayView<const int>&, \
1567  Teuchos::Array<int>&); \
1568  template size_t \
1569  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1570  const CrsMatrix<ST, LO, GO, NT> &, \
1571  const Teuchos::ArrayView<const LO> &, \
1572  const Teuchos::ArrayView<const char> &, \
1573  const Teuchos::ArrayView<const size_t>&, \
1574  size_t, \
1575  Distributor &, \
1576  CombineMode, \
1577  size_t, \
1578  const Teuchos::ArrayView<const LO>&, \
1579  const Teuchos::ArrayView<const LO>&);
1580 
1581 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index.
Traits class for packing / unpacking data of type T, using Kokkos data structures that live in the gi...
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
&quot;Local&quot; part of Map suitable for Kokkos kernels.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Replace old value with maximum of magnitudes of old and new values.
local_matrix_type getLocalMatrix() const
The local sparse matrix.
Replace existing values with new values.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Kokkos::View< value_type *, DT, Kokkos::MemoryUnmanaged > output_array_type
The type of an output array of value_type.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, const bool atomic)
Unpack the imported column indices and values, and combine into matrix.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...