Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
42 
43 #include <memory>
44 #include <string>
45 #include "TpetraCore_config.h"
46 #include "Kokkos_Core.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Teuchos_ArrayView.hpp"
49 #include "Teuchos_OrdinalTraits.hpp"
50 #include "Teuchos_TimeMonitor.hpp"
58 #include "Tpetra_Details_DefaultTypes.hpp"
60 
81 
82 namespace Tpetra {
83 
84 //
85 // Users must never rely on anything in the Details namespace.
86 //
87 namespace Details {
88 
89 namespace UnpackAndCombineCrsMatrixImpl {
90 
100 template<class ST, class LO, class GO>
101 KOKKOS_FUNCTION int
102 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
103  const typename PackTraits<int>::output_array_type& pids_out,
104  const typename PackTraits<ST>::output_array_type& vals_out,
105  const char imports[],
106  const size_t offset,
107  const size_t /* num_bytes */,
108  const size_t num_ent,
109  const size_t bytes_per_value)
110 {
111  if (num_ent == 0) {
112  // Empty rows always take zero bytes, to ensure sparsity.
113  return 0;
114  }
115  bool unpack_pids = pids_out.size() > 0;
116 
117  const size_t num_ent_beg = offset;
118  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
119 
120  const size_t gids_beg = num_ent_beg + num_ent_len;
121  const size_t gids_len =
122  num_ent * PackTraits<GO>::packValueCount (GO (0));
123 
124  const size_t pids_beg = gids_beg + gids_len;
125  const size_t pids_len = unpack_pids ?
126  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
127  size_t (0);
128 
129  const size_t vals_beg = gids_beg + gids_len + pids_len;
130  const size_t vals_len = num_ent * bytes_per_value;
131 
132  const char* const num_ent_in = imports + num_ent_beg;
133  const char* const gids_in = imports + gids_beg;
134  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
135  const char* const vals_in = imports + vals_beg;
136 
137  size_t num_bytes_out = 0;
138  LO num_ent_out;
139  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
140  if (static_cast<size_t> (num_ent_out) != num_ent) {
141  return 20; // error code
142  }
143 
144  {
145  Kokkos::pair<int, size_t> p;
146  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
147  if (p.first != 0) {
148  return 21; // error code
149  }
150  num_bytes_out += p.second;
151 
152  if (unpack_pids) {
153  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
154  if (p.first != 0) {
155  return 22; // error code
156  }
157  num_bytes_out += p.second;
158  }
159 
160  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
161  if (p.first != 0) {
162  return 23; // error code
163  }
164  num_bytes_out += p.second;
165  }
166 
167  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
168  if (num_bytes_out != expected_num_bytes) {
169  return 24; // error code
170  }
171  return 0; // no errors
172 } //unpackRow
173 
184 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
186  typedef LocalMatrix local_matrix_type;
187  typedef LocalMap local_map_type;
188 
189  typedef typename local_matrix_type::value_type ST;
190  typedef typename local_map_type::local_ordinal_type LO;
191  typedef typename local_map_type::global_ordinal_type GO;
192  typedef typename local_map_type::device_type DT;
193  typedef typename DT::execution_space XS;
194 
195  typedef Kokkos::View<const size_t*, BufferDeviceType>
196  num_packets_per_lid_type;
197  typedef Kokkos::View<const size_t*, DT> offsets_type;
198  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
199  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
200 
201  typedef Kokkos::View<int, DT> error_type;
202  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
203 
204  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
205  "LocalMap::local_ordinal_type and "
206  "LocalMatrix::ordinal_type must be the same.");
207 
208  local_matrix_type local_matrix;
209  local_map_type local_col_map;
210  input_buffer_type imports;
211  num_packets_per_lid_type num_packets_per_lid;
212  import_lids_type import_lids;
213  Kokkos::View<const LO*[2], DT> batch_info;
214  offsets_type offsets;
215  Tpetra::CombineMode combine_mode;
216  size_t batch_size;
217  size_t bytes_per_value;
218  bool atomic;
219  error_type error_code;
220 
222  const local_matrix_type& local_matrix_in,
223  const local_map_type& local_col_map_in,
224  const input_buffer_type& imports_in,
225  const num_packets_per_lid_type& num_packets_per_lid_in,
226  const import_lids_type& import_lids_in,
227  const Kokkos::View<const LO*[2], DT>& batch_info_in,
228  const offsets_type& offsets_in,
229  const Tpetra::CombineMode combine_mode_in,
230  const size_t batch_size_in,
231  const size_t bytes_per_value_in,
232  const bool atomic_in) :
233  local_matrix (local_matrix_in),
234  local_col_map (local_col_map_in),
235  imports (imports_in),
236  num_packets_per_lid (num_packets_per_lid_in),
237  import_lids (import_lids_in),
238  batch_info (batch_info_in),
239  offsets (offsets_in),
240  combine_mode (combine_mode_in),
241  batch_size (batch_size_in),
242  bytes_per_value (bytes_per_value_in),
243  atomic (atomic_in),
244  error_code("error")
245  {}
246 
247  KOKKOS_INLINE_FUNCTION
248  void operator()(member_type team_member) const
249  {
250  using Kokkos::View;
251  using Kokkos::subview;
252  using Kokkos::MemoryUnmanaged;
253 
254  const LO batch = team_member.league_rank();
255  const LO lid_no = batch_info(batch, 0);
256  const LO batch_no = batch_info(batch, 1);
257 
258  const size_t num_bytes = num_packets_per_lid(lid_no);
259 
260  // Only unpack data if there is a nonzero number of bytes.
261  if (num_bytes == 0)
262  return;
263 
264  // there is actually something in the row
265  const LO import_lid = import_lids(lid_no);
266  const size_t buf_size = imports.size();
267  const size_t offset = offsets(lid_no);
268 
269  // Get the number of entries to expect in the received data for this row.
270  LO num_ent_LO = 0;
271  const char* const in_buf = imports.data() + offset;
272  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
273  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
274 
275  // Count the number of bytes expected to unpack
276  size_t expected_num_bytes = 0;
277  {
278  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
279  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
280  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
281  }
282 
283  if (expected_num_bytes > num_bytes)
284  {
285 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
286 #ifndef KOKKOS_ENABLE_SYCL
287  printf(
288  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
289  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
290  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
291  );
292 #endif
293  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
294  return;
295  }
296 
297  if (offset > buf_size || offset + num_bytes > buf_size)
298  {
299 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
300 #ifndef KOKKOS_ENABLE_SYCL
301  printf(
302  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
303  "At row %d, the offset (%d) > buffer size (%d)\n",
304  (int) lid_no, (int) offset, (int) buf_size
305  );
306 #endif
307  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
308  return;
309  }
310 
311  // Determine the number of entries to unpack in this batch
312  size_t num_entries_in_batch = 0;
313  if (num_entries_in_row <= batch_size)
314  num_entries_in_batch = num_entries_in_row;
315  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
316  num_entries_in_batch = batch_size;
317  else
318  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
319 
320  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
321  const size_t num_ent_start = offset;
322  const size_t num_ent_end = num_ent_start + bytes_per_lid;
323 
324  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
325  const size_t gids_start = num_ent_end;
326  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
327 
328  const size_t vals_start = gids_end;
329 
330  const size_t shift = batch_no * batch_size;
331  const char* const num_ent_in = imports.data() + num_ent_start;
332  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
333  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
334 
335  LO num_ent_out;
336  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
337  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
338  {
339 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
340 #ifndef KOKKOS_ENABLE_SYCL
341  printf(
342  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
343  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
344  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
345  );
346 #endif
347  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
348  }
349 
350  constexpr bool matrix_has_sorted_rows = true; // see #6282
351  //Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
352  //By ref triggers compiler bug in CUDA 10.
353  Kokkos::parallel_for(
354  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
355  [=](const LO& j)
356  {
357  size_t distance = 0;
358 
359  GO gid_out;
360  distance = j * bytes_per_gid;
361  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
362  auto lid_out = local_col_map.getLocalElement(gid_out);
363 
364  // Column indices come in as global indices, in case the
365  // source object's column Map differs from the target object's
366  // (this's) column Map, and must be converted local index values
367 
368  // assume that ST is default constructible
369  ST val_out;
370  distance = j * bytes_per_value;
371  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
372 
373  if (combine_mode == ADD) {
374  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
375  // different threads don't touch the same row (i.e., no
376  // duplicates in incoming LIDs list).
377  const bool use_atomic_updates = atomic;
378  (void)local_matrix.sumIntoValues(
379  import_lid,
380  &lid_out,
381  1,
382  &val_out,
383  matrix_has_sorted_rows,
384  use_atomic_updates
385  );
386  } else if (combine_mode == REPLACE) {
387  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
388  // combine mode with multiple incoming rows that touch the same
389  // target matrix entries, so we never need atomic updates.
390  const bool use_atomic_updates = false;
391  (void)local_matrix.replaceValues(
392  import_lid,
393  &lid_out,
394  1,
395  &val_out,
396  matrix_has_sorted_rows,
397  use_atomic_updates
398  );
399  } else {
400  // should never get here
401 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
402 #ifndef KOKKOS_ENABLE_SYCL
403  printf(
404  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
405  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
406  );
407 #endif
408  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
409  }
410  }
411  );
412 
413  team_member.team_barrier();
414 
415  }
416 
418  int error() const {
419  auto error_code_h = Kokkos::create_mirror_view_and_copy(
420  Kokkos::HostSpace(), error_code
421  );
422  return error_code_h();
423  }
424 
425 }; //UnpackCrsMatrixAndCombineFunctor
426 
427 struct MaxNumEntTag {};
428 struct TotNumEntTag {};
429 
438 template<class LO, class DT, class BDT>
440 public:
441  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
442  typedef Kokkos::View<const size_t*, DT> offsets_type;
443  typedef Kokkos::View<const char*, BDT> input_buffer_type;
444  // This needs to be public, since it appears in the argument list of
445  // public methods (see below). Otherwise, build errors may happen.
446  typedef size_t value_type;
447 
448 private:
449  num_packets_per_lid_type num_packets_per_lid;
450  offsets_type offsets;
451  input_buffer_type imports;
452 
453 public:
454  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
455  const offsets_type& offsets_in,
456  const input_buffer_type& imports_in) :
457  num_packets_per_lid (num_packets_per_lid_in),
458  offsets (offsets_in),
459  imports (imports_in)
460  {}
461 
462  KOKKOS_INLINE_FUNCTION void
463  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
464  // Get how many entries to expect in the received data for this row.
465  const size_t num_bytes = num_packets_per_lid(i);
466  if (num_bytes > 0) {
467  LO num_ent_LO = 0; // output argument of unpackValue
468  const char* const in_buf = imports.data () + offsets(i);
469  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
470  const size_t num_ent = static_cast<size_t> (num_ent_LO);
471 
472  update = (update < num_ent) ? num_ent : update;
473  }
474  }
475 
476  KOKKOS_INLINE_FUNCTION void
477  join (const MaxNumEntTag,
478  value_type& dst,
479  const value_type& src) const
480  {
481  if (dst < src) dst = src;
482  }
483 
484  KOKKOS_INLINE_FUNCTION void
485  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
486  // Get how many entries to expect in the received data for this row.
487  const size_t num_bytes = num_packets_per_lid(i);
488  if (num_bytes > 0) {
489  LO num_ent_LO = 0; // output argument of unpackValue
490  const char* const in_buf = imports.data () + offsets(i);
491  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
492  tot_num_ent += static_cast<size_t> (num_ent_LO);
493  }
494  }
495 }; //NumEntriesFunctor
496 
504 template<class LO, class DT, class BDT>
505 size_t
506 compute_maximum_num_entries (
507  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
508  const Kokkos::View<const size_t*, DT>& offsets,
509  const Kokkos::View<const char*, BDT>& imports)
510 {
511  typedef typename DT::execution_space XS;
512  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
513  MaxNumEntTag> range_policy;
514 
515  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
516  imports);
517  const LO numRowsToUnpack =
518  static_cast<LO> (num_packets_per_lid.extent (0));
519  size_t max_num_ent = 0;
520  Kokkos::parallel_reduce ("Max num entries in CRS",
521  range_policy (0, numRowsToUnpack),
522  functor, max_num_ent);
523  return max_num_ent;
524 }
525 
533 template<class LO, class DT, class BDT>
534 size_t
535 compute_total_num_entries (
536  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
537  const Kokkos::View<const size_t*, DT>& offsets,
538  const Kokkos::View<const char*, BDT>& imports)
539 {
540  typedef typename DT::execution_space XS;
541  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
542  size_t tot_num_ent = 0;
543  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
544  imports);
545  const LO numRowsToUnpack =
546  static_cast<LO> (num_packets_per_lid.extent (0));
547  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
548  range_policy (0, numRowsToUnpack),
549  functor, tot_num_ent);
550  return tot_num_ent;
551 }
552 
553 template<class LO>
554 KOKKOS_INLINE_FUNCTION
555 size_t
556 unpackRowCount(const char imports[],
557  const size_t offset,
558  const size_t num_bytes)
559 {
560  using PT = PackTraits<LO>;
561 
562  LO num_ent_LO = 0;
563  if (num_bytes > 0) {
564  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
565  if (p_num_bytes > num_bytes) {
566  return OrdinalTraits<size_t>::invalid();
567  }
568  const char* const in_buf = imports + offset;
569  (void) PT::unpackValue(num_ent_LO, in_buf);
570  }
571  return static_cast<size_t>(num_ent_LO);
572 }
573 
578 template<class View1, class View2>
579 inline
580 bool
581 compute_batch_info(
582  const View1& batches_per_lid,
583  View2& batch_info
584 )
585 {
586  using LO = typename View2::value_type;
587  size_t batch = 0;
588  for (size_t i=0; i<batches_per_lid.extent(0); i++)
589  {
590  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
591  {
592  batch_info(batch, 0) = static_cast<LO>(i);
593  batch_info(batch, 1) = batch_no;
594  batch++;
595  }
596  }
597  return batch == batch_info.extent(0);
598 }
599 
607 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
608 void
609 unpackAndCombineIntoCrsMatrix(
610  const LocalMatrix& local_matrix,
611  const LocalMap& local_map,
612  const Kokkos::View<const char*, BufferDeviceType>& imports,
613  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
614  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
615  const Tpetra::CombineMode combine_mode)
616 {
617  using ST = typename LocalMatrix::value_type;
618  using LO = typename LocalMap::local_ordinal_type;
619  using DT = typename LocalMap::device_type;
620  using XS = typename DT::execution_space;
621  const char prefix[] =
622  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
623  "unpackAndCombineIntoCrsMatrix: ";
624 
625  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
626  if (num_import_lids == 0) {
627  // Nothing to unpack
628  return;
629  }
630 
631  {
632  // Check for correct input
633  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
634  std::invalid_argument,
635  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
636  "static graph (i.e., was constructed with the CrsMatrix constructor "
637  "that takes a const CrsGraph pointer).");
638 
639  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
640  std::invalid_argument,
641  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
642  "(i.e., was constructed with the CrsMatrix constructor that takes a "
643  "const CrsGraph pointer).");
644 
645  // Unknown combine mode!
646  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
647  std::invalid_argument,
648  prefix << "Invalid combine mode; should never get "
649  "here! Please report this bug to the Tpetra developers.");
650 
651  // Check that sizes of input objects are consistent.
652  bool bad_num_import_lids =
653  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
654  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
655  std::invalid_argument,
656  prefix << "importLIDs.size() (" << num_import_lids << ") != "
657  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
658  } // end QA error checking
659 
660  // Get the offsets
661  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
662  computeOffsetsFromCounts(offsets, num_packets_per_lid);
663 
664  // Determine the sizes of the unpack batches
665  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
666  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
667  const size_t batch_size = std::min(default_batch_size, max_num_ent);
668 
669  // To achieve some balance amongst threads, unpack each row in equal size batches
670  size_t num_batches = 0;
671  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
672  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
673  // Compute meta data that allows batch unpacking
674  Kokkos::parallel_reduce(
675  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
676  KOKKOS_LAMBDA(const size_t i, size_t& batches)
677  {
678  const size_t num_entries_in_row = unpackRowCount<LO>(
679  imports.data(), offsets(i), num_packets_per_lid(i)
680  );
681  batches_per_lid(i) =
682  (num_entries_in_row <= batch_size) ?
683  1 :
684  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
685  batches += batches_per_lid(i);
686  },
687  num_batches
688  );
689  Kokkos::resize(batch_info, num_batches);
690 
691  Kokkos::HostSpace host_space;
692  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
693  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
694  Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
695 
696  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
697 
698  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
699  // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
700  Kokkos::deep_copy(XS(), batch_info, batch_info_h);
701 
702  // FIXME (TJF SEP 2017)
703  // The scalar type is not necessarily default constructible
704  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
705 
706  // Now do the actual unpack!
707  const bool atomic = XS().concurrency() != 1;
708  using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
709  functor f(
710  local_matrix,
711  local_map,
712  imports,
713  num_packets_per_lid,
714  import_lids,
715  batch_info,
716  offsets,
717  combine_mode,
718  batch_size,
719  bytes_per_value,
720  atomic
721  );
722 
723  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
725  if (!Spaces::is_gpu_exec_space<XS>() || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
726  {
727  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
728  }
729  else
730  {
731  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
732  }
733 
734  auto error_code = f.error();
735  TEUCHOS_TEST_FOR_EXCEPTION(
736  error_code != 0,
737  std::runtime_error,
738  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
739  );
740 } //unpackAndCombineIntoCrsMatrix (Kokkos version)
741 
742 template<class LocalMatrix, class BufferDeviceType>
743 size_t
745  const LocalMatrix& local_matrix,
746  const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
747  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
748  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
749  const size_t num_same_ids)
750 {
751  using Kokkos::parallel_reduce;
752  typedef typename LocalMatrix::ordinal_type LO;
753  typedef typename LocalMatrix::device_type device_type;
754  typedef typename device_type::execution_space XS;
755  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
756  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
757  typedef BufferDeviceType BDT;
758 
759  size_t count = 0;
760  LO num_items;
761 
762  // Number of matrix entries to unpack (returned by this function).
763  num_items = static_cast<LO>(num_same_ids);
764  if (num_items) {
765  size_t kcnt = 0;
766  parallel_reduce(range_policy(0, num_items),
767  KOKKOS_LAMBDA(const LO lid, size_t& update) {
768  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
769  -local_matrix.graph.row_map[lid]);
770  }, kcnt);
771  count += kcnt;
772  }
773 
774  // Count entries copied directly from the source matrix with permuting.
775  num_items = static_cast<LO>(permute_from_lids.extent(0));
776  if (num_items) {
777  size_t kcnt = 0;
778  parallel_reduce(range_policy(0, num_items),
779  KOKKOS_LAMBDA(const LO i, size_t& update) {
780  const LO lid = permute_from_lids(i);
781  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
782  - local_matrix.graph.row_map[lid]);
783  }, kcnt);
784  count += kcnt;
785  }
786 
787  {
788  // Count entries received from other MPI processes.
789  const size_type np = num_packets_per_lid.extent(0);
790  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
791  computeOffsetsFromCounts(offsets, num_packets_per_lid);
792  count +=
793  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
794  offsets, imports);
795  }
796 
797  return count;
798 } //unpackAndCombineWithOwningPIDsCount (Kokkos version)
799 
801 template<class LO, class DT, class BDT>
802 int
803 setupRowPointersForRemotes(
804  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
805  const typename PackTraits<LO>::input_array_type& import_lids,
806  const Kokkos::View<const char*, BDT>& imports,
807  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
808  const typename PackTraits<size_t>::input_array_type& offsets)
809 {
810  using Kokkos::parallel_reduce;
811  typedef typename DT::execution_space XS;
812  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
813  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
814 
815  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
816  const size_type N = num_packets_per_lid.extent(0);
817 
818  int errors = 0;
819  parallel_reduce ("Setup row pointers for remotes",
820  range_policy (0, N),
821  KOKKOS_LAMBDA (const size_t i, int& k_error) {
822  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
823  const size_t num_bytes = num_packets_per_lid(i);
824  const size_t offset = offsets(i);
825  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
826  if (num_ent == InvalidNum) {
827  k_error += 1;
828  }
829  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
830  }, errors);
831  return errors;
832 }
833 
834 // Convert array of row lengths to a CRS pointer array
835 template<class DT>
836 void
837 makeCrsRowPtrFromLengths(
838  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
839  const Kokkos::View<size_t*,DT>& new_start_row)
840 {
841  using Kokkos::parallel_scan;
842  typedef typename DT::execution_space XS;
843  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
844  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
845  const size_type N = new_start_row.extent(0);
846  parallel_scan(range_policy(0, N),
847  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
848  auto cur_val = tgt_rowptr(i);
849  if (final) {
850  tgt_rowptr(i) = update;
851  new_start_row(i) = tgt_rowptr(i);
852  }
853  update += cur_val;
854  }
855  );
856 }
857 
858 template<class LocalMatrix, class LocalMap>
859 void
860 copyDataFromSameIDs(
861  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
862  const typename PackTraits<int>::output_array_type& tgt_pids,
863  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
864  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
865  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
866  const typename PackTraits<int>::input_array_type& src_pids,
867  const LocalMatrix& local_matrix,
868  const LocalMap& local_col_map,
869  const size_t num_same_ids,
870  const int my_pid)
871 {
872  using Kokkos::parallel_for;
873  typedef typename LocalMap::device_type DT;
874  typedef typename LocalMap::local_ordinal_type LO;
875  typedef typename DT::execution_space XS;
876  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
877 
878  parallel_for(range_policy(0, num_same_ids),
879  KOKKOS_LAMBDA(const size_t i) {
880  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
881 
882  const LO src_lid = static_cast<LO>(i);
883  size_t src_row = local_matrix.graph.row_map(src_lid);
884 
885  const LO tgt_lid = static_cast<LO>(i);
886  const size_t tgt_row = tgt_rowptr(tgt_lid);
887 
888  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
889  - local_matrix.graph.row_map(src_lid);
890  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
891 
892  for (size_t j=local_matrix.graph.row_map(src_lid);
893  j<local_matrix.graph.row_map(src_lid+1); ++j) {
894  LO src_col = local_matrix.graph.entries(j);
895  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
896  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
897  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
898  }
899  }
900  );
901 }
902 
903 template<class LocalMatrix, class LocalMap>
904 void
905 copyDataFromPermuteIDs(
906  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
907  const typename PackTraits<int>::output_array_type& tgt_pids,
908  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
909  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
910  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
911  const typename PackTraits<int>::input_array_type& src_pids,
912  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
913  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
914  const LocalMatrix& local_matrix,
915  const LocalMap& local_col_map,
916  const int my_pid)
917 {
918  using Kokkos::parallel_for;
919  typedef typename LocalMap::device_type DT;
920  typedef typename LocalMap::local_ordinal_type LO;
921  typedef typename DT::execution_space XS;
922  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
923  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
924 
925  const size_type num_permute_to_lids = permute_to_lids.extent(0);
926 
927  parallel_for(range_policy(0, num_permute_to_lids),
928  KOKKOS_LAMBDA(const size_t i) {
929  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
930 
931  const LO src_lid = permute_from_lids(i);
932  const size_t src_row = local_matrix.graph.row_map(src_lid);
933 
934  const LO tgt_lid = permute_to_lids(i);
935  const size_t tgt_row = tgt_rowptr(tgt_lid);
936 
937  size_t nsr = local_matrix.graph.row_map(src_lid+1)
938  - local_matrix.graph.row_map(src_lid);
939  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
940 
941  for (size_t j=local_matrix.graph.row_map(src_lid);
942  j<local_matrix.graph.row_map(src_lid+1); ++j) {
943  LO src_col = local_matrix.graph.entries(j);
944  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
945  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
946  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
947  }
948  }
949  );
950 }
951 
952 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
953 int
954 unpackAndCombineIntoCrsArrays2(
955  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
956  const typename PackTraits<int>::output_array_type& tgt_pids,
957  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
958  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
959  const typename PackTraits<size_t>::input_array_type& offsets,
960  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
961  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
962  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
963  const LocalMatrix& /* local_matrix */,
964  const LocalMap /*& local_col_map*/,
965  const int my_pid,
966  const size_t bytes_per_value)
967 {
968  using Kokkos::View;
969  using Kokkos::subview;
970  using Kokkos::MemoryUnmanaged;
971  using Kokkos::parallel_reduce;
972  using Kokkos::atomic_fetch_add;
974  typedef typename LocalMap::device_type DT;
975  typedef typename LocalMap::local_ordinal_type LO;
976  typedef typename LocalMap::global_ordinal_type GO;
977  typedef typename LocalMatrix::value_type ST;
978  typedef typename DT::execution_space XS;
979  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
980  typedef typename Kokkos::pair<size_type, size_type> slice;
981  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
982 
983  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
984  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
985  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
986 
987  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
988 
989  int errors = 0;
990  const size_type num_import_lids = import_lids.size();
991 
992  // RemoteIDs: Loop structure following UnpackAndCombine
993  parallel_reduce ("Unpack and combine into CRS",
994  range_policy (0, num_import_lids),
995  KOKKOS_LAMBDA (const size_t i, int& k_error) {
996  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
997  const size_t num_bytes = num_packets_per_lid(i);
998  const size_t offset = offsets(i);
999  if (num_bytes == 0) {
1000  // Empty buffer means that the row is empty.
1001  return;
1002  }
1003  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1004  if (num_ent == InvalidNum) {
1005  k_error += 1;
1006  return;
1007  }
1008  const LO lcl_row = import_lids(i);
1009  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1010  const size_t end_row = start_row + num_ent;
1011 
1012  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1013  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1014  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1015 
1016  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1017  imports.data(), offset, num_bytes,
1018  num_ent, bytes_per_value);
1019 
1020  // Correct target PIDs.
1021  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1022  const int pid = pids_out(j);
1023  pids_out(j) = (pid != my_pid) ? pid : -1;
1024  }
1025  }, errors);
1026 
1027  return errors;
1028 }
1029 
1030 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1031 void
1033  const LocalMatrix & local_matrix,
1034  const LocalMap & local_col_map,
1035  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1036  const Kokkos::View<const char*, BufferDeviceType, void, void>& imports,
1037  const Kokkos::View<const size_t*, BufferDeviceType, void, void>& num_packets_per_lid,
1038  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1039  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1040  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1041  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1042  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1043  const typename PackTraits<int>::input_array_type& src_pids,
1044  const typename PackTraits<int>::output_array_type& tgt_pids,
1045  const size_t num_same_ids,
1046  const size_t tgt_num_rows,
1047  const size_t tgt_num_nonzeros,
1048  const int my_tgt_pid,
1049  const size_t bytes_per_value)
1050 {
1051  using Kokkos::View;
1052  using Kokkos::subview;
1053  using Kokkos::parallel_for;
1054  using Kokkos::MemoryUnmanaged;
1056  typedef typename LocalMap::device_type DT;
1057  typedef typename LocalMap::local_ordinal_type LO;
1058  typedef typename DT::execution_space XS;
1059  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1060  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1061  typedef BufferDeviceType BDT;
1062 
1063  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1064 
1065  const size_t N = tgt_num_rows;
1066 
1067  // In the case of reduced communicators, the sourceMatrix won't have
1068  // the right "my_pid", so thus we have to supply it.
1069  const int my_pid = my_tgt_pid;
1070 
1071  // Zero the rowptr
1072  parallel_for(range_policy(0, N+1),
1073  KOKKOS_LAMBDA(const size_t i) {
1074  tgt_rowptr(i) = 0;
1075  }
1076  );
1077 
1078  // same IDs: Always first, always in the same place
1079  parallel_for(range_policy(0, num_same_ids),
1080  KOKKOS_LAMBDA(const size_t i) {
1081  const LO tgt_lid = static_cast<LO>(i);
1082  const LO src_lid = static_cast<LO>(i);
1083  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1084  - local_matrix.graph.row_map(src_lid);
1085  }
1086  );
1087 
1088  // Permute IDs: Still local, but reordered
1089  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1090  parallel_for(range_policy(0, num_permute_to_lids),
1091  KOKKOS_LAMBDA(const size_t i) {
1092  const LO tgt_lid = permute_to_lids(i);
1093  const LO src_lid = permute_from_lids(i);
1094  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1095  - local_matrix.graph.row_map(src_lid);
1096  }
1097  );
1098 
1099  // Get the offsets from the number of packets per LID
1100  const size_type num_import_lids = import_lids.extent(0);
1101  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1102  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1103 
1104 #ifdef HAVE_TPETRA_DEBUG
1105  {
1106  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1107  const bool condition =
1108  nth_offset_h != static_cast<size_t>(imports.extent (0));
1109  TEUCHOS_TEST_FOR_EXCEPTION
1110  (condition, std::logic_error, prefix
1111  << "The final offset in bytes " << nth_offset_h
1112  << " != imports.size() = " << imports.extent(0)
1113  << ". Please report this bug to the Tpetra developers.");
1114  }
1115 #endif // HAVE_TPETRA_DEBUG
1116 
1117  // Setup row pointers for remotes
1118  int k_error =
1119  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1120  import_lids, imports, num_packets_per_lid, offsets);
1121  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1122  << " Error transferring data to target row pointers. "
1123  "Please report this bug to the Tpetra developers.");
1124 
1125  // If multiple processes contribute to the same row, we may need to
1126  // update row offsets. This tracks that.
1127  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1128 
1129  // Turn row length into a real CRS row pointer
1130  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1131 
1132  // SameIDs: Copy the data over
1133  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1134  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1135 
1136  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1137  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1138  local_matrix, local_col_map, my_pid);
1139 
1140  if (imports.extent(0) <= 0) {
1141  return;
1142  }
1143 
1144  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1145  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1146  local_matrix, local_col_map, my_pid, bytes_per_value);
1147  TEUCHOS_TEST_FOR_EXCEPTION(
1148  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1149  "should never happen. Please report this bug to the Tpetra developers.");
1150 
1151  return;
1152 }
1153 
1154 } // namespace UnpackAndCombineCrsMatrixImpl
1155 
1195 template<typename ST, typename LO, typename GO, typename Node>
1196 void
1198  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1199  const Teuchos::ArrayView<const char>& imports,
1200  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1201  const Teuchos::ArrayView<const LO>& importLIDs,
1202  size_t /* constantNumPackets */,
1203  CombineMode combineMode)
1204 {
1205  using Kokkos::View;
1206  typedef typename Node::device_type device_type;
1207  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1208  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1209  "Node::device_type and LocalMatrix::device_type must be the same.");
1210 
1211  // Convert all Teuchos::Array to Kokkos::View.
1212  device_type outputDevice;
1213 
1214  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1215  // them to device. Since unpacking is done directly in to the local matrix
1216  // (lclMatrix), no copying needs to be performed after unpacking.
1217  auto num_packets_per_lid_d =
1218  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1219  numPacketsPerLID.size(), true, "num_packets_per_lid");
1220 
1221  auto import_lids_d =
1222  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1223  importLIDs.size(), true, "import_lids");
1224 
1225  auto imports_d =
1226  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1227  imports.size(), true, "imports");
1228 
1229  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1230  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1231 
1232 //KDDKDD This loop doesn't appear to do anything; what is it?
1233 //KDDKDD for (int i=0; i<importLIDs.size(); i++)
1234 //KDDKDD {
1235 //KDDKDD auto lclRow = importLIDs[i];
1236 //KDDKDD Teuchos::ArrayView<const LO> A_indices;
1237 //KDDKDD Teuchos::ArrayView<const ST> A_values;
1238 //KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1239 //KDDKDD }
1240  // Now do the actual unpack!
1241  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1242  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1243  import_lids_d, combineMode);
1244 
1245 }
1246 
1247 template<typename ST, typename LO, typename GO, typename NT>
1248 void
1249 unpackCrsMatrixAndCombineNew(
1250  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1251  Kokkos::DualView<char*,
1253  Kokkos::DualView<size_t*,
1254  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1255  const Kokkos::DualView<const LO*,
1257  const size_t /* constantNumPackets */,
1258  const CombineMode combineMode)
1259 {
1260  using Kokkos::View;
1261  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1262  using dist_object_type = DistObject<char, LO, GO, NT>;
1263  using device_type = typename crs_matrix_type::device_type;
1264  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1265  using buffer_device_type = typename dist_object_type::buffer_device_type;
1266 
1267  static_assert
1268  (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1269  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1270  "must be the same.");
1271 
1272  if (numPacketsPerLID.need_sync_device()) {
1273  numPacketsPerLID.sync_device ();
1274  }
1275  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1276 
1277  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1278  auto import_lids_d = importLIDs.view_device ();
1279 
1280  if (imports.need_sync_device()) {
1281  imports.sync_device ();
1282  }
1283  auto imports_d = imports.view_device ();
1284 
1285  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1286  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1287  typedef decltype (local_col_map) local_map_type;
1288 
1289  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1290  local_matrix_device_type,
1291  local_map_type,
1292  buffer_device_type
1293  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1294  import_lids_d, combineMode);
1295 }
1296 
1343 //
1352 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1353 size_t
1355  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1356  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1357  const Teuchos::ArrayView<const char> &imports,
1358  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1359  size_t /* constantNumPackets */,
1360  CombineMode /* combineMode */,
1361  size_t numSameIDs,
1362  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1363  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1364 {
1365  using Kokkos::MemoryUnmanaged;
1366  using Kokkos::View;
1367  typedef typename Node::device_type DT;
1368  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1369 
1370  TEUCHOS_TEST_FOR_EXCEPTION
1371  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1372  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1373  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1374  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1375  // process, then the matrix is neither locally nor globally indexed.
1376  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1377  TEUCHOS_TEST_FOR_EXCEPTION
1378  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1379  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1380  TEUCHOS_TEST_FOR_EXCEPTION
1381  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1382  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1383  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1384 
1385  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1386 
1387  using kokkos_device_type = Kokkos::Device<typename Node::device_type::execution_space,
1388  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>;
1389 
1390  Kokkos::View<LocalOrdinal const *, kokkos_device_type, void, void > permute_from_lids_d =
1392  permuteFromLIDs.getRawPtr (),
1393  permuteFromLIDs.size (), true,
1394  "permute_from_lids");
1395 
1396  Kokkos::View<const char*, kokkos_device_type, void, void > imports_d =
1398  imports.getRawPtr (),
1399  imports.size (), true,
1400  "imports");
1401 
1402  Kokkos::View<const size_t*, kokkos_device_type, void, void > num_packets_per_lid_d =
1404  numPacketsPerLID.getRawPtr (),
1405  numPacketsPerLID.size (), true,
1406  "num_packets_per_lid");
1407 
1409  local_matrix, permute_from_lids_d, imports_d,
1410  num_packets_per_lid_d, numSameIDs);
1411 } //unpackAndCombineWithOwningPIDsCount (Teuchos::Array version)
1412 
1427 
1428 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1429 void
1432  const Kokkos::View<LocalOrdinal const *,
1433  Kokkos::Device<typename Node::device_type::execution_space,
1434  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1435  void, void > import_lids_d,
1436  const Kokkos::View<const char*,
1437  Kokkos::Device<typename Node::device_type::execution_space,
1438  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1439  void, void > imports_d,
1440  const Kokkos::View<const size_t*,
1441  Kokkos::Device<typename Node::device_type::execution_space,
1442  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1443  void, void > num_packets_per_lid_d,
1444  const size_t numSameIDs,
1445  const Kokkos::View<LocalOrdinal const *,
1446  Kokkos::Device<typename Node::device_type::execution_space,
1447  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1448  void, void > permute_to_lids_d,
1449  const Kokkos::View<LocalOrdinal const *,
1450  Kokkos::Device<typename Node::device_type::execution_space,
1451  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1452  void, void > permute_from_lids_d,
1453  size_t TargetNumRows,
1454  const int MyTargetPID,
1455  Kokkos::View<size_t*,typename Node::device_type> &crs_rowptr_d,
1456  Kokkos::View<GlobalOrdinal*,typename Node::device_type> &crs_colind_d,
1457  Kokkos::View<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type*,typename Node::device_type>& crs_vals_d,
1458  const Teuchos::ArrayView<const int>& SourcePids,
1459  Kokkos::View<int*,typename Node::device_type> &TargetPids)
1460 {
1461  using execution_space = typename Node::execution_space;
1463 
1464  using Kokkos::View;
1465  using Kokkos::deep_copy;
1466 
1467  using Teuchos::ArrayView;
1468  using Teuchos::outArg;
1469  using Teuchos::REDUCE_MAX;
1470  using Teuchos::reduceAll;
1471 
1472  typedef typename Node::device_type DT;
1473 
1475  typedef typename matrix_type::impl_scalar_type ST;
1476 
1477  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1478 # ifdef HAVE_TPETRA_MMM_TIMINGS
1479  using Teuchos::TimeMonitor;
1480  Teuchos::RCP<TimeMonitor> tm;
1481 # endif
1482 
1483  using Kokkos::MemoryUnmanaged;
1484 
1485  TEUCHOS_TEST_FOR_EXCEPTION
1486  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1487  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1488  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1489  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1490  // process, then the matrix is neither locally nor globally indexed.
1491  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1492  TEUCHOS_TEST_FOR_EXCEPTION
1493  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1494  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1495  TEUCHOS_TEST_FOR_EXCEPTION
1496  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1497  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1498  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1499 
1500  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1501 
1502  // TargetNumNonzeros is number of nonzeros in local matrix.
1503 # ifdef HAVE_TPETRA_MMM_TIMINGS
1504  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1505 # endif
1506  size_t TargetNumNonzeros =
1508  local_matrix, permute_from_lids_d, imports_d,
1509  num_packets_per_lid_d, numSameIDs);
1510 # ifdef HAVE_TPETRA_MMM_TIMINGS
1511  tm = Teuchos::null;
1512 # endif
1513 
1514 # ifdef HAVE_TPETRA_MMM_TIMINGS
1515  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1516 # endif
1517  Kokkos::resize(crs_rowptr_d,TargetNumRows+1);
1518  Kokkos::resize(crs_colind_d,TargetNumNonzeros);
1519  Kokkos::resize(crs_vals_d,TargetNumNonzeros);
1520 # ifdef HAVE_TPETRA_MMM_TIMINGS
1521  tm = Teuchos::null;
1522 # endif
1523 
1524  TEUCHOS_TEST_FOR_EXCEPTION(
1525  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1526  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1527  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1528 
1529  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1530  Kokkos::resize(TargetPids,TargetNumNonzeros);
1531  }
1532  Kokkos::deep_copy(execution_space(), TargetPids, -1);
1533 
1534  // Grab pointers for sourceMatrix
1535  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1536 
1537 # ifdef HAVE_TPETRA_MMM_TIMINGS
1538  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1539 # endif
1540  // Convert input arrays to Kokkos::Views
1541  DT outputDevice;
1542 
1543  auto src_pids_d =
1544  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1545  SourcePids.size(), true, "src_pids");
1546 
1547 # ifdef HAVE_TPETRA_MMM_TIMINGS
1548  tm = Teuchos::null;
1549 # endif
1550 
1551  size_t bytes_per_value = 0;
1553  // assume that ST is default constructible
1554  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1555  }
1556  else {
1557  // Since the packed data come from the source matrix, we can use the source
1558  // matrix to get the number of bytes per Scalar value stored in the matrix.
1559  // This assumes that all Scalar values in the source matrix require the same
1560  // number of bytes. If the source matrix has no entries on the calling
1561  // process, then we hope that some process does have some idea how big
1562  // a Scalar value is. Of course, if no processes have any entries, then no
1563  // values should be packed (though this does assume that in our packing
1564  // scheme, rows with zero entries take zero bytes).
1565  size_t bytes_per_value_l = 0;
1566  if (local_matrix.values.extent(0) > 0) {
1567  const ST& val = local_matrix.values(0);
1568  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1569  } else {
1570  const ST& val = crs_vals_d(0);
1571  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1572  }
1573  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1574  Teuchos::REDUCE_MAX,
1575  bytes_per_value_l,
1576  outArg(bytes_per_value));
1577  }
1578 
1579 # ifdef HAVE_TPETRA_MMM_TIMINGS
1580  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1581 # endif
1583  local_matrix, local_col_map, import_lids_d, imports_d,
1584  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1585  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, TargetPids,
1586  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1587  bytes_per_value);
1588 # ifdef HAVE_TPETRA_MMM_TIMINGS
1589  tm = Teuchos::null;
1590 # endif
1591 
1592  // Copy outputs back to host
1593 # ifdef HAVE_TPETRA_MMM_TIMINGS
1594  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1595 # endif
1596 
1597  Kokkos::parallel_for("setLocalEntriesToPID", Kokkos::RangePolicy<typename DT::execution_space>(0,TargetPids.size()), KOKKOS_LAMBDA (const size_t i) {
1598  if (TargetPids(i) == -1) TargetPids(i) = MyTargetPID;
1599  });
1600 
1601 } //unpackAndCombineIntoCrsArrays
1602 
1603 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1604 void
1607  const Kokkos::View<LocalOrdinal const *,
1608  Kokkos::Device<typename Node::device_type::execution_space,
1609  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1610  void, void > import_lids_d,
1611  const Kokkos::View<const char*,
1612  Kokkos::Device<typename Node::device_type::execution_space,
1613  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1614  void, void > imports_d,
1615  const Kokkos::View<const size_t*,
1616  Kokkos::Device<typename Node::device_type::execution_space,
1617  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1618  void, void > num_packets_per_lid_d,
1619  const size_t numSameIDs,
1620  const Kokkos::View<LocalOrdinal const *,
1621  Kokkos::Device<typename Node::device_type::execution_space,
1622  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1623  void, void > permute_to_lids_d,
1624  const Kokkos::View<LocalOrdinal const *,
1625  Kokkos::Device<typename Node::device_type::execution_space,
1626  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename Node::device_type>>,
1627  void, void > permute_from_lids_d,
1628  size_t TargetNumRows,
1629  const int MyTargetPID,
1630  Teuchos::ArrayRCP<size_t>& CRS_rowptr,
1631  Teuchos::ArrayRCP<GlobalOrdinal>& CRS_colind,
1632  Teuchos::ArrayRCP<Scalar>& CRS_vals,
1633  const Teuchos::ArrayView<const int>& SourcePids,
1634  Teuchos::Array<int>& TargetPids)
1635 {
1636  using execution_space = typename Node::execution_space;
1638 
1639  using Kokkos::View;
1640  using Kokkos::deep_copy;
1641 
1642  using Teuchos::ArrayView;
1643  using Teuchos::outArg;
1644  using Teuchos::REDUCE_MAX;
1645  using Teuchos::reduceAll;
1646 
1647  typedef typename Node::device_type DT;
1648 
1650  typedef typename matrix_type::impl_scalar_type ST;
1651 
1652  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays_new: ";
1653 # ifdef HAVE_TPETRA_MMM_TIMINGS
1654  using Teuchos::TimeMonitor;
1655  Teuchos::RCP<TimeMonitor> tm;
1656 # endif
1657 
1658  using Kokkos::MemoryUnmanaged;
1659 
1660  TEUCHOS_TEST_FOR_EXCEPTION
1661  (permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1662  prefix << "permute_to_lids_d.size() = " << permute_to_lids_d.size () << " != "
1663  "permute_from_lids_d.size() = " << permute_from_lids_d.size() << ".");
1664  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1665  // process, then the matrix is neither locally nor globally indexed.
1666  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1667  TEUCHOS_TEST_FOR_EXCEPTION
1668  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1669  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1670  TEUCHOS_TEST_FOR_EXCEPTION
1671  (((size_t)import_lids_d.size ()) != num_packets_per_lid_d.size (), std::invalid_argument,
1672  prefix << "import_lids_d.size() = " << import_lids_d.size () << " != "
1673  "num_packets_per_lid_d.size() = " << num_packets_per_lid_d.size () << ".");
1674 
1675  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1676 
1677  // TargetNumNonzeros is number of nonzeros in local matrix.
1678 # ifdef HAVE_TPETRA_MMM_TIMINGS
1679  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineWithOwningPIDsCount"))));
1680 # endif
1681  size_t TargetNumNonzeros =
1683  local_matrix, permute_from_lids_d, imports_d,
1684  num_packets_per_lid_d, numSameIDs);
1685 # ifdef HAVE_TPETRA_MMM_TIMINGS
1686  tm = Teuchos::null;
1687 # endif
1688 
1689 # ifdef HAVE_TPETRA_MMM_TIMINGS
1690  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("resize CRS pointers"))));
1691 # endif
1692  CRS_rowptr.resize (TargetNumRows+1);
1693  CRS_colind.resize(TargetNumNonzeros);
1694  CRS_vals.resize(TargetNumNonzeros);
1695  Teuchos::ArrayRCP<ST> const & CRS_vals_impl_scalar_type = Teuchos::arcp_reinterpret_cast<ST>(CRS_vals);
1696 # ifdef HAVE_TPETRA_MMM_TIMINGS
1697  tm = Teuchos::null;
1698 # endif
1699 
1700  TEUCHOS_TEST_FOR_EXCEPTION(
1701  permute_to_lids_d.size () != permute_from_lids_d.size (), std::invalid_argument,
1702  prefix << "permuteToLIDs.size() = " << permute_to_lids_d.size ()
1703  << "!= permute_from_lids_d.size() = " << permute_from_lids_d.size () << ".");
1704 
1705  // Preseed TargetPids with -1 for local
1706  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1707  TargetPids.resize (TargetNumNonzeros);
1708  }
1709  TargetPids.assign (TargetNumNonzeros, -1);
1710 
1711  // Grab pointers for sourceMatrix
1712  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1713 
1714 # ifdef HAVE_TPETRA_MMM_TIMINGS
1715  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("create mirror views from inputs"))));
1716 # endif
1717  // Convert input arrays to Kokkos::Views
1718  DT outputDevice;
1719 
1720  auto crs_rowptr_d =
1721  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1722  CRS_rowptr.size(), true, "crs_rowptr");
1723 
1724  auto crs_colind_d =
1725  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1726  CRS_colind.size(), true, "crs_colidx");
1727 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1728  static_assert (! std::is_same<
1729  typename std::remove_const<
1730  typename std::decay<
1731  decltype (CRS_vals_impl_scalar_type)
1732  >::type::value_type
1733  >::type,
1734  std::complex<double> >::value,
1735  "CRS_vals::value_type is std::complex<double>; this should never happen"
1736  ", since std::complex does not work in Kokkos::View objects.");
1737 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1738 
1739  auto crs_vals_d =
1740  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals_impl_scalar_type.getRawPtr(),
1741  CRS_vals_impl_scalar_type.size(), true, "crs_vals");
1742 
1743 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1744  static_assert (! std::is_same<
1745  typename decltype (crs_vals_d)::non_const_value_type,
1746  std::complex<double> >::value,
1747  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1748  "never happen, since std::complex does not work in Kokkos::View objects.");
1749 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1750 
1751  auto src_pids_d =
1752  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1753  SourcePids.size(), true, "src_pids");
1754 
1755  auto tgt_pids_d =
1756  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1757  TargetPids.size(), true, "tgt_pids");
1758 
1759 # ifdef HAVE_TPETRA_MMM_TIMINGS
1760  tm = Teuchos::null;
1761 # endif
1762 
1763  size_t bytes_per_value = 0;
1765  // assume that ST is default constructible
1766  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1767  }
1768  else {
1769  // Since the packed data come from the source matrix, we can use the source
1770  // matrix to get the number of bytes per Scalar value stored in the matrix.
1771  // This assumes that all Scalar values in the source matrix require the same
1772  // number of bytes. If the source matrix has no entries on the calling
1773  // process, then we hope that some process does have some idea how big
1774  // a Scalar value is. Of course, if no processes have any entries, then no
1775  // values should be packed (though this does assume that in our packing
1776  // scheme, rows with zero entries take zero bytes).
1777  size_t bytes_per_value_l = 0;
1778  if (local_matrix.values.extent(0) > 0) {
1779  const ST& val = local_matrix.values(0);
1780  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1781  } else {
1782  const ST& val = crs_vals_d(0);
1783  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1784  }
1785  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1786  Teuchos::REDUCE_MAX,
1787  bytes_per_value_l,
1788  outArg(bytes_per_value));
1789  }
1790 
1791 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1792  static_assert (! std::is_same<
1793  typename decltype (crs_vals_d)::non_const_value_type,
1794  std::complex<double> >::value,
1795  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1796  "never happen, since std::complex does not work in Kokkos::View objects.");
1797 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1798 
1799 # ifdef HAVE_TPETRA_MMM_TIMINGS
1800  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("unpackAndCombineIntoCrsArrays"))));
1801 # endif
1803  local_matrix, local_col_map, import_lids_d, imports_d,
1804  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1805  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1806  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1807  bytes_per_value);
1808 # ifdef HAVE_TPETRA_MMM_TIMINGS
1809  tm = Teuchos::null;
1810 # endif
1811 
1812  // Copy outputs back to host
1813 # ifdef HAVE_TPETRA_MMM_TIMINGS
1814  tm = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("copy back to host"))));
1815 # endif
1816  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1817  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1818  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1819  deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1820 
1821  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1822  CRS_colind.getRawPtr(), CRS_colind.size());
1823  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1824  deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1825 
1826  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1827  CRS_vals_impl_scalar_type.getRawPtr(), CRS_vals_impl_scalar_type.size());
1828  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1829  deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1830 
1831  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1832  TargetPids.getRawPtr(), TargetPids.size());
1833  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1834  deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1835 
1836 } //unpackAndCombineIntoCrsArrays
1837 
1838 
1839 } // namespace Details
1840 } // namespace Tpetra
1841 
1842 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1843  template void \
1844  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1845  const CrsMatrix<ST, LO, GO, NT>&, \
1846  const Teuchos::ArrayView<const char>&, \
1847  const Teuchos::ArrayView<const size_t>&, \
1848  const Teuchos::ArrayView<const LO>&, \
1849  size_t, \
1850  CombineMode); \
1851  template size_t \
1852  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1853  const CrsMatrix<ST, LO, GO, NT> &, \
1854  const Teuchos::ArrayView<const LO> &, \
1855  const Teuchos::ArrayView<const char> &, \
1856  const Teuchos::ArrayView<const size_t>&, \
1857  size_t, \
1858  CombineMode, \
1859  size_t, \
1860  const Teuchos::ArrayView<const LO>&, \
1861  const Teuchos::ArrayView<const LO>&); \
1862  template void \
1863  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1864  const CrsMatrix<ST, LO, GO, NT>&, \
1865  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1866  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1867  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1868  const size_t, \
1869  const CombineMode); \
1870  template void \
1871  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1872  const CrsMatrix<ST, LO, GO, NT> &, \
1873  const Kokkos::View<LO const *, \
1874  Kokkos::Device<typename NT::device_type::execution_space, \
1875  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1876  void, void >, \
1877  const Kokkos::View<const char*, \
1878  Kokkos::Device<typename NT::device_type::execution_space, \
1879  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1880  void, void >, \
1881  const Kokkos::View<const size_t*, \
1882  Kokkos::Device<typename NT::device_type::execution_space, \
1883  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1884  void, void >, \
1885  const size_t, \
1886  const Kokkos::View<LO const *, \
1887  Kokkos::Device<typename NT::device_type::execution_space, \
1888  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1889  void, void >, \
1890  const Kokkos::View<LO const *, \
1891  Kokkos::Device<typename NT::device_type::execution_space, \
1892  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1893  void, void >, \
1894  size_t, \
1895  const int, \
1896  Kokkos::View<size_t*,typename NT::device_type>&, \
1897  Kokkos::View<GO*,typename NT::device_type>&, \
1898  Kokkos::View<typename CrsMatrix<ST, LO, GO, NT>::impl_scalar_type*,typename NT::device_type>&, \
1899  const Teuchos::ArrayView<const int>&, \
1900  Kokkos::View<int*,typename NT::device_type>&); \
1901  template void \
1902  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1903  const CrsMatrix<ST, LO, GO, NT> &, \
1904  const Kokkos::View<LO const *, \
1905  Kokkos::Device<typename NT::device_type::execution_space, \
1906  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>,\
1907  void, void >, \
1908  const Kokkos::View<const char*, \
1909  Kokkos::Device<typename NT::device_type::execution_space, \
1910  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1911  void, void >, \
1912  const Kokkos::View<const size_t*, \
1913  Kokkos::Device<typename NT::device_type::execution_space, \
1914  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1915  void, void >, \
1916  const size_t, \
1917  const Kokkos::View<LO const *, \
1918  Kokkos::Device<typename NT::device_type::execution_space, \
1919  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1920  void, void >, \
1921  const Kokkos::View<LO const *, \
1922  Kokkos::Device<typename NT::device_type::execution_space, \
1923  Tpetra::Details::DefaultTypes::comm_buffer_memory_space<typename NT::device_type>>, \
1924  void, void >, \
1925  size_t, \
1926  const int, \
1927  Teuchos::ArrayRCP<size_t>&, \
1928  Teuchos::ArrayRCP<GO>&, \
1929  Teuchos::ArrayRCP<ST>&, \
1930  const Teuchos::ArrayView<const int>&, \
1931  Teuchos::Array<int>&);
1932 
1933 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...