Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
42 
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
46 #include "Teuchos_OrdinalTraits.hpp"
54 #include "Kokkos_Core.hpp"
55 #include <memory>
56 #include <string>
57 
78 
79 namespace Tpetra {
80 
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
82 // Forward declaration of Distributor
83 class Distributor;
84 #endif // DOXYGEN_SHOULD_SKIP_THIS
85 
86 //
87 // Users must never rely on anything in the Details namespace.
88 //
89 namespace Details {
90 
91 namespace UnpackAndCombineCrsMatrixImpl {
92 
102 template<class ST, class LO, class GO>
103 KOKKOS_FUNCTION int
104 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
105  const typename PackTraits<int>::output_array_type& pids_out,
106  const typename PackTraits<ST>::output_array_type& vals_out,
107  const char imports[],
108  const size_t offset,
109  const size_t /* num_bytes */,
110  const size_t num_ent,
111  const size_t bytes_per_value)
112 {
113  if (num_ent == 0) {
114  // Empty rows always take zero bytes, to ensure sparsity.
115  return 0;
116  }
117  bool unpack_pids = pids_out.size() > 0;
118 
119  const size_t num_ent_beg = offset;
120  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
121 
122  const size_t gids_beg = num_ent_beg + num_ent_len;
123  const size_t gids_len =
124  num_ent * PackTraits<GO>::packValueCount (GO (0));
125 
126  const size_t pids_beg = gids_beg + gids_len;
127  const size_t pids_len = unpack_pids ?
128  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
129  size_t (0);
130 
131  const size_t vals_beg = gids_beg + gids_len + pids_len;
132  const size_t vals_len = num_ent * bytes_per_value;
133 
134  const char* const num_ent_in = imports + num_ent_beg;
135  const char* const gids_in = imports + gids_beg;
136  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
137  const char* const vals_in = imports + vals_beg;
138 
139  size_t num_bytes_out = 0;
140  LO num_ent_out;
141  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
142  if (static_cast<size_t> (num_ent_out) != num_ent) {
143  return 20; // error code
144  }
145 
146  {
147  Kokkos::pair<int, size_t> p;
148  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
149  if (p.first != 0) {
150  return 21; // error code
151  }
152  num_bytes_out += p.second;
153 
154  if (unpack_pids) {
155  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
156  if (p.first != 0) {
157  return 22; // error code
158  }
159  num_bytes_out += p.second;
160  }
161 
162  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
163  if (p.first != 0) {
164  return 23; // error code
165  }
166  num_bytes_out += p.second;
167  }
168 
169  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
170  if (num_bytes_out != expected_num_bytes) {
171  return 24; // error code
172  }
173  return 0; // no errors
174 }
175 
186 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
188  typedef LocalMatrix local_matrix_type;
189  typedef LocalMap local_map_type;
190 
191  typedef typename local_matrix_type::value_type ST;
192  typedef typename local_map_type::local_ordinal_type LO;
193  typedef typename local_map_type::global_ordinal_type GO;
194  typedef typename local_map_type::device_type DT;
195  typedef typename DT::execution_space XS;
196 
197  typedef Kokkos::View<const size_t*, BufferDeviceType>
198  num_packets_per_lid_type;
199  typedef Kokkos::View<const size_t*, DT> offsets_type;
200  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
201  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
202 
203  typedef Kokkos::View<int, DT> error_type;
204  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
205 
206  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
207  "LocalMap::local_ordinal_type and "
208  "LocalMatrix::ordinal_type must be the same.");
209 
210  local_matrix_type local_matrix;
211  local_map_type local_col_map;
212  input_buffer_type imports;
213  num_packets_per_lid_type num_packets_per_lid;
214  import_lids_type import_lids;
215  Kokkos::View<const LO*[2], DT> batch_info;
216  offsets_type offsets;
217  Tpetra::CombineMode combine_mode;
218  size_t batch_size;
219  size_t bytes_per_value;
220  bool atomic;
221  error_type error_code;
222 
224  const local_matrix_type& local_matrix_in,
225  const local_map_type& local_col_map_in,
226  const input_buffer_type& imports_in,
227  const num_packets_per_lid_type& num_packets_per_lid_in,
228  const import_lids_type& import_lids_in,
229  const Kokkos::View<const LO*[2], DT>& batch_info_in,
230  const offsets_type& offsets_in,
231  const Tpetra::CombineMode combine_mode_in,
232  const size_t batch_size_in,
233  const size_t bytes_per_value_in,
234  const bool atomic_in) :
235  local_matrix (local_matrix_in),
236  local_col_map (local_col_map_in),
237  imports (imports_in),
238  num_packets_per_lid (num_packets_per_lid_in),
239  import_lids (import_lids_in),
240  batch_info (batch_info_in),
241  offsets (offsets_in),
242  combine_mode (combine_mode_in),
243  batch_size (batch_size_in),
244  bytes_per_value (bytes_per_value_in),
245  atomic (atomic_in),
246  error_code("error")
247  {}
248 
249  KOKKOS_INLINE_FUNCTION
250  void operator()(member_type team_member) const
251  {
252  using Kokkos::View;
253  using Kokkos::subview;
254  using Kokkos::MemoryUnmanaged;
255 
256  const LO batch = team_member.league_rank();
257  const LO lid_no = batch_info(batch, 0);
258  const LO batch_no = batch_info(batch, 1);
259 
260  const size_t num_bytes = num_packets_per_lid(lid_no);
261 
262  // Only unpack data if there is a nonzero number of bytes.
263  if (num_bytes == 0)
264  return;
265 
266  // there is actually something in the row
267  const LO import_lid = import_lids(lid_no);
268  const size_t buf_size = imports.size();
269  const size_t offset = offsets(lid_no);
270 
271  // Get the number of entries to expect in the received data for this row.
272  LO num_ent_LO = 0;
273  const char* const in_buf = imports.data() + offset;
274  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
275  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
276 
277  // Count the number of bytes expected to unpack
278  size_t expected_num_bytes = 0;
279  {
280  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
281  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
282  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
283  }
284 
285  if (expected_num_bytes > num_bytes)
286  {
287  printf(
288  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
289  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
290  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
291  );
292  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
293  return;
294  }
295 
296  if (offset > buf_size || offset + num_bytes > buf_size)
297  {
298  printf(
299  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300  "At row %d, the offset (%d) > buffer size (%d)\n",
301  (int) lid_no, (int) offset, (int) buf_size
302  );
303  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
304  return;
305  }
306 
307  // Determine the number of entries to unpack in this batch
308  size_t num_entries_in_batch = 0;
309  if (num_entries_in_row <= batch_size)
310  num_entries_in_batch = num_entries_in_row;
311  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
312  num_entries_in_batch = batch_size;
313  else
314  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
315 
316  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
317  const size_t num_ent_start = offset;
318  const size_t num_ent_end = num_ent_start + bytes_per_lid;
319 
320  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
321  const size_t gids_start = num_ent_end;
322  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
323 
324  const size_t vals_start = gids_end;
325 
326  const size_t shift = batch_no * batch_size;
327  const char* const num_ent_in = imports.data() + num_ent_start;
328  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
329  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
330 
331  LO num_ent_out;
332  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
333  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
334  {
335  printf(
336  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
337  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
338  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
339  );
340  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
341  }
342 
343  constexpr bool matrix_has_sorted_rows = true; // see #6282
344  Kokkos::parallel_for(
345  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
346  KOKKOS_LAMBDA(const LO& j)
347  {
348  size_t distance = 0;
349 
350  GO gid_out;
351  distance = j * bytes_per_gid;
352  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
353  auto lid_out = local_col_map.getLocalElement(gid_out);
354 
355  // Column indices come in as global indices, in case the
356  // source object's column Map differs from the target object's
357  // (this's) column Map, and must be converted local index values
358 
359  // assume that ST is default constructible
360  ST val_out;
361  distance = j * bytes_per_value;
362  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
363 
364  if (combine_mode == ADD) {
365  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
366  // different threads don't touch the same row (i.e., no
367  // duplicates in incoming LIDs list).
368  const bool use_atomic_updates = atomic;
369  (void)local_matrix.sumIntoValues(
370  import_lid,
371  &lid_out,
372  1,
373  &val_out,
374  matrix_has_sorted_rows,
375  use_atomic_updates
376  );
377  } else if (combine_mode == REPLACE) {
378  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
379  // combine mode with multiple incoming rows that touch the same
380  // target matrix entries, so we never need atomic updates.
381  const bool use_atomic_updates = false;
382  (void)local_matrix.replaceValues(
383  import_lid,
384  &lid_out,
385  1,
386  &val_out,
387  matrix_has_sorted_rows,
388  use_atomic_updates
389  );
390  } else {
391  // should never get here
392  printf(
393  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
394  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
395  );
396  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
397  }
398  }
399  );
400 
401  team_member.team_barrier();
402 
403  }
404 
406  int error() const {
407  auto error_code_h = Kokkos::create_mirror_view_and_copy(
408  Kokkos::HostSpace(), error_code
409  );
410  return error_code_h();
411  }
412 
413 };
414 
415 struct MaxNumEntTag {};
416 struct TotNumEntTag {};
417 
426 template<class LO, class DT, class BDT>
428 public:
429  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
430  typedef Kokkos::View<const size_t*, DT> offsets_type;
431  typedef Kokkos::View<const char*, BDT> input_buffer_type;
432  // This needs to be public, since it appears in the argument list of
433  // public methods (see below). Otherwise, build errors may happen.
434  typedef size_t value_type;
435 
436 private:
437  num_packets_per_lid_type num_packets_per_lid;
438  offsets_type offsets;
439  input_buffer_type imports;
440 
441 public:
442  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
443  const offsets_type& offsets_in,
444  const input_buffer_type& imports_in) :
445  num_packets_per_lid (num_packets_per_lid_in),
446  offsets (offsets_in),
447  imports (imports_in)
448  {}
449 
450  KOKKOS_INLINE_FUNCTION void
451  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
452  // Get how many entries to expect in the received data for this row.
453  const size_t num_bytes = num_packets_per_lid(i);
454  if (num_bytes > 0) {
455  LO num_ent_LO = 0; // output argument of unpackValue
456  const char* const in_buf = imports.data () + offsets(i);
457  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
458  const size_t num_ent = static_cast<size_t> (num_ent_LO);
459 
460  update = (update < num_ent) ? num_ent : update;
461  }
462  }
463 
464  KOKKOS_INLINE_FUNCTION void
465  join (const MaxNumEntTag,
466  volatile value_type& dst,
467  const volatile value_type& src) const
468  {
469  if (dst < src) dst = src;
470  }
471 
472  KOKKOS_INLINE_FUNCTION void
473  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
474  // Get how many entries to expect in the received data for this row.
475  const size_t num_bytes = num_packets_per_lid(i);
476  if (num_bytes > 0) {
477  LO num_ent_LO = 0; // output argument of unpackValue
478  const char* const in_buf = imports.data () + offsets(i);
479  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
480  tot_num_ent += static_cast<size_t> (num_ent_LO);
481  }
482  }
483 };
484 
492 template<class LO, class DT, class BDT>
493 size_t
494 compute_maximum_num_entries (
495  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
496  const Kokkos::View<const size_t*, DT>& offsets,
497  const Kokkos::View<const char*, BDT>& imports)
498 {
499  typedef typename DT::execution_space XS;
500  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
501  MaxNumEntTag> range_policy;
502 
503  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
504  imports);
505  const LO numRowsToUnpack =
506  static_cast<LO> (num_packets_per_lid.extent (0));
507  size_t max_num_ent = 0;
508  Kokkos::parallel_reduce ("Max num entries in CRS",
509  range_policy (0, numRowsToUnpack),
510  functor, max_num_ent);
511  return max_num_ent;
512 }
513 
521 template<class LO, class DT, class BDT>
522 size_t
523 compute_total_num_entries (
524  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
525  const Kokkos::View<const size_t*, DT>& offsets,
526  const Kokkos::View<const char*, BDT>& imports)
527 {
528  typedef typename DT::execution_space XS;
529  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
530  size_t tot_num_ent = 0;
531  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
532  imports);
533  const LO numRowsToUnpack =
534  static_cast<LO> (num_packets_per_lid.extent (0));
535  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
536  range_policy (0, numRowsToUnpack),
537  functor, tot_num_ent);
538  return tot_num_ent;
539 }
540 
541 template<class LO>
542 KOKKOS_INLINE_FUNCTION
543 size_t
544 unpackRowCount(const char imports[],
545  const size_t offset,
546  const size_t num_bytes)
547 {
548  using PT = PackTraits<LO>;
549 
550  LO num_ent_LO = 0;
551  if (num_bytes > 0) {
552  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
553  if (p_num_bytes > num_bytes) {
554  return OrdinalTraits<size_t>::invalid();
555  }
556  const char* const in_buf = imports + offset;
557  (void) PT::unpackValue(num_ent_LO, in_buf);
558  }
559  return static_cast<size_t>(num_ent_LO);
560 }
561 
566 template<class View1, class View2>
567 inline
568 bool
569 compute_batch_info(
570  const View1& batches_per_lid,
571  View2& batch_info
572 )
573 {
574  using LO = typename View2::value_type;
575  size_t batch = 0;
576  for (size_t i=0; i<batches_per_lid.extent(0); i++)
577  {
578  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
579  {
580  batch_info(batch, 0) = static_cast<LO>(i);
581  batch_info(batch, 1) = batch_no;
582  batch++;
583  }
584  }
585  return batch == batch_info.extent(0);
586 }
587 
595 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
596 void
597 unpackAndCombineIntoCrsMatrix(
598  const LocalMatrix& local_matrix,
599  const LocalMap& local_map,
600  const Kokkos::View<const char*, BufferDeviceType>& imports,
601  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
602  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
603  const Tpetra::CombineMode combine_mode)
604 {
605  using ST = typename LocalMatrix::value_type;
606  using LO = typename LocalMap::local_ordinal_type;
607  using DT = typename LocalMap::device_type;
608  using XS = typename DT::execution_space;
609  const char prefix[] =
610  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
611  "unpackAndCombineIntoCrsMatrix: ";
612 
613  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
614  if (num_import_lids == 0) {
615  // Nothing to unpack
616  return;
617  }
618 
619  {
620  // Check for correct input
621  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
622  std::invalid_argument,
623  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
624  "static graph (i.e., was constructed with the CrsMatrix constructor "
625  "that takes a const CrsGraph pointer).");
626 
627  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
628  std::invalid_argument,
629  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
630  "(i.e., was constructed with the CrsMatrix constructor that takes a "
631  "const CrsGraph pointer).");
632 
633  // Unknown combine mode!
634  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
635  std::invalid_argument,
636  prefix << "Invalid combine mode; should never get "
637  "here! Please report this bug to the Tpetra developers.");
638 
639  // Check that sizes of input objects are consistent.
640  bool bad_num_import_lids =
641  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
642  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
643  std::invalid_argument,
644  prefix << "importLIDs.size() (" << num_import_lids << ") != "
645  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
646  } // end QA error checking
647 
648  // Get the offsets
649  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
650  computeOffsetsFromCounts(offsets, num_packets_per_lid);
651 
652  // Determine the sizes of the unpack batches
653  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
654  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
655  const size_t batch_size = std::min(default_batch_size, max_num_ent);
656 
657  // To achieve some balance amongst threads, unpack each row in equal size batches
658  size_t num_batches = 0;
659  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
660  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
661  // Compute meta data that allows batch unpacking
662  Kokkos::parallel_reduce(
663  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
664  KOKKOS_LAMBDA(const size_t i, size_t& batches)
665  {
666  const size_t num_entries_in_row = unpackRowCount<LO>(
667  imports.data(), offsets(i), num_packets_per_lid(i)
668  );
669  batches_per_lid(i) =
670  (num_entries_in_row <= batch_size) ?
671  1 :
672  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
673  batches += batches_per_lid(i);
674  },
675  num_batches
676  );
677  Kokkos::resize(batch_info, num_batches);
678 
679  Kokkos::HostSpace host_space;
680  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
681  Kokkos::deep_copy(batches_per_lid_h, batches_per_lid);
682 
683  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
684 
685  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
686  Kokkos::deep_copy(batch_info, batch_info_h);
687 
688  // FIXME (TJF SEP 2017)
689  // The scalar type is not necessarily default constructible
690  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
691 
692  // Now do the actual unpack!
693  const bool atomic = XS::concurrency() != 1;
694  using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
695  functor f(
696  local_matrix,
697  local_map,
698  imports,
699  num_packets_per_lid,
700  import_lids,
701  batch_info,
702  offsets,
703  combine_mode,
704  batch_size,
705  bytes_per_value,
706  atomic
707  );
708 
709  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
711 #if defined(KOKKOS_ENABLE_CUDA)
712  constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
713 #else
714  constexpr bool is_cuda = false;
715 #endif
716  if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
717  {
718  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
719  }
720  else
721  {
722  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
723  }
724 
725  auto error_code = f.error();
726  TEUCHOS_TEST_FOR_EXCEPTION(
727  error_code != 0,
728  std::runtime_error,
729  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
730  );
731 }
732 
733 template<class LocalMatrix, class BufferDeviceType>
734 size_t
736  const LocalMatrix& local_matrix,
737  const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
738  const Kokkos::View<const char*, BufferDeviceType>& imports,
739  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
740  const size_t num_same_ids)
741 {
742  using Kokkos::parallel_reduce;
743  typedef typename LocalMatrix::ordinal_type LO;
744  typedef typename LocalMatrix::device_type device_type;
745  typedef typename device_type::execution_space XS;
746  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
747  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
748  typedef BufferDeviceType BDT;
749 
750  size_t count = 0;
751  LO num_items;
752 
753  // Number of matrix entries to unpack (returned by this function).
754  num_items = static_cast<LO>(num_same_ids);
755  if (num_items) {
756  size_t kcnt = 0;
757  parallel_reduce(range_policy(0, num_items),
758  KOKKOS_LAMBDA(const LO lid, size_t& update) {
759  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
760  -local_matrix.graph.row_map[lid]);
761  }, kcnt);
762  count += kcnt;
763  }
764 
765  // Count entries copied directly from the source matrix with permuting.
766  num_items = static_cast<LO>(permute_from_lids.extent(0));
767  if (num_items) {
768  size_t kcnt = 0;
769  parallel_reduce(range_policy(0, num_items),
770  KOKKOS_LAMBDA(const LO i, size_t& update) {
771  const LO lid = permute_from_lids(i);
772  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
773  - local_matrix.graph.row_map[lid]);
774  }, kcnt);
775  count += kcnt;
776  }
777 
778  {
779  // Count entries received from other MPI processes.
780  const size_type np = num_packets_per_lid.extent(0);
781  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
782  computeOffsetsFromCounts(offsets, num_packets_per_lid);
783  count +=
784  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
785  offsets, imports);
786  }
787 
788  return count;
789 }
790 
792 template<class LO, class DT, class BDT>
793 int
794 setupRowPointersForRemotes(
795  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
796  const typename PackTraits<LO>::input_array_type& import_lids,
797  const Kokkos::View<const char*, BDT>& imports,
798  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
799  const typename PackTraits<size_t>::input_array_type& offsets)
800 {
801  using Kokkos::parallel_reduce;
802  typedef typename DT::execution_space XS;
803  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
804  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
805 
806  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
807  const size_type N = num_packets_per_lid.extent(0);
808 
809  int errors = 0;
810  parallel_reduce ("Setup row pointers for remotes",
811  range_policy (0, N),
812  KOKKOS_LAMBDA (const size_t i, int& k_error) {
813  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
814  const size_t num_bytes = num_packets_per_lid(i);
815  const size_t offset = offsets(i);
816  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
817  if (num_ent == InvalidNum) {
818  k_error += 1;
819  }
820  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
821  }, errors);
822  return errors;
823 }
824 
825 // Convert array of row lengths to a CRS pointer array
826 template<class DT>
827 void
828 makeCrsRowPtrFromLengths(
829  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
830  const Kokkos::View<size_t*,DT>& new_start_row)
831 {
832  using Kokkos::parallel_scan;
833  typedef typename DT::execution_space XS;
834  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
835  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
836  const size_type N = new_start_row.extent(0);
837  parallel_scan(range_policy(0, N),
838  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
839  auto cur_val = tgt_rowptr(i);
840  if (final) {
841  tgt_rowptr(i) = update;
842  new_start_row(i) = tgt_rowptr(i);
843  }
844  update += cur_val;
845  }
846  );
847 }
848 
849 template<class LocalMatrix, class LocalMap>
850 void
851 copyDataFromSameIDs(
852  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
853  const typename PackTraits<int>::output_array_type& tgt_pids,
854  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
855  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
856  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
857  const typename PackTraits<int>::input_array_type& src_pids,
858  const LocalMatrix& local_matrix,
859  const LocalMap& local_col_map,
860  const size_t num_same_ids,
861  const int my_pid)
862 {
863  using Kokkos::parallel_for;
864  typedef typename LocalMap::device_type DT;
865  typedef typename LocalMap::local_ordinal_type LO;
866  typedef typename DT::execution_space XS;
867  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
868 
869  parallel_for(range_policy(0, num_same_ids),
870  KOKKOS_LAMBDA(const size_t i) {
871  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
872 
873  const LO src_lid = static_cast<LO>(i);
874  size_t src_row = local_matrix.graph.row_map(src_lid);
875 
876  const LO tgt_lid = static_cast<LO>(i);
877  const size_t tgt_row = tgt_rowptr(tgt_lid);
878 
879  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
880  - local_matrix.graph.row_map(src_lid);
881  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
882 
883  for (size_t j=local_matrix.graph.row_map(src_lid);
884  j<local_matrix.graph.row_map(src_lid+1); ++j) {
885  LO src_col = local_matrix.graph.entries(j);
886  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
887  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
888  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
889  }
890  }
891  );
892 }
893 
894 template<class LocalMatrix, class LocalMap>
895 void
896 copyDataFromPermuteIDs(
897  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
898  const typename PackTraits<int>::output_array_type& tgt_pids,
899  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
900  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
901  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
902  const typename PackTraits<int>::input_array_type& src_pids,
903  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
904  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
905  const LocalMatrix& local_matrix,
906  const LocalMap& local_col_map,
907  const int my_pid)
908 {
909  using Kokkos::parallel_for;
910  typedef typename LocalMap::device_type DT;
911  typedef typename LocalMap::local_ordinal_type LO;
912  typedef typename DT::execution_space XS;
913  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
914  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
915 
916  const size_type num_permute_to_lids = permute_to_lids.extent(0);
917 
918  parallel_for(range_policy(0, num_permute_to_lids),
919  KOKKOS_LAMBDA(const size_t i) {
920  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
921 
922  const LO src_lid = permute_from_lids(i);
923  const size_t src_row = local_matrix.graph.row_map(src_lid);
924 
925  const LO tgt_lid = permute_to_lids(i);
926  const size_t tgt_row = tgt_rowptr(tgt_lid);
927 
928  size_t nsr = local_matrix.graph.row_map(src_lid+1)
929  - local_matrix.graph.row_map(src_lid);
930  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
931 
932  for (size_t j=local_matrix.graph.row_map(src_lid);
933  j<local_matrix.graph.row_map(src_lid+1); ++j) {
934  LO src_col = local_matrix.graph.entries(j);
935  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
936  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
937  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
938  }
939  }
940  );
941 }
942 
943 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
944 int
945 unpackAndCombineIntoCrsArrays2(
946  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
947  const typename PackTraits<int>::output_array_type& tgt_pids,
948  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
949  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
950  const typename PackTraits<size_t>::input_array_type& offsets,
951  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
952  const Kokkos::View<const char*, BufferDeviceType>& imports,
953  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
954  const LocalMatrix& /* local_matrix */,
955  const LocalMap /*& local_col_map*/,
956  const int my_pid,
957  const size_t bytes_per_value)
958 {
959  using Kokkos::View;
960  using Kokkos::subview;
961  using Kokkos::MemoryUnmanaged;
962  using Kokkos::parallel_reduce;
963  using Kokkos::atomic_fetch_add;
965  typedef typename LocalMap::device_type DT;
966  typedef typename LocalMap::local_ordinal_type LO;
967  typedef typename LocalMap::global_ordinal_type GO;
968  typedef typename LocalMatrix::value_type ST;
969  typedef typename DT::execution_space XS;
970  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
971  typedef typename Kokkos::pair<size_type, size_type> slice;
972  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
973 
974  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
975  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
976  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
977 
978  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
979 
980  int errors = 0;
981  const size_type num_import_lids = import_lids.size();
982 
983  // RemoteIDs: Loop structure following UnpackAndCombine
984  parallel_reduce ("Unpack and combine into CRS",
985  range_policy (0, num_import_lids),
986  KOKKOS_LAMBDA (const size_t i, int& k_error) {
987  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
988  const size_t num_bytes = num_packets_per_lid(i);
989  const size_t offset = offsets(i);
990  if (num_bytes == 0) {
991  // Empty buffer means that the row is empty.
992  return;
993  }
994  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
995  if (num_ent == InvalidNum) {
996  k_error += 1;
997  return;
998  }
999  const LO lcl_row = import_lids(i);
1000  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1001  const size_t end_row = start_row + num_ent;
1002 
1003  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1004  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1005  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1006 
1007  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1008  imports.data(), offset, num_bytes,
1009  num_ent, bytes_per_value);
1010 
1011  // Correct target PIDs.
1012  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1013  const int pid = pids_out(j);
1014  pids_out(j) = (pid != my_pid) ? pid : -1;
1015  }
1016  }, errors);
1017 
1018  return errors;
1019 }
1020 
1021 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1022 void
1024  const LocalMatrix & local_matrix,
1025  const LocalMap & local_col_map,
1026  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1027  const Kokkos::View<const char*, BufferDeviceType>& imports,
1028  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1029  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1030  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1031  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1032  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1033  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1034  const typename PackTraits<int>::input_array_type& src_pids,
1035  const typename PackTraits<int>::output_array_type& tgt_pids,
1036  const size_t num_same_ids,
1037  const size_t tgt_num_rows,
1038  const size_t tgt_num_nonzeros,
1039  const int my_tgt_pid,
1040  const size_t bytes_per_value)
1041 {
1042  using Kokkos::View;
1043  using Kokkos::subview;
1044  using Kokkos::parallel_for;
1045  using Kokkos::MemoryUnmanaged;
1047  typedef typename LocalMap::device_type DT;
1048  typedef typename LocalMap::local_ordinal_type LO;
1049  typedef typename DT::execution_space XS;
1050  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1051  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1052  typedef BufferDeviceType BDT;
1053 
1054  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1055 
1056  const size_t N = tgt_num_rows;
1057 
1058  // In the case of reduced communicators, the sourceMatrix won't have
1059  // the right "my_pid", so thus we have to supply it.
1060  const int my_pid = my_tgt_pid;
1061 
1062  // Zero the rowptr
1063  parallel_for(range_policy(0, N+1),
1064  KOKKOS_LAMBDA(const size_t i) {
1065  tgt_rowptr(i) = 0;
1066  }
1067  );
1068 
1069  // same IDs: Always first, always in the same place
1070  parallel_for(range_policy(0, num_same_ids),
1071  KOKKOS_LAMBDA(const size_t i) {
1072  const LO tgt_lid = static_cast<LO>(i);
1073  const LO src_lid = static_cast<LO>(i);
1074  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1075  - local_matrix.graph.row_map(src_lid);
1076  }
1077  );
1078 
1079  // Permute IDs: Still local, but reordered
1080  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1081  parallel_for(range_policy(0, num_permute_to_lids),
1082  KOKKOS_LAMBDA(const size_t i) {
1083  const LO tgt_lid = permute_to_lids(i);
1084  const LO src_lid = permute_from_lids(i);
1085  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1086  - local_matrix.graph.row_map(src_lid);
1087  }
1088  );
1089 
1090  // Get the offsets from the number of packets per LID
1091  const size_type num_import_lids = import_lids.extent(0);
1092  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1093  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1094 
1095 #ifdef HAVE_TPETRA_DEBUG
1096  {
1097  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1098  const bool condition =
1099  nth_offset_h != static_cast<size_t>(imports.extent (0));
1100  TEUCHOS_TEST_FOR_EXCEPTION
1101  (condition, std::logic_error, prefix
1102  << "The final offset in bytes " << nth_offset_h
1103  << " != imports.size() = " << imports.extent(0)
1104  << ". Please report this bug to the Tpetra developers.");
1105  }
1106 #endif // HAVE_TPETRA_DEBUG
1107 
1108  // Setup row pointers for remotes
1109  int k_error =
1110  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1111  import_lids, imports, num_packets_per_lid, offsets);
1112  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1113  << " Error transferring data to target row pointers. "
1114  "Please report this bug to the Tpetra developers.");
1115 
1116  // If multiple processes contribute to the same row, we may need to
1117  // update row offsets. This tracks that.
1118  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1119 
1120  // Turn row length into a real CRS row pointer
1121  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1122 
1123  // SameIDs: Copy the data over
1124  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1125  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1126 
1127  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1128  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1129  local_matrix, local_col_map, my_pid);
1130 
1131  if (imports.extent(0) <= 0) {
1132  return;
1133  }
1134 
1135  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1136  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1137  local_matrix, local_col_map, my_pid, bytes_per_value);
1138  TEUCHOS_TEST_FOR_EXCEPTION(
1139  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1140  "should never happen. Please report this bug to the Tpetra developers.");
1141 
1142  return;
1143 }
1144 
1145 } // namespace UnpackAndCombineCrsMatrixImpl
1146 
1186 template<typename ST, typename LO, typename GO, typename Node>
1187 void
1189  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1190  const Teuchos::ArrayView<const char>& imports,
1191  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1192  const Teuchos::ArrayView<const LO>& importLIDs,
1193  size_t /* constantNumPackets */,
1194  Distributor & /* distor */,
1195  CombineMode combineMode)
1196 {
1197  using Kokkos::View;
1198  typedef typename Node::device_type device_type;
1199  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_type local_matrix_type;
1200  static_assert (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1201  "Node::device_type and LocalMatrix::device_type must be the same.");
1202 
1203  // Execution space.
1204  typedef typename device_type::execution_space XS;
1205 
1206  // Convert all Teuchos::Array to Kokkos::View.
1207  typename XS::device_type outputDevice;
1208 
1209  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1210  // them to device. Since unpacking is done directly in to the local matrix
1211  // (lclMatrix), no copying needs to be performed after unpacking.
1212  auto num_packets_per_lid_d =
1213  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1214  numPacketsPerLID.size(), true, "num_packets_per_lid");
1215 
1216  auto import_lids_d =
1217  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1218  importLIDs.size(), true, "import_lids");
1219 
1220  auto imports_d =
1221  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1222  imports.size(), true, "imports");
1223 
1224  auto local_matrix = sourceMatrix.getLocalMatrix();
1225  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1226 
1227  for (int i=0; i<importLIDs.size(); i++)
1228  {
1229  auto lclRow = importLIDs[i];
1230  Teuchos::ArrayView<const LO> A_indices;
1231  Teuchos::ArrayView<const ST> A_values;
1232  sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1233  }
1234  // Now do the actual unpack!
1235  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1236  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1237  import_lids_d, combineMode);
1238 
1239 }
1240 
1241 template<typename ST, typename LO, typename GO, typename NT>
1242 void
1243 unpackCrsMatrixAndCombineNew(
1244  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1245  Kokkos::DualView<char*,
1247  Kokkos::DualView<size_t*,
1248  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1249  const Kokkos::DualView<const LO*,
1251  const size_t /* constantNumPackets */,
1252  Distributor& /* distor */,
1253  const CombineMode combineMode)
1254 {
1255  using Kokkos::View;
1256  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1257  using dist_object_type = DistObject<char, LO, GO, NT>;
1258  using device_type = typename crs_matrix_type::device_type;
1259  using local_matrix_type = typename crs_matrix_type::local_matrix_type;
1260  using buffer_device_type = typename dist_object_type::buffer_device_type;
1261 
1262  static_assert
1263  (std::is_same<device_type, typename local_matrix_type::device_type>::value,
1264  "crs_matrix_type::device_type and local_matrix_type::device_type "
1265  "must be the same.");
1266 
1267  if (numPacketsPerLID.need_sync_device()) {
1268  numPacketsPerLID.sync_device ();
1269  }
1270  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1271 
1272  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1273  auto import_lids_d = importLIDs.view_device ();
1274 
1275  if (imports.need_sync_device()) {
1276  imports.sync_device ();
1277  }
1278  auto imports_d = imports.view_device ();
1279 
1280  auto local_matrix = sourceMatrix.getLocalMatrix ();
1281  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1282  typedef decltype (local_col_map) local_map_type;
1283 
1284  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1285  local_matrix_type,
1286  local_map_type,
1287  buffer_device_type
1288  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1289  import_lids_d, combineMode);
1290 }
1291 
1338 //
1347 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1348 size_t
1350  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1351  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1352  const Teuchos::ArrayView<const char> &imports,
1353  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1354  size_t /* constantNumPackets */,
1355  Distributor &/* distor */,
1356  CombineMode /* combineMode */,
1357  size_t numSameIDs,
1358  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1359  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1360 {
1361  using Kokkos::MemoryUnmanaged;
1362  using Kokkos::View;
1363  typedef typename Node::device_type DT;
1365  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1366 
1367  TEUCHOS_TEST_FOR_EXCEPTION
1368  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1369  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1370  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1371  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1372  // process, then the matrix is neither locally nor globally indexed.
1373  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1374  TEUCHOS_TEST_FOR_EXCEPTION
1375  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1376  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1377  TEUCHOS_TEST_FOR_EXCEPTION
1378  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1379  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1380  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1381 
1382  auto local_matrix = sourceMatrix.getLocalMatrix ();
1383  auto permute_from_lids_d =
1385  permuteFromLIDs.getRawPtr (),
1386  permuteFromLIDs.size (), true,
1387  "permute_from_lids");
1388  auto imports_d =
1390  imports.getRawPtr (),
1391  imports.size (), true,
1392  "imports");
1393  auto num_packets_per_lid_d =
1395  numPacketsPerLID.getRawPtr (),
1396  numPacketsPerLID.size (), true,
1397  "num_packets_per_lid");
1398 
1400  local_matrix, permute_from_lids_d, imports_d,
1401  num_packets_per_lid_d, numSameIDs);
1402 }
1403 
1418 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1419 void
1422  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1423  const Teuchos::ArrayView<const char>& imports,
1424  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1425  const size_t /* constantNumPackets */,
1426  Distributor& /* distor */,
1427  const CombineMode /* combineMode */,
1428  const size_t numSameIDs,
1429  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1430  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1431  size_t TargetNumRows,
1432  size_t TargetNumNonzeros,
1433  const int MyTargetPID,
1434  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1435  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1436  const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
1437  const Teuchos::ArrayView<const int>& SourcePids,
1438  Teuchos::Array<int>& TargetPids)
1439 {
1441 
1442  using Kokkos::View;
1443  using Kokkos::deep_copy;
1444 
1445  using Teuchos::ArrayView;
1446  using Teuchos::outArg;
1447  using Teuchos::REDUCE_MAX;
1448  using Teuchos::reduceAll;
1449 
1450  typedef LocalOrdinal LO;
1451 
1452  typedef typename Node::device_type DT;
1453  typedef typename DT::execution_space XS;
1454 
1456  typedef typename matrix_type::impl_scalar_type ST;
1457  typedef typename ArrayView<const LO>::size_type size_type;
1458 
1459  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1460 
1461  TEUCHOS_TEST_FOR_EXCEPTION(
1462  TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1463  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1464  CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1465 
1466  TEUCHOS_TEST_FOR_EXCEPTION(
1467  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1468  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1469  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1470  const size_type numImportLIDs = importLIDs.size ();
1471 
1472  TEUCHOS_TEST_FOR_EXCEPTION(
1473  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1474  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1475  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1476 
1477  // Preseed TargetPids with -1 for local
1478  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1479  TargetPids.resize (TargetNumNonzeros);
1480  }
1481  TargetPids.assign (TargetNumNonzeros, -1);
1482 
1483  // Grab pointers for sourceMatrix
1484  auto local_matrix = sourceMatrix.getLocalMatrix();
1485  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1486 
1487  // Convert input arrays to Kokkos::View
1488  typename XS::device_type outputDevice;
1489  auto import_lids_d =
1490  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1491  importLIDs.size(), true, "import_lids");
1492 
1493  auto imports_d =
1494  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1495  imports.size(), true, "imports");
1496 
1497  auto num_packets_per_lid_d =
1498  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1499  numPacketsPerLID.size(), true, "num_packets_per_lid");
1500 
1501  auto permute_from_lids_d =
1502  create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(),
1503  permuteFromLIDs.size(), true, "permute_from_lids");
1504 
1505  auto permute_to_lids_d =
1506  create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(),
1507  permuteToLIDs.size(), true, "permute_to_lids");
1508 
1509  auto crs_rowptr_d =
1510  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1511  CRS_rowptr.size(), true, "crs_rowptr");
1512 
1513  auto crs_colind_d =
1514  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1515  CRS_colind.size(), true, "crs_colidx");
1516 
1517 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1518  static_assert (! std::is_same<
1519  typename std::remove_const<
1520  typename std::decay<
1521  decltype (CRS_vals)
1522  >::type::value_type
1523  >::type,
1524  std::complex<double> >::value,
1525  "CRS_vals::value_type is std::complex<double>; this should never happen"
1526  ", since std::complex does not work in Kokkos::View objects.");
1527 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1528 
1529  auto crs_vals_d =
1530  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(),
1531  CRS_vals.size(), true, "crs_vals");
1532 
1533 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1534  static_assert (! std::is_same<
1535  typename decltype (crs_vals_d)::non_const_value_type,
1536  std::complex<double> >::value,
1537  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1538  "never happen, since std::complex does not work in Kokkos::View objects.");
1539 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1540 
1541  auto src_pids_d =
1542  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1543  SourcePids.size(), true, "src_pids");
1544 
1545  auto tgt_pids_d =
1546  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1547  TargetPids.size(), true, "tgt_pids");
1548 
1549  size_t bytes_per_value = 0;
1551  // assume that ST is default constructible
1552  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1553  }
1554  else {
1555  // Since the packed data come from the source matrix, we can use the source
1556  // matrix to get the number of bytes per Scalar value stored in the matrix.
1557  // This assumes that all Scalar values in the source matrix require the same
1558  // number of bytes. If the source matrix has no entries on the calling
1559  // process, then we hope that some process does have some idea how big
1560  // a Scalar value is. Of course, if no processes have any entries, then no
1561  // values should be packed (though this does assume that in our packing
1562  // scheme, rows with zero entries take zero bytes).
1563  size_t bytes_per_value_l = 0;
1564  if (local_matrix.values.extent(0) > 0) {
1565  const ST& val = local_matrix.values(0);
1566  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1567  } else {
1568  const ST& val = crs_vals_d(0);
1569  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1570  }
1571  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1572  Teuchos::REDUCE_MAX,
1573  bytes_per_value_l,
1574  outArg(bytes_per_value));
1575  }
1576 
1577 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1578  static_assert (! std::is_same<
1579  typename decltype (crs_vals_d)::non_const_value_type,
1580  std::complex<double> >::value,
1581  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1582  "never happen, since std::complex does not work in Kokkos::View objects.");
1583 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1584 
1586  local_matrix, local_col_map, import_lids_d, imports_d,
1587  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1588  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1589  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1590  bytes_per_value);
1591 
1592  // Copy outputs back to host
1593  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1594  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1595  deep_copy(crs_rowptr_h, crs_rowptr_d);
1596 
1597  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1598  CRS_colind.getRawPtr(), CRS_colind.size());
1599  deep_copy(crs_colind_h, crs_colind_d);
1600 
1601  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1602  CRS_vals.getRawPtr(), CRS_vals.size());
1603  deep_copy(crs_vals_h, crs_vals_d);
1604 
1605  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1606  TargetPids.getRawPtr(), TargetPids.size());
1607  deep_copy(tgt_pids_h, tgt_pids_d);
1608 
1609 }
1610 
1611 } // namespace Details
1612 } // namespace Tpetra
1613 
1614 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1615  template void \
1616  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1617  const CrsMatrix<ST, LO, GO, NT>&, \
1618  const Teuchos::ArrayView<const char>&, \
1619  const Teuchos::ArrayView<const size_t>&, \
1620  const Teuchos::ArrayView<const LO>&, \
1621  size_t, \
1622  Distributor&, \
1623  CombineMode); \
1624  template void \
1625  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1626  const CrsMatrix<ST, LO, GO, NT>&, \
1627  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1628  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1629  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1630  const size_t, \
1631  Distributor&, \
1632  const CombineMode); \
1633  template void \
1634  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1635  const CrsMatrix<ST, LO, GO, NT> &, \
1636  const Teuchos::ArrayView<const LO>&, \
1637  const Teuchos::ArrayView<const char>&, \
1638  const Teuchos::ArrayView<const size_t>&, \
1639  const size_t, \
1640  Distributor&, \
1641  const CombineMode, \
1642  const size_t, \
1643  const Teuchos::ArrayView<const LO>&, \
1644  const Teuchos::ArrayView<const LO>&, \
1645  size_t, \
1646  size_t, \
1647  const int, \
1648  const Teuchos::ArrayView<size_t>&, \
1649  const Teuchos::ArrayView<GO>&, \
1650  const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1651  const Teuchos::ArrayView<const int>&, \
1652  Teuchos::Array<int>&); \
1653  template size_t \
1654  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1655  const CrsMatrix<ST, LO, GO, NT> &, \
1656  const Teuchos::ArrayView<const LO> &, \
1657  const Teuchos::ArrayView<const char> &, \
1658  const Teuchos::ArrayView<const size_t>&, \
1659  size_t, \
1660  Distributor &, \
1661  CombineMode, \
1662  size_t, \
1663  const Teuchos::ArrayView<const LO>&, \
1664  const Teuchos::ArrayView<const LO>&);
1665 
1666 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index.
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
&quot;Local&quot; part of Map suitable for Kokkos kernels.
typename Kokkos::ArithTraits< Scalar >::val_type impl_scalar_type
The type used internally in place of Scalar.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
local_matrix_type getLocalMatrix() const
The local sparse matrix.
void getLocalRowView(LocalOrdinal LocalRow, Teuchos::ArrayView< const LocalOrdinal > &indices, Teuchos::ArrayView< const Scalar > &values) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...