Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsMatrixAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
42 
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
46 #include "Teuchos_OrdinalTraits.hpp"
54 #include "Kokkos_Core.hpp"
55 #include <memory>
56 #include <string>
57 
78 
79 namespace Tpetra {
80 
81 //
82 // Users must never rely on anything in the Details namespace.
83 //
84 namespace Details {
85 
86 namespace UnpackAndCombineCrsMatrixImpl {
87 
97 template<class ST, class LO, class GO>
98 KOKKOS_FUNCTION int
99 unpackRow(const typename PackTraits<GO>::output_array_type& gids_out,
100  const typename PackTraits<int>::output_array_type& pids_out,
101  const typename PackTraits<ST>::output_array_type& vals_out,
102  const char imports[],
103  const size_t offset,
104  const size_t /* num_bytes */,
105  const size_t num_ent,
106  const size_t bytes_per_value)
107 {
108  if (num_ent == 0) {
109  // Empty rows always take zero bytes, to ensure sparsity.
110  return 0;
111  }
112  bool unpack_pids = pids_out.size() > 0;
113 
114  const size_t num_ent_beg = offset;
115  const size_t num_ent_len = PackTraits<LO>::packValueCount (LO (0));
116 
117  const size_t gids_beg = num_ent_beg + num_ent_len;
118  const size_t gids_len =
119  num_ent * PackTraits<GO>::packValueCount (GO (0));
120 
121  const size_t pids_beg = gids_beg + gids_len;
122  const size_t pids_len = unpack_pids ?
123  size_t (num_ent * PackTraits<int>::packValueCount (int (0))) :
124  size_t (0);
125 
126  const size_t vals_beg = gids_beg + gids_len + pids_len;
127  const size_t vals_len = num_ent * bytes_per_value;
128 
129  const char* const num_ent_in = imports + num_ent_beg;
130  const char* const gids_in = imports + gids_beg;
131  const char* const pids_in = unpack_pids ? imports + pids_beg : nullptr;
132  const char* const vals_in = imports + vals_beg;
133 
134  size_t num_bytes_out = 0;
135  LO num_ent_out;
136  num_bytes_out += PackTraits<LO>::unpackValue (num_ent_out, num_ent_in);
137  if (static_cast<size_t> (num_ent_out) != num_ent) {
138  return 20; // error code
139  }
140 
141  {
142  Kokkos::pair<int, size_t> p;
143  p = PackTraits<GO>::unpackArray (gids_out.data (), gids_in, num_ent);
144  if (p.first != 0) {
145  return 21; // error code
146  }
147  num_bytes_out += p.second;
148 
149  if (unpack_pids) {
150  p = PackTraits<int>::unpackArray (pids_out.data (), pids_in, num_ent);
151  if (p.first != 0) {
152  return 22; // error code
153  }
154  num_bytes_out += p.second;
155  }
156 
157  p = PackTraits<ST>::unpackArray (vals_out.data (), vals_in, num_ent);
158  if (p.first != 0) {
159  return 23; // error code
160  }
161  num_bytes_out += p.second;
162  }
163 
164  const size_t expected_num_bytes = num_ent_len + gids_len + pids_len + vals_len;
165  if (num_bytes_out != expected_num_bytes) {
166  return 24; // error code
167  }
168  return 0; // no errors
169 }
170 
181 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
183  typedef LocalMatrix local_matrix_type;
184  typedef LocalMap local_map_type;
185 
186  typedef typename local_matrix_type::value_type ST;
187  typedef typename local_map_type::local_ordinal_type LO;
188  typedef typename local_map_type::global_ordinal_type GO;
189  typedef typename local_map_type::device_type DT;
190  typedef typename DT::execution_space XS;
191 
192  typedef Kokkos::View<const size_t*, BufferDeviceType>
193  num_packets_per_lid_type;
194  typedef Kokkos::View<const size_t*, DT> offsets_type;
195  typedef Kokkos::View<const char*, BufferDeviceType> input_buffer_type;
196  typedef Kokkos::View<const LO*, BufferDeviceType> import_lids_type;
197 
198  typedef Kokkos::View<int, DT> error_type;
199  using member_type = typename Kokkos::TeamPolicy<XS>::member_type;
200 
201  static_assert (std::is_same<LO, typename local_matrix_type::ordinal_type>::value,
202  "LocalMap::local_ordinal_type and "
203  "LocalMatrix::ordinal_type must be the same.");
204 
205  local_matrix_type local_matrix;
206  local_map_type local_col_map;
207  input_buffer_type imports;
208  num_packets_per_lid_type num_packets_per_lid;
209  import_lids_type import_lids;
210  Kokkos::View<const LO*[2], DT> batch_info;
211  offsets_type offsets;
212  Tpetra::CombineMode combine_mode;
213  size_t batch_size;
214  size_t bytes_per_value;
215  bool atomic;
216  error_type error_code;
217 
219  const local_matrix_type& local_matrix_in,
220  const local_map_type& local_col_map_in,
221  const input_buffer_type& imports_in,
222  const num_packets_per_lid_type& num_packets_per_lid_in,
223  const import_lids_type& import_lids_in,
224  const Kokkos::View<const LO*[2], DT>& batch_info_in,
225  const offsets_type& offsets_in,
226  const Tpetra::CombineMode combine_mode_in,
227  const size_t batch_size_in,
228  const size_t bytes_per_value_in,
229  const bool atomic_in) :
230  local_matrix (local_matrix_in),
231  local_col_map (local_col_map_in),
232  imports (imports_in),
233  num_packets_per_lid (num_packets_per_lid_in),
234  import_lids (import_lids_in),
235  batch_info (batch_info_in),
236  offsets (offsets_in),
237  combine_mode (combine_mode_in),
238  batch_size (batch_size_in),
239  bytes_per_value (bytes_per_value_in),
240  atomic (atomic_in),
241  error_code("error")
242  {}
243 
244  KOKKOS_INLINE_FUNCTION
245  void operator()(member_type team_member) const
246  {
247  using Kokkos::View;
248  using Kokkos::subview;
249  using Kokkos::MemoryUnmanaged;
250 
251  const LO batch = team_member.league_rank();
252  const LO lid_no = batch_info(batch, 0);
253  const LO batch_no = batch_info(batch, 1);
254 
255  const size_t num_bytes = num_packets_per_lid(lid_no);
256 
257  // Only unpack data if there is a nonzero number of bytes.
258  if (num_bytes == 0)
259  return;
260 
261  // there is actually something in the row
262  const LO import_lid = import_lids(lid_no);
263  const size_t buf_size = imports.size();
264  const size_t offset = offsets(lid_no);
265 
266  // Get the number of entries to expect in the received data for this row.
267  LO num_ent_LO = 0;
268  const char* const in_buf = imports.data() + offset;
269  (void) PackTraits<LO>::unpackValue(num_ent_LO, in_buf);
270  const size_t num_entries_in_row = static_cast<size_t>(num_ent_LO);
271 
272  // Count the number of bytes expected to unpack
273  size_t expected_num_bytes = 0;
274  {
275  expected_num_bytes += PackTraits<LO>::packValueCount(LO(0));
276  expected_num_bytes += num_entries_in_row * PackTraits<GO>::packValueCount(GO(0));
277  expected_num_bytes += num_entries_in_row * PackTraits<ST>::packValueCount(ST());
278  }
279 
280  if (expected_num_bytes > num_bytes)
281  {
282 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
283 #ifndef KOKKOS_ENABLE_SYCL
284  printf(
285  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
286  "At row %d, the expected number of bytes (%d) != number of unpacked bytes (%d)\n",
287  (int) lid_no, (int) expected_num_bytes, (int) num_bytes
288  );
289 #endif
290  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 21);
291  return;
292  }
293 
294  if (offset > buf_size || offset + num_bytes > buf_size)
295  {
296 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
297 #ifndef KOKKOS_ENABLE_SYCL
298  printf(
299  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
300  "At row %d, the offset (%d) > buffer size (%d)\n",
301  (int) lid_no, (int) offset, (int) buf_size
302  );
303 #endif
304  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 22);
305  return;
306  }
307 
308  // Determine the number of entries to unpack in this batch
309  size_t num_entries_in_batch = 0;
310  if (num_entries_in_row <= batch_size)
311  num_entries_in_batch = num_entries_in_row;
312  else if (num_entries_in_row >= (batch_no + 1) * batch_size)
313  num_entries_in_batch = batch_size;
314  else
315  num_entries_in_batch = num_entries_in_row - batch_no * batch_size;
316 
317  const size_t bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
318  const size_t num_ent_start = offset;
319  const size_t num_ent_end = num_ent_start + bytes_per_lid;
320 
321  const size_t bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
322  const size_t gids_start = num_ent_end;
323  const size_t gids_end = gids_start + num_entries_in_row * bytes_per_gid;
324 
325  const size_t vals_start = gids_end;
326 
327  const size_t shift = batch_no * batch_size;
328  const char* const num_ent_in = imports.data() + num_ent_start;
329  const char* const gids_in = imports.data() + gids_start + shift * bytes_per_gid;
330  const char* const vals_in = imports.data() + vals_start + shift * bytes_per_value;
331 
332  LO num_ent_out;
333  (void)PackTraits<LO>::unpackValue(num_ent_out, num_ent_in);
334  if (static_cast<size_t>(num_ent_out) != num_entries_in_row)
335  {
336 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
337 #ifndef KOKKOS_ENABLE_SYCL
338  printf(
339  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
340  "At row %d, number of entries (%d) != number of entries unpacked (%d)\n",
341  (int) lid_no, (int) num_entries_in_row, (int) num_ent_out
342  );
343 #endif
344  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 23);
345  }
346 
347  constexpr bool matrix_has_sorted_rows = true; // see #6282
348  //Note BMK 6-22: this lambda must use capture-by-value [=] and not capture-by-ref [&].
349  //By ref triggers compiler bug in CUDA 10.
350  Kokkos::parallel_for(
351  Kokkos::TeamThreadRange(team_member, num_entries_in_batch),
352  [=](const LO& j)
353  {
354  size_t distance = 0;
355 
356  GO gid_out;
357  distance = j * bytes_per_gid;
358  (void) PackTraits<GO>::unpackValue(gid_out, gids_in + distance);
359  auto lid_out = local_col_map.getLocalElement(gid_out);
360 
361  // Column indices come in as global indices, in case the
362  // source object's column Map differs from the target object's
363  // (this's) column Map, and must be converted local index values
364 
365  // assume that ST is default constructible
366  ST val_out;
367  distance = j * bytes_per_value;
368  (void) PackTraits<ST>::unpackValue(val_out, vals_in + distance);
369 
370  if (combine_mode == ADD) {
371  // NOTE (mfh 20 Nov 2019) Must assume atomic is required, unless
372  // different threads don't touch the same row (i.e., no
373  // duplicates in incoming LIDs list).
374  const bool use_atomic_updates = atomic;
375  (void)local_matrix.sumIntoValues(
376  import_lid,
377  &lid_out,
378  1,
379  &val_out,
380  matrix_has_sorted_rows,
381  use_atomic_updates
382  );
383  } else if (combine_mode == REPLACE) {
384  // NOTE (mfh 20 Nov 2019): It's never correct to use REPLACE
385  // combine mode with multiple incoming rows that touch the same
386  // target matrix entries, so we never need atomic updates.
387  const bool use_atomic_updates = false;
388  (void)local_matrix.replaceValues(
389  import_lid,
390  &lid_out,
391  1,
392  &val_out,
393  matrix_has_sorted_rows,
394  use_atomic_updates
395  );
396  } else {
397  // should never get here
398 // FIXME_SYCL Enable again once a SYCL conforming printf implementation is available.
399 #ifndef KOKKOS_ENABLE_SYCL
400  printf(
401  "*** Error: UnpackCrsMatrixAndCombineFunctor: "
402  "At row %d, an unknown error occurred during unpack\n", (int) lid_no
403  );
404 #endif
405  Kokkos::atomic_compare_exchange_strong(error_code.data(), 0, 31);
406  }
407  }
408  );
409 
410  team_member.team_barrier();
411 
412  }
413 
415  int error() const {
416  auto error_code_h = Kokkos::create_mirror_view_and_copy(
417  Kokkos::HostSpace(), error_code
418  );
419  return error_code_h();
420  }
421 
422 };
423 
424 struct MaxNumEntTag {};
425 struct TotNumEntTag {};
426 
435 template<class LO, class DT, class BDT>
437 public:
438  typedef Kokkos::View<const size_t*, BDT> num_packets_per_lid_type;
439  typedef Kokkos::View<const size_t*, DT> offsets_type;
440  typedef Kokkos::View<const char*, BDT> input_buffer_type;
441  // This needs to be public, since it appears in the argument list of
442  // public methods (see below). Otherwise, build errors may happen.
443  typedef size_t value_type;
444 
445 private:
446  num_packets_per_lid_type num_packets_per_lid;
447  offsets_type offsets;
448  input_buffer_type imports;
449 
450 public:
451  NumEntriesFunctor (const num_packets_per_lid_type num_packets_per_lid_in,
452  const offsets_type& offsets_in,
453  const input_buffer_type& imports_in) :
454  num_packets_per_lid (num_packets_per_lid_in),
455  offsets (offsets_in),
456  imports (imports_in)
457  {}
458 
459  KOKKOS_INLINE_FUNCTION void
460  operator() (const MaxNumEntTag, const LO i, value_type& update) const {
461  // Get how many entries to expect in the received data for this row.
462  const size_t num_bytes = num_packets_per_lid(i);
463  if (num_bytes > 0) {
464  LO num_ent_LO = 0; // output argument of unpackValue
465  const char* const in_buf = imports.data () + offsets(i);
466  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
467  const size_t num_ent = static_cast<size_t> (num_ent_LO);
468 
469  update = (update < num_ent) ? num_ent : update;
470  }
471  }
472 
473  KOKKOS_INLINE_FUNCTION void
474  join (const MaxNumEntTag,
475  value_type& dst,
476  const value_type& src) const
477  {
478  if (dst < src) dst = src;
479  }
480 
481  KOKKOS_INLINE_FUNCTION void
482  operator() (const TotNumEntTag, const LO i, value_type& tot_num_ent) const {
483  // Get how many entries to expect in the received data for this row.
484  const size_t num_bytes = num_packets_per_lid(i);
485  if (num_bytes > 0) {
486  LO num_ent_LO = 0; // output argument of unpackValue
487  const char* const in_buf = imports.data () + offsets(i);
488  (void) PackTraits<LO>::unpackValue (num_ent_LO, in_buf);
489  tot_num_ent += static_cast<size_t> (num_ent_LO);
490  }
491  }
492 };
493 
501 template<class LO, class DT, class BDT>
502 size_t
503 compute_maximum_num_entries (
504  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
505  const Kokkos::View<const size_t*, DT>& offsets,
506  const Kokkos::View<const char*, BDT>& imports)
507 {
508  typedef typename DT::execution_space XS;
509  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>,
510  MaxNumEntTag> range_policy;
511 
512  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
513  imports);
514  const LO numRowsToUnpack =
515  static_cast<LO> (num_packets_per_lid.extent (0));
516  size_t max_num_ent = 0;
517  Kokkos::parallel_reduce ("Max num entries in CRS",
518  range_policy (0, numRowsToUnpack),
519  functor, max_num_ent);
520  return max_num_ent;
521 }
522 
530 template<class LO, class DT, class BDT>
531 size_t
532 compute_total_num_entries (
533  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
534  const Kokkos::View<const size_t*, DT>& offsets,
535  const Kokkos::View<const char*, BDT>& imports)
536 {
537  typedef typename DT::execution_space XS;
538  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO>, TotNumEntTag> range_policy;
539  size_t tot_num_ent = 0;
540  NumEntriesFunctor<LO, DT, BDT> functor (num_packets_per_lid, offsets,
541  imports);
542  const LO numRowsToUnpack =
543  static_cast<LO> (num_packets_per_lid.extent (0));
544  Kokkos::parallel_reduce ("Total num entries in CRS to unpack",
545  range_policy (0, numRowsToUnpack),
546  functor, tot_num_ent);
547  return tot_num_ent;
548 }
549 
550 template<class LO>
551 KOKKOS_INLINE_FUNCTION
552 size_t
553 unpackRowCount(const char imports[],
554  const size_t offset,
555  const size_t num_bytes)
556 {
557  using PT = PackTraits<LO>;
558 
559  LO num_ent_LO = 0;
560  if (num_bytes > 0) {
561  const size_t p_num_bytes = PT::packValueCount(num_ent_LO);
562  if (p_num_bytes > num_bytes) {
563  return OrdinalTraits<size_t>::invalid();
564  }
565  const char* const in_buf = imports + offset;
566  (void) PT::unpackValue(num_ent_LO, in_buf);
567  }
568  return static_cast<size_t>(num_ent_LO);
569 }
570 
575 template<class View1, class View2>
576 inline
577 bool
578 compute_batch_info(
579  const View1& batches_per_lid,
580  View2& batch_info
581 )
582 {
583  using LO = typename View2::value_type;
584  size_t batch = 0;
585  for (size_t i=0; i<batches_per_lid.extent(0); i++)
586  {
587  for (size_t batch_no=0; batch_no<batches_per_lid(i); batch_no++)
588  {
589  batch_info(batch, 0) = static_cast<LO>(i);
590  batch_info(batch, 1) = batch_no;
591  batch++;
592  }
593  }
594  return batch == batch_info.extent(0);
595 }
596 
604 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
605 void
606 unpackAndCombineIntoCrsMatrix(
607  const LocalMatrix& local_matrix,
608  const LocalMap& local_map,
609  const Kokkos::View<const char*, BufferDeviceType>& imports,
610  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
611  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type import_lids,
612  const Tpetra::CombineMode combine_mode)
613 {
614  using ST = typename LocalMatrix::value_type;
615  using LO = typename LocalMap::local_ordinal_type;
616  using DT = typename LocalMap::device_type;
617  using XS = typename DT::execution_space;
618  const char prefix[] =
619  "Tpetra::Details::UnpackAndCombineCrsMatrixImpl::"
620  "unpackAndCombineIntoCrsMatrix: ";
621 
622  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
623  if (num_import_lids == 0) {
624  // Nothing to unpack
625  return;
626  }
627 
628  {
629  // Check for correct input
630  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == ABSMAX,
631  std::invalid_argument,
632  prefix << "ABSMAX combine mode is not yet implemented for a matrix that has a "
633  "static graph (i.e., was constructed with the CrsMatrix constructor "
634  "that takes a const CrsGraph pointer).");
635 
636  TEUCHOS_TEST_FOR_EXCEPTION(combine_mode == INSERT,
637  std::invalid_argument,
638  prefix << "INSERT combine mode is not allowed if the matrix has a static graph "
639  "(i.e., was constructed with the CrsMatrix constructor that takes a "
640  "const CrsGraph pointer).");
641 
642  // Unknown combine mode!
643  TEUCHOS_TEST_FOR_EXCEPTION(!(combine_mode == ADD || combine_mode == REPLACE),
644  std::invalid_argument,
645  prefix << "Invalid combine mode; should never get "
646  "here! Please report this bug to the Tpetra developers.");
647 
648  // Check that sizes of input objects are consistent.
649  bool bad_num_import_lids =
650  num_import_lids != static_cast<size_t>(num_packets_per_lid.extent(0));
651  TEUCHOS_TEST_FOR_EXCEPTION(bad_num_import_lids,
652  std::invalid_argument,
653  prefix << "importLIDs.size() (" << num_import_lids << ") != "
654  "numPacketsPerLID.size() (" << num_packets_per_lid.extent(0) << ").");
655  } // end QA error checking
656 
657  // Get the offsets
658  Kokkos::View<size_t*, DT> offsets("offsets", num_import_lids+1);
659  computeOffsetsFromCounts(offsets, num_packets_per_lid);
660 
661  // Determine the sizes of the unpack batches
662  size_t max_num_ent = compute_maximum_num_entries<LO,DT>(num_packets_per_lid, offsets, imports);
663  const size_t default_batch_size = Tpetra::Details::Behavior::hierarchicalUnpackBatchSize();
664  const size_t batch_size = std::min(default_batch_size, max_num_ent);
665 
666  // To achieve some balance amongst threads, unpack each row in equal size batches
667  size_t num_batches = 0;
668  Kokkos::View<LO*[2], DT> batch_info("", num_batches);
669  Kokkos::View<size_t*, DT> batches_per_lid("", num_import_lids);
670  // Compute meta data that allows batch unpacking
671  Kokkos::parallel_reduce(
672  Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t>>(0, num_import_lids),
673  KOKKOS_LAMBDA(const size_t i, size_t& batches)
674  {
675  const size_t num_entries_in_row = unpackRowCount<LO>(
676  imports.data(), offsets(i), num_packets_per_lid(i)
677  );
678  batches_per_lid(i) =
679  (num_entries_in_row <= batch_size) ?
680  1 :
681  num_entries_in_row / batch_size + (num_entries_in_row % batch_size != 0);
682  batches += batches_per_lid(i);
683  },
684  num_batches
685  );
686  Kokkos::resize(batch_info, num_batches);
687 
688  Kokkos::HostSpace host_space;
689  auto batches_per_lid_h = Kokkos::create_mirror_view(host_space, batches_per_lid);
690  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
691  Kokkos::deep_copy(XS(), batches_per_lid_h, batches_per_lid);
692 
693  auto batch_info_h = Kokkos::create_mirror_view(host_space, batch_info);
694 
695  (void) compute_batch_info(batches_per_lid_h, batch_info_h);
696  // DEEP_COPY REVIEW - HOSTMIRROR-TO-DEVICE
697  Kokkos::deep_copy(XS(), batch_info, batch_info_h);
698 
699  // FIXME (TJF SEP 2017)
700  // The scalar type is not necessarily default constructible
701  size_t bytes_per_value = PackTraits<ST>::packValueCount(ST());
702 
703  // Now do the actual unpack!
704  const bool atomic = XS().concurrency() != 1;
705  using functor = UnpackCrsMatrixAndCombineFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
706  functor f(
707  local_matrix,
708  local_map,
709  imports,
710  num_packets_per_lid,
711  import_lids,
712  batch_info,
713  offsets,
714  combine_mode,
715  batch_size,
716  bytes_per_value,
717  atomic
718  );
719 
720  using policy = Kokkos::TeamPolicy<XS, Kokkos::IndexType<LO>>;
722 #if defined(KOKKOS_ENABLE_CUDA)
723  constexpr bool is_cuda = std::is_same<XS, Kokkos::Cuda>::value;
724 #else
725  constexpr bool is_cuda = false;
726 #endif
727  if (!is_cuda || team_size == Teuchos::OrdinalTraits<size_t>::invalid())
728  {
729  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), Kokkos::AUTO), f);
730  }
731  else
732  {
733  Kokkos::parallel_for(policy(static_cast<LO>(num_batches), static_cast<int>(team_size)), f);
734  }
735 
736  auto error_code = f.error();
737  TEUCHOS_TEST_FOR_EXCEPTION(
738  error_code != 0,
739  std::runtime_error,
740  prefix << "UnpackCrsMatrixAndCombineFunctor reported error code " << error_code
741  );
742 }
743 
744 template<class LocalMatrix, class BufferDeviceType>
745 size_t
747  const LocalMatrix& local_matrix,
748  const typename PackTraits<typename LocalMatrix::ordinal_type>::input_array_type permute_from_lids,
749  const Kokkos::View<const char*, BufferDeviceType>& imports,
750  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
751  const size_t num_same_ids)
752 {
753  using Kokkos::parallel_reduce;
754  typedef typename LocalMatrix::ordinal_type LO;
755  typedef typename LocalMatrix::device_type device_type;
756  typedef typename device_type::execution_space XS;
757  typedef typename Kokkos::View<LO*, device_type>::size_type size_type;
758  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<LO> > range_policy;
759  typedef BufferDeviceType BDT;
760 
761  size_t count = 0;
762  LO num_items;
763 
764  // Number of matrix entries to unpack (returned by this function).
765  num_items = static_cast<LO>(num_same_ids);
766  if (num_items) {
767  size_t kcnt = 0;
768  parallel_reduce(range_policy(0, num_items),
769  KOKKOS_LAMBDA(const LO lid, size_t& update) {
770  update += static_cast<size_t>(local_matrix.graph.row_map[lid+1]
771  -local_matrix.graph.row_map[lid]);
772  }, kcnt);
773  count += kcnt;
774  }
775 
776  // Count entries copied directly from the source matrix with permuting.
777  num_items = static_cast<LO>(permute_from_lids.extent(0));
778  if (num_items) {
779  size_t kcnt = 0;
780  parallel_reduce(range_policy(0, num_items),
781  KOKKOS_LAMBDA(const LO i, size_t& update) {
782  const LO lid = permute_from_lids(i);
783  update += static_cast<size_t> (local_matrix.graph.row_map[lid+1]
784  - local_matrix.graph.row_map[lid]);
785  }, kcnt);
786  count += kcnt;
787  }
788 
789  {
790  // Count entries received from other MPI processes.
791  const size_type np = num_packets_per_lid.extent(0);
792  Kokkos::View<size_t*, device_type> offsets("offsets", np+1);
793  computeOffsetsFromCounts(offsets, num_packets_per_lid);
794  count +=
795  compute_total_num_entries<LO, device_type, BDT> (num_packets_per_lid,
796  offsets, imports);
797  }
798 
799  return count;
800 }
801 
803 template<class LO, class DT, class BDT>
804 int
805 setupRowPointersForRemotes(
806  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
807  const typename PackTraits<LO>::input_array_type& import_lids,
808  const Kokkos::View<const char*, BDT>& imports,
809  const Kokkos::View<const size_t*, BDT>& num_packets_per_lid,
810  const typename PackTraits<size_t>::input_array_type& offsets)
811 {
812  using Kokkos::parallel_reduce;
813  typedef typename DT::execution_space XS;
814  typedef typename PackTraits<size_t>::input_array_type::size_type size_type;
815  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
816 
817  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
818  const size_type N = num_packets_per_lid.extent(0);
819 
820  int errors = 0;
821  parallel_reduce ("Setup row pointers for remotes",
822  range_policy (0, N),
823  KOKKOS_LAMBDA (const size_t i, int& k_error) {
824  typedef typename std::remove_reference< decltype( tgt_rowptr(0) ) >::type atomic_incr_type;
825  const size_t num_bytes = num_packets_per_lid(i);
826  const size_t offset = offsets(i);
827  const size_t num_ent = unpackRowCount<LO> (imports.data(), offset, num_bytes);
828  if (num_ent == InvalidNum) {
829  k_error += 1;
830  }
831  Kokkos::atomic_fetch_add (&tgt_rowptr (import_lids(i)), atomic_incr_type(num_ent));
832  }, errors);
833  return errors;
834 }
835 
836 // Convert array of row lengths to a CRS pointer array
837 template<class DT>
838 void
839 makeCrsRowPtrFromLengths(
840  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
841  const Kokkos::View<size_t*,DT>& new_start_row)
842 {
843  using Kokkos::parallel_scan;
844  typedef typename DT::execution_space XS;
845  typedef typename Kokkos::View<size_t*,DT>::size_type size_type;
846  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
847  const size_type N = new_start_row.extent(0);
848  parallel_scan(range_policy(0, N),
849  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
850  auto cur_val = tgt_rowptr(i);
851  if (final) {
852  tgt_rowptr(i) = update;
853  new_start_row(i) = tgt_rowptr(i);
854  }
855  update += cur_val;
856  }
857  );
858 }
859 
860 template<class LocalMatrix, class LocalMap>
861 void
862 copyDataFromSameIDs(
863  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
864  const typename PackTraits<int>::output_array_type& tgt_pids,
865  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
866  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
867  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
868  const typename PackTraits<int>::input_array_type& src_pids,
869  const LocalMatrix& local_matrix,
870  const LocalMap& local_col_map,
871  const size_t num_same_ids,
872  const int my_pid)
873 {
874  using Kokkos::parallel_for;
875  typedef typename LocalMap::device_type DT;
876  typedef typename LocalMap::local_ordinal_type LO;
877  typedef typename DT::execution_space XS;
878  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
879 
880  parallel_for(range_policy(0, num_same_ids),
881  KOKKOS_LAMBDA(const size_t i) {
882  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
883 
884  const LO src_lid = static_cast<LO>(i);
885  size_t src_row = local_matrix.graph.row_map(src_lid);
886 
887  const LO tgt_lid = static_cast<LO>(i);
888  const size_t tgt_row = tgt_rowptr(tgt_lid);
889 
890  const size_t nsr = local_matrix.graph.row_map(src_lid+1)
891  - local_matrix.graph.row_map(src_lid);
892  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
893 
894  for (size_t j=local_matrix.graph.row_map(src_lid);
895  j<local_matrix.graph.row_map(src_lid+1); ++j) {
896  LO src_col = local_matrix.graph.entries(j);
897  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
898  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
899  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
900  }
901  }
902  );
903 }
904 
905 template<class LocalMatrix, class LocalMap>
906 void
907 copyDataFromPermuteIDs(
908  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
909  const typename PackTraits<int>::output_array_type& tgt_pids,
910  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
911  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
912  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
913  const typename PackTraits<int>::input_array_type& src_pids,
914  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
915  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
916  const LocalMatrix& local_matrix,
917  const LocalMap& local_col_map,
918  const int my_pid)
919 {
920  using Kokkos::parallel_for;
921  typedef typename LocalMap::device_type DT;
922  typedef typename LocalMap::local_ordinal_type LO;
923  typedef typename DT::execution_space XS;
924  typedef typename PackTraits<LO>::input_array_type::size_type size_type;
925  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
926 
927  const size_type num_permute_to_lids = permute_to_lids.extent(0);
928 
929  parallel_for(range_policy(0, num_permute_to_lids),
930  KOKKOS_LAMBDA(const size_t i) {
931  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
932 
933  const LO src_lid = permute_from_lids(i);
934  const size_t src_row = local_matrix.graph.row_map(src_lid);
935 
936  const LO tgt_lid = permute_to_lids(i);
937  const size_t tgt_row = tgt_rowptr(tgt_lid);
938 
939  size_t nsr = local_matrix.graph.row_map(src_lid+1)
940  - local_matrix.graph.row_map(src_lid);
941  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
942 
943  for (size_t j=local_matrix.graph.row_map(src_lid);
944  j<local_matrix.graph.row_map(src_lid+1); ++j) {
945  LO src_col = local_matrix.graph.entries(j);
946  tgt_vals(tgt_row + j - src_row) = local_matrix.values(j);
947  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
948  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
949  }
950  }
951  );
952 }
953 
954 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
955 int
956 unpackAndCombineIntoCrsArrays2(
957  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
958  const typename PackTraits<int>::output_array_type& tgt_pids,
959  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
960  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
961  const typename PackTraits<size_t>::input_array_type& offsets,
962  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
963  const Kokkos::View<const char*, BufferDeviceType>& imports,
964  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
965  const LocalMatrix& /* local_matrix */,
966  const LocalMap /*& local_col_map*/,
967  const int my_pid,
968  const size_t bytes_per_value)
969 {
970  using Kokkos::View;
971  using Kokkos::subview;
972  using Kokkos::MemoryUnmanaged;
973  using Kokkos::parallel_reduce;
974  using Kokkos::atomic_fetch_add;
976  typedef typename LocalMap::device_type DT;
977  typedef typename LocalMap::local_ordinal_type LO;
978  typedef typename LocalMap::global_ordinal_type GO;
979  typedef typename LocalMatrix::value_type ST;
980  typedef typename DT::execution_space XS;
981  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
982  typedef typename Kokkos::pair<size_type, size_type> slice;
983  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_type> > range_policy;
984 
985  typedef View<int*,DT, MemoryUnmanaged> pids_out_type;
986  typedef View<GO*, DT, MemoryUnmanaged> gids_out_type;
987  typedef View<ST*, DT, MemoryUnmanaged> vals_out_type;
988 
989  const size_t InvalidNum = OrdinalTraits<size_t>::invalid();
990 
991  int errors = 0;
992  const size_type num_import_lids = import_lids.size();
993 
994  // RemoteIDs: Loop structure following UnpackAndCombine
995  parallel_reduce ("Unpack and combine into CRS",
996  range_policy (0, num_import_lids),
997  KOKKOS_LAMBDA (const size_t i, int& k_error) {
998  typedef typename std::remove_reference< decltype( new_start_row(0) ) >::type atomic_incr_type;
999  const size_t num_bytes = num_packets_per_lid(i);
1000  const size_t offset = offsets(i);
1001  if (num_bytes == 0) {
1002  // Empty buffer means that the row is empty.
1003  return;
1004  }
1005  size_t num_ent = unpackRowCount<LO>(imports.data(), offset, num_bytes);
1006  if (num_ent == InvalidNum) {
1007  k_error += 1;
1008  return;
1009  }
1010  const LO lcl_row = import_lids(i);
1011  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
1012  const size_t end_row = start_row + num_ent;
1013 
1014  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
1015  vals_out_type vals_out = subview(tgt_vals, slice(start_row, end_row));
1016  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
1017 
1018  k_error += unpackRow<ST,LO,GO>(gids_out, pids_out, vals_out,
1019  imports.data(), offset, num_bytes,
1020  num_ent, bytes_per_value);
1021 
1022  // Correct target PIDs.
1023  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
1024  const int pid = pids_out(j);
1025  pids_out(j) = (pid != my_pid) ? pid : -1;
1026  }
1027  }, errors);
1028 
1029  return errors;
1030 }
1031 
1032 template<typename LocalMatrix, typename LocalMap, typename BufferDeviceType>
1033 void
1035  const LocalMatrix & local_matrix,
1036  const LocalMap & local_col_map,
1037  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& import_lids,
1038  const Kokkos::View<const char*, BufferDeviceType>& imports,
1039  const Kokkos::View<const size_t*, BufferDeviceType>& num_packets_per_lid,
1040  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_to_lids,
1041  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& permute_from_lids,
1042  const typename PackTraits<size_t>::output_array_type& tgt_rowptr,
1043  const typename PackTraits<typename LocalMap::global_ordinal_type>::output_array_type& tgt_colind,
1044  const typename PackTraits<typename LocalMatrix::value_type>::output_array_type& tgt_vals,
1045  const typename PackTraits<int>::input_array_type& src_pids,
1046  const typename PackTraits<int>::output_array_type& tgt_pids,
1047  const size_t num_same_ids,
1048  const size_t tgt_num_rows,
1049  const size_t tgt_num_nonzeros,
1050  const int my_tgt_pid,
1051  const size_t bytes_per_value)
1052 {
1053  using Kokkos::View;
1054  using Kokkos::subview;
1055  using Kokkos::parallel_for;
1056  using Kokkos::MemoryUnmanaged;
1058  typedef typename LocalMap::device_type DT;
1059  typedef typename LocalMap::local_ordinal_type LO;
1060  typedef typename DT::execution_space XS;
1061  typedef typename Kokkos::View<LO*, DT>::size_type size_type;
1062  typedef Kokkos::RangePolicy<XS, Kokkos::IndexType<size_t> > range_policy;
1063  typedef BufferDeviceType BDT;
1064 
1065  const char prefix[] = "unpackAndCombineIntoCrsArrays: ";
1066 
1067  const size_t N = tgt_num_rows;
1068 
1069  // In the case of reduced communicators, the sourceMatrix won't have
1070  // the right "my_pid", so thus we have to supply it.
1071  const int my_pid = my_tgt_pid;
1072 
1073  // Zero the rowptr
1074  parallel_for(range_policy(0, N+1),
1075  KOKKOS_LAMBDA(const size_t i) {
1076  tgt_rowptr(i) = 0;
1077  }
1078  );
1079 
1080  // same IDs: Always first, always in the same place
1081  parallel_for(range_policy(0, num_same_ids),
1082  KOKKOS_LAMBDA(const size_t i) {
1083  const LO tgt_lid = static_cast<LO>(i);
1084  const LO src_lid = static_cast<LO>(i);
1085  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1086  - local_matrix.graph.row_map(src_lid);
1087  }
1088  );
1089 
1090  // Permute IDs: Still local, but reordered
1091  const size_type num_permute_to_lids = permute_to_lids.extent(0);
1092  parallel_for(range_policy(0, num_permute_to_lids),
1093  KOKKOS_LAMBDA(const size_t i) {
1094  const LO tgt_lid = permute_to_lids(i);
1095  const LO src_lid = permute_from_lids(i);
1096  tgt_rowptr(tgt_lid) = local_matrix.graph.row_map(src_lid+1)
1097  - local_matrix.graph.row_map(src_lid);
1098  }
1099  );
1100 
1101  // Get the offsets from the number of packets per LID
1102  const size_type num_import_lids = import_lids.extent(0);
1103  View<size_t*, DT> offsets("offsets", num_import_lids+1);
1104  computeOffsetsFromCounts(offsets, num_packets_per_lid);
1105 
1106 #ifdef HAVE_TPETRA_DEBUG
1107  {
1108  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
1109  const bool condition =
1110  nth_offset_h != static_cast<size_t>(imports.extent (0));
1111  TEUCHOS_TEST_FOR_EXCEPTION
1112  (condition, std::logic_error, prefix
1113  << "The final offset in bytes " << nth_offset_h
1114  << " != imports.size() = " << imports.extent(0)
1115  << ". Please report this bug to the Tpetra developers.");
1116  }
1117 #endif // HAVE_TPETRA_DEBUG
1118 
1119  // Setup row pointers for remotes
1120  int k_error =
1121  setupRowPointersForRemotes<LO,DT,BDT>(tgt_rowptr,
1122  import_lids, imports, num_packets_per_lid, offsets);
1123  TEUCHOS_TEST_FOR_EXCEPTION(k_error != 0, std::logic_error, prefix
1124  << " Error transferring data to target row pointers. "
1125  "Please report this bug to the Tpetra developers.");
1126 
1127  // If multiple processes contribute to the same row, we may need to
1128  // update row offsets. This tracks that.
1129  View<size_t*, DT> new_start_row ("new_start_row", N+1);
1130 
1131  // Turn row length into a real CRS row pointer
1132  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
1133 
1134  // SameIDs: Copy the data over
1135  copyDataFromSameIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1136  tgt_rowptr, src_pids, local_matrix, local_col_map, num_same_ids, my_pid);
1137 
1138  copyDataFromPermuteIDs(tgt_colind, tgt_pids, tgt_vals, new_start_row,
1139  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
1140  local_matrix, local_col_map, my_pid);
1141 
1142  if (imports.extent(0) <= 0) {
1143  return;
1144  }
1145 
1146  int unpack_err = unpackAndCombineIntoCrsArrays2(tgt_colind, tgt_pids,
1147  tgt_vals, new_start_row, offsets, import_lids, imports, num_packets_per_lid,
1148  local_matrix, local_col_map, my_pid, bytes_per_value);
1149  TEUCHOS_TEST_FOR_EXCEPTION(
1150  unpack_err != 0, std::logic_error, prefix << "unpack loop failed. This "
1151  "should never happen. Please report this bug to the Tpetra developers.");
1152 
1153  return;
1154 }
1155 
1156 } // namespace UnpackAndCombineCrsMatrixImpl
1157 
1197 template<typename ST, typename LO, typename GO, typename Node>
1198 void
1200  const CrsMatrix<ST, LO, GO, Node>& sourceMatrix,
1201  const Teuchos::ArrayView<const char>& imports,
1202  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1203  const Teuchos::ArrayView<const LO>& importLIDs,
1204  size_t /* constantNumPackets */,
1205  CombineMode combineMode)
1206 {
1207  using Kokkos::View;
1208  typedef typename Node::device_type device_type;
1209  typedef typename CrsMatrix<ST, LO, GO, Node>::local_matrix_device_type local_matrix_device_type;
1210  static_assert (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1211  "Node::device_type and LocalMatrix::device_type must be the same.");
1212 
1213  // Convert all Teuchos::Array to Kokkos::View.
1214  device_type outputDevice;
1215 
1216  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
1217  // them to device. Since unpacking is done directly in to the local matrix
1218  // (lclMatrix), no copying needs to be performed after unpacking.
1219  auto num_packets_per_lid_d =
1220  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1221  numPacketsPerLID.size(), true, "num_packets_per_lid");
1222 
1223  auto import_lids_d =
1224  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1225  importLIDs.size(), true, "import_lids");
1226 
1227  auto imports_d =
1228  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1229  imports.size(), true, "imports");
1230 
1231  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1232  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1233 
1234 //KDDKDD This loop doesn't appear to do anything; what is it?
1235 //KDDKDD for (int i=0; i<importLIDs.size(); i++)
1236 //KDDKDD {
1237 //KDDKDD auto lclRow = importLIDs[i];
1238 //KDDKDD Teuchos::ArrayView<const LO> A_indices;
1239 //KDDKDD Teuchos::ArrayView<const ST> A_values;
1240 //KDDKDD sourceMatrix.getLocalRowView(lclRow, A_indices, A_values);
1241 //KDDKDD }
1242  // Now do the actual unpack!
1243  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix(
1244  local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1245  import_lids_d, combineMode);
1246 
1247 }
1248 
1249 template<typename ST, typename LO, typename GO, typename NT>
1250 void
1251 unpackCrsMatrixAndCombineNew(
1252  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
1253  Kokkos::DualView<char*,
1255  Kokkos::DualView<size_t*,
1256  typename DistObject<char, LO, GO, NT>::buffer_device_type> numPacketsPerLID,
1257  const Kokkos::DualView<const LO*,
1259  const size_t /* constantNumPackets */,
1260  const CombineMode combineMode)
1261 {
1262  using Kokkos::View;
1263  using crs_matrix_type = CrsMatrix<ST, LO, GO, NT>;
1264  using dist_object_type = DistObject<char, LO, GO, NT>;
1265  using device_type = typename crs_matrix_type::device_type;
1266  using local_matrix_device_type = typename crs_matrix_type::local_matrix_device_type;
1267  using buffer_device_type = typename dist_object_type::buffer_device_type;
1268 
1269  static_assert
1270  (std::is_same<device_type, typename local_matrix_device_type::device_type>::value,
1271  "crs_matrix_type::device_type and local_matrix_device_type::device_type "
1272  "must be the same.");
1273 
1274  if (numPacketsPerLID.need_sync_device()) {
1275  numPacketsPerLID.sync_device ();
1276  }
1277  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1278 
1279  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1280  auto import_lids_d = importLIDs.view_device ();
1281 
1282  if (imports.need_sync_device()) {
1283  imports.sync_device ();
1284  }
1285  auto imports_d = imports.view_device ();
1286 
1287  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1288  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
1289  typedef decltype (local_col_map) local_map_type;
1290 
1291  UnpackAndCombineCrsMatrixImpl::unpackAndCombineIntoCrsMatrix<
1292  local_matrix_device_type,
1293  local_map_type,
1294  buffer_device_type
1295  > (local_matrix, local_col_map, imports_d, num_packets_per_lid_d,
1296  import_lids_d, combineMode);
1297 }
1298 
1345 //
1354 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1355 size_t
1357  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> & sourceMatrix,
1358  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1359  const Teuchos::ArrayView<const char> &imports,
1360  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1361  size_t /* constantNumPackets */,
1362  CombineMode /* combineMode */,
1363  size_t numSameIDs,
1364  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1365  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1366 {
1367  using Kokkos::MemoryUnmanaged;
1368  using Kokkos::View;
1369  typedef typename Node::device_type DT;
1371  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1372 
1373  TEUCHOS_TEST_FOR_EXCEPTION
1374  (permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1375  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size () << " != "
1376  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1377  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1378  // process, then the matrix is neither locally nor globally indexed.
1379  const bool locallyIndexed = sourceMatrix.isLocallyIndexed ();
1380  TEUCHOS_TEST_FOR_EXCEPTION
1381  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1382  "CrsMatrix 'sourceMatrix' must be locally indexed.");
1383  TEUCHOS_TEST_FOR_EXCEPTION
1384  (importLIDs.size () != numPacketsPerLID.size (), std::invalid_argument,
1385  prefix << "importLIDs.size() = " << importLIDs.size () << " != "
1386  "numPacketsPerLID.size() = " << numPacketsPerLID.size () << ".");
1387 
1388  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
1389  auto permute_from_lids_d =
1391  permuteFromLIDs.getRawPtr (),
1392  permuteFromLIDs.size (), true,
1393  "permute_from_lids");
1394  auto imports_d =
1396  imports.getRawPtr (),
1397  imports.size (), true,
1398  "imports");
1399  auto num_packets_per_lid_d =
1401  numPacketsPerLID.getRawPtr (),
1402  numPacketsPerLID.size (), true,
1403  "num_packets_per_lid");
1404 
1406  local_matrix, permute_from_lids_d, imports_d,
1407  num_packets_per_lid_d, numSameIDs);
1408 }
1409 
1424 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
1425 void
1428  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1429  const Teuchos::ArrayView<const char>& imports,
1430  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1431  const size_t /* constantNumPackets */,
1432  const CombineMode /* combineMode */,
1433  const size_t numSameIDs,
1434  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1435  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1436  size_t TargetNumRows,
1437  size_t TargetNumNonzeros,
1438  const int MyTargetPID,
1439  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1440  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1441  const Teuchos::ArrayView<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::impl_scalar_type>& CRS_vals,
1442  const Teuchos::ArrayView<const int>& SourcePids,
1443  Teuchos::Array<int>& TargetPids)
1444 {
1445  using execution_space = typename Node::execution_space;
1447 
1448  using Kokkos::View;
1449  using Kokkos::deep_copy;
1450 
1451  using Teuchos::ArrayView;
1452  using Teuchos::outArg;
1453  using Teuchos::REDUCE_MAX;
1454  using Teuchos::reduceAll;
1455 
1456  typedef LocalOrdinal LO;
1457 
1458  typedef typename Node::device_type DT;
1459 
1461  typedef typename matrix_type::impl_scalar_type ST;
1462  typedef typename ArrayView<const LO>::size_type size_type;
1463 
1464  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1465 
1466  TEUCHOS_TEST_FOR_EXCEPTION(
1467  TargetNumRows + 1 != static_cast<size_t> (CRS_rowptr.size ()),
1468  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1469  CRS_rowptr.size () << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1470 
1471  TEUCHOS_TEST_FOR_EXCEPTION(
1472  permuteToLIDs.size () != permuteFromLIDs.size (), std::invalid_argument,
1473  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size ()
1474  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size () << ".");
1475  const size_type numImportLIDs = importLIDs.size ();
1476 
1477  TEUCHOS_TEST_FOR_EXCEPTION(
1478  numImportLIDs != numPacketsPerLID.size (), std::invalid_argument,
1479  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1480  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1481 
1482  // Preseed TargetPids with -1 for local
1483  if (static_cast<size_t> (TargetPids.size ()) != TargetNumNonzeros) {
1484  TargetPids.resize (TargetNumNonzeros);
1485  }
1486  TargetPids.assign (TargetNumNonzeros, -1);
1487 
1488  // Grab pointers for sourceMatrix
1489  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
1490  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
1491 
1492  // Convert input arrays to Kokkos::View
1493  DT outputDevice;
1494  auto import_lids_d =
1495  create_mirror_view_from_raw_host_array(outputDevice, importLIDs.getRawPtr(),
1496  importLIDs.size(), true, "import_lids");
1497 
1498  auto imports_d =
1499  create_mirror_view_from_raw_host_array(outputDevice, imports.getRawPtr(),
1500  imports.size(), true, "imports");
1501 
1502  auto num_packets_per_lid_d =
1503  create_mirror_view_from_raw_host_array(outputDevice, numPacketsPerLID.getRawPtr(),
1504  numPacketsPerLID.size(), true, "num_packets_per_lid");
1505 
1506  auto permute_from_lids_d =
1507  create_mirror_view_from_raw_host_array(outputDevice, permuteFromLIDs.getRawPtr(),
1508  permuteFromLIDs.size(), true, "permute_from_lids");
1509 
1510  auto permute_to_lids_d =
1511  create_mirror_view_from_raw_host_array(outputDevice, permuteToLIDs.getRawPtr(),
1512  permuteToLIDs.size(), true, "permute_to_lids");
1513 
1514  auto crs_rowptr_d =
1515  create_mirror_view_from_raw_host_array(outputDevice, CRS_rowptr.getRawPtr(),
1516  CRS_rowptr.size(), true, "crs_rowptr");
1517 
1518  auto crs_colind_d =
1519  create_mirror_view_from_raw_host_array(outputDevice, CRS_colind.getRawPtr(),
1520  CRS_colind.size(), true, "crs_colidx");
1521 
1522 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1523  static_assert (! std::is_same<
1524  typename std::remove_const<
1525  typename std::decay<
1526  decltype (CRS_vals)
1527  >::type::value_type
1528  >::type,
1529  std::complex<double> >::value,
1530  "CRS_vals::value_type is std::complex<double>; this should never happen"
1531  ", since std::complex does not work in Kokkos::View objects.");
1532 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1533 
1534  auto crs_vals_d =
1535  create_mirror_view_from_raw_host_array(outputDevice, CRS_vals.getRawPtr(),
1536  CRS_vals.size(), true, "crs_vals");
1537 
1538 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1539  static_assert (! std::is_same<
1540  typename decltype (crs_vals_d)::non_const_value_type,
1541  std::complex<double> >::value,
1542  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1543  "never happen, since std::complex does not work in Kokkos::View objects.");
1544 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1545 
1546  auto src_pids_d =
1547  create_mirror_view_from_raw_host_array(outputDevice, SourcePids.getRawPtr(),
1548  SourcePids.size(), true, "src_pids");
1549 
1550  auto tgt_pids_d =
1551  create_mirror_view_from_raw_host_array(outputDevice, TargetPids.getRawPtr(),
1552  TargetPids.size(), true, "tgt_pids");
1553 
1554  size_t bytes_per_value = 0;
1556  // assume that ST is default constructible
1557  bytes_per_value = PackTraits<ST>::packValueCount(ST());
1558  }
1559  else {
1560  // Since the packed data come from the source matrix, we can use the source
1561  // matrix to get the number of bytes per Scalar value stored in the matrix.
1562  // This assumes that all Scalar values in the source matrix require the same
1563  // number of bytes. If the source matrix has no entries on the calling
1564  // process, then we hope that some process does have some idea how big
1565  // a Scalar value is. Of course, if no processes have any entries, then no
1566  // values should be packed (though this does assume that in our packing
1567  // scheme, rows with zero entries take zero bytes).
1568  size_t bytes_per_value_l = 0;
1569  if (local_matrix.values.extent(0) > 0) {
1570  const ST& val = local_matrix.values(0);
1571  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1572  } else {
1573  const ST& val = crs_vals_d(0);
1574  bytes_per_value_l = PackTraits<ST>::packValueCount(val);
1575  }
1576  Teuchos::reduceAll<int, size_t>(*(sourceMatrix.getComm()),
1577  Teuchos::REDUCE_MAX,
1578  bytes_per_value_l,
1579  outArg(bytes_per_value));
1580  }
1581 
1582 #ifdef HAVE_TPETRA_INST_COMPLEX_DOUBLE
1583  static_assert (! std::is_same<
1584  typename decltype (crs_vals_d)::non_const_value_type,
1585  std::complex<double> >::value,
1586  "crs_vals_d::non_const_value_type is std::complex<double>; this should "
1587  "never happen, since std::complex does not work in Kokkos::View objects.");
1588 #endif // HAVE_TPETRA_INST_COMPLEX_DOUBLE
1589 
1591  local_matrix, local_col_map, import_lids_d, imports_d,
1592  num_packets_per_lid_d, permute_to_lids_d, permute_from_lids_d,
1593  crs_rowptr_d, crs_colind_d, crs_vals_d, src_pids_d, tgt_pids_d,
1594  numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID,
1595  bytes_per_value);
1596 
1597  // Copy outputs back to host
1598  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1599  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1600  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1601  deep_copy(execution_space(), crs_rowptr_h, crs_rowptr_d);
1602 
1603  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1604  CRS_colind.getRawPtr(), CRS_colind.size());
1605  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1606  deep_copy(execution_space(), crs_colind_h, crs_colind_d);
1607 
1608  typename decltype(crs_vals_d)::HostMirror crs_vals_h(
1609  CRS_vals.getRawPtr(), CRS_vals.size());
1610  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1611  deep_copy(execution_space(), crs_vals_h, crs_vals_d);
1612 
1613  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1614  TargetPids.getRawPtr(), TargetPids.size());
1615  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
1616  deep_copy(execution_space(), tgt_pids_h, tgt_pids_d);
1617 }
1618 
1619 } // namespace Details
1620 } // namespace Tpetra
1621 
1622 #define TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_INSTANT( ST, LO, GO, NT ) \
1623  template void \
1624  Details::unpackCrsMatrixAndCombine<ST, LO, GO, NT> ( \
1625  const CrsMatrix<ST, LO, GO, NT>&, \
1626  const Teuchos::ArrayView<const char>&, \
1627  const Teuchos::ArrayView<const size_t>&, \
1628  const Teuchos::ArrayView<const LO>&, \
1629  size_t, \
1630  CombineMode); \
1631  template void \
1632  Details::unpackCrsMatrixAndCombineNew<ST, LO, GO, NT> ( \
1633  const CrsMatrix<ST, LO, GO, NT>&, \
1634  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1635  Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>, \
1636  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1637  const size_t, \
1638  const CombineMode); \
1639  template void \
1640  Details::unpackAndCombineIntoCrsArrays<ST, LO, GO, NT> ( \
1641  const CrsMatrix<ST, LO, GO, NT> &, \
1642  const Teuchos::ArrayView<const LO>&, \
1643  const Teuchos::ArrayView<const char>&, \
1644  const Teuchos::ArrayView<const size_t>&, \
1645  const size_t, \
1646  const CombineMode, \
1647  const size_t, \
1648  const Teuchos::ArrayView<const LO>&, \
1649  const Teuchos::ArrayView<const LO>&, \
1650  size_t, \
1651  size_t, \
1652  const int, \
1653  const Teuchos::ArrayView<size_t>&, \
1654  const Teuchos::ArrayView<GO>&, \
1655  const Teuchos::ArrayView<CrsMatrix<ST, LO, GO, NT>::impl_scalar_type>&, \
1656  const Teuchos::ArrayView<const int>&, \
1657  Teuchos::Array<int>&); \
1658  template size_t \
1659  Details::unpackAndCombineWithOwningPIDsCount<ST, LO, GO, NT> ( \
1660  const CrsMatrix<ST, LO, GO, NT> &, \
1661  const Teuchos::ArrayView<const LO> &, \
1662  const Teuchos::ArrayView<const char> &, \
1663  const Teuchos::ArrayView<const size_t>&, \
1664  size_t, \
1665  CombineMode, \
1666  size_t, \
1667  const Teuchos::ArrayView<const LO>&, \
1668  const Teuchos::ArrayView<const LO>&);
1669 
1670 #endif // TPETRA_DETAILS_UNPACKCRSMATRIXANDCOMBINE_DEF_HPP
Kokkos::parallel_reduce functor to determine the number of entries (to unpack) in a KokkosSparse::Crs...
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
static KOKKOS_INLINE_FUNCTION size_t unpackValue(T &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
KOKKOS_INLINE_FUNCTION LocalOrdinal getLocalElement(const GlobalOrdinal globalIndex) const
Get the local index corresponding to the given global index. (device only)
Traits class for packing / unpacking data of type T.
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static size_t hierarchicalUnpackTeamSize()
Size of team for hierarchical unpacking.
void unpackCrsMatrixAndCombine(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, const Teuchos::ArrayView< const char > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, CombineMode combineMode)
Unpack the imported column indices and values, and combine into matrix.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
static size_t hierarchicalUnpackBatchSize()
Size of batch for hierarchical unpacking.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
Kokkos::View< value_type *, Kokkos::AnonymousSpace > output_array_type
The type of an output array of value_type.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
DeviceType device_type
The device type.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > unpackArray(value_type outBuf[], const char inBuf[], const size_t numEnt)
Unpack numEnt value_type entries from the given input buffer of bytes, to the given output buffer of ...
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...