Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_packCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
11 #define TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Teuchos_Array.hpp"
15 #include "Teuchos_ArrayView.hpp"
24 #include <memory>
25 #include <sstream>
26 #include <stdexcept>
27 #include <string>
28 
51 
52 namespace Tpetra {
53 
54 //
55 // Users must never rely on anything in the Details namespace.
56 //
57 namespace Details {
58 
59 namespace PackCrsMatrixImpl {
67 template<class OutputOffsetsViewType,
68  class CountsViewType,
69  class InputOffsetsViewType,
70  class InputLocalRowIndicesViewType,
71  class InputLocalRowPidsViewType,
72  const bool debug =
73 #ifdef HAVE_TPETRA_DEBUG
74  true
75 #else
76  false
77 #endif // HAVE_TPETRA_DEBUG
78  >
80 public:
81  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
82  typedef typename CountsViewType::non_const_value_type count_type;
83  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
84  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
85  typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
86  // output Views drive where execution happens.
87  typedef typename OutputOffsetsViewType::device_type device_type;
88  static_assert (std::is_same<typename CountsViewType::device_type::execution_space,
89  typename device_type::execution_space>::value,
90  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
91  static_assert (Kokkos::is_view<OutputOffsetsViewType>::value,
92  "OutputOffsetsViewType must be a Kokkos::View.");
93  static_assert (std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
94  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
95  static_assert (std::is_integral<output_offset_type>::value,
96  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
97  static_assert (Kokkos::is_view<CountsViewType>::value,
98  "CountsViewType must be a Kokkos::View.");
99  static_assert (std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
100  "CountsViewType must be a nonconst Kokkos::View.");
101  static_assert (std::is_integral<count_type>::value,
102  "The type of each entry of CountsViewType must be a built-in integer type.");
103  static_assert (Kokkos::is_view<InputOffsetsViewType>::value,
104  "InputOffsetsViewType must be a Kokkos::View.");
105  static_assert (std::is_integral<input_offset_type>::value,
106  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
107  static_assert (Kokkos::is_view<InputLocalRowIndicesViewType>::value,
108  "InputLocalRowIndicesViewType must be a Kokkos::View.");
109  static_assert (std::is_integral<local_row_index_type>::value,
110  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
111 
112  NumPacketsAndOffsetsFunctor (const OutputOffsetsViewType& outputOffsets,
113  const CountsViewType& counts,
114  const InputOffsetsViewType& rowOffsets,
115  const InputLocalRowIndicesViewType& lclRowInds,
116  const InputLocalRowPidsViewType& lclRowPids,
117  const count_type sizeOfLclCount,
118  const count_type sizeOfGblColInd,
119  const count_type sizeOfPid,
120  const count_type sizeOfValue) :
121  outputOffsets_ (outputOffsets),
122  counts_ (counts),
123  rowOffsets_ (rowOffsets),
124  lclRowInds_ (lclRowInds),
125  lclRowPids_ (lclRowPids),
126  sizeOfLclCount_ (sizeOfLclCount),
127  sizeOfGblColInd_ (sizeOfGblColInd),
128  sizeOfPid_ (sizeOfPid),
129  sizeOfValue_ (sizeOfValue),
130  error_ ("error") // don't forget this, or you'll get segfaults!
131  {
132  if (debug) {
133  const size_t numRowsToPack = static_cast<size_t> (lclRowInds_.extent (0));
134 
135  if (numRowsToPack != static_cast<size_t> (counts_.extent (0))) {
136  std::ostringstream os;
137  os << "lclRowInds.extent(0) = " << numRowsToPack
138  << " != counts.extent(0) = " << counts_.extent (0)
139  << ".";
140  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
141  }
142  if (static_cast<size_t> (numRowsToPack + 1) !=
143  static_cast<size_t> (outputOffsets_.extent (0))) {
144  std::ostringstream os;
145  os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
146  << " != outputOffsets.extent(0) = " << outputOffsets_.extent (0)
147  << ".";
148  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
149  }
150  }
151  }
152 
153  KOKKOS_INLINE_FUNCTION void
154  operator() (const local_row_index_type& curInd,
155  output_offset_type& update,
156  const bool final) const
157  {
158  if (debug) {
159  if (curInd < static_cast<local_row_index_type> (0)) {
160  error_ () = 1;
161  return;
162  }
163  }
164 
165  if (final) {
166  if (debug) {
167  if (curInd >= static_cast<local_row_index_type> (outputOffsets_.extent (0))) {
168  error_ () = 2;
169  return;
170  }
171  }
172  outputOffsets_(curInd) = update;
173  }
174 
175  if (curInd < static_cast<local_row_index_type> (counts_.extent (0))) {
176  const auto lclRow = lclRowInds_(curInd);
177  if (static_cast<size_t> (lclRow + 1) >= static_cast<size_t> (rowOffsets_.extent (0)) ||
178  static_cast<local_row_index_type> (lclRow) < static_cast<local_row_index_type> (0)) {
179  error_ () = 3;
180  return;
181  }
182  // count_type could differ from the type of each row offset.
183  // For example, row offsets might each be 64 bits, but if their
184  // difference always fits in 32 bits, we may then safely use a
185  // 32-bit count_type.
186  const count_type count =
187  static_cast<count_type> (rowOffsets_(lclRow+1) - rowOffsets_(lclRow));
188 
189  // We pack first the number of entries in the row, then that
190  // many global column indices, then that many pids (if any),
191  // then that many values. However, if the number of entries in
192  // the row is zero, we pack nothing.
193  const count_type numBytes = (count == 0) ?
194  static_cast<count_type> (0) :
195  sizeOfLclCount_ + count * (sizeOfGblColInd_ +
196  (lclRowPids_.size() > 0 ? sizeOfPid_ : 0) +
197  sizeOfValue_);
198 
199  if (final) {
200  counts_(curInd) = numBytes;
201  }
202  update += numBytes;
203  }
204  }
205 
206  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
207  // have join both with and without volatile! Otherwise intrawarp
208  // joins are really slow on GPUs.
209 
211  int getError () const {
212  auto error_h = Kokkos::create_mirror_view (error_);
213  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
214  // Note: In the UVM case, this would otherwise be a no-op
215  // and thus not fence, so the value might not be correct on return
216  // In the non-UVM case, create_mirror_view will block for the allocation
217  Kokkos::deep_copy (error_h, error_);
218  return error_h ();
219  }
220 
221 private:
222  OutputOffsetsViewType outputOffsets_;
223  CountsViewType counts_;
224  typename InputOffsetsViewType::const_type rowOffsets_;
225  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
226  typename InputLocalRowPidsViewType::const_type lclRowPids_;
227  count_type sizeOfLclCount_;
228  count_type sizeOfGblColInd_;
229  count_type sizeOfPid_;
230  count_type sizeOfValue_;
231  Kokkos::View<int, device_type> error_;
232 };
233 
243 template<class OutputOffsetsViewType,
244  class CountsViewType,
245  class InputOffsetsViewType,
246  class InputLocalRowIndicesViewType,
247  class InputLocalRowPidsViewType>
248 typename CountsViewType::non_const_value_type
249 computeNumPacketsAndOffsets (const OutputOffsetsViewType& outputOffsets,
250  const CountsViewType& counts,
251  const InputOffsetsViewType& rowOffsets,
252  const InputLocalRowIndicesViewType& lclRowInds,
253  const InputLocalRowPidsViewType& lclRowPids,
254  const typename CountsViewType::non_const_value_type sizeOfLclCount,
255  const typename CountsViewType::non_const_value_type sizeOfGblColInd,
256  const typename CountsViewType::non_const_value_type sizeOfPid,
257  const typename CountsViewType::non_const_value_type sizeOfValue)
258 {
259  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
260  CountsViewType, typename InputOffsetsViewType::const_type,
261  typename InputLocalRowIndicesViewType::const_type,
262  typename InputLocalRowPidsViewType::const_type> functor_type;
263  typedef typename CountsViewType::non_const_value_type count_type;
264  typedef typename OutputOffsetsViewType::size_type size_type;
265  typedef typename OutputOffsetsViewType::execution_space execution_space;
266  typedef typename functor_type::local_row_index_type LO;
267  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
268  const char prefix[] = "computeNumPacketsAndOffsets: ";
269 
270  count_type count = 0;
271  const count_type numRowsToPack = lclRowInds.extent (0);
272 
273  if (numRowsToPack == 0) {
274  return count;
275  }
276  else {
277  TEUCHOS_TEST_FOR_EXCEPTION
278  (rowOffsets.extent (0) <= static_cast<size_type> (1),
279  std::invalid_argument, prefix << "There is at least one row to pack, "
280  "but the matrix has no rows. lclRowInds.extent(0) = " <<
281  numRowsToPack << ", but rowOffsets.extent(0) = " <<
282  rowOffsets.extent (0) << " <= 1.");
283  TEUCHOS_TEST_FOR_EXCEPTION
284  (outputOffsets.extent (0) !=
285  static_cast<size_type> (numRowsToPack + 1), std::invalid_argument,
286  prefix << "Output dimension does not match number of rows to pack. "
287  << "outputOffsets.extent(0) = " << outputOffsets.extent (0)
288  << " != lclRowInds.extent(0) + 1 = "
289  << static_cast<size_type> (numRowsToPack + 1) << ".");
290  TEUCHOS_TEST_FOR_EXCEPTION
291  (counts.extent (0) != numRowsToPack, std::invalid_argument,
292  prefix << "counts.extent(0) = " << counts.extent (0)
293  << " != numRowsToPack = " << numRowsToPack << ".");
294 
295  functor_type f (outputOffsets, counts, rowOffsets,
296  lclRowInds, lclRowPids, sizeOfLclCount,
297  sizeOfGblColInd, sizeOfPid, sizeOfValue);
298  Kokkos::parallel_scan (range_type (0, numRowsToPack + 1), f);
299 
300  // At least in debug mode, this functor checks for errors.
301  const int errCode = f.getError ();
302  TEUCHOS_TEST_FOR_EXCEPTION
303  (errCode != 0, std::runtime_error, prefix << "parallel_scan error code "
304  << errCode << " != 0.");
305 
306 #if 0
307  size_t total = 0;
308  for (LO k = 0; k < numRowsToPack; ++k) {
309  total += counts[k];
310  }
311  if (outputOffsets(numRowsToPack) != total) {
312  if (errStr.get () == NULL) {
313  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
314  }
315  std::ostringstream& os = *errStr;
316  os << prefix
317  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
318  << outputOffsets(numRowsToPack) << " != sum of counts = "
319  << total << "." << std::endl;
320  if (numRowsToPack != 0) {
321  // Only print the array if it's not too long.
322  if (numRowsToPack < static_cast<LO> (10)) {
323  os << "outputOffsets: [";
324  for (LO i = 0; i <= numRowsToPack; ++i) {
325  os << outputOffsets(i);
326  if (static_cast<LO> (i + 1) <= numRowsToPack) {
327  os << ",";
328  }
329  }
330  os << "]" << std::endl;
331  os << "counts: [";
332  for (LO i = 0; i < numRowsToPack; ++i) {
333  os << counts(i);
334  if (static_cast<LO> (i + 1) < numRowsToPack) {
335  os << ",";
336  }
337  }
338  os << "]" << std::endl;
339  }
340  else {
341  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
342  << outputOffsets(numRowsToPack-1) << "." << std::endl;
343  }
344  }
345  count = outputOffsets(numRowsToPack);
346  return {false, errStr};
347  }
348 #endif // HAVE_TPETRA_DEBUG
349 
350  // Get last entry of outputOffsets, which is the sum of the entries
351  // of counts. Don't assume UVM.
352  using Tpetra::Details::getEntryOnHost;
353  return static_cast<count_type> (getEntryOnHost (outputOffsets,
354  numRowsToPack));
355  }
356 }
357 
373 template<class ST, class ColumnMap, class BufferDeviceType>
374 KOKKOS_FUNCTION
375 Kokkos::pair<int, size_t>
376 packCrsMatrixRow (const ColumnMap& col_map,
377  const Kokkos::View<char*, BufferDeviceType>& exports,
378  const typename PackTraits<typename ColumnMap::local_ordinal_type>::input_array_type& lids_in,
379  const typename PackTraits<int>::input_array_type& pids_in,
380  const typename PackTraits<ST>::input_array_type& vals_in,
381  const size_t offset,
382  const size_t num_ent,
383  const size_t num_bytes_per_value,
384  const bool pack_pids)
385 {
386  using Kokkos::subview;
387  using LO = typename ColumnMap::local_ordinal_type;
388  using GO = typename ColumnMap::global_ordinal_type;
389  using return_type = Kokkos::pair<int, size_t>;
390 
391  if (num_ent == 0) {
392  // Empty rows always take zero bytes, to ensure sparsity.
393  return return_type (0, 0);
394  }
395 
396  const LO num_ent_LO = static_cast<LO> (num_ent); // packValueCount wants this
397  const size_t num_ent_beg = offset;
398  const size_t num_ent_len = PackTraits<LO>::packValueCount (num_ent_LO);
399 
400  const size_t gids_beg = num_ent_beg + num_ent_len;
401  const size_t gids_len = num_ent * PackTraits<GO>::packValueCount (GO (0));
402 
403  const size_t pids_beg = gids_beg + gids_len;
404  const size_t pids_len = pack_pids ?
405  num_ent * PackTraits<int>::packValueCount (int (0)) :
406  static_cast<size_t> (0);
407 
408  const size_t vals_beg = gids_beg + gids_len + pids_len;
409  const size_t vals_len = num_ent * num_bytes_per_value;
410 
411  char* const num_ent_out = exports.data () + num_ent_beg;
412  char* const gids_out = exports.data () + gids_beg;
413  char* const pids_out = pack_pids ? exports.data () + pids_beg : NULL;
414  char* const vals_out = exports.data () + vals_beg;
415 
416  size_t num_bytes_out = 0;
417  int error_code = 0;
418  num_bytes_out += PackTraits<LO>::packValue (num_ent_out, num_ent_LO);
419 
420  {
421  // Copy column indices one at a time, so that we don't need
422  // temporary storage.
423  for (size_t k = 0; k < num_ent; ++k) {
424  const LO lid = lids_in[k];
425  const GO gid = col_map.getGlobalElement (lid);
426  num_bytes_out += PackTraits<GO>::packValue (gids_out, k, gid);
427  }
428  // Copy PIDs one at a time, so that we don't need temporary storage.
429  if (pack_pids) {
430  for (size_t k = 0; k < num_ent; ++k) {
431  const LO lid = lids_in[k];
432  const int pid = pids_in[lid];
433  num_bytes_out += PackTraits<int>::packValue (pids_out, k, pid);
434  }
435  }
436  const auto p =
437  PackTraits<ST>::packArray (vals_out, vals_in.data (), num_ent);
438  error_code += p.first;
439  num_bytes_out += p.second;
440  }
441 
442  if (error_code != 0) {
443  return return_type (10, num_bytes_out);
444  }
445 
446  const size_t expected_num_bytes =
447  num_ent_len + gids_len + pids_len + vals_len;
448  if (num_bytes_out != expected_num_bytes) {
449  return return_type (11, num_bytes_out);
450  }
451  return return_type (0, num_bytes_out);
452 }
453 
454 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
455 struct PackCrsMatrixFunctor {
456  typedef LocalMatrix local_matrix_device_type;
457  typedef LocalMap local_map_type;
458  typedef typename local_matrix_device_type::value_type ST;
459  typedef typename local_map_type::local_ordinal_type LO;
460  typedef typename local_map_type::global_ordinal_type GO;
461  typedef typename local_matrix_device_type::device_type DT;
462 
463  typedef Kokkos::View<const size_t*, BufferDeviceType>
464  num_packets_per_lid_view_type;
465  typedef Kokkos::View<const size_t*, BufferDeviceType> offsets_view_type;
466  typedef Kokkos::View<char*, BufferDeviceType> exports_view_type;
467  using export_lids_view_type = typename PackTraits<LO>::input_array_type;
468  using source_pids_view_type = typename PackTraits<int>::input_array_type;
469 
470  typedef typename num_packets_per_lid_view_type::non_const_value_type
471  count_type;
472  typedef typename offsets_view_type::non_const_value_type
473  offset_type;
474  typedef Kokkos::pair<int, LO> value_type;
475 
476  static_assert (std::is_same<LO, typename local_matrix_device_type::ordinal_type>::value,
477  "local_map_type::local_ordinal_type and "
478  "local_matrix_device_type::ordinal_type must be the same.");
479 
480  local_matrix_device_type local_matrix;
481  local_map_type local_col_map;
482  exports_view_type exports;
483  num_packets_per_lid_view_type num_packets_per_lid;
484  export_lids_view_type export_lids;
485  source_pids_view_type source_pids;
486  offsets_view_type offsets;
487  size_t num_bytes_per_value;
488  bool pack_pids;
489 
490  PackCrsMatrixFunctor (const local_matrix_device_type& local_matrix_in,
491  const local_map_type& local_col_map_in,
492  const exports_view_type& exports_in,
493  const num_packets_per_lid_view_type& num_packets_per_lid_in,
494  const export_lids_view_type& export_lids_in,
495  const source_pids_view_type& source_pids_in,
496  const offsets_view_type& offsets_in,
497  const size_t num_bytes_per_value_in,
498  const bool pack_pids_in) :
499  local_matrix (local_matrix_in),
500  local_col_map (local_col_map_in),
501  exports (exports_in),
502  num_packets_per_lid (num_packets_per_lid_in),
503  export_lids (export_lids_in),
504  source_pids (source_pids_in),
505  offsets (offsets_in),
506  num_bytes_per_value (num_bytes_per_value_in),
507  pack_pids (pack_pids_in)
508  {
509  const LO numRows = local_matrix_in.numRows ();
510  const LO rowMapDim =
511  static_cast<LO> (local_matrix.graph.row_map.extent (0));
512  TEUCHOS_TEST_FOR_EXCEPTION
513  (numRows != 0 && rowMapDim != numRows + static_cast<LO> (1),
514  std::logic_error, "local_matrix.graph.row_map.extent(0) = "
515  << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
516  }
517 
518  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const
519  {
520  using ::Tpetra::Details::OrdinalTraits;
521  dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
522  }
523 
524  KOKKOS_INLINE_FUNCTION void
525  join (value_type& dst, const value_type& src) const
526  {
527  // `dst` should reflect the first (least) bad index and all other
528  // associated error codes and data, so prefer keeping it.
529  if (src.first != 0 && dst.first == 0) {
530  dst = src;
531  }
532  }
533 
534  KOKKOS_INLINE_FUNCTION
535  void operator() (const LO i, value_type& dst) const
536  {
537  const size_t offset = offsets[i];
538  const LO export_lid = export_lids[i];
539  const size_t buf_size = exports.size();
540  const size_t num_bytes = num_packets_per_lid(i);
541  const size_t num_ent =
542  static_cast<size_t> (local_matrix.graph.row_map[export_lid+1]
543  - local_matrix.graph.row_map[export_lid]);
544 
545  // Only pack this row's data if it has a nonzero number of
546  // entries. We can do this because receiving processes get the
547  // number of packets, and will know that zero packets means zero
548  // entries.
549  if (num_ent == 0) {
550  return;
551  }
552 
553  if (export_lid >= local_matrix.numRows ()) {
554  if (dst.first != 0) { // keep only the first error
555  dst = Kokkos::make_pair (1, i); // invalid row
556  }
557  return;
558  }
559  else if ((offset > buf_size || offset + num_bytes > buf_size)) {
560  if (dst.first != 0) { // keep only the first error
561  dst = Kokkos::make_pair (2, i); // out of bounds
562  }
563  return;
564  }
565 
566  // We can now pack this row
567 
568  // Since the matrix is locally indexed on the calling process, we
569  // have to use its column Map (which it _must_ have in this case)
570  // to convert to global indices.
571  const auto row_beg = local_matrix.graph.row_map[export_lid];
572  const auto row_end = local_matrix.graph.row_map[export_lid + 1];
573  auto vals_in = subview (local_matrix.values,
574  Kokkos::make_pair (row_beg, row_end));
575  auto lids_in = subview (local_matrix.graph.entries,
576  Kokkos::make_pair (row_beg, row_end));
577  typedef local_map_type LMT;
578  typedef BufferDeviceType BDT;
579  auto p = packCrsMatrixRow<ST, LMT, BDT> (local_col_map, exports, lids_in,
580  source_pids, vals_in, offset,
581  num_ent, num_bytes_per_value,
582  pack_pids);
583  int error_code_this_row = p.first;
584  size_t num_bytes_packed_this_row = p.second;
585  if (error_code_this_row != 0) {
586  if (dst.first != 0) { // keep only the first error
587  dst = Kokkos::make_pair (error_code_this_row, i); // bad pack
588  }
589  }
590  else if (num_bytes_packed_this_row != num_bytes) {
591  if (dst.first != 0) { // keep only the first error
592  dst = Kokkos::make_pair (3, i);
593  }
594  }
595  }
596 };
597 
605 template<class LocalMatrix, class LocalMap, class BufferDeviceType>
606 void
607 do_pack (const LocalMatrix& local_matrix,
608  const LocalMap& local_map,
609  const Kokkos::View<char*, BufferDeviceType>& exports,
610  const typename PackTraits<size_t>::input_array_type& num_packets_per_lid,
611  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& export_lids,
612  const typename PackTraits<int>::input_array_type& source_pids,
613  const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
614  const size_t num_bytes_per_value,
615  const bool pack_pids)
616 {
617  using LO = typename LocalMap::local_ordinal_type;
618  using DT = typename LocalMatrix::device_type;
619  using range_type = Kokkos::RangePolicy<typename DT::execution_space, LO>;
620  const char prefix[] = "Tpetra::Details::do_pack: ";
621 
622  if (export_lids.extent (0) != 0) {
623  TEUCHOS_TEST_FOR_EXCEPTION
624  (static_cast<size_t> (offsets.extent (0)) !=
625  static_cast<size_t> (export_lids.extent (0) + 1),
626  std::invalid_argument, prefix << "offsets.extent(0) = "
627  << offsets.extent (0) << " != export_lids.extent(0) (= "
628  << export_lids.extent (0) << ") + 1.");
629  TEUCHOS_TEST_FOR_EXCEPTION
630  (export_lids.extent (0) != num_packets_per_lid.extent (0),
631  std::invalid_argument, prefix << "export_lids.extent(0) = " <<
632  export_lids.extent (0) << " != num_packets_per_lid.extent(0) = "
633  << num_packets_per_lid.extent (0) << ".");
634  // If exports has nonzero length at this point, then the matrix
635  // has at least one entry to pack. Thus, if packing process
636  // ranks, we had better have at least one process rank to pack.
637  TEUCHOS_TEST_FOR_EXCEPTION
638  (pack_pids && exports.extent (0) != 0 &&
639  source_pids.extent (0) == 0, std::invalid_argument, prefix <<
640  "pack_pids is true, and exports.extent(0) = " <<
641  exports.extent (0) << " != 0, meaning that we need to pack at "
642  "least one matrix entry, but source_pids.extent(0) = 0.");
643  }
644 
645  using pack_functor_type =
646  PackCrsMatrixFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
647  pack_functor_type f (local_matrix, local_map, exports,
648  num_packets_per_lid, export_lids,
649  source_pids, offsets, num_bytes_per_value,
650  pack_pids);
651 
652  typename pack_functor_type::value_type result;
653  range_type range (0, num_packets_per_lid.extent (0));
654  Kokkos::parallel_reduce (range, f, result);
655 
656  if (result.first != 0) {
657  // We can't deep_copy from AnonymousSpace Views, so we can't print
658  // out any information from them in case of error.
659  TEUCHOS_TEST_FOR_EXCEPTION
660  (true, std::runtime_error, prefix << "PackCrsMatrixFunctor "
661  "reported error code " << result.first << " for the first "
662  "bad row " << result.second << ".");
663  }
664 }
665 
695 template<typename ST, typename LO, typename GO, typename NT, typename BufferDeviceType>
696 void
697 packCrsMatrix (const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
698  Kokkos::DualView<char*, BufferDeviceType>& exports,
699  const Kokkos::View<size_t*, BufferDeviceType>& num_packets_per_lid,
700  const Kokkos::View<const LO*, BufferDeviceType>& export_lids,
701  const Kokkos::View<const int*, typename NT::device_type>& export_pids,
702  size_t& constant_num_packets,
703  const bool pack_pids)
704 {
705  ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix(
706  "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix",
707  "Import/Export"
708  );
709  using Kokkos::View;
710  typedef BufferDeviceType DT;
711  typedef Kokkos::DualView<char*, BufferDeviceType> exports_view_type;
712  const char prefix[] = "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix: ";
713  constexpr bool debug = false;
714 
715  auto local_matrix = sourceMatrix.getLocalMatrixDevice ();
716  auto local_col_map = sourceMatrix.getColMap ()->getLocalMap ();
717 
718  // Setting this to zero tells the caller to expect a possibly
719  // different ("nonconstant") number of packets per local index
720  // (i.e., a possibly different number of entries per row).
721  constant_num_packets = 0;
722 
723  const size_t num_export_lids =
724  static_cast<size_t> (export_lids.extent (0));
725  TEUCHOS_TEST_FOR_EXCEPTION
726  (num_export_lids !=
727  static_cast<size_t> (num_packets_per_lid.extent (0)),
728  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
729  << num_export_lids << " != num_packets_per_lid.extent(0) = "
730  << num_packets_per_lid.extent (0) << ".");
731  if (num_export_lids != 0) {
732  TEUCHOS_TEST_FOR_EXCEPTION
733  (num_packets_per_lid.data () == NULL, std::invalid_argument,
734  prefix << "num_export_lids = "<< num_export_lids << " != 0, but "
735  "num_packets_per_lid.data() = "
736  << num_packets_per_lid.data () << " == NULL.");
737  }
738 
739  const size_t num_bytes_per_lid = PackTraits<LO>::packValueCount (LO (0));
740  const size_t num_bytes_per_gid = PackTraits<GO>::packValueCount (GO (0));
741  const size_t num_bytes_per_pid = PackTraits<int>::packValueCount (int (0));
742 
743  size_t num_bytes_per_value = 0;
744  if (PackTraits<ST>::compileTimeSize) {
745  // Assume ST is default constructible; packValueCount wants an instance.
746  num_bytes_per_value = PackTraits<ST>::packValueCount (ST ());
747  }
748  else {
749  // Since the packed data come from the source matrix, we can use
750  // the source matrix to get the number of bytes per Scalar value
751  // stored in the matrix. This assumes that all Scalar values in
752  // the source matrix require the same number of bytes. If the
753  // source matrix has no entries on the calling process, then we
754  // hope that some process does have some idea how big a Scalar
755  // value is. Of course, if no processes have any entries, then no
756  // values should be packed (though this does assume that in our
757  // packing scheme, rows with zero entries take zero bytes).
758  size_t num_bytes_per_value_l = 0;
759  if (local_matrix.values.extent(0) > 0) {
760  const ST& val = local_matrix.values(0);
761  num_bytes_per_value_l = PackTraits<ST>::packValueCount (val);
762  }
763  using Teuchos::reduceAll;
764  reduceAll<int, size_t> (* (sourceMatrix.getComm ()),
765  Teuchos::REDUCE_MAX,
766  num_bytes_per_value_l,
767  Teuchos::outArg (num_bytes_per_value));
768  }
769 
770  if (num_export_lids == 0) {
771  exports = exports_view_type ("exports", 0);
772  return;
773  }
774 
775  // Array of offsets into the pack buffer.
776  Kokkos::View<size_t*, DT> offsets ("offsets", num_export_lids + 1);
777 
778  // Compute number of packets per LID (row to send), as well as
779  // corresponding offsets (the prefix sum of the packet counts).
780  const size_t count =
781  computeNumPacketsAndOffsets (offsets, num_packets_per_lid,
782  local_matrix.graph.row_map, export_lids,
783  export_pids,
784  num_bytes_per_lid, num_bytes_per_gid,
785  num_bytes_per_pid, num_bytes_per_value);
786 
787  // Resize the output pack buffer if needed.
788  if (count > static_cast<size_t> (exports.extent (0))) {
789  exports = exports_view_type ("exports", count);
790  if (debug) {
791  std::ostringstream os;
792  os << "*** exports resized to " << count << std::endl;
793  std::cerr << os.str ();
794  }
795  }
796  if (debug) {
797  std::ostringstream os;
798  os << "*** count: " << count << ", exports.extent(0): "
799  << exports.extent (0) << std::endl;
800  std::cerr << os.str ();
801  }
802 
803  // If exports has nonzero length at this point, then the matrix has
804  // at least one entry to pack. Thus, if packing process ranks, we
805  // had better have at least one process rank to pack.
806  TEUCHOS_TEST_FOR_EXCEPTION
807  (pack_pids && exports.extent (0) != 0 &&
808  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
809  "pack_pids is true, and exports.extent(0) = " <<
810  exports.extent (0) << " != 0, meaning that we need to pack at least "
811  "one matrix entry, but export_pids.extent(0) = 0.");
812 
813  typedef typename std::decay<decltype (local_matrix)>::type
814  local_matrix_device_type;
815  typedef typename std::decay<decltype (local_col_map)>::type
816  local_map_type;
817 
818  exports.modify_device ();
819  auto exports_d = exports.view_device ();
820  do_pack<local_matrix_device_type, local_map_type, DT>
821  (local_matrix, local_col_map, exports_d, num_packets_per_lid,
822  export_lids, export_pids, offsets, num_bytes_per_value,
823  pack_pids);
824  // If we got this far, we succeeded.
825 }
826 
827 } // namespace PackCrsMatrixImpl
828 
829 template<typename ST, typename LO, typename GO, typename NT>
830 void
832  Teuchos::Array<char>& exports,
833  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
834  const Teuchos::ArrayView<const LO>& exportLIDs,
835  size_t& constantNumPackets)
836 {
837 
839  using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
840  using host_exec_space = typename Kokkos::View<size_t*, device_type>::HostMirror::execution_space;
841  using device_exec_space = typename device_type::execution_space;
842  using host_dev_type = Kokkos::Device<host_exec_space, Kokkos::HostSpace>;
843 
844  // Convert all Teuchos::Array to Kokkos::View
845 
846  // This is an output array, so we don't have to copy to device here.
847  // However, we'll have to remember to copy back to host when done.
848  Kokkos::View<size_t*, buffer_device_type> num_packets_per_lid_d =
849  create_mirror_view_from_raw_host_array (buffer_device_type (),
850  numPacketsPerLID.getRawPtr (),
851  numPacketsPerLID.size (), false,
852  "num_packets_per_lid");
853  // FIXME (mfh 05 Feb 2019) We should just pass the exportLIDs
854  // DualView through here, instead of recreating a device View from a
855  // host ArrayView that itself came from a DualView.
856  //
857  // This is an input array, so we have to copy to device here.
858  // However, we never need to copy it back to host.
859  Kokkos::View<const LO*, buffer_device_type> export_lids_d =
860  create_mirror_view_from_raw_host_array (buffer_device_type (),
861  exportLIDs.getRawPtr (),
862  exportLIDs.size (), true,
863  "export_lids");
864 
865  Kokkos::View<int*, device_type> export_pids_d; // output arg
866  Kokkos::DualView<char*, buffer_device_type> exports_dv; // output arg
867  constexpr bool pack_pids = false;
868  PackCrsMatrixImpl::packCrsMatrix<ST, LO, GO, NT, buffer_device_type> (
869  sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
870  export_pids_d, constantNumPackets, pack_pids);
871 
872  // The counts are an output of PackCrsMatrixImpl::packCrsMatrix, so we have to
873  // copy them back to host.
874  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
875  (numPacketsPerLID.getRawPtr (),
876  numPacketsPerLID.size ());
877  // DEEP_COPY REVIEW - DEVICE-TO-HOST
878  Kokkos::deep_copy (device_exec_space(), num_packets_per_lid_h, num_packets_per_lid_d);
879 
880  // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
881  // exports_dv above, then we have two host copies for exports_h.
882 
883  // The exports are an output of PackCrsMatrixImpl::packCrsMatrix, so we have
884  // to copy them back to host.
885  if (static_cast<size_t> (exports.size ()) !=
886  static_cast<size_t> (exports_dv.extent (0))) {
887  exports.resize (exports_dv.extent (0));
888  }
889  Kokkos::View<char*, host_dev_type> exports_h (exports.getRawPtr (),
890  exports.size ());
891  // DEEP_COPY REVIEW - DEVICE-TO-HOST
892  Kokkos::deep_copy (device_exec_space(), exports_h, exports_dv.d_view);
893 }
894 
895 template<typename ST, typename LO, typename GO, typename NT>
896 void
898  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
899  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports,
900  const Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& numPacketsPerLID,
901  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exportLIDs,
902  size_t& constantNumPackets)
903 {
904  using device_type = typename CrsMatrix<ST, LO, GO, NT>::device_type;
905  using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
906 
907  // Create an empty array of PIDs, since the interface needs it.
908  Kokkos::View<int*, device_type> exportPIDs_d ("exportPIDs", 0);
909  constexpr bool pack_pids = false;
910 
911  // Write-only device access
912  auto numPacketsPerLID_nc = numPacketsPerLID; // const DV& -> DV
913  numPacketsPerLID_nc.clear_sync_state ();
914  numPacketsPerLID_nc.modify_device ();
915  auto numPacketsPerLID_d = numPacketsPerLID.view_device ();
916 
917  // Read-only device access
918  TEUCHOS_ASSERT( ! exportLIDs.need_sync_device () );
919  auto exportLIDs_d = exportLIDs.view_device ();
920 
921  ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix_new(
922  "Tpetra::Details::packCrsMatrixNew",
923  "Import/Export"
924  );
925  PackCrsMatrixImpl::packCrsMatrix<ST,LO,GO,NT,buffer_device_type> (
926  sourceMatrix, exports, numPacketsPerLID_d, exportLIDs_d,
927  exportPIDs_d, constantNumPackets, pack_pids);
928 }
929 
930 template<typename ST, typename LO, typename GO, typename NT>
931 void
933  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports_dv,
934  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
935  const Teuchos::ArrayView<const LO>& exportLIDs,
936  const Teuchos::ArrayView<const int>& sourcePIDs,
937  size_t& constantNumPackets)
938 {
939  typedef typename CrsMatrix<ST,LO,GO,NT>::local_matrix_device_type local_matrix_device_type;
940  typedef typename DistObject<char, LO, GO, NT>::buffer_device_type buffer_device_type;
941  typedef typename Kokkos::DualView<char*, buffer_device_type>::t_host::execution_space host_exec_space;
942  typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
943 
944  typename local_matrix_device_type::device_type outputDevice;
945  typedef typename NT::execution_space execution_space;
946 
947 
948  const bool verbose = ::Tpetra::Details::Behavior::verbose ();
949  std::unique_ptr<std::string> prefix;
950  if (verbose) {
951  const int myRank = [&] () {
952  auto map = sourceMatrix.getMap ();
953  if (map.get () == nullptr) {
954  return -1;
955  }
956  auto comm = map->getComm ();
957  if (comm.get () == nullptr) {
958  return -2;
959  }
960  return comm->getRank ();
961  } ();
962  std::ostringstream os;
963  os << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs: ";
964  prefix = std::unique_ptr<std::string> (new std::string (os.str ()));
965 
966  std::ostringstream os2;
967  os2 << *prefix << "start" << std::endl;
968  std::cerr << os2.str ();
969  }
970 
971  // Convert all Teuchos::Array to Kokkos::View
972 
973  // This is an output array, so we don't have to copy to device here.
974  // However, we'll have to remember to copy back to host when done.
975  auto num_packets_per_lid_d =
976  create_mirror_view_from_raw_host_array (buffer_device_type (),
977  numPacketsPerLID.getRawPtr (),
978  numPacketsPerLID.size (), false,
979  "num_packets_per_lid");
980 
981  // This is an input array, so we have to copy to device here.
982  // However, we never need to copy it back to host.
983  auto export_lids_d =
984  create_mirror_view_from_raw_host_array (buffer_device_type (),
985  exportLIDs.getRawPtr (),
986  exportLIDs.size (), true,
987  "export_lids");
988  // This is an input array, so we have to copy to device here.
989  // However, we never need to copy it back to host.
990  auto export_pids_d =
992  sourcePIDs.getRawPtr (),
993  sourcePIDs.size (), true,
994  "export_pids");
995  constexpr bool pack_pids = true;
996  try {
998  (sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
999  export_pids_d, constantNumPackets, pack_pids);
1000  }
1001  catch (std::exception& e) {
1002  if (verbose) {
1003  std::ostringstream os;
1004  os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw: "
1005  << e.what () << std::endl;
1006  std::cerr << os.str ();
1007  }
1008  throw;
1009  }
1010  catch (...) {
1011  if (verbose) {
1012  std::ostringstream os;
1013  os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw an exception "
1014  "not a subclass of std::exception" << std::endl;
1015  std::cerr << os.str ();
1016  }
1017  throw;
1018  }
1019 
1020  if (numPacketsPerLID.size () != 0) {
1021  try {
1022  // The counts are an output of PackCrsMatrixImpl::packCrsMatrix,
1023  // so we have to copy them back to host.
1024  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
1025  (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ());
1026  // DEEP_COPY REVIEW - DEVICE-TO-HOST
1027  Kokkos::deep_copy (execution_space(), num_packets_per_lid_h, num_packets_per_lid_d);
1028  }
1029  catch (std::exception& e) {
1030  if (verbose) {
1031  std::ostringstream os;
1032  os << *prefix << "Kokkos::deep_copy threw: " << e.what () << std::endl;
1033  std::cerr << os.str ();
1034  }
1035  throw;
1036  }
1037  catch (...) {
1038  if (verbose) {
1039  std::ostringstream os;
1040  os << *prefix << "Kokkos::deep_copy threw an exception not a subclass "
1041  "of std::exception" << std::endl;
1042  std::cerr << os.str ();
1043  }
1044  throw;
1045  }
1046  }
1047 
1048  if (verbose) {
1049  std::ostringstream os;
1050  os << *prefix << "done" << std::endl;
1051  std::cerr << os.str ();
1052  }
1053 }
1054 
1055 } // namespace Details
1056 } // namespace Tpetra
1057 
1058 #define TPETRA_DETAILS_PACKCRSMATRIX_INSTANT( ST, LO, GO, NT ) \
1059  template void \
1060  Details::packCrsMatrix<ST, LO, GO, NT> (const CrsMatrix<ST, LO, GO, NT>&, \
1061  Teuchos::Array<char>&, \
1062  const Teuchos::ArrayView<size_t>&, \
1063  const Teuchos::ArrayView<const LO>&, \
1064  size_t&); \
1065  template void \
1066  Details::packCrsMatrixNew<ST, LO, GO, NT> (const CrsMatrix<ST, LO, GO, NT>&, \
1067  Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1068  const Kokkos::DualView<size_t*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1069  const Kokkos::DualView<const LO*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1070  size_t&); \
1071  template void \
1072  Details::packCrsMatrixWithOwningPIDs<ST, LO, GO, NT> (const CrsMatrix<ST, LO, GO, NT>&, \
1073  Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1074  const Teuchos::ArrayView<size_t>&, \
1075  const Teuchos::ArrayView<const LO>&, \
1076  const Teuchos::ArrayView<const int>&, \
1077  size_t&);
1078 
1079 #endif // TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > packArray(char outBuf[], const value_type inBuf[], const size_t numEnt)
Pack the first numEnt entries of the given input buffer of value_type, into the output buffer of byte...
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const T &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Compute the number of packets and offsets for the pack procedure.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for &quot;new&quot; DistObject inter...
static bool verbose()
Whether Tpetra is in verbose mode.
typename Node::device_type device_type
The Kokkos device type.
void packCrsMatrix(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.