Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_packCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
11 #define TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Teuchos_Array.hpp"
15 #include "Teuchos_ArrayView.hpp"
24 #include <memory>
25 #include <sstream>
26 #include <stdexcept>
27 #include <string>
28 
51 
52 namespace Tpetra {
53 
54 //
55 // Users must never rely on anything in the Details namespace.
56 //
57 namespace Details {
58 
59 namespace PackCrsMatrixImpl {
67 template <class OutputOffsetsViewType,
68  class CountsViewType,
69  class InputOffsetsViewType,
70  class InputLocalRowIndicesViewType,
71  class InputLocalRowPidsViewType,
72  const bool debug =
73 #ifdef HAVE_TPETRA_DEBUG
74  true
75 #else
76  false
77 #endif // HAVE_TPETRA_DEBUG
78  >
80  public:
81  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
82  typedef typename CountsViewType::non_const_value_type count_type;
83  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
84  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
85  typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
86  // output Views drive where execution happens.
87  typedef typename OutputOffsetsViewType::device_type device_type;
88  static_assert(std::is_same<typename CountsViewType::device_type::execution_space,
89  typename device_type::execution_space>::value,
90  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
91  static_assert(Kokkos::is_view<OutputOffsetsViewType>::value,
92  "OutputOffsetsViewType must be a Kokkos::View.");
93  static_assert(std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
94  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
95  static_assert(std::is_integral<output_offset_type>::value,
96  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
97  static_assert(Kokkos::is_view<CountsViewType>::value,
98  "CountsViewType must be a Kokkos::View.");
99  static_assert(std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
100  "CountsViewType must be a nonconst Kokkos::View.");
101  static_assert(std::is_integral<count_type>::value,
102  "The type of each entry of CountsViewType must be a built-in integer type.");
103  static_assert(Kokkos::is_view<InputOffsetsViewType>::value,
104  "InputOffsetsViewType must be a Kokkos::View.");
105  static_assert(std::is_integral<input_offset_type>::value,
106  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
107  static_assert(Kokkos::is_view<InputLocalRowIndicesViewType>::value,
108  "InputLocalRowIndicesViewType must be a Kokkos::View.");
109  static_assert(std::is_integral<local_row_index_type>::value,
110  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
111 
112  NumPacketsAndOffsetsFunctor(const OutputOffsetsViewType& outputOffsets,
113  const CountsViewType& counts,
114  const InputOffsetsViewType& rowOffsets,
115  const InputLocalRowIndicesViewType& lclRowInds,
116  const InputLocalRowPidsViewType& lclRowPids,
117  const count_type sizeOfLclCount,
118  const count_type sizeOfGblColInd,
119  const count_type sizeOfPid,
120  const count_type sizeOfValue)
121  : outputOffsets_(outputOffsets)
122  , counts_(counts)
123  , rowOffsets_(rowOffsets)
124  , lclRowInds_(lclRowInds)
125  , lclRowPids_(lclRowPids)
126  , sizeOfLclCount_(sizeOfLclCount)
127  , sizeOfGblColInd_(sizeOfGblColInd)
128  , sizeOfPid_(sizeOfPid)
129  , sizeOfValue_(sizeOfValue)
130  , error_("error") // don't forget this, or you'll get segfaults!
131  {
132  if (debug) {
133  const size_t numRowsToPack = static_cast<size_t>(lclRowInds_.extent(0));
134 
135  if (numRowsToPack != static_cast<size_t>(counts_.extent(0))) {
136  std::ostringstream os;
137  os << "lclRowInds.extent(0) = " << numRowsToPack
138  << " != counts.extent(0) = " << counts_.extent(0)
139  << ".";
140  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
141  }
142  if (static_cast<size_t>(numRowsToPack + 1) !=
143  static_cast<size_t>(outputOffsets_.extent(0))) {
144  std::ostringstream os;
145  os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
146  << " != outputOffsets.extent(0) = " << outputOffsets_.extent(0)
147  << ".";
148  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
149  }
150  }
151  }
152 
153  KOKKOS_INLINE_FUNCTION void
154  operator()(const local_row_index_type& curInd,
155  output_offset_type& update,
156  const bool final) const {
157  if (debug) {
158  if (curInd < static_cast<local_row_index_type>(0)) {
159  error_() = 1;
160  return;
161  }
162  }
163 
164  if (final) {
165  if (debug) {
166  if (curInd >= static_cast<local_row_index_type>(outputOffsets_.extent(0))) {
167  error_() = 2;
168  return;
169  }
170  }
171  outputOffsets_(curInd) = update;
172  }
173 
174  if (curInd < static_cast<local_row_index_type>(counts_.extent(0))) {
175  const auto lclRow = lclRowInds_(curInd);
176  if (static_cast<size_t>(lclRow + 1) >= static_cast<size_t>(rowOffsets_.extent(0)) ||
177  static_cast<local_row_index_type>(lclRow) < static_cast<local_row_index_type>(0)) {
178  error_() = 3;
179  return;
180  }
181  // count_type could differ from the type of each row offset.
182  // For example, row offsets might each be 64 bits, but if their
183  // difference always fits in 32 bits, we may then safely use a
184  // 32-bit count_type.
185  const count_type count =
186  static_cast<count_type>(rowOffsets_(lclRow + 1) - rowOffsets_(lclRow));
187 
188  // We pack first the number of entries in the row, then that
189  // many global column indices, then that many pids (if any),
190  // then that many values. However, if the number of entries in
191  // the row is zero, we pack nothing.
192  const count_type numBytes = (count == 0) ? static_cast<count_type>(0) : sizeOfLclCount_ + count * (sizeOfGblColInd_ + (lclRowPids_.size() > 0 ? sizeOfPid_ : 0) + sizeOfValue_);
193 
194  if (final) {
195  counts_(curInd) = numBytes;
196  }
197  update += numBytes;
198  }
199  }
200 
201  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
202  // have join both with and without volatile! Otherwise intrawarp
203  // joins are really slow on GPUs.
204 
206  int getError() const {
207  auto error_h = Kokkos::create_mirror_view(error_);
208  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
209  // Note: In the UVM case, this would otherwise be a no-op
210  // and thus not fence, so the value might not be correct on return
211  // In the non-UVM case, create_mirror_view will block for the allocation
212  Kokkos::deep_copy(error_h, error_);
213  return error_h();
214  }
215 
216  private:
217  OutputOffsetsViewType outputOffsets_;
218  CountsViewType counts_;
219  typename InputOffsetsViewType::const_type rowOffsets_;
220  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
221  typename InputLocalRowPidsViewType::const_type lclRowPids_;
222  count_type sizeOfLclCount_;
223  count_type sizeOfGblColInd_;
224  count_type sizeOfPid_;
225  count_type sizeOfValue_;
226  Kokkos::View<int, device_type> error_;
227 };
228 
238 template <class OutputOffsetsViewType,
239  class CountsViewType,
240  class InputOffsetsViewType,
241  class InputLocalRowIndicesViewType,
242  class InputLocalRowPidsViewType>
243 typename CountsViewType::non_const_value_type
244 computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
245  const CountsViewType& counts,
246  const InputOffsetsViewType& rowOffsets,
247  const InputLocalRowIndicesViewType& lclRowInds,
248  const InputLocalRowPidsViewType& lclRowPids,
249  const typename CountsViewType::non_const_value_type sizeOfLclCount,
250  const typename CountsViewType::non_const_value_type sizeOfGblColInd,
251  const typename CountsViewType::non_const_value_type sizeOfPid,
252  const typename CountsViewType::non_const_value_type sizeOfValue) {
253  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
254  CountsViewType, typename InputOffsetsViewType::const_type,
255  typename InputLocalRowIndicesViewType::const_type,
256  typename InputLocalRowPidsViewType::const_type>
257  functor_type;
258  typedef typename CountsViewType::non_const_value_type count_type;
259  typedef typename OutputOffsetsViewType::size_type size_type;
260  typedef typename OutputOffsetsViewType::execution_space execution_space;
261  typedef typename functor_type::local_row_index_type LO;
262  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
263  const char prefix[] = "computeNumPacketsAndOffsets: ";
264 
265  count_type count = 0;
266  const count_type numRowsToPack = lclRowInds.extent(0);
267 
268  if (numRowsToPack == 0) {
269  return count;
270  } else {
271  TEUCHOS_TEST_FOR_EXCEPTION(rowOffsets.extent(0) <= static_cast<size_type>(1),
272  std::invalid_argument, prefix << "There is at least one row to pack, "
273  "but the matrix has no rows. lclRowInds.extent(0) = "
274  << numRowsToPack << ", but rowOffsets.extent(0) = " << rowOffsets.extent(0) << " <= 1.");
275  TEUCHOS_TEST_FOR_EXCEPTION(outputOffsets.extent(0) !=
276  static_cast<size_type>(numRowsToPack + 1),
277  std::invalid_argument,
278  prefix << "Output dimension does not match number of rows to pack. "
279  << "outputOffsets.extent(0) = " << outputOffsets.extent(0)
280  << " != lclRowInds.extent(0) + 1 = "
281  << static_cast<size_type>(numRowsToPack + 1) << ".");
282  TEUCHOS_TEST_FOR_EXCEPTION(counts.extent(0) != numRowsToPack, std::invalid_argument,
283  prefix << "counts.extent(0) = " << counts.extent(0)
284  << " != numRowsToPack = " << numRowsToPack << ".");
285 
286  functor_type f(outputOffsets, counts, rowOffsets,
287  lclRowInds, lclRowPids, sizeOfLclCount,
288  sizeOfGblColInd, sizeOfPid, sizeOfValue);
289  Kokkos::parallel_scan(range_type(0, numRowsToPack + 1), f);
290 
291  // At least in debug mode, this functor checks for errors.
292  const int errCode = f.getError();
293  TEUCHOS_TEST_FOR_EXCEPTION(errCode != 0, std::runtime_error, prefix << "parallel_scan error code " << errCode << " != 0.");
294 
295 #if 0
296  size_t total = 0;
297  for (LO k = 0; k < numRowsToPack; ++k) {
298  total += counts[k];
299  }
300  if (outputOffsets(numRowsToPack) != total) {
301  if (errStr.get () == NULL) {
302  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
303  }
304  std::ostringstream& os = *errStr;
305  os << prefix
306  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
307  << outputOffsets(numRowsToPack) << " != sum of counts = "
308  << total << "." << std::endl;
309  if (numRowsToPack != 0) {
310  // Only print the array if it's not too long.
311  if (numRowsToPack < static_cast<LO> (10)) {
312  os << "outputOffsets: [";
313  for (LO i = 0; i <= numRowsToPack; ++i) {
314  os << outputOffsets(i);
315  if (static_cast<LO> (i + 1) <= numRowsToPack) {
316  os << ",";
317  }
318  }
319  os << "]" << std::endl;
320  os << "counts: [";
321  for (LO i = 0; i < numRowsToPack; ++i) {
322  os << counts(i);
323  if (static_cast<LO> (i + 1) < numRowsToPack) {
324  os << ",";
325  }
326  }
327  os << "]" << std::endl;
328  }
329  else {
330  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
331  << outputOffsets(numRowsToPack-1) << "." << std::endl;
332  }
333  }
334  count = outputOffsets(numRowsToPack);
335  return {false, errStr};
336  }
337 #endif // HAVE_TPETRA_DEBUG
338 
339  // Get last entry of outputOffsets, which is the sum of the entries
340  // of counts. Don't assume UVM.
341  using Tpetra::Details::getEntryOnHost;
342  return static_cast<count_type>(getEntryOnHost(outputOffsets,
343  numRowsToPack));
344  }
345 }
346 
362 template <class ST, class ColumnMap, class BufferDeviceType>
363 KOKKOS_FUNCTION
364  Kokkos::pair<int, size_t>
365  packCrsMatrixRow(const ColumnMap& col_map,
366  const Kokkos::View<char*, BufferDeviceType>& exports,
367  const typename PackTraits<typename ColumnMap::local_ordinal_type>::input_array_type& lids_in,
368  const typename PackTraits<int>::input_array_type& pids_in,
369  const typename PackTraits<ST>::input_array_type& vals_in,
370  const size_t offset,
371  const size_t num_ent,
372  const size_t num_bytes_per_value,
373  const bool pack_pids) {
374  using Kokkos::subview;
375  using LO = typename ColumnMap::local_ordinal_type;
376  using GO = typename ColumnMap::global_ordinal_type;
377  using return_type = Kokkos::pair<int, size_t>;
378 
379  if (num_ent == 0) {
380  // Empty rows always take zero bytes, to ensure sparsity.
381  return return_type(0, 0);
382  }
383 
384  const LO num_ent_LO = static_cast<LO>(num_ent); // packValueCount wants this
385  const size_t num_ent_beg = offset;
386  const size_t num_ent_len = PackTraits<LO>::packValueCount(num_ent_LO);
387 
388  const size_t gids_beg = num_ent_beg + num_ent_len;
389  const size_t gids_len = num_ent * PackTraits<GO>::packValueCount(GO(0));
390 
391  const size_t pids_beg = gids_beg + gids_len;
392  const size_t pids_len = pack_pids ? num_ent * PackTraits<int>::packValueCount(int(0)) : static_cast<size_t>(0);
393 
394  const size_t vals_beg = gids_beg + gids_len + pids_len;
395  const size_t vals_len = num_ent * num_bytes_per_value;
396 
397  char* const num_ent_out = exports.data() + num_ent_beg;
398  char* const gids_out = exports.data() + gids_beg;
399  char* const pids_out = pack_pids ? exports.data() + pids_beg : NULL;
400  char* const vals_out = exports.data() + vals_beg;
401 
402  size_t num_bytes_out = 0;
403  int error_code = 0;
404  num_bytes_out += PackTraits<LO>::packValue(num_ent_out, num_ent_LO);
405 
406  {
407  // Copy column indices one at a time, so that we don't need
408  // temporary storage.
409  for (size_t k = 0; k < num_ent; ++k) {
410  const LO lid = lids_in[k];
411  const GO gid = col_map.getGlobalElement(lid);
412  num_bytes_out += PackTraits<GO>::packValue(gids_out, k, gid);
413  }
414  // Copy PIDs one at a time, so that we don't need temporary storage.
415  if (pack_pids) {
416  for (size_t k = 0; k < num_ent; ++k) {
417  const LO lid = lids_in[k];
418  const int pid = pids_in[lid];
419  num_bytes_out += PackTraits<int>::packValue(pids_out, k, pid);
420  }
421  }
422  const auto p =
423  PackTraits<ST>::packArray(vals_out, vals_in.data(), num_ent);
424  error_code += p.first;
425  num_bytes_out += p.second;
426  }
427 
428  if (error_code != 0) {
429  return return_type(10, num_bytes_out);
430  }
431 
432  const size_t expected_num_bytes =
433  num_ent_len + gids_len + pids_len + vals_len;
434  if (num_bytes_out != expected_num_bytes) {
435  return return_type(11, num_bytes_out);
436  }
437  return return_type(0, num_bytes_out);
438 }
439 
440 template <class LocalMatrix, class LocalMap, class BufferDeviceType>
441 struct PackCrsMatrixFunctor {
442  typedef LocalMatrix local_matrix_device_type;
443  typedef LocalMap local_map_type;
444  typedef typename local_matrix_device_type::value_type ST;
445  typedef typename local_map_type::local_ordinal_type LO;
446  typedef typename local_map_type::global_ordinal_type GO;
447  typedef typename local_matrix_device_type::device_type DT;
448 
449  typedef Kokkos::View<const size_t*, BufferDeviceType>
450  num_packets_per_lid_view_type;
451  typedef Kokkos::View<const size_t*, BufferDeviceType> offsets_view_type;
452  typedef Kokkos::View<char*, BufferDeviceType> exports_view_type;
453  using export_lids_view_type = typename PackTraits<LO>::input_array_type;
454  using source_pids_view_type = typename PackTraits<int>::input_array_type;
455 
456  typedef typename num_packets_per_lid_view_type::non_const_value_type
457  count_type;
458  typedef typename offsets_view_type::non_const_value_type
459  offset_type;
460  typedef Kokkos::pair<int, LO> value_type;
461 
462  static_assert(std::is_same<LO, typename local_matrix_device_type::ordinal_type>::value,
463  "local_map_type::local_ordinal_type and "
464  "local_matrix_device_type::ordinal_type must be the same.");
465 
466  local_matrix_device_type local_matrix;
467  local_map_type local_col_map;
468  exports_view_type exports;
469  num_packets_per_lid_view_type num_packets_per_lid;
470  export_lids_view_type export_lids;
471  source_pids_view_type source_pids;
472  offsets_view_type offsets;
473  size_t num_bytes_per_value;
474  bool pack_pids;
475 
476  PackCrsMatrixFunctor(const local_matrix_device_type& local_matrix_in,
477  const local_map_type& local_col_map_in,
478  const exports_view_type& exports_in,
479  const num_packets_per_lid_view_type& num_packets_per_lid_in,
480  const export_lids_view_type& export_lids_in,
481  const source_pids_view_type& source_pids_in,
482  const offsets_view_type& offsets_in,
483  const size_t num_bytes_per_value_in,
484  const bool pack_pids_in)
485  : local_matrix(local_matrix_in)
486  , local_col_map(local_col_map_in)
487  , exports(exports_in)
488  , num_packets_per_lid(num_packets_per_lid_in)
489  , export_lids(export_lids_in)
490  , source_pids(source_pids_in)
491  , offsets(offsets_in)
492  , num_bytes_per_value(num_bytes_per_value_in)
493  , pack_pids(pack_pids_in) {
494  const LO numRows = local_matrix_in.numRows();
495  const LO rowMapDim =
496  static_cast<LO>(local_matrix.graph.row_map.extent(0));
497  TEUCHOS_TEST_FOR_EXCEPTION(numRows != 0 && rowMapDim != numRows + static_cast<LO>(1),
498  std::logic_error, "local_matrix.graph.row_map.extent(0) = " << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
499  }
500 
501  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const {
502  using ::Tpetra::Details::OrdinalTraits;
503  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
504  }
505 
506  KOKKOS_INLINE_FUNCTION void
507  join(value_type& dst, const value_type& src) const {
508  // `dst` should reflect the first (least) bad index and all other
509  // associated error codes and data, so prefer keeping it.
510  if (src.first != 0 && dst.first == 0) {
511  dst = src;
512  }
513  }
514 
515  KOKKOS_INLINE_FUNCTION
516  void operator()(const LO i, value_type& dst) const {
517  const size_t offset = offsets[i];
518  const LO export_lid = export_lids[i];
519  const size_t buf_size = exports.size();
520  const size_t num_bytes = num_packets_per_lid(i);
521  const size_t num_ent =
522  static_cast<size_t>(local_matrix.graph.row_map[export_lid + 1] - local_matrix.graph.row_map[export_lid]);
523 
524  // Only pack this row's data if it has a nonzero number of
525  // entries. We can do this because receiving processes get the
526  // number of packets, and will know that zero packets means zero
527  // entries.
528  if (num_ent == 0) {
529  return;
530  }
531 
532  if (export_lid >= local_matrix.numRows()) {
533  if (dst.first != 0) { // keep only the first error
534  dst = Kokkos::make_pair(1, i); // invalid row
535  }
536  return;
537  } else if ((offset > buf_size || offset + num_bytes > buf_size)) {
538  if (dst.first != 0) { // keep only the first error
539  dst = Kokkos::make_pair(2, i); // out of bounds
540  }
541  return;
542  }
543 
544  // We can now pack this row
545 
546  // Since the matrix is locally indexed on the calling process, we
547  // have to use its column Map (which it _must_ have in this case)
548  // to convert to global indices.
549  const auto row_beg = local_matrix.graph.row_map[export_lid];
550  const auto row_end = local_matrix.graph.row_map[export_lid + 1];
551  auto vals_in = subview(local_matrix.values,
552  Kokkos::make_pair(row_beg, row_end));
553  auto lids_in = subview(local_matrix.graph.entries,
554  Kokkos::make_pair(row_beg, row_end));
555  typedef local_map_type LMT;
556  typedef BufferDeviceType BDT;
557  auto p = packCrsMatrixRow<ST, LMT, BDT>(local_col_map, exports, lids_in,
558  source_pids, vals_in, offset,
559  num_ent, num_bytes_per_value,
560  pack_pids);
561  int error_code_this_row = p.first;
562  size_t num_bytes_packed_this_row = p.second;
563  if (error_code_this_row != 0) {
564  if (dst.first != 0) { // keep only the first error
565  dst = Kokkos::make_pair(error_code_this_row, i); // bad pack
566  }
567  } else if (num_bytes_packed_this_row != num_bytes) {
568  if (dst.first != 0) { // keep only the first error
569  dst = Kokkos::make_pair(3, i);
570  }
571  }
572  }
573 };
574 
582 template <class LocalMatrix, class LocalMap, class BufferDeviceType>
583 void do_pack(const LocalMatrix& local_matrix,
584  const LocalMap& local_map,
585  const Kokkos::View<char*, BufferDeviceType>& exports,
586  const typename PackTraits<size_t>::input_array_type& num_packets_per_lid,
587  const typename PackTraits<typename LocalMap::local_ordinal_type>::input_array_type& export_lids,
588  const typename PackTraits<int>::input_array_type& source_pids,
589  const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
590  const size_t num_bytes_per_value,
591  const bool pack_pids) {
592  using LO = typename LocalMap::local_ordinal_type;
593  using DT = typename LocalMatrix::device_type;
594  using range_type = Kokkos::RangePolicy<typename DT::execution_space, LO>;
595  const char prefix[] = "Tpetra::Details::do_pack: ";
596 
597  if (export_lids.extent(0) != 0) {
598  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(offsets.extent(0)) !=
599  static_cast<size_t>(export_lids.extent(0) + 1),
600  std::invalid_argument, prefix << "offsets.extent(0) = " << offsets.extent(0) << " != export_lids.extent(0) (= " << export_lids.extent(0) << ") + 1.");
601  TEUCHOS_TEST_FOR_EXCEPTION(export_lids.extent(0) != num_packets_per_lid.extent(0),
602  std::invalid_argument, prefix << "export_lids.extent(0) = " << export_lids.extent(0) << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
603  // If exports has nonzero length at this point, then the matrix
604  // has at least one entry to pack. Thus, if packing process
605  // ranks, we had better have at least one process rank to pack.
606  TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
607  source_pids.extent(0) == 0,
608  std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at "
609  "least one matrix entry, but source_pids.extent(0) = 0.");
610  }
611 
612  using pack_functor_type =
613  PackCrsMatrixFunctor<LocalMatrix, LocalMap, BufferDeviceType>;
614  pack_functor_type f(local_matrix, local_map, exports,
615  num_packets_per_lid, export_lids,
616  source_pids, offsets, num_bytes_per_value,
617  pack_pids);
618 
619  typename pack_functor_type::value_type result;
620  range_type range(0, num_packets_per_lid.extent(0));
621  Kokkos::parallel_reduce(range, f, result);
622 
623  if (result.first != 0) {
624  // We can't deep_copy from AnonymousSpace Views, so we can't print
625  // out any information from them in case of error.
626  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "PackCrsMatrixFunctor "
627  "reported error code "
628  << result.first << " for the first "
629  "bad row "
630  << result.second << ".");
631  }
632 }
633 
663 template <typename ST, typename LO, typename GO, typename NT, typename BufferDeviceType>
664 void packCrsMatrix(const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
665  Kokkos::DualView<char*, BufferDeviceType>& exports,
666  const Kokkos::View<size_t*, BufferDeviceType>& num_packets_per_lid,
667  const Kokkos::View<const LO*, BufferDeviceType>& export_lids,
668  const Kokkos::View<const int*, typename NT::device_type>& export_pids,
669  size_t& constant_num_packets,
670  const bool pack_pids) {
671  ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix(
672  "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix",
673  "Import/Export");
674  using Kokkos::View;
675  typedef BufferDeviceType DT;
676  typedef Kokkos::DualView<char*, BufferDeviceType> exports_view_type;
677  const char prefix[] = "Tpetra::Details::PackCrsMatrixImpl::packCrsMatrix: ";
678  constexpr bool debug = false;
679 
680  auto local_matrix = sourceMatrix.getLocalMatrixDevice();
681  auto local_col_map = sourceMatrix.getColMap()->getLocalMap();
682 
683  // Setting this to zero tells the caller to expect a possibly
684  // different ("nonconstant") number of packets per local index
685  // (i.e., a possibly different number of entries per row).
686  constant_num_packets = 0;
687 
688  const size_t num_export_lids =
689  static_cast<size_t>(export_lids.extent(0));
690  TEUCHOS_TEST_FOR_EXCEPTION(num_export_lids !=
691  static_cast<size_t>(num_packets_per_lid.extent(0)),
692  std::invalid_argument, prefix << "num_export_lids.extent(0) = " << num_export_lids << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
693  if (num_export_lids != 0) {
694  TEUCHOS_TEST_FOR_EXCEPTION(num_packets_per_lid.data() == NULL, std::invalid_argument,
695  prefix << "num_export_lids = " << num_export_lids << " != 0, but "
696  "num_packets_per_lid.data() = "
697  << num_packets_per_lid.data() << " == NULL.");
698  }
699 
700  const size_t num_bytes_per_lid = PackTraits<LO>::packValueCount(LO(0));
701  const size_t num_bytes_per_gid = PackTraits<GO>::packValueCount(GO(0));
702  const size_t num_bytes_per_pid = PackTraits<int>::packValueCount(int(0));
703 
704  size_t num_bytes_per_value = 0;
705  if (PackTraits<ST>::compileTimeSize) {
706  // Assume ST is default constructible; packValueCount wants an instance.
707  num_bytes_per_value = PackTraits<ST>::packValueCount(ST());
708  } else {
709  // Since the packed data come from the source matrix, we can use
710  // the source matrix to get the number of bytes per Scalar value
711  // stored in the matrix. This assumes that all Scalar values in
712  // the source matrix require the same number of bytes. If the
713  // source matrix has no entries on the calling process, then we
714  // hope that some process does have some idea how big a Scalar
715  // value is. Of course, if no processes have any entries, then no
716  // values should be packed (though this does assume that in our
717  // packing scheme, rows with zero entries take zero bytes).
718  size_t num_bytes_per_value_l = 0;
719  if (local_matrix.values.extent(0) > 0) {
720  const ST& val = local_matrix.values(0);
721  num_bytes_per_value_l = PackTraits<ST>::packValueCount(val);
722  }
723  using Teuchos::reduceAll;
724  reduceAll<int, size_t>(*(sourceMatrix.getComm()),
725  Teuchos::REDUCE_MAX,
726  num_bytes_per_value_l,
727  Teuchos::outArg(num_bytes_per_value));
728  }
729 
730  if (num_export_lids == 0) {
731  exports = exports_view_type("exports", 0);
732  return;
733  }
734 
735  // Array of offsets into the pack buffer.
736  Kokkos::View<size_t*, DT> offsets("offsets", num_export_lids + 1);
737 
738  // Compute number of packets per LID (row to send), as well as
739  // corresponding offsets (the prefix sum of the packet counts).
740  const size_t count =
741  computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
742  local_matrix.graph.row_map, export_lids,
743  export_pids,
744  num_bytes_per_lid, num_bytes_per_gid,
745  num_bytes_per_pid, num_bytes_per_value);
746 
747  // Resize the output pack buffer if needed.
748  if (count > static_cast<size_t>(exports.extent(0))) {
749  exports = exports_view_type("exports", count);
750  if (debug) {
751  std::ostringstream os;
752  os << "*** exports resized to " << count << std::endl;
753  std::cerr << os.str();
754  }
755  }
756  if (debug) {
757  std::ostringstream os;
758  os << "*** count: " << count << ", exports.extent(0): "
759  << exports.extent(0) << std::endl;
760  std::cerr << os.str();
761  }
762 
763  // If exports has nonzero length at this point, then the matrix has
764  // at least one entry to pack. Thus, if packing process ranks, we
765  // had better have at least one process rank to pack.
766  TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
767  export_pids.extent(0) == 0,
768  std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at least "
769  "one matrix entry, but export_pids.extent(0) = 0.");
770 
771  typedef typename std::decay<decltype(local_matrix)>::type
772  local_matrix_device_type;
773  typedef typename std::decay<decltype(local_col_map)>::type
774  local_map_type;
775 
776  exports.modify_device();
777  auto exports_d = exports.view_device();
778  do_pack<local_matrix_device_type, local_map_type, DT>(local_matrix, local_col_map, exports_d, num_packets_per_lid,
779  export_lids, export_pids, offsets, num_bytes_per_value,
780  pack_pids);
781  // If we got this far, we succeeded.
782 }
783 
784 } // namespace PackCrsMatrixImpl
785 
786 template <typename ST, typename LO, typename GO, typename NT>
787 void packCrsMatrix(const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
788  Teuchos::Array<char>& exports,
789  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
790  const Teuchos::ArrayView<const LO>& exportLIDs,
791  size_t& constantNumPackets) {
793  using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
794  using host_exec_space = typename Kokkos::View<size_t*, device_type>::host_mirror_type::execution_space;
795  using device_exec_space = typename device_type::execution_space;
796  using host_dev_type = Kokkos::Device<host_exec_space, Kokkos::HostSpace>;
797 
798  // Convert all Teuchos::Array to Kokkos::View
799 
800  // This is an output array, so we don't have to copy to device here.
801  // However, we'll have to remember to copy back to host when done.
802  Kokkos::View<size_t*, buffer_device_type> num_packets_per_lid_d =
803  create_mirror_view_from_raw_host_array(buffer_device_type(),
804  numPacketsPerLID.getRawPtr(),
805  numPacketsPerLID.size(), false,
806  "num_packets_per_lid");
807  // FIXME (mfh 05 Feb 2019) We should just pass the exportLIDs
808  // DualView through here, instead of recreating a device View from a
809  // host ArrayView that itself came from a DualView.
810  //
811  // This is an input array, so we have to copy to device here.
812  // However, we never need to copy it back to host.
813  Kokkos::View<const LO*, buffer_device_type> export_lids_d =
814  create_mirror_view_from_raw_host_array(buffer_device_type(),
815  exportLIDs.getRawPtr(),
816  exportLIDs.size(), true,
817  "export_lids");
818 
819  Kokkos::View<int*, device_type> export_pids_d; // output arg
820  Kokkos::DualView<char*, buffer_device_type> exports_dv; // output arg
821  constexpr bool pack_pids = false;
822  PackCrsMatrixImpl::packCrsMatrix<ST, LO, GO, NT, buffer_device_type>(
823  sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
824  export_pids_d, constantNumPackets, pack_pids);
825 
826  // The counts are an output of PackCrsMatrixImpl::packCrsMatrix, so we have to
827  // copy them back to host.
828  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h(numPacketsPerLID.getRawPtr(),
829  numPacketsPerLID.size());
830  // DEEP_COPY REVIEW - DEVICE-TO-HOST
831  Kokkos::deep_copy(device_exec_space(), num_packets_per_lid_h, num_packets_per_lid_d);
832 
833  // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
834  // exports_dv above, then we have two host copies for exports_h.
835 
836  // The exports are an output of PackCrsMatrixImpl::packCrsMatrix, so we have
837  // to copy them back to host.
838  if (static_cast<size_t>(exports.size()) !=
839  static_cast<size_t>(exports_dv.extent(0))) {
840  exports.resize(exports_dv.extent(0));
841  }
842  Kokkos::View<char*, host_dev_type> exports_h(exports.getRawPtr(),
843  exports.size());
844  // DEEP_COPY REVIEW - DEVICE-TO-HOST
845  Kokkos::deep_copy(device_exec_space(), exports_h, exports_dv.view_device());
846 }
847 
848 template <typename ST, typename LO, typename GO, typename NT>
850  const CrsMatrix<ST, LO, GO, NT>& sourceMatrix,
851  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports,
852  const Kokkos::DualView<size_t*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& numPacketsPerLID,
853  const Kokkos::DualView<const LO*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exportLIDs,
854  size_t& constantNumPackets) {
855  using device_type = typename CrsMatrix<ST, LO, GO, NT>::device_type;
856  using buffer_device_type = typename DistObject<char, LO, GO, NT>::buffer_device_type;
857 
858  // Create an empty array of PIDs, since the interface needs it.
859  Kokkos::View<int*, device_type> exportPIDs_d("exportPIDs", 0);
860  constexpr bool pack_pids = false;
861 
862  // Write-only device access
863  auto numPacketsPerLID_nc = numPacketsPerLID; // const DV& -> DV
864  numPacketsPerLID_nc.clear_sync_state();
865  numPacketsPerLID_nc.modify_device();
866  auto numPacketsPerLID_d = numPacketsPerLID.view_device();
867 
868  // Read-only device access
869  TEUCHOS_ASSERT(!exportLIDs.need_sync_device());
870  auto exportLIDs_d = exportLIDs.view_device();
871 
872  ::Tpetra::Details::ProfilingRegion region_pack_crs_matrix_new(
873  "Tpetra::Details::packCrsMatrixNew",
874  "Import/Export");
875  PackCrsMatrixImpl::packCrsMatrix<ST, LO, GO, NT, buffer_device_type>(
876  sourceMatrix, exports, numPacketsPerLID_d, exportLIDs_d,
877  exportPIDs_d, constantNumPackets, pack_pids);
878 }
879 
880 template <typename ST, typename LO, typename GO, typename NT>
882  Kokkos::DualView<char*, typename DistObject<char, LO, GO, NT>::buffer_device_type>& exports_dv,
883  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
884  const Teuchos::ArrayView<const LO>& exportLIDs,
885  const Teuchos::ArrayView<const int>& sourcePIDs,
886  size_t& constantNumPackets) {
887  typedef typename CrsMatrix<ST, LO, GO, NT>::local_matrix_device_type local_matrix_device_type;
888  typedef typename DistObject<char, LO, GO, NT>::buffer_device_type buffer_device_type;
889  typedef typename Kokkos::DualView<char*, buffer_device_type>::t_host::execution_space host_exec_space;
890  typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
891 
892  typename local_matrix_device_type::device_type outputDevice;
893  typedef typename NT::execution_space execution_space;
894 
895  const bool verbose = ::Tpetra::Details::Behavior::verbose();
896  std::unique_ptr<std::string> prefix;
897  if (verbose) {
898  const int myRank = [&]() {
899  auto map = sourceMatrix.getMap();
900  if (map.get() == nullptr) {
901  return -1;
902  }
903  auto comm = map->getComm();
904  if (comm.get() == nullptr) {
905  return -2;
906  }
907  return comm->getRank();
908  }();
909  std::ostringstream os;
910  os << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs: ";
911  prefix = std::unique_ptr<std::string>(new std::string(os.str()));
912 
913  std::ostringstream os2;
914  os2 << *prefix << "start" << std::endl;
915  std::cerr << os2.str();
916  }
917 
918  // Convert all Teuchos::Array to Kokkos::View
919 
920  // This is an output array, so we don't have to copy to device here.
921  // However, we'll have to remember to copy back to host when done.
922  auto num_packets_per_lid_d =
923  create_mirror_view_from_raw_host_array(buffer_device_type(),
924  numPacketsPerLID.getRawPtr(),
925  numPacketsPerLID.size(), false,
926  "num_packets_per_lid");
927 
928  // This is an input array, so we have to copy to device here.
929  // However, we never need to copy it back to host.
930  auto export_lids_d =
931  create_mirror_view_from_raw_host_array(buffer_device_type(),
932  exportLIDs.getRawPtr(),
933  exportLIDs.size(), true,
934  "export_lids");
935  // This is an input array, so we have to copy to device here.
936  // However, we never need to copy it back to host.
937  auto export_pids_d =
939  sourcePIDs.getRawPtr(),
940  sourcePIDs.size(), true,
941  "export_pids");
942  constexpr bool pack_pids = true;
943  try {
944  PackCrsMatrixImpl::packCrsMatrix(sourceMatrix, exports_dv, num_packets_per_lid_d, export_lids_d,
945  export_pids_d, constantNumPackets, pack_pids);
946  } catch (std::exception& e) {
947  if (verbose) {
948  std::ostringstream os;
949  os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw: "
950  << e.what() << std::endl;
951  std::cerr << os.str();
952  }
953  throw;
954  } catch (...) {
955  if (verbose) {
956  std::ostringstream os;
957  os << *prefix << "PackCrsMatrixImpl::packCrsMatrix threw an exception "
958  "not a subclass of std::exception"
959  << std::endl;
960  std::cerr << os.str();
961  }
962  throw;
963  }
964 
965  if (numPacketsPerLID.size() != 0) {
966  try {
967  // The counts are an output of PackCrsMatrixImpl::packCrsMatrix,
968  // so we have to copy them back to host.
969  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h(numPacketsPerLID.getRawPtr(), numPacketsPerLID.size());
970  // DEEP_COPY REVIEW - DEVICE-TO-HOST
971  Kokkos::deep_copy(execution_space(), num_packets_per_lid_h, num_packets_per_lid_d);
972  } catch (std::exception& e) {
973  if (verbose) {
974  std::ostringstream os;
975  os << *prefix << "Kokkos::deep_copy threw: " << e.what() << std::endl;
976  std::cerr << os.str();
977  }
978  throw;
979  } catch (...) {
980  if (verbose) {
981  std::ostringstream os;
982  os << *prefix << "Kokkos::deep_copy threw an exception not a subclass "
983  "of std::exception"
984  << std::endl;
985  std::cerr << os.str();
986  }
987  throw;
988  }
989  }
990 
991  if (verbose) {
992  std::ostringstream os;
993  os << *prefix << "done" << std::endl;
994  std::cerr << os.str();
995  }
996 }
997 
998 } // namespace Details
999 } // namespace Tpetra
1000 
1001 #define TPETRA_DETAILS_PACKCRSMATRIX_INSTANT(ST, LO, GO, NT) \
1002  template void \
1003  Details::packCrsMatrix<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1004  Teuchos::Array<char>&, \
1005  const Teuchos::ArrayView<size_t>&, \
1006  const Teuchos::ArrayView<const LO>&, \
1007  size_t&); \
1008  template void \
1009  Details::packCrsMatrixNew<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1010  Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1011  const Kokkos::DualView<size_t*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1012  const Kokkos::DualView<const LO*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1013  size_t&); \
1014  template void \
1015  Details::packCrsMatrixWithOwningPIDs<ST, LO, GO, NT>(const CrsMatrix<ST, LO, GO, NT>&, \
1016  Kokkos::DualView<char*, DistObject<char, LO, GO, NT>::buffer_device_type>&, \
1017  const Teuchos::ArrayView<size_t>&, \
1018  const Teuchos::ArrayView<const LO>&, \
1019  const Teuchos::ArrayView<const int>&, \
1020  size_t&);
1021 
1022 #endif // TPETRA_DETAILS_PACKCRSMATRIX_DEF_HPP
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
static KOKKOS_INLINE_FUNCTION Kokkos::pair< int, size_t > packArray(char outBuf[], const value_type inBuf[], const size_t numEnt)
Pack the first numEnt entries of the given input buffer of value_type, into the output buffer of byte...
Declaration of the Tpetra::CrsMatrix class.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const T &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Compute the number of packets and offsets for the pack procedure.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for &quot;new&quot; DistObject inter...
static bool verbose()
Whether Tpetra is in verbose mode.
typename Node::device_type device_type
The Kokkos device type.
void packCrsMatrix(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Teuchos::Array< char > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const T &)
Number of bytes required to pack or unpack the given value of type value_type.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
Base class for distributed Tpetra objects that support data redistribution.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.