Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_packCrsGraph_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
11 #define TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Teuchos_Array.hpp"
15 #include "Teuchos_ArrayView.hpp"
22 #include "Tpetra_CrsGraph_decl.hpp"
23 #include <memory>
24 #include <string>
25 
47 
48 namespace Tpetra {
49 
50 //
51 // Users must never rely on anything in the Details namespace.
52 //
53 namespace Details {
54 
55 namespace PackCrsGraphImpl {
63 template <class OutputOffsetsViewType,
64  class CountsViewType,
65  class InputOffsetsViewType,
66  class InputLocalRowIndicesViewType,
67  class InputLocalRowPidsViewType,
68  const bool debug =
69 #ifdef HAVE_TPETRA_DEBUG
70  true
71 #else
72  false
73 #endif // HAVE_TPETRA_DEBUG
74  >
76  public:
77  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
78  typedef typename CountsViewType::non_const_value_type count_type;
79  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
80  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
81  typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
82  // output Views drive where execution happens.
83  typedef typename OutputOffsetsViewType::device_type device_type;
84  static_assert(std::is_same<typename CountsViewType::device_type::execution_space,
85  typename device_type::execution_space>::value,
86  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
87  static_assert(Kokkos::is_view<OutputOffsetsViewType>::value,
88  "OutputOffsetsViewType must be a Kokkos::View.");
89  static_assert(std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
90  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
91  static_assert(std::is_integral<output_offset_type>::value,
92  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
93  static_assert(Kokkos::is_view<CountsViewType>::value,
94  "CountsViewType must be a Kokkos::View.");
95  static_assert(std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
96  "CountsViewType must be a nonconst Kokkos::View.");
97  static_assert(std::is_integral<count_type>::value,
98  "The type of each entry of CountsViewType must be a built-in integer type.");
99  static_assert(Kokkos::is_view<InputOffsetsViewType>::value,
100  "InputOffsetsViewType must be a Kokkos::View.");
101  static_assert(std::is_integral<input_offset_type>::value,
102  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
103  static_assert(Kokkos::is_view<InputLocalRowIndicesViewType>::value,
104  "InputLocalRowIndicesViewType must be a Kokkos::View.");
105  static_assert(std::is_integral<local_row_index_type>::value,
106  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
107 
108  NumPacketsAndOffsetsFunctor(const OutputOffsetsViewType& outputOffsets,
109  const CountsViewType& counts,
110  const InputOffsetsViewType& rowOffsets,
111  const InputLocalRowIndicesViewType& lclRowInds,
112  const InputLocalRowPidsViewType& lclRowPids)
113  : outputOffsets_(outputOffsets)
114  , counts_(counts)
115  , rowOffsets_(rowOffsets)
116  , lclRowInds_(lclRowInds)
117  , lclRowPids_(lclRowPids)
118  , error_("error") // don't forget this, or you'll get segfaults!
119  {
120  if (debug) {
121  const size_t numRowsToPack = static_cast<size_t>(lclRowInds_.extent(0));
122 
123  if (numRowsToPack != static_cast<size_t>(counts_.extent(0))) {
124  std::ostringstream os;
125  os << "lclRowInds.extent(0) = " << numRowsToPack
126  << " != counts.extent(0) = " << counts_.extent(0)
127  << ".";
128  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
129  }
130  if (static_cast<size_t>(numRowsToPack + 1) !=
131  static_cast<size_t>(outputOffsets_.extent(0))) {
132  std::ostringstream os;
133  os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
134  << " != outputOffsets.extent(0) = " << outputOffsets_.extent(0)
135  << ".";
136  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str());
137  }
138  }
139  }
140 
141  KOKKOS_INLINE_FUNCTION void
142  operator()(const local_row_index_type& curInd,
143  output_offset_type& update,
144  const bool final) const {
145  if (debug) {
146  if (curInd < static_cast<local_row_index_type>(0)) {
147  error_() = 1;
148  return;
149  }
150  }
151 
152  if (final) {
153  if (debug) {
154  if (curInd >= static_cast<local_row_index_type>(outputOffsets_.extent(0))) {
155  error_() = 2;
156  return;
157  }
158  }
159  outputOffsets_(curInd) = update;
160  }
161 
162  if (curInd < static_cast<local_row_index_type>(counts_.extent(0))) {
163  const auto lclRow = lclRowInds_(curInd);
164  if (static_cast<size_t>(lclRow + 1) >= static_cast<size_t>(rowOffsets_.extent(0)) ||
165  static_cast<local_row_index_type>(lclRow) < static_cast<local_row_index_type>(0)) {
166  error_() = 3;
167  return;
168  }
169  // count_type could differ from the type of each row offset.
170  // For example, row offsets might each be 64 bits, but if their
171  // difference always fits in 32 bits, we may then safely use a
172  // 32-bit count_type.
173  const count_type count =
174  static_cast<count_type>(rowOffsets_(lclRow + 1) - rowOffsets_(lclRow));
175 
176  // We pack first the global column indices and then pids (if any),
177  // However, if the number of entries in the row is zero, we pack nothing.
178  const count_type numEntToPack = (count == 0)
179  ? static_cast<count_type>(0)
180  : count * (1 + (lclRowPids_.size() > 0 ? 1 : 0));
181 
182  if (final) {
183  counts_(curInd) = numEntToPack;
184  }
185  update += numEntToPack;
186  }
187  }
188 
189  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
190  // have join both with and without volatile! Otherwise intrawarp
191  // joins are really slow on GPUs.
192 
194  int getError() const {
195  auto error_h = Kokkos::create_mirror_view(error_);
196  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
197  // Note: In the UVM case, this would otherwise be a no-op
198  // and thus not fence, so the value might not be correct on return
199  // In the non-UVM case, create_mirror_view will block for the allocation
200  Kokkos::deep_copy(error_h, error_);
201 
202  return error_h();
203  }
204 
205  private:
206  OutputOffsetsViewType outputOffsets_;
207  CountsViewType counts_;
208  typename InputOffsetsViewType::const_type rowOffsets_;
209  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
210  typename InputLocalRowPidsViewType::const_type lclRowPids_;
211  Kokkos::View<int, device_type> error_;
212 };
213 
223 template <class OutputOffsetsViewType,
224  class CountsViewType,
225  class InputOffsetsViewType,
226  class InputLocalRowIndicesViewType,
227  class InputLocalRowPidsViewType>
228 typename CountsViewType::non_const_value_type
229 computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
230  const CountsViewType& counts,
231  const InputOffsetsViewType& rowOffsets,
232  const InputLocalRowIndicesViewType& lclRowInds,
233  const InputLocalRowPidsViewType& lclRowPids) {
234  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
235  CountsViewType, typename InputOffsetsViewType::const_type,
236  typename InputLocalRowIndicesViewType::const_type,
237  typename InputLocalRowPidsViewType::const_type>
238  functor_type;
239  typedef typename CountsViewType::non_const_value_type count_type;
240  typedef typename OutputOffsetsViewType::size_type size_type;
241  typedef typename OutputOffsetsViewType::execution_space execution_space;
242  typedef typename functor_type::local_row_index_type LO;
243  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
244  const char prefix[] = "computeNumPacketsAndOffsets: ";
245 
246  count_type count = 0;
247  const count_type numRowsToPack = lclRowInds.extent(0);
248 
249  if (numRowsToPack == 0) {
250  return count;
251  } else {
252  TEUCHOS_TEST_FOR_EXCEPTION(rowOffsets.extent(0) <= static_cast<size_type>(1),
253  std::invalid_argument, prefix << "There is at least one row to pack, "
254  "but the graph has no rows. lclRowInds.extent(0) = "
255  << numRowsToPack << ", but rowOffsets.extent(0) = " << rowOffsets.extent(0) << " <= 1.");
256  TEUCHOS_TEST_FOR_EXCEPTION(outputOffsets.extent(0) !=
257  static_cast<size_type>(numRowsToPack + 1),
258  std::invalid_argument,
259  prefix << "Output dimension does not match number of rows to pack. "
260  << "outputOffsets.extent(0) = " << outputOffsets.extent(0)
261  << " != lclRowInds.extent(0) + 1 = "
262  << static_cast<size_type>(numRowsToPack + 1) << ".");
263  TEUCHOS_TEST_FOR_EXCEPTION(counts.extent(0) != numRowsToPack, std::invalid_argument,
264  prefix << "counts.extent(0) = " << counts.extent(0)
265  << " != numRowsToPack = " << numRowsToPack << ".");
266 
267  functor_type f(outputOffsets, counts, rowOffsets, lclRowInds, lclRowPids);
268  Kokkos::parallel_scan("Tpetra::Details::computeNumPacketsAndOffsets::scan", range_type(0, numRowsToPack + 1), f);
269 
270  // At least in debug mode, this functor checks for errors.
271  const int errCode = f.getError();
272  TEUCHOS_TEST_FOR_EXCEPTION(errCode != 0, std::runtime_error, prefix << "parallel_scan error code " << errCode << " != 0.");
273 
274 #if 0
275  size_t total = 0;
276  for (LO k = 0; k < numRowsToPack; ++k) {
277  total += counts[k];
278  }
279  if (outputOffsets(numRowsToPack) != total) {
280  if (errStr.get () == NULL) {
281  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
282  }
283  std::ostringstream& os = *errStr;
284  os << prefix
285  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
286  << outputOffsets(numRowsToPack) << " != sum of counts = "
287  << total << "." << std::endl;
288  if (numRowsToPack != 0) {
289  // Only print the array if it's not too long.
290  if (numRowsToPack < static_cast<LO> (10)) {
291  os << "outputOffsets: [";
292  for (LO i = 0; i <= numRowsToPack; ++i) {
293  os << outputOffsets(i);
294  if (static_cast<LO> (i + 1) <= numRowsToPack) {
295  os << ",";
296  }
297  }
298  os << "]" << std::endl;
299  os << "counts: [";
300  for (LO i = 0; i < numRowsToPack; ++i) {
301  os << counts(i);
302  if (static_cast<LO> (i + 1) < numRowsToPack) {
303  os << ",";
304  }
305  }
306  os << "]" << std::endl;
307  }
308  else {
309  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
310  << outputOffsets(numRowsToPack-1) << "." << std::endl;
311  }
312  }
313  count = outputOffsets(numRowsToPack);
314  return {false, errStr};
315  }
316 #endif // HAVE_TPETRA_DEBUG
317 
318  // Get last entry of outputOffsets, which is the sum of the entries
319  // of counts. Don't assume UVM.
320  using Tpetra::Details::getEntryOnHost;
321  return static_cast<count_type>(getEntryOnHost(outputOffsets,
322  numRowsToPack));
323  }
324 }
325 
336 template <class Packet,
337  class LocalMapType,
338  class BufferDeviceType,
339  class InputLidsType,
340  class InputPidsType>
341 KOKKOS_FUNCTION
342  size_t
343  packRow(const LocalMapType& col_map,
344  const Kokkos::View<Packet*, BufferDeviceType>& exports,
345  const InputLidsType& lids_in,
346  const InputPidsType& pids_in,
347  const size_t offset,
348  const size_t num_ent,
349  const bool pack_pids) {
350  using LO = typename LocalMapType::local_ordinal_type;
351  using GO = typename LocalMapType::global_ordinal_type;
352 
353  if (num_ent == 0) {
354  // Empty rows always take zero bytes, to ensure sparsity.
355  return static_cast<size_t>(0);
356  }
357 
358  size_t num_ent_packed = num_ent;
359  if (pack_pids) {
360  num_ent_packed += num_ent;
361  }
362 
363  // Copy column indices one at a time, so that we don't need
364  // temporary storage.
365  for (size_t k = 0; k < num_ent; ++k) {
366  const LO lid = lids_in[k];
367  const GO gid = col_map.getGlobalElement(lid);
368  exports(offset + k) = gid;
369  }
370  // Copy PIDs one at a time, so that we don't need temporary storage.
371  if (pack_pids) {
372  for (size_t k = 0; k < num_ent; ++k) {
373  const LO lid = lids_in[k];
374  const int pid = pids_in[lid];
375  exports(offset + num_ent + k) = static_cast<GO>(pid);
376  }
377  }
378 
379  return num_ent_packed;
380 }
381 
382 template <class Packet,
383  class LocalGraph,
384  class LocalMap,
385  class BufferDeviceType>
386 struct PackCrsGraphFunctor {
387  using local_graph_type = LocalGraph;
388  using local_map_type = LocalMap;
389  using LO = typename local_map_type::local_ordinal_type;
390  using GO = typename local_map_type::global_ordinal_type;
391 
392  using num_packets_per_lid_view_type =
393  Kokkos::View<const size_t*, BufferDeviceType>;
394  using offsets_view_type = Kokkos::View<const size_t*, BufferDeviceType>;
395  using exports_view_type = Kokkos::View<Packet*, BufferDeviceType>;
396  using export_lids_view_type =
398  using source_pids_view_type =
400 
401  using count_type =
402  typename num_packets_per_lid_view_type::non_const_value_type;
403  using offset_type = typename offsets_view_type::non_const_value_type;
404  using value_type = Kokkos::pair<int, LO>;
405 
406  static_assert(std::is_same<LO, typename local_graph_type::data_type>::value,
407  "local_map_type::local_ordinal_type and "
408  "local_graph_type::data_type must be the same.");
409 
410  local_graph_type local_graph;
411  local_map_type local_col_map;
412  exports_view_type exports;
413  num_packets_per_lid_view_type num_packets_per_lid;
414  export_lids_view_type export_lids;
415  source_pids_view_type source_pids;
416  offsets_view_type offsets;
417  bool pack_pids;
418 
419  PackCrsGraphFunctor(const local_graph_type& local_graph_in,
420  const local_map_type& local_col_map_in,
421  const exports_view_type& exports_in,
422  const num_packets_per_lid_view_type& num_packets_per_lid_in,
423  const export_lids_view_type& export_lids_in,
424  const source_pids_view_type& source_pids_in,
425  const offsets_view_type& offsets_in,
426  const bool pack_pids_in)
427  : local_graph(local_graph_in)
428  , local_col_map(local_col_map_in)
429  , exports(exports_in)
430  , num_packets_per_lid(num_packets_per_lid_in)
431  , export_lids(export_lids_in)
432  , source_pids(source_pids_in)
433  , offsets(offsets_in)
434  , pack_pids(pack_pids_in) {
435  const LO numRows = local_graph_in.numRows();
436  const LO rowMapDim =
437  static_cast<LO>(local_graph.row_map.extent(0));
438  TEUCHOS_TEST_FOR_EXCEPTION(numRows != 0 && rowMapDim != numRows + static_cast<LO>(1),
439  std::logic_error, "local_graph.row_map.extent(0) = " << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
440  }
441 
442  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const {
443  using ::Tpetra::Details::OrdinalTraits;
444  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
445  }
446 
447  KOKKOS_INLINE_FUNCTION void
448  join(value_type& dst, const value_type& src) const {
449  // `dst` should reflect the first (least) bad index and all other
450  // associated error codes and data, so prefer keeping it.
451  if (src.first != 0 && dst.first == 0) {
452  dst = src;
453  }
454  }
455 
456  KOKKOS_INLINE_FUNCTION
457  void operator()(const LO i, value_type& dst) const {
458  const size_t offset = offsets[i];
459  const LO export_lid = export_lids[i];
460  const size_t buf_size = exports.size();
461  const size_t num_packets_this_lid = num_packets_per_lid(i);
462  const size_t num_ent =
463  static_cast<size_t>(local_graph.row_map[export_lid + 1] - local_graph.row_map[export_lid]);
464 
465  // Only pack this row's data if it has a nonzero number of
466  // entries. We can do this because receiving processes get the
467  // number of packets, and will know that zero packets means zero
468  // entries.
469  if (num_ent == 0) {
470  return;
471  }
472 
473  if (export_lid >= static_cast<LO>(local_graph.numRows())) {
474  if (dst.first != 0) { // keep only the first error
475  dst = Kokkos::make_pair(1, i); // invalid row
476  }
477  return;
478  } else if ((offset > buf_size || offset + num_packets_this_lid > buf_size)) {
479  if (dst.first != 0) { // keep only the first error
480  dst = Kokkos::make_pair(2, i); // out of bounds
481  }
482  return;
483  }
484 
485  // We can now pack this row
486 
487  // Since the graph is locally indexed on the calling process, we
488  // have to use its column Map (which it _must_ have in this case)
489  // to convert to global indices.
490  const auto row_beg = local_graph.row_map[export_lid];
491  const auto row_end = local_graph.row_map[export_lid + 1];
492  auto lids_in = Kokkos::subview(local_graph.entries,
493  Kokkos::make_pair(row_beg, row_end));
494  size_t num_ent_packed_this_row =
495  packRow(local_col_map, exports, lids_in,
496  source_pids, offset, num_ent, pack_pids);
497  if (num_ent_packed_this_row != num_packets_this_lid) {
498  if (dst.first != 0) { // keep only the first error
499  dst = Kokkos::make_pair(3, i);
500  }
501  }
502  }
503 };
504 
512 template <class Packet,
513  class LocalGraph,
514  class LocalMap,
515  class BufferDeviceType>
516 void do_pack(const LocalGraph& local_graph,
517  const LocalMap& local_map,
518  const Kokkos::View<Packet*, BufferDeviceType>& exports,
519  const typename PackTraits<
520  size_t>::input_array_type& num_packets_per_lid,
521  const typename PackTraits<
522  typename LocalMap::local_ordinal_type>::input_array_type& export_lids,
523  const typename PackTraits<
524  int>::input_array_type& source_pids,
525  const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
526  const bool pack_pids) {
527  using LO = typename LocalMap::local_ordinal_type;
528  using execution_space = typename LocalGraph::device_type::execution_space;
529  using range_type = Kokkos::RangePolicy<execution_space, LO>;
530  const char prefix[] = "Tpetra::Details::PackCrsGraphImpl::do_pack: ";
531 
532  if (export_lids.extent(0) != 0) {
533  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(offsets.extent(0)) !=
534  static_cast<size_t>(export_lids.extent(0) + 1),
535  std::invalid_argument, prefix << "offsets.extent(0) = " << offsets.extent(0) << " != export_lids.extent(0) (= " << export_lids.extent(0) << ") + 1.");
536  TEUCHOS_TEST_FOR_EXCEPTION(export_lids.extent(0) != num_packets_per_lid.extent(0),
537  std::invalid_argument, prefix << "export_lids.extent(0) = " << export_lids.extent(0) << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
538  // If exports has nonzero length at this point, then the graph
539  // has at least one entry to pack. Thus, if packing process
540  // ranks, we had better have at least one process rank to pack.
541  TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
542  source_pids.extent(0) == 0,
543  std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at "
544  "least one graph entry, but source_pids.extent(0) = 0.");
545  }
546 
547  using pack_functor_type =
548  PackCrsGraphFunctor<Packet, LocalGraph, LocalMap,
549  BufferDeviceType>;
550  pack_functor_type f(local_graph, local_map, exports,
551  num_packets_per_lid, export_lids,
552  source_pids, offsets, pack_pids);
553 
554  typename pack_functor_type::value_type result;
555  range_type range(0, num_packets_per_lid.extent(0));
556  Kokkos::parallel_reduce("Tpetra::Details::computeNumPacketsAndOffsets::reduce", range, f, result);
557 
558  if (result.first != 0) {
559  // We can't deep_copy from AnonymousSpace Views, so we can't
560  // print out any information from them in case of error.
561  std::ostringstream os;
562  if (result.first == 1) { // invalid local row index
563  os << "invalid local row index";
564  } else if (result.first == 2) { // invalid offset
565  os << "invalid offset";
566  }
567  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "PackCrsGraphFunctor "
568  "reported error code "
569  << result.first << " (" << os.str() << ") for the first bad row " << result.second << ".");
570  }
571 }
572 
599 template <typename LO, typename GO, typename NT>
600 void packCrsGraph(const CrsGraph<LO, GO, NT>& sourceGraph,
601  Kokkos::DualView<
602  typename CrsGraph<LO, GO, NT>::packet_type*,
603  typename CrsGraph<LO, GO, NT>::buffer_device_type>& exports,
604  const Kokkos::View<
605  size_t*,
606  typename CrsGraph<LO, GO, NT>::buffer_device_type>& num_packets_per_lid,
607  const Kokkos::View<
608  const LO*,
609  typename CrsGraph<LO, GO, NT>::buffer_device_type>& export_lids,
610  const Kokkos::View<
611  const int*,
612  typename CrsGraph<LO, GO, NT>::buffer_device_type>& export_pids,
613  size_t& constant_num_packets,
614  const bool pack_pids) {
615  using Kokkos::View;
616  using crs_graph_type = CrsGraph<LO, GO, NT>;
617  using packet_type = typename crs_graph_type::packet_type;
618  using buffer_device_type = typename crs_graph_type::buffer_device_type;
619  using exports_view_type = Kokkos::DualView<packet_type*, buffer_device_type>;
620  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
621  using local_map_type = typename Tpetra::Map<LO, GO, NT>::local_map_type;
622  const char prefix[] = "Tpetra::Details::packCrsGraph: ";
623  constexpr bool debug = false;
624 
625  local_graph_device_type local_graph = sourceGraph.getLocalGraphDevice();
626  local_map_type local_col_map = sourceGraph.getColMap()->getLocalMap();
627 
628  // Setting this to zero tells the caller to expect a possibly
629  // different ("nonconstant") number of packets per local index
630  // (i.e., a possibly different number of entries per row).
631  constant_num_packets = 0;
632 
633  const size_t num_export_lids(export_lids.extent(0));
634  TEUCHOS_TEST_FOR_EXCEPTION(num_export_lids != size_t(num_packets_per_lid.extent(0)),
635  std::invalid_argument, prefix << "num_export_lids.extent(0) = " << num_export_lids << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
636  if (num_export_lids != 0) {
637  TEUCHOS_TEST_FOR_EXCEPTION(num_packets_per_lid.data() == nullptr, std::invalid_argument,
638  prefix << "num_export_lids = " << num_export_lids << " != 0, but "
639  "num_packets_per_lid.data() = "
640  << num_packets_per_lid.data() << " == NULL.");
641  }
642 
643  if (num_export_lids == 0) {
644  exports = exports_view_type("exports", 0);
645  return;
646  }
647 
648  // Array of offsets into the pack buffer.
649  View<size_t*, buffer_device_type> offsets("offsets", num_export_lids + 1);
650 
651  // Compute number of packets per LID (row to send), as well as
652  // corresponding offsets (the prefix sum of the packet counts).
653  const size_t count =
654  computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
655  local_graph.row_map, export_lids, export_pids);
656 
657  // Resize the output pack buffer if needed.
658  if (count > size_t(exports.extent(0))) {
659  exports = exports_view_type("exports", count);
660  if (debug) {
661  std::ostringstream os;
662  os << "*** exports resized to " << count << std::endl;
663  std::cerr << os.str();
664  }
665  }
666  if (debug) {
667  std::ostringstream os;
668  os << "*** count: " << count << ", exports.extent(0): "
669  << exports.extent(0) << std::endl;
670  std::cerr << os.str();
671  }
672 
673  // If exports has nonzero length at this point, then the graph has
674  // at least one entry to pack. Thus, if packing process ranks, we
675  // had better have at least one process rank to pack.
676  TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
677  export_pids.extent(0) == 0,
678  std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at least "
679  "one graph entry, but export_pids.extent(0) = 0.");
680 
681  exports.modify_device();
682  auto exports_d = exports.view_device();
683  do_pack<packet_type, local_graph_device_type, local_map_type, buffer_device_type>(local_graph, local_col_map, exports_d, num_packets_per_lid,
684  export_lids, export_pids, offsets, pack_pids);
685  // If we got this far, we succeeded.
686 }
687 
688 } // namespace PackCrsGraphImpl
689 
690 template <typename LO, typename GO, typename NT>
691 void packCrsGraph(const CrsGraph<LO, GO, NT>& sourceGraph,
692  Teuchos::Array<typename CrsGraph<LO, GO, NT>::packet_type>& exports,
693  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
694  const Teuchos::ArrayView<const LO>& exportLIDs,
695  size_t& constantNumPackets) {
696  using Kokkos::HostSpace;
697  using Kokkos::MemoryUnmanaged;
698  using Kokkos::View;
699  using crs_graph_type = CrsGraph<LO, GO, NT>;
700  using packet_type = typename crs_graph_type::packet_type;
701  using BDT = typename crs_graph_type::buffer_device_type;
702 
703  // Convert all Teuchos::Array to Kokkos::View
704 
705  // This is an output array, so we don't have to copy to device here.
706  // However, we'll have to remember to copy back to host when done.
707  BDT outputDevice;
708  View<size_t*, BDT> num_packets_per_lid_d =
710  numPacketsPerLID.getRawPtr(),
711  numPacketsPerLID.size(), false,
712  "num_packets_per_lid");
713  // This is an input array, so we have to copy to device here.
714  // However, we never need to copy it back to host.
715  View<const LO*, BDT> export_lids_d =
717  exportLIDs.getRawPtr(),
718  exportLIDs.size(), true,
719  "export_lids");
720  View<const int*, BDT> export_pids_d;
721  Kokkos::DualView<packet_type*, BDT> exports_dv;
722  constexpr bool pack_pids = false;
723 
724  static_assert(std::is_same<
725  typename decltype(num_packets_per_lid_d)::non_const_value_type,
726  size_t>::value,
727  "num_packets_per_lid_d's non_const_value_type should be size_t.");
728  static_assert(std::is_same<
729  typename decltype(num_packets_per_lid_d)::device_type,
730  BDT>::value,
731  "num_packets_per_lid_d's BDT should be size_t.");
732  static_assert(std::is_same<
733  typename decltype(export_lids_d)::device_type,
734  BDT>::value,
735  "export_lids_d's device_type should be BDT.");
736  static_assert(std::is_same<
737  typename decltype(export_pids_d)::non_const_value_type,
738  int>::value,
739  "export_pids_d's non_const_value_type should be int.");
740  static_assert(std::is_same<
741  typename decltype(export_pids_d)::device_type,
742  BDT>::value,
743  "export_pids_d's device_type should be BDT.");
744 
745  PackCrsGraphImpl::packCrsGraph(sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
746  export_pids_d, constantNumPackets, pack_pids);
747 
748  // The counts are an output of packCrsGraph, so we have to copy
749  // them back to host.
750  View<size_t*, HostSpace, MemoryUnmanaged>
751  num_packets_per_lid_h(numPacketsPerLID.getRawPtr(),
752  numPacketsPerLID.size());
753 
754  // DEEP_COPY REVIEW - DEVICE-TO-HOST
755  using execution_space = typename BDT::execution_space;
756  Kokkos::deep_copy(execution_space(), num_packets_per_lid_h, num_packets_per_lid_d);
757 
758  // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
759  // exports_dv above, then we have two host copies for exports_h.
760 
761  // The exports are an output of packCrsGraph, so we have to
762  // copy them back to host.
763  if (static_cast<size_t>(exports.size()) !=
764  static_cast<size_t>(exports_dv.extent(0))) {
765  exports.resize(exports_dv.extent(0));
766  }
767  View<packet_type*, HostSpace, MemoryUnmanaged>
768  exports_h(exports.getRawPtr(), exports.size());
769  // DEEP_COPY REVIEW - DEVICE-TO-HOST
770  Kokkos::deep_copy(execution_space(), exports_h, exports_dv.view_device());
771  execution_space().fence();
772 }
773 
776 template <typename LO, typename GO, typename NT>
777 void packCrsGraphNew(const CrsGraph<LO, GO, NT>& sourceGraph,
778  const Kokkos::DualView<
779  const LO*,
780  typename CrsGraph<LO, GO, NT>::buffer_device_type>& export_lids,
781  const Kokkos::DualView<
782  const int*,
783  typename CrsGraph<LO, GO, NT>::buffer_device_type>& export_pids,
784  Kokkos::DualView<
786  typename CrsGraph<LO, GO, NT>::buffer_device_type>& exports,
787  Kokkos::DualView<
788  size_t*,
790  num_packets_per_lid,
791  size_t& constant_num_packets,
792  const bool pack_pids) {
793  using Kokkos::View;
794  using crs_graph_type = CrsGraph<LO, GO, NT>;
795  using BDT = typename crs_graph_type::buffer_device_type;
796  using PT = typename crs_graph_type::packet_type;
797  using exports_dual_view_type = Kokkos::DualView<PT*, BDT>;
798  using LGT = typename crs_graph_type::local_graph_device_type;
799  using LMT = typename crs_graph_type::map_type::local_map_type;
800  const char prefix[] = "Tpetra::Details::packCrsGraphNew: ";
801 
802  const LGT local_graph = sourceGraph.getLocalGraphDevice();
803  const LMT local_col_map = sourceGraph.getColMap()->getLocalMap();
804 
805  // Setting this to zero tells the caller to expect a possibly
806  // different ("nonconstant") number of packets per local index
807  // (i.e., a possibly different number of entries per row).
808  constant_num_packets = 0;
809 
810  const size_t num_export_lids =
811  static_cast<size_t>(export_lids.extent(0));
812  TEUCHOS_TEST_FOR_EXCEPTION(num_export_lids !=
813  static_cast<size_t>(num_packets_per_lid.extent(0)),
814  std::invalid_argument, prefix << "num_export_lids.extent(0) = " << num_export_lids << " != num_packets_per_lid.extent(0) = " << num_packets_per_lid.extent(0) << ".");
815  TEUCHOS_TEST_FOR_EXCEPTION(num_export_lids != 0 &&
816  num_packets_per_lid.view_device().data() == nullptr,
817  std::invalid_argument, prefix << "num_export_lids = " << num_export_lids << " != 0, but num_packets_per_lid.view_device().data() = nullptr.");
818 
819  if (num_export_lids == 0) {
820  exports = exports_dual_view_type();
821  return;
822  }
823 
824  // Array of offsets into the pack buffer.
825  using offsets_type = Kokkos::View<size_t*, BDT>;
826  offsets_type offsets("offsets", num_export_lids + 1);
827 
828  // Compute number of packets per LID (row to send), as well as
829  // corresponding offsets (the prefix sum of the packet counts).
830  num_packets_per_lid.clear_sync_state();
831  num_packets_per_lid.modify_device();
832  using PackCrsGraphImpl::computeNumPacketsAndOffsets;
833  const size_t count =
834  computeNumPacketsAndOffsets(offsets, num_packets_per_lid.view_device(),
835  local_graph.row_map,
836  export_lids.view_device(),
837  export_pids.view_device());
838 
839  // Resize the output pack buffer if needed.
840  if (count > static_cast<size_t>(exports.extent(0))) {
841  exports = exports_dual_view_type("exports", count);
842  }
843 
844  // If exports has nonzero length at this point, then the graph has
845  // at least one entry to pack. Thus, if packing process ranks, we
846  // had better have at least one process rank to pack.
847  TEUCHOS_TEST_FOR_EXCEPTION(pack_pids && exports.extent(0) != 0 &&
848  export_pids.extent(0) == 0,
849  std::invalid_argument, prefix << "pack_pids is true, and exports.extent(0) = " << exports.extent(0) << " != 0, meaning that we need to pack at least "
850  "one graph entry, but export_pids.extent(0) = 0.");
851 
852  exports.modify_device();
853  using PackCrsGraphImpl::do_pack;
854  do_pack<PT, LGT, LMT, BDT>(local_graph, local_col_map,
855  exports.view_device(),
856  num_packets_per_lid.view_device(),
857  export_lids.view_device(),
858  export_pids.view_device(),
859  offsets, pack_pids);
860 }
861 
862 template <typename LO, typename GO, typename NT>
864  Kokkos::DualView<
866  typename CrsGraph<LO, GO, NT>::buffer_device_type>& exports_dv,
867  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
868  const Teuchos::ArrayView<const LO>& exportLIDs,
869  const Teuchos::ArrayView<const int>& sourcePIDs,
870  size_t& constantNumPackets) {
871  using Kokkos::HostSpace;
872  using Kokkos::MemoryUnmanaged;
873  using Kokkos::View;
874  using crs_graph_type = CrsGraph<LO, GO, NT>;
875  using buffer_device_type = typename crs_graph_type::buffer_device_type;
876 
877  // Convert all Teuchos::Array to Kokkos::View
878 
879  // This is an output array, so we don't have to copy to device here.
880  // However, we'll have to remember to copy back to host when done.
881  View<size_t*, buffer_device_type> num_packets_per_lid_d =
882  create_mirror_view_from_raw_host_array(buffer_device_type(),
883  numPacketsPerLID.getRawPtr(),
884  numPacketsPerLID.size(), false,
885  "num_packets_per_lid");
886 
887  // This is an input array, so we have to copy to device here.
888  // However, we never need to copy it back to host.
889  View<const LO*, buffer_device_type> export_lids_d =
890  create_mirror_view_from_raw_host_array(buffer_device_type(),
891  exportLIDs.getRawPtr(),
892  exportLIDs.size(), true,
893  "export_lids");
894  // This is an input array, so we have to copy to device here.
895  // However, we never need to copy it back to host.
896  View<const int*, buffer_device_type> export_pids_d =
897  create_mirror_view_from_raw_host_array(buffer_device_type(),
898  sourcePIDs.getRawPtr(),
899  sourcePIDs.size(), true,
900  "export_pids");
901  constexpr bool pack_pids = true;
902  PackCrsGraphImpl::packCrsGraph(sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
903  export_pids_d, constantNumPackets, pack_pids);
904 
905  // The counts are an output of packCrsGraph, so we
906  // have to copy them back to host.
907  View<size_t*, HostSpace, MemoryUnmanaged> num_packets_per_lid_h(numPacketsPerLID.getRawPtr(), numPacketsPerLID.size());
908  // DEEP_COPY REVIEW - DEVICE-TO-HOST
909  using execution_space = typename buffer_device_type::execution_space;
910  Kokkos::deep_copy(execution_space(),
911  num_packets_per_lid_h, num_packets_per_lid_d);
912  execution_space().fence();
913 }
914 
915 } // namespace Details
916 } // namespace Tpetra
917 
918 #define TPETRA_DETAILS_PACKCRSGRAPH_INSTANT(LO, GO, NT) \
919  template void \
920  Details::packCrsGraph<LO, GO, NT>( \
921  const CrsGraph<LO, GO, NT>&, \
922  Teuchos::Array<CrsGraph<LO, GO, NT>::packet_type>&, \
923  const Teuchos::ArrayView<size_t>&, \
924  const Teuchos::ArrayView<const LO>&, \
925  size_t&); \
926  template void \
927  Details::packCrsGraphNew<LO, GO, NT>( \
928  const CrsGraph<LO, GO, NT>&, \
929  const Kokkos::DualView< \
930  const LO*, \
931  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
932  const Kokkos::DualView< \
933  const int*, \
934  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
935  Kokkos::DualView< \
936  CrsGraph<LO, GO, NT>::packet_type*, \
937  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
938  Kokkos::DualView< \
939  size_t*, \
940  CrsGraph<LO, GO, NT>::buffer_device_type>, \
941  size_t&, \
942  const bool); \
943  template void \
944  Details::packCrsGraphWithOwningPIDs<LO, GO, NT>( \
945  const CrsGraph<LO, GO, NT>&, \
946  Kokkos::DualView<CrsGraph<LO, GO, NT>::packet_type*, CrsGraph<LO, GO, NT>::buffer_device_type>&, \
947  const Teuchos::ArrayView<size_t>&, \
948  const Teuchos::ArrayView<const LO>&, \
949  const Teuchos::ArrayView<const int>&, \
950  size_t&);
951 
952 #endif // TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
void packCrsGraphWithOwningPIDs(const CrsGraph< LO, GO, NT > &sourceGraph, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse graph for communication.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
void packCrsGraph(const CrsGraph< LO, GO, NT > &sourceGraph, Teuchos::Array< typename CrsGraph< LO, GO, NT >::packet_type > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse graph for communication.
Declaration of the Tpetra::CrsGraph class.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Compute the number of packets and offsets for the pack procedure.
void packCrsGraphNew(const CrsGraph< LO, GO, NT > &sourceGraph, const Kokkos::DualView< const LO *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportLIDs, const Kokkos::DualView< const int *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportPIDs, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports, Kokkos::DualView< size_t *, typename CrsGraph< LO, GO, NT >::buffer_device_type > numPacketsPerLID, size_t &constantNumPackets, const bool pack_pids)
Pack specified entries of the given local sparse graph for communication, for &quot;new&quot; DistObject interf...
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declaration and definition of Tpetra::Details::getEntryOnHost.
local_graph_device_type getLocalGraphDevice() const
Get the local graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.