Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_packCrsGraph_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
11 #define TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Teuchos_Array.hpp"
15 #include "Teuchos_ArrayView.hpp"
22 #include "Tpetra_CrsGraph_decl.hpp"
23 #include <memory>
24 #include <string>
25 
47 
48 namespace Tpetra {
49 
50 //
51 // Users must never rely on anything in the Details namespace.
52 //
53 namespace Details {
54 
55 namespace PackCrsGraphImpl {
63 template<class OutputOffsetsViewType,
64  class CountsViewType,
65  class InputOffsetsViewType,
66  class InputLocalRowIndicesViewType,
67  class InputLocalRowPidsViewType,
68  const bool debug =
69 #ifdef HAVE_TPETRA_DEBUG
70  true
71 #else
72  false
73 #endif // HAVE_TPETRA_DEBUG
74  >
76 public:
77  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
78  typedef typename CountsViewType::non_const_value_type count_type;
79  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
80  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
81  typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
82  // output Views drive where execution happens.
83  typedef typename OutputOffsetsViewType::device_type device_type;
84  static_assert (std::is_same<typename CountsViewType::device_type::execution_space,
85  typename device_type::execution_space>::value,
86  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
87  static_assert (Kokkos::is_view<OutputOffsetsViewType>::value,
88  "OutputOffsetsViewType must be a Kokkos::View.");
89  static_assert (std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
90  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
91  static_assert (std::is_integral<output_offset_type>::value,
92  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
93  static_assert (Kokkos::is_view<CountsViewType>::value,
94  "CountsViewType must be a Kokkos::View.");
95  static_assert (std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
96  "CountsViewType must be a nonconst Kokkos::View.");
97  static_assert (std::is_integral<count_type>::value,
98  "The type of each entry of CountsViewType must be a built-in integer type.");
99  static_assert (Kokkos::is_view<InputOffsetsViewType>::value,
100  "InputOffsetsViewType must be a Kokkos::View.");
101  static_assert (std::is_integral<input_offset_type>::value,
102  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
103  static_assert (Kokkos::is_view<InputLocalRowIndicesViewType>::value,
104  "InputLocalRowIndicesViewType must be a Kokkos::View.");
105  static_assert (std::is_integral<local_row_index_type>::value,
106  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
107 
108  NumPacketsAndOffsetsFunctor(const OutputOffsetsViewType& outputOffsets,
109  const CountsViewType& counts,
110  const InputOffsetsViewType& rowOffsets,
111  const InputLocalRowIndicesViewType& lclRowInds,
112  const InputLocalRowPidsViewType& lclRowPids) :
113  outputOffsets_ (outputOffsets),
114  counts_ (counts),
115  rowOffsets_ (rowOffsets),
116  lclRowInds_ (lclRowInds),
117  lclRowPids_ (lclRowPids),
118  error_ ("error") // don't forget this, or you'll get segfaults!
119  {
120  if (debug) {
121  const size_t numRowsToPack = static_cast<size_t> (lclRowInds_.extent (0));
122 
123  if (numRowsToPack != static_cast<size_t> (counts_.extent (0))) {
124  std::ostringstream os;
125  os << "lclRowInds.extent(0) = " << numRowsToPack
126  << " != counts.extent(0) = " << counts_.extent (0)
127  << ".";
128  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
129  }
130  if (static_cast<size_t> (numRowsToPack + 1) !=
131  static_cast<size_t> (outputOffsets_.extent (0))) {
132  std::ostringstream os;
133  os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
134  << " != outputOffsets.extent(0) = " << outputOffsets_.extent (0)
135  << ".";
136  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
137  }
138  }
139  }
140 
141  KOKKOS_INLINE_FUNCTION void
142  operator() (const local_row_index_type& curInd,
143  output_offset_type& update,
144  const bool final) const
145  {
146  if (debug) {
147  if (curInd < static_cast<local_row_index_type> (0)) {
148  error_ () = 1;
149  return;
150  }
151  }
152 
153  if (final) {
154  if (debug) {
155  if (curInd >= static_cast<local_row_index_type> (outputOffsets_.extent (0))) {
156  error_ () = 2;
157  return;
158  }
159  }
160  outputOffsets_(curInd) = update;
161  }
162 
163  if (curInd < static_cast<local_row_index_type> (counts_.extent (0))) {
164  const auto lclRow = lclRowInds_(curInd);
165  if (static_cast<size_t> (lclRow + 1) >= static_cast<size_t> (rowOffsets_.extent (0)) ||
166  static_cast<local_row_index_type> (lclRow) < static_cast<local_row_index_type> (0)) {
167  error_ () = 3;
168  return;
169  }
170  // count_type could differ from the type of each row offset.
171  // For example, row offsets might each be 64 bits, but if their
172  // difference always fits in 32 bits, we may then safely use a
173  // 32-bit count_type.
174  const count_type count =
175  static_cast<count_type> (rowOffsets_(lclRow+1) - rowOffsets_(lclRow));
176 
177  // We pack first the global column indices and then pids (if any),
178  // However, if the number of entries in the row is zero, we pack nothing.
179  const count_type numEntToPack = (count == 0)
180  ? static_cast<count_type>(0)
181  : count * (1 + (lclRowPids_.size() > 0 ? 1 : 0));
182 
183  if (final) {
184  counts_(curInd) = numEntToPack;
185  }
186  update += numEntToPack;
187  }
188  }
189 
190  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
191  // have join both with and without volatile! Otherwise intrawarp
192  // joins are really slow on GPUs.
193 
195  int getError () const {
196  auto error_h = Kokkos::create_mirror_view (error_);
197  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
198  // Note: In the UVM case, this would otherwise be a no-op
199  // and thus not fence, so the value might not be correct on return
200  // In the non-UVM case, create_mirror_view will block for the allocation
201  Kokkos::deep_copy (error_h, error_);
202 
203  return error_h ();
204  }
205 
206 private:
207  OutputOffsetsViewType outputOffsets_;
208  CountsViewType counts_;
209  typename InputOffsetsViewType::const_type rowOffsets_;
210  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
211  typename InputLocalRowPidsViewType::const_type lclRowPids_;
212  Kokkos::View<int, device_type> error_;
213 };
214 
224 template<class OutputOffsetsViewType,
225  class CountsViewType,
226  class InputOffsetsViewType,
227  class InputLocalRowIndicesViewType,
228  class InputLocalRowPidsViewType>
229 typename CountsViewType::non_const_value_type
230 computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
231  const CountsViewType& counts,
232  const InputOffsetsViewType& rowOffsets,
233  const InputLocalRowIndicesViewType& lclRowInds,
234  const InputLocalRowPidsViewType& lclRowPids)
235 {
236  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
237  CountsViewType, typename InputOffsetsViewType::const_type,
238  typename InputLocalRowIndicesViewType::const_type,
239  typename InputLocalRowPidsViewType::const_type> functor_type;
240  typedef typename CountsViewType::non_const_value_type count_type;
241  typedef typename OutputOffsetsViewType::size_type size_type;
242  typedef typename OutputOffsetsViewType::execution_space execution_space;
243  typedef typename functor_type::local_row_index_type LO;
244  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
245  const char prefix[] = "computeNumPacketsAndOffsets: ";
246 
247  count_type count = 0;
248  const count_type numRowsToPack = lclRowInds.extent (0);
249 
250  if (numRowsToPack == 0) {
251  return count;
252  }
253  else {
254  TEUCHOS_TEST_FOR_EXCEPTION
255  (rowOffsets.extent (0) <= static_cast<size_type> (1),
256  std::invalid_argument, prefix << "There is at least one row to pack, "
257  "but the graph has no rows. lclRowInds.extent(0) = " <<
258  numRowsToPack << ", but rowOffsets.extent(0) = " <<
259  rowOffsets.extent (0) << " <= 1.");
260  TEUCHOS_TEST_FOR_EXCEPTION
261  (outputOffsets.extent (0) !=
262  static_cast<size_type> (numRowsToPack + 1), std::invalid_argument,
263  prefix << "Output dimension does not match number of rows to pack. "
264  << "outputOffsets.extent(0) = " << outputOffsets.extent (0)
265  << " != lclRowInds.extent(0) + 1 = "
266  << static_cast<size_type> (numRowsToPack + 1) << ".");
267  TEUCHOS_TEST_FOR_EXCEPTION
268  (counts.extent (0) != numRowsToPack, std::invalid_argument,
269  prefix << "counts.extent(0) = " << counts.extent (0)
270  << " != numRowsToPack = " << numRowsToPack << ".");
271 
272  functor_type f (outputOffsets, counts, rowOffsets, lclRowInds, lclRowPids);
273  Kokkos::parallel_scan ("Tpetra::Details::computeNumPacketsAndOffsets::scan", range_type (0, numRowsToPack + 1), f);
274 
275  // At least in debug mode, this functor checks for errors.
276  const int errCode = f.getError ();
277  TEUCHOS_TEST_FOR_EXCEPTION
278  (errCode != 0, std::runtime_error, prefix << "parallel_scan error code "
279  << errCode << " != 0.");
280 
281 #if 0
282  size_t total = 0;
283  for (LO k = 0; k < numRowsToPack; ++k) {
284  total += counts[k];
285  }
286  if (outputOffsets(numRowsToPack) != total) {
287  if (errStr.get () == NULL) {
288  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
289  }
290  std::ostringstream& os = *errStr;
291  os << prefix
292  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
293  << outputOffsets(numRowsToPack) << " != sum of counts = "
294  << total << "." << std::endl;
295  if (numRowsToPack != 0) {
296  // Only print the array if it's not too long.
297  if (numRowsToPack < static_cast<LO> (10)) {
298  os << "outputOffsets: [";
299  for (LO i = 0; i <= numRowsToPack; ++i) {
300  os << outputOffsets(i);
301  if (static_cast<LO> (i + 1) <= numRowsToPack) {
302  os << ",";
303  }
304  }
305  os << "]" << std::endl;
306  os << "counts: [";
307  for (LO i = 0; i < numRowsToPack; ++i) {
308  os << counts(i);
309  if (static_cast<LO> (i + 1) < numRowsToPack) {
310  os << ",";
311  }
312  }
313  os << "]" << std::endl;
314  }
315  else {
316  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
317  << outputOffsets(numRowsToPack-1) << "." << std::endl;
318  }
319  }
320  count = outputOffsets(numRowsToPack);
321  return {false, errStr};
322  }
323 #endif // HAVE_TPETRA_DEBUG
324 
325  // Get last entry of outputOffsets, which is the sum of the entries
326  // of counts. Don't assume UVM.
327  using Tpetra::Details::getEntryOnHost;
328  return static_cast<count_type> (getEntryOnHost (outputOffsets,
329  numRowsToPack));
330  }
331 }
332 
343 template<class Packet,
344  class LocalMapType,
345  class BufferDeviceType,
346  class InputLidsType,
347  class InputPidsType>
348 KOKKOS_FUNCTION
349 size_t
350 packRow(const LocalMapType& col_map,
351  const Kokkos::View<Packet*, BufferDeviceType>& exports,
352  const InputLidsType& lids_in,
353  const InputPidsType& pids_in,
354  const size_t offset,
355  const size_t num_ent,
356  const bool pack_pids)
357 {
358  using LO = typename LocalMapType::local_ordinal_type;
359  using GO = typename LocalMapType::global_ordinal_type;
360 
361  if (num_ent == 0) {
362  // Empty rows always take zero bytes, to ensure sparsity.
363  return static_cast<size_t>(0);
364  }
365 
366  size_t num_ent_packed = num_ent;
367  if (pack_pids) {
368  num_ent_packed += num_ent;
369  }
370 
371  // Copy column indices one at a time, so that we don't need
372  // temporary storage.
373  for (size_t k = 0; k < num_ent; ++k) {
374  const LO lid = lids_in[k];
375  const GO gid = col_map.getGlobalElement (lid);
376  exports(offset+k) = gid;
377  }
378  // Copy PIDs one at a time, so that we don't need temporary storage.
379  if (pack_pids) {
380  for (size_t k = 0; k < num_ent; ++k) {
381  const LO lid = lids_in[k];
382  const int pid = pids_in[lid];
383  exports(offset+num_ent+k) = static_cast<GO>(pid);
384  }
385  }
386 
387  return num_ent_packed;
388 }
389 
390 template<class Packet,
391  class LocalGraph,
392  class LocalMap,
393  class BufferDeviceType>
394 struct PackCrsGraphFunctor {
395  using local_graph_type = LocalGraph;
396  using local_map_type = LocalMap;
397  using LO = typename local_map_type::local_ordinal_type;
398  using GO = typename local_map_type::global_ordinal_type;
399 
400  using num_packets_per_lid_view_type =
401  Kokkos::View<const size_t*, BufferDeviceType>;
402  using offsets_view_type = Kokkos::View<const size_t*, BufferDeviceType>;
403  using exports_view_type = Kokkos::View<Packet*, BufferDeviceType>;
404  using export_lids_view_type =
406  using source_pids_view_type =
408 
409  using count_type =
410  typename num_packets_per_lid_view_type::non_const_value_type;
411  using offset_type = typename offsets_view_type::non_const_value_type;
412  using value_type = Kokkos::pair<int, LO>;
413 
414  static_assert (std::is_same<LO, typename local_graph_type::data_type>::value,
415  "local_map_type::local_ordinal_type and "
416  "local_graph_type::data_type must be the same.");
417 
418  local_graph_type local_graph;
419  local_map_type local_col_map;
420  exports_view_type exports;
421  num_packets_per_lid_view_type num_packets_per_lid;
422  export_lids_view_type export_lids;
423  source_pids_view_type source_pids;
424  offsets_view_type offsets;
425  bool pack_pids;
426 
427  PackCrsGraphFunctor(const local_graph_type& local_graph_in,
428  const local_map_type& local_col_map_in,
429  const exports_view_type& exports_in,
430  const num_packets_per_lid_view_type& num_packets_per_lid_in,
431  const export_lids_view_type& export_lids_in,
432  const source_pids_view_type& source_pids_in,
433  const offsets_view_type& offsets_in,
434  const bool pack_pids_in) :
435  local_graph (local_graph_in),
436  local_col_map (local_col_map_in),
437  exports (exports_in),
438  num_packets_per_lid (num_packets_per_lid_in),
439  export_lids (export_lids_in),
440  source_pids (source_pids_in),
441  offsets (offsets_in),
442  pack_pids (pack_pids_in)
443  {
444  const LO numRows = local_graph_in.numRows ();
445  const LO rowMapDim =
446  static_cast<LO> (local_graph.row_map.extent (0));
447  TEUCHOS_TEST_FOR_EXCEPTION
448  (numRows != 0 && rowMapDim != numRows + static_cast<LO> (1),
449  std::logic_error, "local_graph.row_map.extent(0) = "
450  << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
451  }
452 
453  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const
454  {
455  using ::Tpetra::Details::OrdinalTraits;
456  dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
457  }
458 
459  KOKKOS_INLINE_FUNCTION void
460  join (value_type& dst, const value_type& src) const
461  {
462  // `dst` should reflect the first (least) bad index and all other
463  // associated error codes and data, so prefer keeping it.
464  if (src.first != 0 && dst.first == 0) {
465  dst = src;
466  }
467  }
468 
469  KOKKOS_INLINE_FUNCTION
470  void operator() (const LO i, value_type& dst) const
471  {
472  const size_t offset = offsets[i];
473  const LO export_lid = export_lids[i];
474  const size_t buf_size = exports.size();
475  const size_t num_packets_this_lid = num_packets_per_lid(i);
476  const size_t num_ent =
477  static_cast<size_t> (local_graph.row_map[export_lid+1]
478  - local_graph.row_map[export_lid]);
479 
480  // Only pack this row's data if it has a nonzero number of
481  // entries. We can do this because receiving processes get the
482  // number of packets, and will know that zero packets means zero
483  // entries.
484  if (num_ent == 0) {
485  return;
486  }
487 
488  if (export_lid >= static_cast<LO>(local_graph.numRows())) {
489  if (dst.first != 0) { // keep only the first error
490  dst = Kokkos::make_pair (1, i); // invalid row
491  }
492  return;
493  }
494  else if ((offset > buf_size || offset + num_packets_this_lid > buf_size)) {
495  if (dst.first != 0) { // keep only the first error
496  dst = Kokkos::make_pair (2, i); // out of bounds
497  }
498  return;
499  }
500 
501  // We can now pack this row
502 
503  // Since the graph is locally indexed on the calling process, we
504  // have to use its column Map (which it _must_ have in this case)
505  // to convert to global indices.
506  const auto row_beg = local_graph.row_map[export_lid];
507  const auto row_end = local_graph.row_map[export_lid + 1];
508  auto lids_in = Kokkos::subview (local_graph.entries,
509  Kokkos::make_pair (row_beg, row_end));
510  size_t num_ent_packed_this_row =
511  packRow (local_col_map, exports, lids_in,
512  source_pids, offset, num_ent, pack_pids);
513  if (num_ent_packed_this_row != num_packets_this_lid) {
514  if (dst.first != 0) { // keep only the first error
515  dst = Kokkos::make_pair (3, i);
516  }
517  }
518  }
519 };
520 
528 template<class Packet,
529  class LocalGraph,
530  class LocalMap,
531  class BufferDeviceType>
532 void
533 do_pack(const LocalGraph& local_graph,
534  const LocalMap& local_map,
535  const Kokkos::View<Packet*, BufferDeviceType>& exports,
536  const typename PackTraits<
537  size_t
538  >::input_array_type& num_packets_per_lid,
539  const typename PackTraits<
541  >::input_array_type& export_lids,
542  const typename PackTraits<
543  int
544  >::input_array_type& source_pids,
545  const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
546  const bool pack_pids)
547 {
548  using LO = typename LocalMap::local_ordinal_type;
549  using execution_space = typename LocalGraph::device_type::execution_space;
550  using range_type = Kokkos::RangePolicy<execution_space, LO>;
551  const char prefix[] = "Tpetra::Details::PackCrsGraphImpl::do_pack: ";
552 
553  if (export_lids.extent (0) != 0) {
554  TEUCHOS_TEST_FOR_EXCEPTION
555  (static_cast<size_t> (offsets.extent (0)) !=
556  static_cast<size_t> (export_lids.extent (0) + 1),
557  std::invalid_argument, prefix << "offsets.extent(0) = "
558  << offsets.extent (0) << " != export_lids.extent(0) (= "
559  << export_lids.extent (0) << ") + 1.");
560  TEUCHOS_TEST_FOR_EXCEPTION
561  (export_lids.extent (0) != num_packets_per_lid.extent (0),
562  std::invalid_argument, prefix << "export_lids.extent(0) = " <<
563  export_lids.extent (0) << " != num_packets_per_lid.extent(0) = "
564  << num_packets_per_lid.extent (0) << ".");
565  // If exports has nonzero length at this point, then the graph
566  // has at least one entry to pack. Thus, if packing process
567  // ranks, we had better have at least one process rank to pack.
568  TEUCHOS_TEST_FOR_EXCEPTION
569  (pack_pids && exports.extent (0) != 0 &&
570  source_pids.extent (0) == 0, std::invalid_argument, prefix <<
571  "pack_pids is true, and exports.extent(0) = " <<
572  exports.extent (0) << " != 0, meaning that we need to pack at "
573  "least one graph entry, but source_pids.extent(0) = 0.");
574  }
575 
576  using pack_functor_type =
577  PackCrsGraphFunctor<Packet, LocalGraph, LocalMap,
578  BufferDeviceType>;
579  pack_functor_type f (local_graph, local_map, exports,
580  num_packets_per_lid, export_lids,
581  source_pids, offsets, pack_pids);
582 
583  typename pack_functor_type::value_type result;
584  range_type range (0, num_packets_per_lid.extent (0));
585  Kokkos::parallel_reduce ("Tpetra::Details::computeNumPacketsAndOffsets::reduce",range, f, result);
586 
587  if (result.first != 0) {
588  // We can't deep_copy from AnonymousSpace Views, so we can't
589  // print out any information from them in case of error.
590  std::ostringstream os;
591  if (result.first == 1) { // invalid local row index
592  os << "invalid local row index";
593  }
594  else if (result.first == 2) { // invalid offset
595  os << "invalid offset";
596  }
597  TEUCHOS_TEST_FOR_EXCEPTION
598  (true, std::runtime_error, prefix << "PackCrsGraphFunctor "
599  "reported error code " << result.first << " (" << os.str ()
600  << ") for the first bad row " << result.second << ".");
601  }
602 }
603 
630 template<typename LO, typename GO, typename NT>
631 void
633 (const CrsGraph<LO,GO,NT>& sourceGraph,
634  Kokkos::DualView<
635  typename CrsGraph<LO,GO,NT>::packet_type*,
636  typename CrsGraph<LO,GO,NT>::buffer_device_type
637  >& exports,
638  const Kokkos::View<
639  size_t*,
640  typename CrsGraph<LO,GO,NT>::buffer_device_type
641  >& num_packets_per_lid,
642  const Kokkos::View<
643  const LO*,
644  typename CrsGraph<LO, GO, NT>::buffer_device_type
645  >& export_lids,
646  const Kokkos::View<
647  const int*,
648  typename CrsGraph<LO, GO, NT>::buffer_device_type
649  >& export_pids,
650  size_t& constant_num_packets,
651  const bool pack_pids)
652 {
653  using Kokkos::View;
654  using crs_graph_type = CrsGraph<LO, GO, NT>;
655  using packet_type = typename crs_graph_type::packet_type;
656  using buffer_device_type = typename crs_graph_type::buffer_device_type;
657  using exports_view_type = Kokkos::DualView<packet_type*, buffer_device_type>;
658  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
659  using local_map_type = typename Tpetra::Map<LO, GO, NT>::local_map_type;
660  const char prefix[] = "Tpetra::Details::packCrsGraph: ";
661  constexpr bool debug = false;
662 
663  local_graph_device_type local_graph = sourceGraph.getLocalGraphDevice ();
664  local_map_type local_col_map = sourceGraph.getColMap ()->getLocalMap ();
665 
666  // Setting this to zero tells the caller to expect a possibly
667  // different ("nonconstant") number of packets per local index
668  // (i.e., a possibly different number of entries per row).
669  constant_num_packets = 0;
670 
671  const size_t num_export_lids (export_lids.extent (0));
672  TEUCHOS_TEST_FOR_EXCEPTION
673  (num_export_lids != size_t (num_packets_per_lid.extent (0)),
674  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
675  << num_export_lids << " != num_packets_per_lid.extent(0) = "
676  << num_packets_per_lid.extent (0) << ".");
677  if (num_export_lids != 0) {
678  TEUCHOS_TEST_FOR_EXCEPTION
679  (num_packets_per_lid.data () == nullptr, std::invalid_argument,
680  prefix << "num_export_lids = "<< num_export_lids << " != 0, but "
681  "num_packets_per_lid.data() = "
682  << num_packets_per_lid.data () << " == NULL.");
683  }
684 
685  if (num_export_lids == 0) {
686  exports = exports_view_type ("exports", 0);
687  return;
688  }
689 
690  // Array of offsets into the pack buffer.
691  View<size_t*, buffer_device_type> offsets ("offsets", num_export_lids + 1);
692 
693  // Compute number of packets per LID (row to send), as well as
694  // corresponding offsets (the prefix sum of the packet counts).
695  const size_t count =
696  computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
697  local_graph.row_map, export_lids, export_pids);
698 
699  // Resize the output pack buffer if needed.
700  if (count > size_t (exports.extent (0))) {
701  exports = exports_view_type ("exports", count);
702  if (debug) {
703  std::ostringstream os;
704  os << "*** exports resized to " << count << std::endl;
705  std::cerr << os.str ();
706  }
707  }
708  if (debug) {
709  std::ostringstream os;
710  os << "*** count: " << count << ", exports.extent(0): "
711  << exports.extent (0) << std::endl;
712  std::cerr << os.str ();
713  }
714 
715  // If exports has nonzero length at this point, then the graph has
716  // at least one entry to pack. Thus, if packing process ranks, we
717  // had better have at least one process rank to pack.
718  TEUCHOS_TEST_FOR_EXCEPTION
719  (pack_pids && exports.extent (0) != 0 &&
720  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
721  "pack_pids is true, and exports.extent(0) = " <<
722  exports.extent (0) << " != 0, meaning that we need to pack at least "
723  "one graph entry, but export_pids.extent(0) = 0.");
724 
725  exports.modify_device ();
726  auto exports_d = exports.view_device ();
727  do_pack<packet_type, local_graph_device_type, local_map_type, buffer_device_type>
728  (local_graph, local_col_map, exports_d, num_packets_per_lid,
729  export_lids, export_pids, offsets, pack_pids);
730  // If we got this far, we succeeded.
731 }
732 
733 } // namespace PackCrsGraphImpl
734 
735 template<typename LO, typename GO, typename NT>
736 void
737 packCrsGraph (const CrsGraph<LO, GO, NT>& sourceGraph,
738  Teuchos::Array<typename CrsGraph<LO,GO,NT>::packet_type>& exports,
739  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
740  const Teuchos::ArrayView<const LO>& exportLIDs,
741  size_t& constantNumPackets)
742 {
743  using Kokkos::HostSpace;
744  using Kokkos::MemoryUnmanaged;
745  using Kokkos::View;
746  using crs_graph_type = CrsGraph<LO, GO, NT>;
747  using packet_type = typename crs_graph_type::packet_type;
748  using BDT = typename crs_graph_type::buffer_device_type;
749 
750  // Convert all Teuchos::Array to Kokkos::View
751 
752  // This is an output array, so we don't have to copy to device here.
753  // However, we'll have to remember to copy back to host when done.
754  BDT outputDevice;
755  View<size_t*, BDT> num_packets_per_lid_d =
757  numPacketsPerLID.getRawPtr (),
758  numPacketsPerLID.size (), false,
759  "num_packets_per_lid");
760  // This is an input array, so we have to copy to device here.
761  // However, we never need to copy it back to host.
762  View<const LO*, BDT> export_lids_d =
764  exportLIDs.getRawPtr (),
765  exportLIDs.size (), true,
766  "export_lids");
767  View<const int*, BDT> export_pids_d;
768  Kokkos::DualView<packet_type*, BDT> exports_dv;
769  constexpr bool pack_pids = false;
770 
771  static_assert
772  (std::is_same<
773  typename decltype (num_packets_per_lid_d)::non_const_value_type,
774  size_t>::value,
775  "num_packets_per_lid_d's non_const_value_type should be size_t.");
776  static_assert
777  (std::is_same<
778  typename decltype (num_packets_per_lid_d)::device_type,
779  BDT>::value,
780  "num_packets_per_lid_d's BDT should be size_t.");
781  static_assert
782  (std::is_same<
783  typename decltype (export_lids_d)::device_type,
784  BDT>::value,
785  "export_lids_d's device_type should be BDT.");
786  static_assert
787  (std::is_same<
788  typename decltype (export_pids_d)::non_const_value_type,
789  int>::value,
790  "export_pids_d's non_const_value_type should be int.");
791  static_assert
792  (std::is_same<
793  typename decltype (export_pids_d)::device_type,
794  BDT>::value,
795  "export_pids_d's device_type should be BDT.");
796 
798  (sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
799  export_pids_d, constantNumPackets, pack_pids);
800 
801  // The counts are an output of packCrsGraph, so we have to copy
802  // them back to host.
803  View<size_t*, HostSpace, MemoryUnmanaged>
804  num_packets_per_lid_h (numPacketsPerLID.getRawPtr (),
805  numPacketsPerLID.size ());
806 
807  // DEEP_COPY REVIEW - DEVICE-TO-HOST
808  using execution_space = typename BDT::execution_space;
809  Kokkos::deep_copy (execution_space(), num_packets_per_lid_h, num_packets_per_lid_d);
810 
811  // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
812  // exports_dv above, then we have two host copies for exports_h.
813 
814  // The exports are an output of packCrsGraph, so we have to
815  // copy them back to host.
816  if (static_cast<size_t> (exports.size ()) !=
817  static_cast<size_t> (exports_dv.extent (0))) {
818  exports.resize (exports_dv.extent (0));
819  }
820  View<packet_type*, HostSpace, MemoryUnmanaged>
821  exports_h (exports.getRawPtr (), exports.size ());
822  // DEEP_COPY REVIEW - DEVICE-TO-HOST
823  Kokkos::deep_copy (execution_space(), exports_h, exports_dv.d_view);
824  execution_space().fence();
825 }
826 
829 template<typename LO, typename GO, typename NT>
830 void
832  const Kokkos::DualView<
833  const LO*,
835  >& export_lids,
836  const Kokkos::DualView<
837  const int*,
839  >& export_pids,
840  Kokkos::DualView<
842  typename CrsGraph<LO,GO,NT>::buffer_device_type>& exports,
843  Kokkos::DualView<
844  size_t*,
846  > num_packets_per_lid,
847  size_t& constant_num_packets,
848  const bool pack_pids)
849 {
850  using Kokkos::View;
851  using crs_graph_type = CrsGraph<LO,GO,NT>;
852  using BDT = typename crs_graph_type::buffer_device_type;
853  using PT = typename crs_graph_type::packet_type;
854  using exports_dual_view_type = Kokkos::DualView<PT*, BDT>;
855  using LGT = typename crs_graph_type::local_graph_device_type;
856  using LMT = typename crs_graph_type::map_type::local_map_type;
857  const char prefix[] = "Tpetra::Details::packCrsGraphNew: ";
858 
859  const LGT local_graph = sourceGraph.getLocalGraphDevice ();
860  const LMT local_col_map = sourceGraph.getColMap ()->getLocalMap ();
861 
862  // Setting this to zero tells the caller to expect a possibly
863  // different ("nonconstant") number of packets per local index
864  // (i.e., a possibly different number of entries per row).
865  constant_num_packets = 0;
866 
867  const size_t num_export_lids =
868  static_cast<size_t> (export_lids.extent (0));
869  TEUCHOS_TEST_FOR_EXCEPTION
870  (num_export_lids !=
871  static_cast<size_t> (num_packets_per_lid.extent (0)),
872  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
873  << num_export_lids << " != num_packets_per_lid.extent(0) = "
874  << num_packets_per_lid.extent (0) << ".");
875  TEUCHOS_TEST_FOR_EXCEPTION
876  (num_export_lids != 0 &&
877  num_packets_per_lid.view_device ().data () == nullptr,
878  std::invalid_argument, prefix << "num_export_lids = "<< num_export_lids
879  << " != 0, but num_packets_per_lid.view_device().data() = nullptr.");
880 
881  if (num_export_lids == 0) {
882  exports = exports_dual_view_type ();
883  return;
884  }
885 
886  // Array of offsets into the pack buffer.
887  using offsets_type = Kokkos::View<size_t*, BDT>;
888  offsets_type offsets ("offsets", num_export_lids + 1);
889 
890  // Compute number of packets per LID (row to send), as well as
891  // corresponding offsets (the prefix sum of the packet counts).
892  num_packets_per_lid.clear_sync_state ();
893  num_packets_per_lid.modify_device ();
894  using PackCrsGraphImpl::computeNumPacketsAndOffsets;
895  const size_t count =
896  computeNumPacketsAndOffsets (offsets, num_packets_per_lid.view_device (),
897  local_graph.row_map,
898  export_lids.view_device (),
899  export_pids.view_device ());
900 
901  // Resize the output pack buffer if needed.
902  if (count > static_cast<size_t> (exports.extent (0))) {
903  exports = exports_dual_view_type ("exports", count);
904  }
905 
906  // If exports has nonzero length at this point, then the graph has
907  // at least one entry to pack. Thus, if packing process ranks, we
908  // had better have at least one process rank to pack.
909  TEUCHOS_TEST_FOR_EXCEPTION
910  (pack_pids && exports.extent (0) != 0 &&
911  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
912  "pack_pids is true, and exports.extent(0) = " <<
913  exports.extent (0) << " != 0, meaning that we need to pack at least "
914  "one graph entry, but export_pids.extent(0) = 0.");
915 
916  exports.modify_device ();
917  using PackCrsGraphImpl::do_pack;
918  do_pack<PT, LGT, LMT, BDT> (local_graph, local_col_map,
919  exports.view_device (),
920  num_packets_per_lid.view_device (),
921  export_lids.view_device (),
922  export_pids.view_device (),
923  offsets, pack_pids);
924 }
925 
926 template<typename LO, typename GO, typename NT>
927 void
929 (const CrsGraph<LO, GO, NT>& sourceGraph,
930  Kokkos::DualView<
933  >& exports_dv,
934  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
935  const Teuchos::ArrayView<const LO>& exportLIDs,
936  const Teuchos::ArrayView<const int>& sourcePIDs,
937  size_t& constantNumPackets)
938 {
939  using Kokkos::HostSpace;
940  using Kokkos::MemoryUnmanaged;
941  using Kokkos::View;
942  using crs_graph_type = CrsGraph<LO, GO, NT>;
943  using buffer_device_type = typename crs_graph_type::buffer_device_type;
944 
945  // Convert all Teuchos::Array to Kokkos::View
946 
947  // This is an output array, so we don't have to copy to device here.
948  // However, we'll have to remember to copy back to host when done.
949  View<size_t*, buffer_device_type> num_packets_per_lid_d =
950  create_mirror_view_from_raw_host_array (buffer_device_type (),
951  numPacketsPerLID.getRawPtr (),
952  numPacketsPerLID.size (), false,
953  "num_packets_per_lid");
954 
955  // This is an input array, so we have to copy to device here.
956  // However, we never need to copy it back to host.
957  View<const LO*, buffer_device_type> export_lids_d =
958  create_mirror_view_from_raw_host_array (buffer_device_type (),
959  exportLIDs.getRawPtr (),
960  exportLIDs.size (), true,
961  "export_lids");
962  // This is an input array, so we have to copy to device here.
963  // However, we never need to copy it back to host.
964  View<const int*, buffer_device_type> export_pids_d =
965  create_mirror_view_from_raw_host_array (buffer_device_type (),
966  sourcePIDs.getRawPtr (),
967  sourcePIDs.size (), true,
968  "export_pids");
969  constexpr bool pack_pids = true;
971  (sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
972  export_pids_d, constantNumPackets, pack_pids);
973 
974  // The counts are an output of packCrsGraph, so we
975  // have to copy them back to host.
976  View<size_t*, HostSpace, MemoryUnmanaged> num_packets_per_lid_h
977  (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ());
978  // DEEP_COPY REVIEW - DEVICE-TO-HOST
979  using execution_space = typename buffer_device_type::execution_space;
980  Kokkos::deep_copy (execution_space(),
981  num_packets_per_lid_h, num_packets_per_lid_d);
982  execution_space().fence();
983 }
984 
985 } // namespace Details
986 } // namespace Tpetra
987 
988 #define TPETRA_DETAILS_PACKCRSGRAPH_INSTANT( LO, GO, NT ) \
989  template void \
990  Details::packCrsGraph<LO, GO, NT> ( \
991  const CrsGraph<LO, GO, NT>&, \
992  Teuchos::Array<CrsGraph<LO,GO,NT>::packet_type>&, \
993  const Teuchos::ArrayView<size_t>&, \
994  const Teuchos::ArrayView<const LO>&, \
995  size_t&); \
996  template void \
997  Details::packCrsGraphNew<LO, GO, NT> ( \
998  const CrsGraph<LO, GO, NT>&, \
999  const Kokkos::DualView< \
1000  const LO*, \
1001  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1002  const Kokkos::DualView< \
1003  const int*, \
1004  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1005  Kokkos::DualView< \
1006  CrsGraph<LO,GO,NT>::packet_type*, \
1007  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1008  Kokkos::DualView< \
1009  size_t*, \
1010  CrsGraph<LO,GO,NT>::buffer_device_type>, \
1011  size_t&, \
1012  const bool); \
1013  template void \
1014  Details::packCrsGraphWithOwningPIDs<LO, GO, NT> ( \
1015  const CrsGraph<LO, GO, NT>&, \
1016  Kokkos::DualView<CrsGraph<LO,GO,NT>::packet_type*, CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1017  const Teuchos::ArrayView<size_t>&, \
1018  const Teuchos::ArrayView<const LO>&, \
1019  const Teuchos::ArrayView<const int>&, \
1020  size_t&);
1021 
1022 #endif // TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
void packCrsGraphWithOwningPIDs(const CrsGraph< LO, GO, NT > &sourceGraph, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse graph for communication.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
void packCrsGraph(const CrsGraph< LO, GO, NT > &sourceGraph, Teuchos::Array< typename CrsGraph< LO, GO, NT >::packet_type > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse graph for communication.
Declaration of the Tpetra::CrsGraph class.
&quot;Local&quot; part of Map suitable for Kokkos kernels.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Compute the number of packets and offsets for the pack procedure.
void packCrsGraphNew(const CrsGraph< LO, GO, NT > &sourceGraph, const Kokkos::DualView< const LO *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportLIDs, const Kokkos::DualView< const int *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportPIDs, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports, Kokkos::DualView< size_t *, typename CrsGraph< LO, GO, NT >::buffer_device_type > numPacketsPerLID, size_t &constantNumPackets, const bool pack_pids)
Pack specified entries of the given local sparse graph for communication, for &quot;new&quot; DistObject interf...
Kokkos::View< const value_type *, Kokkos::AnonymousSpace > input_array_type
The type of an input array of value_type.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Declaration and definition of Tpetra::Details::getEntryOnHost.
local_graph_device_type getLocalGraphDevice() const
Get the local graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.