Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_Details_packCrsGraph_def.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
43 #define TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
54 #include "Tpetra_CrsGraph_decl.hpp"
55 #include <memory>
56 #include <string>
57 
79 
80 namespace Tpetra {
81 
82 #ifndef DOXYGEN_SHOULD_SKIP_THIS
83 // Forward declaration of Distributor
84 class Distributor;
85 #endif // DOXYGEN_SHOULD_SKIP_THIS
86 
87 //
88 // Users must never rely on anything in the Details namespace.
89 //
90 namespace Details {
91 
92 namespace PackCrsGraphImpl {
100 template<class OutputOffsetsViewType,
101  class CountsViewType,
102  class InputOffsetsViewType,
103  class InputLocalRowIndicesViewType,
104  class InputLocalRowPidsViewType,
105  const bool debug =
106 #ifdef HAVE_TPETRA_DEBUG
107  true
108 #else
109  false
110 #endif // HAVE_TPETRA_DEBUG
111  >
113 public:
114  typedef typename OutputOffsetsViewType::non_const_value_type output_offset_type;
115  typedef typename CountsViewType::non_const_value_type count_type;
116  typedef typename InputOffsetsViewType::non_const_value_type input_offset_type;
117  typedef typename InputLocalRowIndicesViewType::non_const_value_type local_row_index_type;
118  typedef typename InputLocalRowPidsViewType::non_const_value_type local_row_pid_type;
119  // output Views drive where execution happens.
120  typedef typename OutputOffsetsViewType::device_type device_type;
121  static_assert (std::is_same<typename CountsViewType::device_type::execution_space,
122  typename device_type::execution_space>::value,
123  "OutputOffsetsViewType and CountsViewType must have the same execution space.");
124  static_assert (Kokkos::Impl::is_view<OutputOffsetsViewType>::value,
125  "OutputOffsetsViewType must be a Kokkos::View.");
126  static_assert (std::is_same<typename OutputOffsetsViewType::value_type, output_offset_type>::value,
127  "OutputOffsetsViewType must be a nonconst Kokkos::View.");
128  static_assert (std::is_integral<output_offset_type>::value,
129  "The type of each entry of OutputOffsetsViewType must be a built-in integer type.");
130  static_assert (Kokkos::Impl::is_view<CountsViewType>::value,
131  "CountsViewType must be a Kokkos::View.");
132  static_assert (std::is_same<typename CountsViewType::value_type, output_offset_type>::value,
133  "CountsViewType must be a nonconst Kokkos::View.");
134  static_assert (std::is_integral<count_type>::value,
135  "The type of each entry of CountsViewType must be a built-in integer type.");
136  static_assert (Kokkos::Impl::is_view<InputOffsetsViewType>::value,
137  "InputOffsetsViewType must be a Kokkos::View.");
138  static_assert (std::is_integral<input_offset_type>::value,
139  "The type of each entry of InputOffsetsViewType must be a built-in integer type.");
140  static_assert (Kokkos::Impl::is_view<InputLocalRowIndicesViewType>::value,
141  "InputLocalRowIndicesViewType must be a Kokkos::View.");
142  static_assert (std::is_integral<local_row_index_type>::value,
143  "The type of each entry of InputLocalRowIndicesViewType must be a built-in integer type.");
144 
145  NumPacketsAndOffsetsFunctor(const OutputOffsetsViewType& outputOffsets,
146  const CountsViewType& counts,
147  const InputOffsetsViewType& rowOffsets,
148  const InputLocalRowIndicesViewType& lclRowInds,
149  const InputLocalRowPidsViewType& lclRowPids) :
150  outputOffsets_ (outputOffsets),
151  counts_ (counts),
152  rowOffsets_ (rowOffsets),
153  lclRowInds_ (lclRowInds),
154  lclRowPids_ (lclRowPids),
155  error_ ("error") // don't forget this, or you'll get segfaults!
156  {
157  if (debug) {
158  const size_t numRowsToPack = static_cast<size_t> (lclRowInds_.extent (0));
159 
160  if (numRowsToPack != static_cast<size_t> (counts_.extent (0))) {
161  std::ostringstream os;
162  os << "lclRowInds.extent(0) = " << numRowsToPack
163  << " != counts.extent(0) = " << counts_.extent (0)
164  << ".";
165  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
166  }
167  if (static_cast<size_t> (numRowsToPack + 1) !=
168  static_cast<size_t> (outputOffsets_.extent (0))) {
169  std::ostringstream os;
170  os << "lclRowInds.extent(0) + 1 = " << (numRowsToPack + 1)
171  << " != outputOffsets.extent(0) = " << outputOffsets_.extent (0)
172  << ".";
173  TEUCHOS_TEST_FOR_EXCEPTION(true, std::invalid_argument, os.str ());
174  }
175  }
176  }
177 
178  KOKKOS_INLINE_FUNCTION void
179  operator() (const local_row_index_type& curInd,
180  output_offset_type& update,
181  const bool final) const
182  {
183  if (debug) {
184  if (curInd < static_cast<local_row_index_type> (0)) {
185  error_ () = 1;
186  return;
187  }
188  }
189 
190  if (final) {
191  if (debug) {
192  if (curInd >= static_cast<local_row_index_type> (outputOffsets_.extent (0))) {
193  error_ () = 2;
194  return;
195  }
196  }
197  outputOffsets_(curInd) = update;
198  }
199 
200  if (curInd < static_cast<local_row_index_type> (counts_.extent (0))) {
201  const auto lclRow = lclRowInds_(curInd);
202  if (static_cast<size_t> (lclRow + 1) >= static_cast<size_t> (rowOffsets_.extent (0)) ||
203  static_cast<local_row_index_type> (lclRow) < static_cast<local_row_index_type> (0)) {
204  error_ () = 3;
205  return;
206  }
207  // count_type could differ from the type of each row offset.
208  // For example, row offsets might each be 64 bits, but if their
209  // difference always fits in 32 bits, we may then safely use a
210  // 32-bit count_type.
211  const count_type count =
212  static_cast<count_type> (rowOffsets_(lclRow+1) - rowOffsets_(lclRow));
213 
214  // We pack first the global column indices and then pids (if any),
215  // However, if the number of entries in the row is zero, we pack nothing.
216  const count_type numEntToPack = (count == 0)
217  ? static_cast<count_type>(0)
218  : count * (1 + (lclRowPids_.size() > 0 ? 1 : 0));
219 
220  if (final) {
221  counts_(curInd) = numEntToPack;
222  }
223  update += numEntToPack;
224  }
225  }
226 
227  // mfh 31 May 2017: Don't need init or join. If you have join, MUST
228  // have join both with and without volatile! Otherwise intrawarp
229  // joins are really slow on GPUs.
230 
232  int getError () const {
233  auto error_h = Kokkos::create_mirror_view (error_);
234  Kokkos::deep_copy (error_h, error_);
235  return error_h ();
236  }
237 
238 private:
239  OutputOffsetsViewType outputOffsets_;
240  CountsViewType counts_;
241  typename InputOffsetsViewType::const_type rowOffsets_;
242  typename InputLocalRowIndicesViewType::const_type lclRowInds_;
243  typename InputLocalRowPidsViewType::const_type lclRowPids_;
244  Kokkos::View<int, device_type> error_;
245 };
246 
256 template<class OutputOffsetsViewType,
257  class CountsViewType,
258  class InputOffsetsViewType,
259  class InputLocalRowIndicesViewType,
260  class InputLocalRowPidsViewType>
261 typename CountsViewType::non_const_value_type
262 computeNumPacketsAndOffsets(const OutputOffsetsViewType& outputOffsets,
263  const CountsViewType& counts,
264  const InputOffsetsViewType& rowOffsets,
265  const InputLocalRowIndicesViewType& lclRowInds,
266  const InputLocalRowPidsViewType& lclRowPids)
267 {
268  typedef NumPacketsAndOffsetsFunctor<OutputOffsetsViewType,
269  CountsViewType, typename InputOffsetsViewType::const_type,
270  typename InputLocalRowIndicesViewType::const_type,
271  typename InputLocalRowPidsViewType::const_type> functor_type;
272  typedef typename CountsViewType::non_const_value_type count_type;
273  typedef typename OutputOffsetsViewType::size_type size_type;
274  typedef typename OutputOffsetsViewType::execution_space execution_space;
275  typedef typename functor_type::local_row_index_type LO;
276  typedef Kokkos::RangePolicy<execution_space, LO> range_type;
277  const char prefix[] = "computeNumPacketsAndOffsets: ";
278 
279  count_type count = 0;
280  const count_type numRowsToPack = lclRowInds.extent (0);
281 
282  if (numRowsToPack == 0) {
283  return count;
284  }
285  else {
286  TEUCHOS_TEST_FOR_EXCEPTION
287  (rowOffsets.extent (0) <= static_cast<size_type> (1),
288  std::invalid_argument, prefix << "There is at least one row to pack, "
289  "but the graph has no rows. lclRowInds.extent(0) = " <<
290  numRowsToPack << ", but rowOffsets.extent(0) = " <<
291  rowOffsets.extent (0) << " <= 1.");
292  TEUCHOS_TEST_FOR_EXCEPTION
293  (outputOffsets.extent (0) !=
294  static_cast<size_type> (numRowsToPack + 1), std::invalid_argument,
295  prefix << "Output dimension does not match number of rows to pack. "
296  << "outputOffsets.extent(0) = " << outputOffsets.extent (0)
297  << " != lclRowInds.extent(0) + 1 = "
298  << static_cast<size_type> (numRowsToPack + 1) << ".");
299  TEUCHOS_TEST_FOR_EXCEPTION
300  (counts.extent (0) != numRowsToPack, std::invalid_argument,
301  prefix << "counts.extent(0) = " << counts.extent (0)
302  << " != numRowsToPack = " << numRowsToPack << ".");
303 
304  functor_type f (outputOffsets, counts, rowOffsets, lclRowInds, lclRowPids);
305  Kokkos::parallel_scan (range_type (0, numRowsToPack + 1), f);
306 
307  // At least in debug mode, this functor checks for errors.
308  const int errCode = f.getError ();
309  TEUCHOS_TEST_FOR_EXCEPTION
310  (errCode != 0, std::runtime_error, prefix << "parallel_scan error code "
311  << errCode << " != 0.");
312 
313 #if 0
314  size_t total = 0;
315  for (LO k = 0; k < numRowsToPack; ++k) {
316  total += counts[k];
317  }
318  if (outputOffsets(numRowsToPack) != total) {
319  if (errStr.get () == NULL) {
320  errStr = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
321  }
322  std::ostringstream& os = *errStr;
323  os << prefix
324  << "outputOffsets(numRowsToPack=" << numRowsToPack << ") "
325  << outputOffsets(numRowsToPack) << " != sum of counts = "
326  << total << "." << std::endl;
327  if (numRowsToPack != 0) {
328  // Only print the array if it's not too long.
329  if (numRowsToPack < static_cast<LO> (10)) {
330  os << "outputOffsets: [";
331  for (LO i = 0; i <= numRowsToPack; ++i) {
332  os << outputOffsets(i);
333  if (static_cast<LO> (i + 1) <= numRowsToPack) {
334  os << ",";
335  }
336  }
337  os << "]" << std::endl;
338  os << "counts: [";
339  for (LO i = 0; i < numRowsToPack; ++i) {
340  os << counts(i);
341  if (static_cast<LO> (i + 1) < numRowsToPack) {
342  os << ",";
343  }
344  }
345  os << "]" << std::endl;
346  }
347  else {
348  os << "outputOffsets(" << (numRowsToPack-1) << ") = "
349  << outputOffsets(numRowsToPack-1) << "." << std::endl;
350  }
351  }
352  count = outputOffsets(numRowsToPack);
353  return {false, errStr};
354  }
355 #endif // HAVE_TPETRA_DEBUG
356 
357  // Get last entry of outputOffsets, which is the sum of the entries
358  // of counts. Don't assume UVM.
359  using Tpetra::Details::getEntryOnHost;
360  return static_cast<count_type> (getEntryOnHost (outputOffsets,
361  numRowsToPack));
362  }
363 }
364 
375 template<class Packet,
376  class ColumnMap,
377  class BufferDeviceType,
378  class OtherDeviceType = typename ColumnMap::device_type>
379 KOKKOS_FUNCTION
380 size_t
381 packRow(const ColumnMap& col_map,
382  const Kokkos::View<Packet*, BufferDeviceType>& exports,
383  const typename PackTraits<
384  typename ColumnMap::local_ordinal_type,
385  OtherDeviceType>::input_array_type& lids_in,
386  const typename PackTraits<
387  int,
388  OtherDeviceType>::input_array_type& pids_in,
389  const size_t offset,
390  const size_t num_ent,
391  const bool pack_pids)
392 {
393  using LO = typename ColumnMap::local_ordinal_type;
394  using GO = typename ColumnMap::global_ordinal_type;
395 
396  if (num_ent == 0) {
397  // Empty rows always take zero bytes, to ensure sparsity.
398  return static_cast<size_t>(0);
399  }
400 
401  size_t num_ent_packed = num_ent;
402  if (pack_pids) {
403  num_ent_packed += num_ent;
404  }
405 
406  // Copy column indices one at a time, so that we don't need
407  // temporary storage.
408  for (size_t k = 0; k < num_ent; ++k) {
409  const LO lid = lids_in[k];
410  const GO gid = col_map.getGlobalElement (lid);
411  exports(offset+k) = gid;
412  }
413  // Copy PIDs one at a time, so that we don't need temporary storage.
414  if (pack_pids) {
415  for (size_t k = 0; k < num_ent; ++k) {
416  const LO lid = lids_in[k];
417  const int pid = pids_in[lid];
418  exports(offset+num_ent+k) = static_cast<GO>(pid);
419  }
420  }
421 
422  return num_ent_packed;
423 }
424 
425 template<class Packet,
426  class LocalGraph,
427  class LocalMap,
428  class BufferDeviceType,
429  class OtherDeviceType = typename LocalGraph::device_type>
430 struct PackCrsGraphFunctor {
431  using local_graph_type = LocalGraph;
432  using local_map_type = LocalMap;
433  using LO = typename local_map_type::local_ordinal_type;
434  using GO = typename local_map_type::global_ordinal_type;
435 
436  using num_packets_per_lid_view_type =
437  Kokkos::View<const size_t*, BufferDeviceType>;
438  using offsets_view_type = Kokkos::View<const size_t*, BufferDeviceType>;
439  using exports_view_type = Kokkos::View<Packet*, BufferDeviceType>;
440  using export_lids_view_type =
442  using source_pids_view_type =
444 
445  using count_type =
446  typename num_packets_per_lid_view_type::non_const_value_type;
447  using offset_type = typename offsets_view_type::non_const_value_type;
448  using value_type = Kokkos::pair<int, LO>;
449 
450  static_assert (std::is_same<LO, typename local_graph_type::data_type>::value,
451  "local_map_type::local_ordinal_type and "
452  "local_graph_type::data_type must be the same.");
453 
454  local_graph_type local_graph;
455  local_map_type local_col_map;
456  exports_view_type exports;
457  num_packets_per_lid_view_type num_packets_per_lid;
458  export_lids_view_type export_lids;
459  source_pids_view_type source_pids;
460  offsets_view_type offsets;
461  bool pack_pids;
462 
463  PackCrsGraphFunctor(const local_graph_type& local_graph_in,
464  const local_map_type& local_col_map_in,
465  const exports_view_type& exports_in,
466  const num_packets_per_lid_view_type& num_packets_per_lid_in,
467  const export_lids_view_type& export_lids_in,
468  const source_pids_view_type& source_pids_in,
469  const offsets_view_type& offsets_in,
470  const bool pack_pids_in) :
471  local_graph (local_graph_in),
472  local_col_map (local_col_map_in),
473  exports (exports_in),
474  num_packets_per_lid (num_packets_per_lid_in),
475  export_lids (export_lids_in),
476  source_pids (source_pids_in),
477  offsets (offsets_in),
478  pack_pids (pack_pids_in)
479  {
480  const LO numRows = local_graph_in.numRows ();
481  const LO rowMapDim =
482  static_cast<LO> (local_graph.row_map.extent (0));
483  TEUCHOS_TEST_FOR_EXCEPTION
484  (numRows != 0 && rowMapDim != numRows + static_cast<LO> (1),
485  std::logic_error, "local_graph.row_map.extent(0) = "
486  << rowMapDim << " != numRows (= " << numRows << " ) + 1.");
487  }
488 
489  KOKKOS_INLINE_FUNCTION void init (value_type& dst) const
490  {
491  using ::Tpetra::Details::OrdinalTraits;
492  dst = Kokkos::make_pair (0, OrdinalTraits<LO>::invalid ());
493  }
494 
495  KOKKOS_INLINE_FUNCTION void
496  join (volatile value_type& dst, const volatile value_type& src) const
497  {
498  // `dst` should reflect the first (least) bad index and all other
499  // associated error codes and data, so prefer keeping it.
500  if (src.first != 0 && dst.first == 0) {
501  dst = src;
502  }
503  }
504 
505  KOKKOS_INLINE_FUNCTION
506  void operator() (const LO i, value_type& dst) const
507  {
508  const size_t offset = offsets[i];
509  const LO export_lid = export_lids[i];
510  const size_t buf_size = exports.size();
511  const size_t num_packets_this_lid = num_packets_per_lid(i);
512  const size_t num_ent =
513  static_cast<size_t> (local_graph.row_map[export_lid+1]
514  - local_graph.row_map[export_lid]);
515 
516  // Only pack this row's data if it has a nonzero number of
517  // entries. We can do this because receiving processes get the
518  // number of packets, and will know that zero packets means zero
519  // entries.
520  if (num_ent == 0) {
521  return;
522  }
523 
524  if (export_lid >= static_cast<LO>(local_graph.numRows())) {
525  if (dst.first != 0) { // keep only the first error
526  dst = Kokkos::make_pair (1, i); // invalid row
527  }
528  return;
529  }
530  else if ((offset > buf_size || offset + num_packets_this_lid > buf_size)) {
531  if (dst.first != 0) { // keep only the first error
532  dst = Kokkos::make_pair (2, i); // out of bounds
533  }
534  return;
535  }
536 
537  // We can now pack this row
538 
539  // Since the graph is locally indexed on the calling process, we
540  // have to use its column Map (which it _must_ have in this case)
541  // to convert to global indices.
542  const auto row_beg = local_graph.row_map[export_lid];
543  const auto row_end = local_graph.row_map[export_lid + 1];
544  auto lids_in = Kokkos::subview (local_graph.entries,
545  Kokkos::make_pair (row_beg, row_end));
546  using LMT = local_map_type;
547  using PT = Packet;
548  using BDT = BufferDeviceType;
549  using ODT = OtherDeviceType;
550  size_t num_ent_packed_this_row =
551  packRow<PT,LMT,BDT,ODT> (local_col_map, exports, lids_in,
552  source_pids, offset, num_ent, pack_pids);
553  if (num_ent_packed_this_row != num_packets_this_lid) {
554  if (dst.first != 0) { // keep only the first error
555  dst = Kokkos::make_pair (3, i);
556  }
557  }
558  }
559 };
560 
568 template<class Packet,
569  class LocalGraph,
570  class LocalMap,
571  class BufferDeviceType,
572  class OtherDeviceType>
573 void
574 do_pack(const LocalGraph& local_graph,
575  const LocalMap& local_map,
576  const Kokkos::View<Packet*, BufferDeviceType>& exports,
577  const typename PackTraits<
578  size_t,
579  BufferDeviceType
580  >::input_array_type& num_packets_per_lid,
581  const typename PackTraits<
582  typename LocalMap::local_ordinal_type,
583  OtherDeviceType
584  >::input_array_type& export_lids,
585  const typename PackTraits<
586  int,
587  OtherDeviceType
588  >::input_array_type& source_pids,
589  const Kokkos::View<const size_t*, BufferDeviceType>& offsets,
590  const bool pack_pids)
591 {
592  using LO = typename LocalMap::local_ordinal_type;
593  using execution_space = typename LocalGraph::device_type::execution_space;
594  using range_type = Kokkos::RangePolicy<execution_space, LO>;
595  const char prefix[] = "Tpetra::Details::PackCrsGraphImpl::do_pack: ";
596 
597  if (export_lids.extent (0) != 0) {
598  TEUCHOS_TEST_FOR_EXCEPTION
599  (static_cast<size_t> (offsets.extent (0)) !=
600  static_cast<size_t> (export_lids.extent (0) + 1),
601  std::invalid_argument, prefix << "offsets.extent(0) = "
602  << offsets.extent (0) << " != export_lids.extent(0) (= "
603  << export_lids.extent (0) << ") + 1.");
604  TEUCHOS_TEST_FOR_EXCEPTION
605  (export_lids.extent (0) != num_packets_per_lid.extent (0),
606  std::invalid_argument, prefix << "export_lids.extent(0) = " <<
607  export_lids.extent (0) << " != num_packets_per_lid.extent(0) = "
608  << num_packets_per_lid.extent (0) << ".");
609  // If exports has nonzero length at this point, then the graph
610  // has at least one entry to pack. Thus, if packing process
611  // ranks, we had better have at least one process rank to pack.
612  TEUCHOS_TEST_FOR_EXCEPTION
613  (pack_pids && exports.extent (0) != 0 &&
614  source_pids.extent (0) == 0, std::invalid_argument, prefix <<
615  "pack_pids is true, and exports.extent(0) = " <<
616  exports.extent (0) << " != 0, meaning that we need to pack at "
617  "least one graph entry, but source_pids.extent(0) = 0.");
618  }
619 
620  using pack_functor_type =
621  PackCrsGraphFunctor<Packet, LocalGraph, LocalMap,
622  BufferDeviceType, OtherDeviceType>;
623  pack_functor_type f (local_graph, local_map, exports,
624  num_packets_per_lid, export_lids,
625  source_pids, offsets, pack_pids);
626 
627  typename pack_functor_type::value_type result;
628  range_type range (0, num_packets_per_lid.extent (0));
629  Kokkos::parallel_reduce (range, f, result);
630 
631  if (result.first != 0) {
632  std::ostringstream os;
633 
634  if (result.first == 1) { // invalid local row index
635  auto export_lids_h = Kokkos::create_mirror_view (export_lids);
636  Kokkos::deep_copy (export_lids_h, export_lids);
637  const auto firstBadLid = export_lids_h(result.second);
638  os << "First bad export LID: export_lids(i=" << result.second << ") = "
639  << firstBadLid;
640  }
641  else if (result.first == 2) { // invalid offset
642  auto offsets_h = Kokkos::create_mirror_view (offsets);
643  Kokkos::deep_copy (offsets_h, offsets);
644  const auto firstBadOffset = offsets_h(result.second);
645 
646  auto num_packets_per_lid_h =
647  Kokkos::create_mirror_view (num_packets_per_lid);
648  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid);
649  os << "First bad offset: offsets(i=" << result.second << ") = "
650  << firstBadOffset << ", num_packets_per_lid(i) = "
651  << num_packets_per_lid_h(result.second) << ", buf_size = "
652  << exports.size ();
653  }
654 
655  TEUCHOS_TEST_FOR_EXCEPTION
656  (true, std::runtime_error, prefix << "PackCrsGraphFunctor reported "
657  "error code " << result.first << " for the first bad row "
658  << result.second << ". " << os.str ());
659  }
660 }
661 
688 template<typename LO, typename GO, typename NT>
689 void
690 packCrsGraph(const CrsGraph<LO,GO,NT>& sourceGraph,
691  Kokkos::DualView<typename CrsGraph<LO,GO,NT>::packet_type*,
692  typename CrsGraph<LO,GO,NT>::buffer_device_type>& exports,
693  const Kokkos::View<size_t*,
694  typename CrsGraph<LO,GO,NT>::buffer_device_type>& num_packets_per_lid,
695  const Kokkos::View<const LO*, typename NT::device_type>& export_lids,
696  const Kokkos::View<const int*, typename NT::device_type>& export_pids,
697  size_t& constant_num_packets,
698  const bool pack_pids,
699  Distributor& /* dist */)
700 {
701  using Kokkos::View;
702  typedef typename CrsGraph<LO,GO,NT>::packet_type packet_type;
703  typedef typename CrsGraph<LO,GO,NT>::buffer_device_type buffer_device_type;
704  typedef typename buffer_device_type::execution_space execution_space;
705  typedef Kokkos::DualView<packet_type*,buffer_device_type> exports_view_type;
706  const char prefix[] = "Tpetra::Details::packCrsGraph: ";
707  constexpr bool debug = false;
708 
709  auto local_graph = sourceGraph.getLocalGraph ();
710  auto local_col_map = sourceGraph.getColMap ()->getLocalMap ();
711 
712  // Setting this to zero tells the caller to expect a possibly
713  // different ("nonconstant") number of packets per local index
714  // (i.e., a possibly different number of entries per row).
715  constant_num_packets = 0;
716 
717  const size_t num_export_lids =
718  static_cast<size_t> (export_lids.extent (0));
719  TEUCHOS_TEST_FOR_EXCEPTION
720  (num_export_lids !=
721  static_cast<size_t> (num_packets_per_lid.extent (0)),
722  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
723  << num_export_lids << " != num_packets_per_lid.extent(0) = "
724  << num_packets_per_lid.extent (0) << ".");
725  if (num_export_lids != 0) {
726  TEUCHOS_TEST_FOR_EXCEPTION
727  (num_packets_per_lid.data () == NULL, std::invalid_argument,
728  prefix << "num_export_lids = "<< num_export_lids << " != 0, but "
729  "num_packets_per_lid.data() = "
730  << num_packets_per_lid.data () << " == NULL.");
731  }
732 
733  if (num_export_lids == 0) {
734  // FIXME (26 Apr 2016) Fences around (UVM) allocations only
735  // temporarily needed for #227 debugging. Should be able to
736  // remove them after that's fixed.
737  execution_space::fence ();
738  exports = exports_view_type ("exports", 0);
739  execution_space::fence ();
740  return;
741  }
742 
743  // Array of offsets into the pack buffer.
744  Kokkos::View<size_t*,buffer_device_type> offsets ("offsets", num_export_lids + 1);
745 
746  // Compute number of packets per LID (row to send), as well as
747  // corresponding offsets (the prefix sum of the packet counts).
748  const size_t count =
749  computeNumPacketsAndOffsets(offsets, num_packets_per_lid,
750  local_graph.row_map, export_lids, export_pids);
751 
752  // Resize the output pack buffer if needed.
753  if (count > static_cast<size_t> (exports.extent (0))) {
754  // FIXME (26 Apr 2016) Fences around (UVM) allocations only
755  // temporarily needed for #227 debugging. Should be able to
756  // remove them after that's fixed.
757  execution_space::fence ();
758  exports = exports_view_type ("exports", count);
759  if (debug) {
760  std::ostringstream os;
761  os << "*** exports resized to " << count << std::endl;
762  std::cerr << os.str ();
763  }
764  execution_space::fence ();
765  }
766  if (debug) {
767  std::ostringstream os;
768  os << "*** count: " << count << ", exports.extent(0): "
769  << exports.extent (0) << std::endl;
770  std::cerr << os.str ();
771  }
772 
773  // If exports has nonzero length at this point, then the graph has
774  // at least one entry to pack. Thus, if packing process ranks, we
775  // had better have at least one process rank to pack.
776  TEUCHOS_TEST_FOR_EXCEPTION
777  (pack_pids && exports.extent (0) != 0 &&
778  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
779  "pack_pids is true, and exports.extent(0) = " <<
780  exports.extent (0) << " != 0, meaning that we need to pack at least "
781  "one graph entry, but export_pids.extent(0) = 0.");
782 
783  typedef typename std::decay<decltype (local_graph)>::type
784  local_graph_type;
785  typedef typename std::decay<decltype (local_col_map)>::type
786  local_map_type;
787  exports.modify_device ();
788  auto exports_d = exports.view_device ();
789  using other_device_type = typename NT::device_type;
790  do_pack<packet_type,local_graph_type,local_map_type,buffer_device_type,other_device_type>
791  (local_graph, local_col_map, exports_d, num_packets_per_lid,
792  export_lids, export_pids, offsets, pack_pids);
793  // If we got this far, we succeeded.
794 }
795 
796 } // namespace PackCrsGraphImpl
797 
798 template<typename LO, typename GO, typename NT>
799 void
801  Teuchos::Array<typename CrsGraph<LO,GO,NT>::packet_type>& exports,
802  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
803  const Teuchos::ArrayView<const LO>& exportLIDs,
804  size_t& constantNumPackets,
805  Distributor& distor)
806 {
807  typedef typename CrsGraph<LO,GO,NT>::packet_type packet_type;
808  typedef typename CrsGraph<LO,GO,NT>::local_graph_type local_graph_type;
809  typedef typename local_graph_type::device_type device_type;
810  typedef typename Kokkos::View<size_t*, device_type>::HostMirror::execution_space host_exec_space;
811  typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
812 
813  // mfh 23 Aug 2017: Fix for #1088 requires pack / unpack buffers to
814  // have a possibly different memory space (CudaSpace) than the
815  // default CUDA memory space (currently CudaUVMSpace).
816  typedef typename device_type::execution_space buffer_exec_space;
817 #ifdef KOKKOS_ENABLE_CUDA
818  typedef typename std::conditional<
819  std::is_same<
820  buffer_exec_space, Kokkos::Cuda
821  >::value,
822  Kokkos::CudaSpace,
823  typename device_type::memory_space
824  >::type buffer_memory_space;
825 #else
826  typedef typename device_type::memory_space buffer_memory_space;
827 #endif // KOKKOS_ENABLE_CUDA
828  // @MFH: why not use CrsGraph<LO,GO,NT>::buffer_device_type???
829  typedef Kokkos::Device<buffer_exec_space,
830  buffer_memory_space> buffer_device_type;
831 
832  // Convert all Teuchos::Array to Kokkos::View
833 
834  // This is an output array, so we don't have to copy to device here.
835  // However, we'll have to remember to copy back to host when done.
836  typename local_graph_type::device_type outputDevice;
837  auto num_packets_per_lid_d =
839  numPacketsPerLID.getRawPtr (),
840  numPacketsPerLID.size (), false,
841  "num_packets_per_lid");
842  // This is an input array, so we have to copy to device here.
843  // However, we never need to copy it back to host.
844  auto export_lids_d =
846  exportLIDs.getRawPtr (),
847  exportLIDs.size (), true,
848  "export_lids");
849  // Create an empty array of PIDs
850  Kokkos::View<int*, device_type> export_pids_d ("export_pids", 0);
851 
852  Kokkos::DualView<packet_type*,buffer_device_type> exports_dv ("exports", 0);
853  constexpr bool pack_pids = false;
854  PackCrsGraphImpl::packCrsGraph<LO,GO,NT>(
855  sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
856  export_pids_d, constantNumPackets, pack_pids, distor);
857  // The counts are an output of packCrsGraph, so we have to copy
858  // them back to host.
859  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
860  (numPacketsPerLID.getRawPtr (),
861  numPacketsPerLID.size ());
862  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d);
863 
864  // FIXME (mfh 23 Aug 2017) If we're forced to use a DualView for
865  // exports_dv above, then we have two host copies for exports_h.
866 
867  // The exports are an output of packCrsGraph, so we have to
868  // copy them back to host.
869  if (static_cast<size_t> (exports.size ()) !=
870  static_cast<size_t> (exports_dv.extent (0))) {
871  exports.resize (exports_dv.extent (0));
872  }
873  Kokkos::View<packet_type*, host_dev_type> exports_h (exports.getRawPtr (),
874  exports.size ());
875  Kokkos::deep_copy (exports_h, exports_dv.d_view);
876 }
877 
880 template<typename LO, typename GO, typename NT>
881 void
883  const Kokkos::DualView<
884  const LO*,
886  >& export_lids,
887  const Kokkos::DualView<
888  const int*,
890  >& export_pids,
891  Kokkos::DualView<
893  typename CrsGraph<LO,GO,NT>::buffer_device_type>& exports,
894  Kokkos::DualView<
895  size_t*,
897  > num_packets_per_lid,
898  size_t& constant_num_packets,
899  const bool pack_pids,
900  Distributor& /* dist */)
901 {
902  using Kokkos::View;
903  using crs_graph_type = CrsGraph<LO,GO,NT>;
904  using BDT = typename crs_graph_type::buffer_device_type;
905  using PT = typename crs_graph_type::packet_type;
906  using exports_dual_view_type = Kokkos::DualView<PT*, BDT>;
907  using LGT = typename crs_graph_type::local_graph_type;
908  using LMT = typename crs_graph_type::map_type::local_map_type;
909  const char prefix[] = "Tpetra::Details::packCrsGraphNew: ";
910 
911  const LGT local_graph = sourceGraph.getLocalGraph ();
912  const LMT local_col_map = sourceGraph.getColMap ()->getLocalMap ();
913 
914  // Setting this to zero tells the caller to expect a possibly
915  // different ("nonconstant") number of packets per local index
916  // (i.e., a possibly different number of entries per row).
917  constant_num_packets = 0;
918 
919  const size_t num_export_lids =
920  static_cast<size_t> (export_lids.extent (0));
921  TEUCHOS_TEST_FOR_EXCEPTION
922  (num_export_lids !=
923  static_cast<size_t> (num_packets_per_lid.extent (0)),
924  std::invalid_argument, prefix << "num_export_lids.extent(0) = "
925  << num_export_lids << " != num_packets_per_lid.extent(0) = "
926  << num_packets_per_lid.extent (0) << ".");
927  TEUCHOS_TEST_FOR_EXCEPTION
928  (num_export_lids != 0 &&
929  num_packets_per_lid.view_device ().data () == nullptr,
930  std::invalid_argument, prefix << "num_export_lids = "<< num_export_lids
931  << " != 0, but num_packets_per_lid.view_device().data() = nullptr.");
932 
933  if (num_export_lids == 0) {
934  exports = exports_dual_view_type ();
935  return;
936  }
937 
938  // Array of offsets into the pack buffer.
939  using offsets_type = Kokkos::View<size_t*, BDT>;
940  offsets_type offsets ("offsets", num_export_lids + 1);
941 
942  // Compute number of packets per LID (row to send), as well as
943  // corresponding offsets (the prefix sum of the packet counts).
944  num_packets_per_lid.clear_sync_state ();
945  num_packets_per_lid.modify_device ();
946  using PackCrsGraphImpl::computeNumPacketsAndOffsets;
947  const size_t count =
948  computeNumPacketsAndOffsets (offsets, num_packets_per_lid.view_device (),
949  local_graph.row_map,
950  export_lids.view_device (),
951  export_pids.view_device ());
952 
953  // Resize the output pack buffer if needed.
954  if (count > static_cast<size_t> (exports.extent (0))) {
955  exports = exports_dual_view_type ("exports", count);
956  }
957 
958  // If exports has nonzero length at this point, then the graph has
959  // at least one entry to pack. Thus, if packing process ranks, we
960  // had better have at least one process rank to pack.
961  TEUCHOS_TEST_FOR_EXCEPTION
962  (pack_pids && exports.extent (0) != 0 &&
963  export_pids.extent (0) == 0, std::invalid_argument, prefix <<
964  "pack_pids is true, and exports.extent(0) = " <<
965  exports.extent (0) << " != 0, meaning that we need to pack at least "
966  "one graph entry, but export_pids.extent(0) = 0.");
967 
968  exports.modify_device ();
969  using PackCrsGraphImpl::do_pack;
970  do_pack<PT, LGT, LMT, BDT, BDT> (local_graph, local_col_map,
971  exports.view_device (),
972  num_packets_per_lid.view_device (),
973  export_lids.view_device (),
974  export_pids.view_device (),
975  offsets, pack_pids);
976 }
977 
978 template<typename LO, typename GO, typename NT>
979 void
981  Kokkos::DualView<typename CrsGraph<LO,GO,NT>::packet_type*,
983  exports_dv,
984  const Teuchos::ArrayView<size_t>& numPacketsPerLID,
985  const Teuchos::ArrayView<const LO>& exportLIDs,
986  const Teuchos::ArrayView<const int>& sourcePIDs,
987  size_t& constantNumPackets,
988  Distributor& distor)
989 {
990  typedef typename CrsGraph<LO,GO,NT>::local_graph_type local_graph_type;
991  typedef typename CrsGraph<LO,GO,NT>::packet_type packet_type;
992  typedef typename CrsGraph<LO,GO,NT>::buffer_device_type buffer_device_type;
993  typedef typename Kokkos::DualView<packet_type*, buffer_device_type>::t_host::execution_space host_exec_space;
994  typedef Kokkos::Device<host_exec_space, Kokkos::HostSpace> host_dev_type;
995 
996  typename local_graph_type::device_type outputDevice;
997 
998  // Convert all Teuchos::Array to Kokkos::View
999 
1000  // This is an output array, so we don't have to copy to device here.
1001  // However, we'll have to remember to copy back to host when done.
1002  auto num_packets_per_lid_d =
1003  create_mirror_view_from_raw_host_array (buffer_device_type (),
1004  numPacketsPerLID.getRawPtr (),
1005  numPacketsPerLID.size (), false,
1006  "num_packets_per_lid");
1007 
1008  // This is an input array, so we have to copy to device here.
1009  // However, we never need to copy it back to host.
1010  auto export_lids_d =
1012  exportLIDs.getRawPtr (),
1013  exportLIDs.size (), true,
1014  "export_lids");
1015  // This is an input array, so we have to copy to device here.
1016  // However, we never need to copy it back to host.
1017  auto export_pids_d =
1019  sourcePIDs.getRawPtr (),
1020  sourcePIDs.size (), true,
1021  "export_pids");
1022  constexpr bool pack_pids = true;
1023  PackCrsGraphImpl::packCrsGraph<LO,GO,NT>(
1024  sourceGraph, exports_dv, num_packets_per_lid_d, export_lids_d,
1025  export_pids_d, constantNumPackets, pack_pids, distor);
1026 
1027  // The counts are an output of packCrsGraph, so we
1028  // have to copy them back to host.
1029  Kokkos::View<size_t*, host_dev_type> num_packets_per_lid_h
1030  (numPacketsPerLID.getRawPtr (), numPacketsPerLID.size ());
1031  Kokkos::deep_copy (num_packets_per_lid_h, num_packets_per_lid_d);
1032 }
1033 
1034 } // namespace Details
1035 } // namespace Tpetra
1036 
1037 #define TPETRA_DETAILS_PACKCRSGRAPH_INSTANT( LO, GO, NT ) \
1038  template void \
1039  Details::packCrsGraph<LO, GO, NT> ( \
1040  const CrsGraph<LO, GO, NT>&, \
1041  Teuchos::Array<CrsGraph<LO,GO,NT>::packet_type>&, \
1042  const Teuchos::ArrayView<size_t>&, \
1043  const Teuchos::ArrayView<const LO>&, \
1044  size_t&, \
1045  Distributor&); \
1046  template void \
1047  Details::packCrsGraphNew<LO, GO, NT> ( \
1048  const CrsGraph<LO, GO, NT>&, \
1049  const Kokkos::DualView< \
1050  const LO*, \
1051  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1052  const Kokkos::DualView< \
1053  const int*, \
1054  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1055  Kokkos::DualView< \
1056  CrsGraph<LO,GO,NT>::packet_type*, \
1057  CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1058  Kokkos::DualView< \
1059  size_t*, \
1060  CrsGraph<LO,GO,NT>::buffer_device_type>, \
1061  size_t&, \
1062  const bool, \
1063  Distributor&); \
1064  template void \
1065  Details::packCrsGraphWithOwningPIDs<LO, GO, NT> ( \
1066  const CrsGraph<LO, GO, NT>&, \
1067  Kokkos::DualView<CrsGraph<LO,GO,NT>::packet_type*, CrsGraph<LO,GO,NT>::buffer_device_type>&, \
1068  const Teuchos::ArrayView<size_t>&, \
1069  const Teuchos::ArrayView<const LO>&, \
1070  const Teuchos::ArrayView<const int>&, \
1071  size_t&, \
1072  Distributor&);
1073 
1074 #endif // TPETRA_DETAILS_PACKCRSGRAPH_DEF_HPP
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Declaration of the Tpetra::CrsGraph class.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Compute the number of packets and offsets for the pack procedure.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Sets up and executes a communication plan for a Tpetra DistObject.
Kokkos::View< const value_type *, OtherDeviceType, Kokkos::MemoryUnmanaged > input_array_type
The type of an input array of value_type.
void packCrsGraph(const CrsGraph< LO, GO, NT > &sourceGraph, Teuchos::Array< typename CrsGraph< LO, GO, NT >::packet_type > &exports, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, size_t &constantNumPackets, Distributor &distor)
Pack specified entries of the given local sparse graph for communication.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
local_graph_type getLocalGraph() const
Get the local graph.
void packCrsGraphNew(const CrsGraph< LO, GO, NT > &sourceGraph, const Kokkos::DualView< const LO *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportLIDs, const Kokkos::DualView< const int *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exportPIDs, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports, Kokkos::DualView< size_t *, typename CrsGraph< LO, GO, NT >::buffer_device_type > numPacketsPerLID, size_t &constantNumPackets, const bool pack_pids, Distributor &distor)
Pack specified entries of the given local sparse graph for communication, for &quot;new&quot; DistObject interf...
Declaration and definition of Tpetra::Details::getEntryOnHost.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
void packCrsGraphWithOwningPIDs(const CrsGraph< LO, GO, NT > &sourceGraph, Kokkos::DualView< typename CrsGraph< LO, GO, NT >::packet_type *, typename CrsGraph< LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets, Distributor &distor)
Pack specified entries of the given local sparse graph for communication.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.