Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Teuchos_Array.hpp"
15 #include "Teuchos_ArrayView.hpp"
21 #include "Tpetra_CrsGraph_decl.hpp"
24 #include "Kokkos_Core.hpp"
25 #include <memory>
26 #include <string>
27 
46 
47 namespace Tpetra {
48 
49 //
50 // Users must never rely on anything in the Details namespace.
51 //
52 namespace Details {
53 
54 namespace UnpackAndCombineCrsGraphImpl {
55 
65 template <class Packet, class GO, class Device, class BufferDevice>
66 KOKKOS_FUNCTION int
67 unpackRow(const Kokkos::View<GO*, Device, Kokkos::MemoryUnmanaged>& gids_out,
68  const Kokkos::View<int*, Device, Kokkos::MemoryUnmanaged>& pids_out,
69  const Kokkos::View<const Packet*, BufferDevice>& imports,
70  const size_t offset,
71  const size_t num_ent) {
72  using size_type = typename Kokkos::View<GO*, Device>::size_type;
73 
74  if (num_ent == 0) {
75  // Empty rows always take zero bytes, to ensure sparsity.
76  return 0;
77  }
78 
79  // Unpack GIDs
80  for (size_type k = 0; k < num_ent; k++)
81  gids_out(k) = imports(offset + k);
82 
83  // Unpack PIDs
84  if (pids_out.size() > 0) {
85  for (size_type k = 0; k < num_ent; k++) {
86  pids_out(k) = static_cast<int>(imports(offset + num_ent + k));
87  }
88  }
89 
90  return 0;
91 }
92 
103 template <class LocalOrdinal,
104  class Packet,
105  class RowView,
106  class IndicesView,
107  class BufferDevice>
109  using LO = LocalOrdinal;
110  using GO = typename IndicesView::value_type;
111  using packet_type = Packet;
112  using row_ptrs_type = RowView;
113  using indices_type = IndicesView;
114  using buffer_device_type = BufferDevice;
115 
116  using device_type = typename IndicesView::device_type;
117  using execution_space = typename device_type::execution_space;
118 
119  using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
120  using offsets_type = Kokkos::View<const size_t*, device_type>;
121  using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
122  using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
123 
124  using gids_scratch_type = Kokkos::View<GO*, device_type>;
125  using pids_scratch_type = Kokkos::View<int*, device_type>;
126 
127  row_ptrs_type row_ptrs_beg;
128  row_ptrs_type row_ptrs_end;
129  indices_type indices;
130  input_buffer_type imports;
131  num_packets_per_lid_type num_packets_per_lid;
132  import_lids_type import_lids;
133  offsets_type offsets;
134  size_t max_num_ent;
135  bool unpack_pids;
136  Kokkos::Experimental::UniqueToken<execution_space,
137  Kokkos::Experimental::UniqueTokenScope::Global>
138  tokens;
139  gids_scratch_type gids_scratch;
140  pids_scratch_type pids_scratch;
141 
142  public:
143  using value_type = Kokkos::pair<int, LO>;
144 
146  const row_ptrs_type& row_ptrs_beg_in,
147  const row_ptrs_type& row_ptrs_end_in,
148  const indices_type& indices_in,
149  const input_buffer_type& imports_in,
150  const num_packets_per_lid_type& num_packets_per_lid_in,
151  const import_lids_type& import_lids_in,
152  const offsets_type& offsets_in,
153  const size_t max_num_ent_in,
154  const bool unpack_pids_in)
155  : row_ptrs_beg(row_ptrs_beg_in)
156  , row_ptrs_end(row_ptrs_end_in)
157  , indices(indices_in)
158  , imports(imports_in)
159  , num_packets_per_lid(num_packets_per_lid_in)
160  , import_lids(import_lids_in)
161  , offsets(offsets_in)
162  , max_num_ent(max_num_ent_in)
163  , unpack_pids(unpack_pids_in)
164  , tokens(execution_space())
165  , gids_scratch("gids_scratch", tokens.size() * max_num_ent)
166  , pids_scratch("pids_scratch", tokens.size() * max_num_ent) {}
167 
168  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const {
169  using Tpetra::Details::OrdinalTraits;
170  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
171  }
172 
173  KOKKOS_INLINE_FUNCTION void
174  join(value_type& dst, const value_type& src) const {
175  // `dst` should reflect the first (least) bad index and
176  // all other associated error codes and data. Thus, we need only
177  // check if the `src` object shows an error and if its associated
178  // bad index is less than `dst`'s bad index.
179  using Tpetra::Details::OrdinalTraits;
180  if (src.second != OrdinalTraits<LO>::invalid()) {
181  // An error in the src; check if
182  // 1. `dst` shows errors
183  // 2. If `dst` does show errors, if src's bad index is less than
184  // *this' bad index
185  if (dst.second == OrdinalTraits<LO>::invalid() ||
186  src.second < dst.second) {
187  dst = src;
188  }
189  }
190  }
191 
192  KOKKOS_INLINE_FUNCTION
193  void operator()(const LO i, value_type& dst) const {
194  using Kokkos::MemoryUnmanaged;
195  using Kokkos::subview;
196  using Kokkos::View;
197  using size_type = typename execution_space::size_type;
198  using slice = typename Kokkos::pair<size_type, size_type>;
199 
200  using pids_out_type = View<int*, device_type, MemoryUnmanaged>;
201  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
202 
203  const size_t num_packets_this_lid = num_packets_per_lid(i);
204  const size_t num_ent = (unpack_pids) ? num_packets_this_lid / 2
205  : num_packets_this_lid;
206  if (unpack_pids && num_packets_this_lid % 2 != 0) {
207  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
208  // should never
209  dst = Kokkos::make_pair(1, i);
210  return;
211  }
212 
213  // Only unpack data if there is a nonzero number to unpack
214  if (num_ent == 0) {
215  return;
216  }
217 
218  // there is actually something in the row
219  const size_t buf_size = imports.size();
220  const size_t offset = offsets(i);
221 
222  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
223  dst = Kokkos::make_pair(2, i); // out of bounds
224  return;
225  }
226 
227  // Get subviews in to the scratch arrays. The token returned from acquire
228  // is an integer in [0, tokens.size()). It is used to grab a unique (to
229  // this thread) subview of the scratch arrays.
230  const size_type token = tokens.acquire();
231  const size_t a = static_cast<size_t>(token) * max_num_ent;
232  const size_t b = a + num_ent;
233  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
234  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
235 
236  const int err = unpackRow(gids_out, pids_out, imports, offset, num_ent);
237 
238  if (err != 0) {
239  dst = Kokkos::make_pair(3, i);
240  tokens.release(token);
241  return;
242  }
243 
244  auto import_lid = import_lids(i);
245  for (size_t k = 0; k < num_ent; ++k) {
246  indices(row_ptrs_end(import_lid)) = gids_out(k);
247  // this is OK; don't need atomic, since LIDs to pack don't have repeats.
248  row_ptrs_end(import_lid) += 1;
249  }
250 
251  tokens.release(token);
252  }
253 };
254 
261 template <class LocalOrdinal, class GlobalOrdinal, class Node,
262  class RowView, class IndicesView, class BufferDevice>
263 void unpackAndCombine(const RowView& row_ptrs_beg,
264  const RowView& row_ptrs_end,
265  IndicesView& indices,
266  const Kokkos::View<const GlobalOrdinal*, BufferDevice,
267  Kokkos::MemoryUnmanaged>& imports,
268  const Kokkos::View<const size_t*, BufferDevice,
269  Kokkos::MemoryUnmanaged>& num_packets_per_lid,
270  const Kokkos::View<const LocalOrdinal*, BufferDevice,
271  Kokkos::MemoryUnmanaged>& import_lids,
272  const typename CrsGraph<LocalOrdinal, GlobalOrdinal,
273  Node>::padding_type& padding,
274  const bool unpack_pids,
275  const int myRank,
276  const bool verbose) {
277  using LO = LocalOrdinal;
278  using GO = GlobalOrdinal;
279  using device_type = typename Node::device_type;
280  using execution_space = typename BufferDevice::execution_space;
281  using range_policy =
282  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
283  using unpack_functor_type =
285 
286  const char prefix[] =
287  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
288 
289  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
290  if (num_import_lids == 0) {
291  // Nothing to unpack
292  return;
293  }
294 
295  // Resize row pointers and indices to accommodate incoming data
296  padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding,
297  myRank, verbose);
298 
299  // Get the offsets
300  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids + 1);
301  computeOffsetsFromCounts(offsets, num_packets_per_lid);
302 
303  // Determine the maximum number of entries in any row in the graph. The
304  // maximum number of entries is needed to allocate unpack buffers on the
305  // device.
306  size_t max_num_ent;
307  Kokkos::parallel_reduce(
308  "MaxReduce",
309  range_policy(0, LO(num_packets_per_lid.size())),
310  KOKKOS_LAMBDA(const LO i, size_t& running_max_num_ent) {
311  const size_t num_packets_this_lid = num_packets_per_lid(i);
312  const size_t num_ent = (unpack_pids) ? num_packets_this_lid / 2 : num_packets_this_lid;
313  if (num_ent > running_max_num_ent) {
314  running_max_num_ent = num_ent;
315  }
316  },
317  Kokkos::Max<size_t>(max_num_ent));
318 
319  // Now do the actual unpack!
320  unpack_functor_type f(row_ptrs_beg, row_ptrs_end, indices, imports,
321  num_packets_per_lid, import_lids, offsets,
322  max_num_ent, unpack_pids);
323 
324  typename unpack_functor_type::value_type x;
325  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
326  auto x_h = x.to_std_pair();
327  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
328  prefix << "UnpackAndCombineFunctor reported error code "
329  << x_h.first << " for the first bad row " << x_h.second);
330 }
331 
332 template <class Packet, class LocalGraph, class BufferDevice>
333 size_t
335  const LocalGraph& local_graph,
336  const Kokkos::View<const typename LocalGraph::data_type*,
337  typename LocalGraph::device_type,
338  Kokkos::MemoryUnmanaged>
339  permute_from_lids,
340  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
341  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
342  const size_t num_same_ids) {
343  using Kokkos::parallel_reduce;
344  using local_graph_type = LocalGraph;
345  using LO = typename local_graph_type::data_type;
346  using device_type = typename local_graph_type::device_type;
347  using execution_space = typename device_type::execution_space;
348  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
349 
350  size_t count = 0;
351  LO num_items;
352 
353  // Number of graph entries to unpack (returned by this function).
354  num_items = static_cast<LO>(num_same_ids);
355  if (num_items) {
356  size_t kcnt = 0;
357  parallel_reduce(
358  range_policy(0, num_items),
359  KOKKOS_LAMBDA(const LO lid, size_t& update) {
360  update += static_cast<size_t>(local_graph.row_map[lid + 1] - local_graph.row_map[lid]);
361  },
362  kcnt);
363  count += kcnt;
364  }
365 
366  // Count entries copied directly from the source graph with permuting.
367  num_items = static_cast<LO>(permute_from_lids.extent(0));
368  if (num_items) {
369  size_t kcnt = 0;
370  parallel_reduce(
371  range_policy(0, num_items),
372  KOKKOS_LAMBDA(const LO i, size_t& update) {
373  const LO lid = permute_from_lids(i);
374  update += static_cast<size_t>(local_graph.row_map[lid + 1] - local_graph.row_map[lid]);
375  },
376  kcnt);
377  count += kcnt;
378  }
379 
380  {
381  // Count entries received from other MPI processes.
382  size_t tot_num_ent = 0;
383  parallel_reduce(
384  "SumReduce",
385  range_policy(0, num_packets_per_lid.size()),
386  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
387  lsum += num_packets_per_lid(i) / 2;
388  },
389  Kokkos::Sum<size_t>(tot_num_ent));
390  count += tot_num_ent;
391  }
392 
393  return count;
394 }
395 
397 template <class Packet, class LO, class Device, class BufferDevice>
398 void setupRowPointersForRemotes(
399  const Kokkos::View<size_t*, Device>& tgt_rowptr,
400  const Kokkos::View<const LO*, BufferDevice>& import_lids,
401  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
402  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid) {
403  using Kokkos::parallel_reduce;
404  using device_type = Device;
405  using execution_space = typename device_type::execution_space;
406  using size_type = typename Kokkos::View<size_t*, device_type>::size_type;
407  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
408 
409  const size_type N = num_packets_per_lid.extent(0);
410  parallel_for(
411  "Setup row pointers for remotes",
412  range_policy(0, N),
413  KOKKOS_LAMBDA(const size_t i) {
414  using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
415  const size_t num_packets_this_lid = num_packets_per_lid(i);
416  const size_t num_ent = num_packets_this_lid / 2;
417  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
418  });
419 }
420 
421 // Convert array of row lengths to a CRS pointer array
422 template <class Device>
423 void makeCrsRowPtrFromLengths(
424  const Kokkos::View<size_t*, Device, Kokkos::MemoryUnmanaged>& tgt_rowptr,
425  const Kokkos::View<size_t*, Device>& new_start_row) {
426  using Kokkos::parallel_scan;
427  using device_type = Device;
428  using execution_space = typename device_type::execution_space;
429  using size_type = typename Kokkos::View<size_t*, device_type>::size_type;
430  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
431  const size_type N = new_start_row.extent(0);
432  parallel_scan(
433  range_policy(0, N),
434  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
435  auto cur_val = tgt_rowptr(i);
436  if (final) {
437  tgt_rowptr(i) = update;
438  new_start_row(i) = tgt_rowptr(i);
439  }
440  update += cur_val;
441  });
442 }
443 
444 template <class LocalGraph, class LocalMap>
445 void copyDataFromSameIDs(
446  const Kokkos::View<typename LocalMap::global_ordinal_type*,
447  typename LocalMap::device_type>& tgt_colind,
448  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
449  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
450  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
451  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
452  const LocalGraph& local_graph,
453  const LocalMap& local_col_map,
454  const size_t num_same_ids,
455  const int my_pid) {
456  using Kokkos::parallel_for;
457  using device_type = typename LocalMap::device_type;
458  using LO = typename LocalMap::local_ordinal_type;
459  using execution_space = typename device_type::execution_space;
460  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
461 
462  parallel_for(
463  range_policy(0, num_same_ids),
464  KOKKOS_LAMBDA(const size_t i) {
465  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
466 
467  const LO src_lid = static_cast<LO>(i);
468  size_t src_row = local_graph.row_map(src_lid);
469 
470  const LO tgt_lid = static_cast<LO>(i);
471  const size_t tgt_row = tgt_rowptr(tgt_lid);
472 
473  const size_t nsr = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
474  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
475 
476  for (size_t j = local_graph.row_map(src_lid);
477  j < local_graph.row_map(src_lid + 1); ++j) {
478  LO src_col = local_graph.entries(j);
479  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
480  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
481  }
482  });
483 }
484 
485 template <class LocalGraph, class LocalMap, class BufferDevice>
486 void copyDataFromPermuteIDs(
487  const Kokkos::View<typename LocalMap::global_ordinal_type*,
488  typename LocalMap::device_type>& tgt_colind,
489  const Kokkos::View<int*,
490  typename LocalMap::device_type>& tgt_pids,
491  const Kokkos::View<size_t*,
492  typename LocalMap::device_type>& new_start_row,
493  const Kokkos::View<size_t*,
494  typename LocalMap::device_type>& tgt_rowptr,
495  const Kokkos::View<const int*,
496  typename LocalMap::device_type>& src_pids,
497  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
498  BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
499  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
500  BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
501  const LocalGraph& local_graph,
502  const LocalMap& local_col_map,
503  const int my_pid) {
504  using Kokkos::parallel_for;
505  using device_type = typename LocalMap::device_type;
506  using LO = typename LocalMap::local_ordinal_type;
507  using execution_space = typename device_type::execution_space;
508  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
509  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
510 
511  const size_type num_permute_to_lids = permute_to_lids.extent(0);
512 
513  parallel_for(
514  range_policy(0, num_permute_to_lids),
515  KOKKOS_LAMBDA(const size_t i) {
516  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
517 
518  const LO src_lid = permute_from_lids(i);
519  const size_t src_row = local_graph.row_map(src_lid);
520 
521  const LO tgt_lid = permute_to_lids(i);
522  const size_t tgt_row = tgt_rowptr(tgt_lid);
523 
524  size_t nsr = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
525  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
526 
527  for (size_t j = local_graph.row_map(src_lid);
528  j < local_graph.row_map(src_lid + 1); ++j) {
529  LO src_col = local_graph.entries(j);
530  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
531  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
532  }
533  });
534 }
535 
536 template <class Packet, class LocalGraph, class LocalMap, class BufferDevice>
537 void unpackAndCombineIntoCrsArrays2(
538  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
539  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
540  const Kokkos::View<size_t*, typename LocalMap::device_type>& new_start_row,
541  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
542  const Kokkos::View<
543  const typename LocalMap::local_ordinal_type*,
544  BufferDevice,
545  Kokkos::MemoryUnmanaged>& import_lids,
546  const Kokkos::View<const Packet*, BufferDevice>& imports,
547  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
548  const LocalGraph& /* local_graph */,
549  const LocalMap /*& local_col_map*/,
550  const int my_pid) {
551  using Kokkos::atomic_fetch_add;
552  using Kokkos::MemoryUnmanaged;
553  using Kokkos::parallel_reduce;
554  using Kokkos::subview;
555  using Kokkos::View;
556 
557  using device_type = typename LocalMap::device_type;
558  using LO = typename LocalMap::local_ordinal_type;
559  using GO = typename LocalMap::global_ordinal_type;
560  using execution_space = typename device_type::execution_space;
561  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
562  using slice = typename Kokkos::pair<size_type, size_type>;
563  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
564 
565  using pids_out_type = View<int*, device_type, MemoryUnmanaged>;
566  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
567 
568  const size_type num_import_lids = import_lids.size();
569  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
570 
571  // RemoteIDs: Loop structure following UnpackAndCombine
572  int gbl_err_count;
573  parallel_reduce(
574  "Unpack and combine into CRS",
575  range_policy(0, num_import_lids),
576  KOKKOS_LAMBDA(const size_t i, int& err) {
577  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
578  const size_t num_packets_this_lid = num_packets_per_lid(i);
579  const size_t num_ent = num_packets_this_lid / 2;
580  const size_t offset = offsets(i);
581  const LO lcl_row = import_lids(i);
582  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
583  const size_t end_row = start_row + num_ent;
584 
585  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
586  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
587 
588  err += unpackRow(gids_out, pids_out, imports, offset, num_ent);
589 
590  // Correct target PIDs.
591  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
592  const int pid = pids_out(j);
593  pids_out(j) = (pid != my_pid) ? pid : -1;
594  }
595  },
596  gbl_err_count);
597 
598  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
599  std::invalid_argument, prefix << "Attempting to unpack PIDs, but num_ent is not even; this should never "
600  "happen! Please report this bug to the Tpetra developers.");
601 
602  return;
603 }
604 
605 template <class Packet, class LocalGraph, class LocalMap, class BufferDevice>
607  const LocalGraph& local_graph,
608  const LocalMap& local_col_map,
609  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
610  BufferDevice,
611  Kokkos::MemoryUnmanaged>& import_lids,
612  const Kokkos::View<const Packet*, BufferDevice>& imports,
613  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
614  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
615  BufferDevice,
616  Kokkos::MemoryUnmanaged>& permute_to_lids,
617  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
618  BufferDevice,
619  Kokkos::MemoryUnmanaged>& permute_from_lids,
620  const Kokkos::View<size_t*,
621  typename LocalMap::device_type,
622  Kokkos::MemoryUnmanaged>& tgt_rowptr,
623  const Kokkos::View<typename LocalMap::global_ordinal_type*,
624  typename LocalMap::device_type,
625  Kokkos::MemoryUnmanaged>& tgt_colind,
626  const Kokkos::View<const int*,
627  typename LocalMap::device_type,
628  Kokkos::MemoryUnmanaged>& src_pids,
629  const Kokkos::View<int*,
630  typename LocalMap::device_type,
631  Kokkos::MemoryUnmanaged>& tgt_pids,
632  const size_t num_same_ids,
633  const size_t tgt_num_rows,
634  const size_t tgt_num_nonzeros,
635  const int my_tgt_pid) {
636  using Kokkos::MemoryUnmanaged;
637  using Kokkos::parallel_for;
638  using Kokkos::subview;
639  using Kokkos::View;
640  using packet_type = Packet;
641  using local_map_type = LocalMap;
642  using local_graph_type = LocalGraph;
643  using buffer_device_type = BufferDevice;
644  using device_type = typename LocalMap::device_type;
645  using LO = typename LocalMap::local_ordinal_type;
646  using execution_space = typename device_type::execution_space;
647  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
648  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
649 
650  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
651 
652  const size_t N = tgt_num_rows;
653  const size_t mynnz = tgt_num_nonzeros;
654 
655  // In the case of reduced communicators, the sourceGraph won't have
656  // the right "my_pid", so thus we have to supply it.
657  const int my_pid = my_tgt_pid;
658 
659  // FIXME (mfh 24 Jun 2019)
660  //
661  // 1. Only zero the entries of tgt_rowptr that actually need it.
662  // 2. Consider merging these three kernels into one.
663 
664  // Zero the rowptr
665  parallel_for(
666  range_policy(0, N + 1),
667  KOKKOS_LAMBDA(const size_t i) {
668  tgt_rowptr(i) = 0;
669  });
670 
671  // same IDs: Always first, always in the same place
672  parallel_for(
673  range_policy(0, num_same_ids),
674  KOKKOS_LAMBDA(const size_t i) {
675  const LO tgt_lid = static_cast<LO>(i);
676  const LO src_lid = static_cast<LO>(i);
677  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
678  });
679 
680  // Permute IDs: Still local, but reordered
681  const size_type num_permute_to_lids = permute_to_lids.extent(0);
682  parallel_for(
683  range_policy(0, num_permute_to_lids),
684  KOKKOS_LAMBDA(const size_t i) {
685  const LO tgt_lid = permute_to_lids(i);
686  const LO src_lid = permute_from_lids(i);
687  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid + 1) - local_graph.row_map(src_lid);
688  });
689 
690  // Get the offsets from the number of packets per LID
691  const size_type num_import_lids = import_lids.extent(0);
692  View<size_t*, device_type> offsets("offsets", num_import_lids + 1);
693  computeOffsetsFromCounts(offsets, num_packets_per_lid);
694 
695 #ifdef HAVE_TPETRA_DEBUG
696  {
697  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
698  const bool condition =
699  nth_offset_h != static_cast<size_t>(imports.extent(0));
700  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::logic_error, prefix << "The final offset in bytes " << nth_offset_h << " != imports.size() = " << imports.extent(0) << ". Please report this bug to the Tpetra developers.");
701  }
702 #endif // HAVE_TPETRA_DEBUG
703 
704  // Setup row pointers for remotes
705  setupRowPointersForRemotes<packet_type, LO, device_type, buffer_device_type>(
706  tgt_rowptr, import_lids, imports, num_packets_per_lid);
707 
708  // If multiple processes contribute to the same row, we may need to
709  // update row offsets. This tracks that.
710  View<size_t*, device_type> new_start_row("new_start_row", N + 1);
711 
712  // Turn row length into a real CRS row pointer
713  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
714  {
715  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
716  bool condition = nth_tgt_rowptr_h != mynnz;
717  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
718  prefix << "CRS_rowptr[last] = " << nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
719  }
720 
721  // SameIDs: Copy the data over
722  copyDataFromSameIDs<LocalGraph, LocalMap>(tgt_colind, tgt_pids, new_start_row,
723  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
724 
725  copyDataFromPermuteIDs<LocalGraph, LocalMap>(tgt_colind, tgt_pids, new_start_row,
726  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
727  local_graph, local_col_map, my_pid);
728 
729  if (imports.extent(0) <= 0) {
730  return;
731  }
732 
733  unpackAndCombineIntoCrsArrays2<
734  packet_type, local_graph_type, local_map_type, buffer_device_type>(
735  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
736  num_packets_per_lid, local_graph, local_col_map, my_pid);
737 
738  return;
739 }
740 
741 } // namespace UnpackAndCombineCrsGraphImpl
742 
790 template <class LocalOrdinal, class GlobalOrdinal, class Node>
791 size_t
794  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
795  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::packet_type>& imports,
796  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
797  size_t /* constantNumPackets */,
798  CombineMode /* combineMode */,
799  size_t numSameIDs,
800  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
801  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs) {
802  using Kokkos::MemoryUnmanaged;
803  using Kokkos::View;
804  using device_type = typename Node::device_type;
806  using local_graph_device_type = typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::local_graph_device_type;
807  using buffer_device_type = typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::buffer_device_type;
808  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
809 
810  TEUCHOS_TEST_FOR_EXCEPTION(permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
811  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
812  "permuteFromLIDs.size() = "
813  << permuteFromLIDs.size() << ".");
814  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
815  // process, then the graph is neither locally nor globally indexed.
816  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
817  TEUCHOS_TEST_FOR_EXCEPTION(!locallyIndexed, std::invalid_argument, prefix << "The input "
818  "CrsGraph 'sourceGraph' must be locally indexed.");
819  TEUCHOS_TEST_FOR_EXCEPTION(importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
820  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
821  "numPacketsPerLID.size() = "
822  << numPacketsPerLID.size() << ".");
823 
824  auto local_graph = sourceGraph.getLocalGraphDevice();
825  auto permute_from_lids_d =
827  permuteFromLIDs.getRawPtr(),
828  permuteFromLIDs.size(), true,
829  "permute_from_lids");
830  auto imports_d =
831  create_mirror_view_from_raw_host_array(buffer_device_type(),
832  imports.getRawPtr(),
833  imports.size(), true,
834  "imports");
835  auto num_packets_per_lid_d =
836  create_mirror_view_from_raw_host_array(buffer_device_type(),
837  numPacketsPerLID.getRawPtr(),
838  numPacketsPerLID.size(), true,
839  "num_packets_per_lid");
840 
842  packet_type, local_graph_device_type, buffer_device_type>(
843  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
844 }
845 
859 template <class LocalOrdinal, class GlobalOrdinal, class Node>
862  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
863  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal, GlobalOrdinal, Node>::packet_type>& imports,
864  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
865  const size_t /* constantNumPackets */,
866  const CombineMode /* combineMode */,
867  const size_t numSameIDs,
868  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
869  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
870  size_t TargetNumRows,
871  size_t TargetNumNonzeros,
872  const int MyTargetPID,
873  const Teuchos::ArrayView<size_t>& CRS_rowptr,
874  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
875  const Teuchos::ArrayView<const int>& SourcePids,
876  Teuchos::Array<int>& TargetPids) {
877  using Kokkos::deep_copy;
878  using Kokkos::View;
879  using Teuchos::outArg;
880  using Teuchos::REDUCE_MAX;
881  using Teuchos::reduceAll;
882  using LO = LocalOrdinal;
883  using GO = GlobalOrdinal;
884  using crs_graph_type = CrsGraph<LO, GO, Node>;
885  using packet_type = typename crs_graph_type::packet_type;
886  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
887  using buffer_device_type = typename crs_graph_type::buffer_device_type;
888  using device_type = typename Node::device_type;
889  using size_type = typename Teuchos::ArrayView<const LO>::size_type;
890 
891  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
892 
893  TEUCHOS_TEST_FOR_EXCEPTION(
894  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
895  std::invalid_argument, prefix << "CRS_rowptr.size() = " << CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows + 1 << ".");
896 
897  TEUCHOS_TEST_FOR_EXCEPTION(
898  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
899  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
900  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
901  const size_type numImportLIDs = importLIDs.size();
902 
903  TEUCHOS_TEST_FOR_EXCEPTION(
904  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
905  prefix << "importLIDs.size() = " << numImportLIDs << " != "
906  "numPacketsPerLID.size() = "
907  << numPacketsPerLID.size() << ".");
908 
909  // Preseed TargetPids with -1 for local
910  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
911  TargetPids.resize(TargetNumNonzeros);
912  }
913  TargetPids.assign(TargetNumNonzeros, -1);
914 
915  // Grab pointers for sourceGraph
916  auto local_graph = sourceGraph.getLocalGraphDevice();
917  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
918 
919  // Convert input arrays to Kokkos::View
920  device_type outputDevice;
921  buffer_device_type bufferOutputDevice;
922 
923  Kokkos::View<const LO*, buffer_device_type> import_lids_d =
924  create_mirror_view_from_raw_host_array(bufferOutputDevice, importLIDs.getRawPtr(),
925  importLIDs.size(), true, "import_lids");
926 
927  Kokkos::View<const packet_type*, buffer_device_type> imports_d =
928  create_mirror_view_from_raw_host_array(bufferOutputDevice, imports.getRawPtr(),
929  imports.size(), true, "imports");
930 
931  Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
932  create_mirror_view_from_raw_host_array(bufferOutputDevice,
933  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
934  true, "num_packets_per_lid");
935 
936  Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
937  create_mirror_view_from_raw_host_array(bufferOutputDevice,
938  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
939  true, "permute_to_lids");
940 
941  Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
942  create_mirror_view_from_raw_host_array(bufferOutputDevice,
943  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
944  true, "permute_from_lids");
945 
946  Kokkos::View<size_t*, device_type> crs_rowptr_d =
948  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
949  true, "crs_rowptr");
950 
951  Kokkos::View<GO*, device_type> crs_colind_d =
953  CRS_colind.getRawPtr(), CRS_colind.size(),
954  true, "crs_colidx");
955 
956  Kokkos::View<const int*, device_type> src_pids_d =
958  SourcePids.getRawPtr(), SourcePids.size(),
959  true, "src_pids");
960 
961  Kokkos::View<int*, device_type> tgt_pids_d =
963  TargetPids.getRawPtr(), TargetPids.size(),
964  true, "tgt_pids");
965 
966  using local_map_type = decltype(local_col_map);
968  packet_type, local_graph_device_type, local_map_type, buffer_device_type>(
969  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
970  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
971  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
972 
973  // FIXME (mfh 25 Jun 2019) host_mirror_type of CudaUVMSpace is CudaUVMSpace!!!
974 
975  // Copy outputs back to host
976  typename decltype(crs_rowptr_d)::host_mirror_type crs_rowptr_h(
977  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
978  deep_copy(crs_rowptr_h, crs_rowptr_d);
979 
980  typename decltype(crs_colind_d)::host_mirror_type crs_colind_h(
981  CRS_colind.getRawPtr(), CRS_colind.size());
982  deep_copy(crs_colind_h, crs_colind_d);
983 
984  typename decltype(tgt_pids_d)::host_mirror_type tgt_pids_h(
985  TargetPids.getRawPtr(), TargetPids.size());
986  deep_copy(tgt_pids_h, tgt_pids_d);
987 }
988 
989 } // namespace Details
990 } // namespace Tpetra
991 
992 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT(LO, GO, NT) \
993  template void \
994  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
995  const CrsGraph<LO, GO, NT>&, \
996  const Teuchos::ArrayView<const LO>&, \
997  const Teuchos::ArrayView<const typename CrsGraph<LO, GO, NT>::packet_type>&, \
998  const Teuchos::ArrayView<const size_t>&, \
999  const size_t, \
1000  const CombineMode, \
1001  const size_t, \
1002  const Teuchos::ArrayView<const LO>&, \
1003  const Teuchos::ArrayView<const LO>&, \
1004  size_t, \
1005  size_t, \
1006  const int, \
1007  const Teuchos::ArrayView<size_t>&, \
1008  const Teuchos::ArrayView<GO>&, \
1009  const Teuchos::ArrayView<const int>&, \
1010  Teuchos::Array<int>&); \
1011  template size_t \
1012  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1013  const CrsGraph<LO, GO, NT>&, \
1014  const Teuchos::ArrayView<const LO>&, \
1015  const Teuchos::ArrayView<const typename CrsGraph<LO, GO, NT>::packet_type>&, \
1016  const Teuchos::ArrayView<const size_t>&, \
1017  size_t, \
1018  CombineMode, \
1019  size_t, \
1020  const Teuchos::ArrayView<const LO>&, \
1021  const Teuchos::ArrayView<const LO>&);
1022 
1023 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
KokkosSparse::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
Declaration of the Tpetra::CrsGraph class.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Functions for manipulating CRS arrays.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
DeviceType device_type
The device type.
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Declaration and definition of Tpetra::Details::getEntryOnHost.
local_graph_device_type getLocalGraphDevice() const
Get the local graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.