Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
11 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
12 
13 #include "TpetraCore_config.h"
14 #include "Teuchos_Array.hpp"
15 #include "Teuchos_ArrayView.hpp"
21 #include "Tpetra_CrsGraph_decl.hpp"
24 #include "Kokkos_Core.hpp"
25 #include <memory>
26 #include <string>
27 
46 
47 namespace Tpetra {
48 
49 //
50 // Users must never rely on anything in the Details namespace.
51 //
52 namespace Details {
53 
54 namespace UnpackAndCombineCrsGraphImpl {
55 
65 template<class Packet, class GO, class Device, class BufferDevice>
66 KOKKOS_FUNCTION int
67 unpackRow (const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
68  const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
69  const Kokkos::View<const Packet*,BufferDevice>& imports,
70  const size_t offset,
71  const size_t num_ent)
72 {
73  using size_type = typename Kokkos::View<GO*,Device>::size_type;
74 
75  if (num_ent == 0) {
76  // Empty rows always take zero bytes, to ensure sparsity.
77  return 0;
78  }
79 
80  // Unpack GIDs
81  for (size_type k=0; k<num_ent; k++)
82  gids_out(k) = imports(offset+k);
83 
84  // Unpack PIDs
85  if (pids_out.size() > 0) {
86  for (size_type k=0; k<num_ent; k++) {
87  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
88  }
89  }
90 
91  return 0;
92 }
93 
104 template<class LocalOrdinal,
105  class Packet,
106  class RowView,
107  class IndicesView,
108  class BufferDevice>
110 
111  using LO = LocalOrdinal;
112  using GO = typename IndicesView::value_type;
113  using packet_type = Packet;
114  using row_ptrs_type = RowView;
115  using indices_type = IndicesView;
116  using buffer_device_type = BufferDevice;
117 
118  using device_type = typename IndicesView::device_type;
119  using execution_space = typename device_type::execution_space;
120 
121  using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
122  using offsets_type = Kokkos::View<const size_t*, device_type>;
123  using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
124  using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
125 
126  using gids_scratch_type = Kokkos::View<GO*, device_type>;
127  using pids_scratch_type = Kokkos::View<int*,device_type>;
128 
129  row_ptrs_type row_ptrs_beg;
130  row_ptrs_type row_ptrs_end;
131  indices_type indices;
132  input_buffer_type imports;
133  num_packets_per_lid_type num_packets_per_lid;
134  import_lids_type import_lids;
135  offsets_type offsets;
136  size_t max_num_ent;
137  bool unpack_pids;
138  Kokkos::Experimental::UniqueToken<execution_space,
139  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
140  gids_scratch_type gids_scratch;
141  pids_scratch_type pids_scratch;
142 
143  public:
144  using value_type = Kokkos::pair<int, LO>;
145 
147  const row_ptrs_type& row_ptrs_beg_in,
148  const row_ptrs_type& row_ptrs_end_in,
149  const indices_type& indices_in,
150  const input_buffer_type& imports_in,
151  const num_packets_per_lid_type& num_packets_per_lid_in,
152  const import_lids_type& import_lids_in,
153  const offsets_type& offsets_in,
154  const size_t max_num_ent_in,
155  const bool unpack_pids_in) :
156  row_ptrs_beg(row_ptrs_beg_in),
157  row_ptrs_end(row_ptrs_end_in),
158  indices(indices_in),
159  imports(imports_in),
160  num_packets_per_lid(num_packets_per_lid_in),
161  import_lids(import_lids_in),
162  offsets(offsets_in),
163  max_num_ent(max_num_ent_in),
164  unpack_pids(unpack_pids_in),
165  tokens(execution_space()),
166  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
167  pids_scratch("pids_scratch", tokens.size() * max_num_ent)
168  {}
169 
170  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
171  {
172  using Tpetra::Details::OrdinalTraits;
173  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
174  }
175 
176  KOKKOS_INLINE_FUNCTION void
177  join(value_type& dst, const value_type& src) const
178  {
179  // `dst` should reflect the first (least) bad index and
180  // all other associated error codes and data. Thus, we need only
181  // check if the `src` object shows an error and if its associated
182  // bad index is less than `dst`'s bad index.
183  using Tpetra::Details::OrdinalTraits;
184  if (src.second != OrdinalTraits<LO>::invalid()) {
185  // An error in the src; check if
186  // 1. `dst` shows errors
187  // 2. If `dst` does show errors, if src's bad index is less than
188  // *this' bad index
189  if (dst.second == OrdinalTraits<LO>::invalid() ||
190  src.second < dst.second) {
191  dst = src;
192  }
193  }
194  }
195 
196  KOKKOS_INLINE_FUNCTION
197  void operator()(const LO i, value_type& dst) const
198  {
199  using Kokkos::View;
200  using Kokkos::subview;
201  using Kokkos::MemoryUnmanaged;
202  using size_type = typename execution_space::size_type;
203  using slice = typename Kokkos::pair<size_type, size_type>;
204 
205  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
206  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
207 
208  const size_t num_packets_this_lid = num_packets_per_lid(i);
209  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
210  : num_packets_this_lid;
211  if (unpack_pids && num_packets_this_lid%2 != 0) {
212  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
213  // should never
214  dst = Kokkos::make_pair(1, i);
215  return;
216  }
217 
218  // Only unpack data if there is a nonzero number to unpack
219  if (num_ent == 0) {
220  return;
221  }
222 
223  // there is actually something in the row
224  const size_t buf_size = imports.size();
225  const size_t offset = offsets(i);
226 
227  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
228  dst = Kokkos::make_pair(2, i); // out of bounds
229  return;
230  }
231 
232  // Get subviews in to the scratch arrays. The token returned from acquire
233  // is an integer in [0, tokens.size()). It is used to grab a unique (to
234  // this thread) subview of the scratch arrays.
235  const size_type token = tokens.acquire();
236  const size_t a = static_cast<size_t>(token) * max_num_ent;
237  const size_t b = a + num_ent;
238  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
239  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
240 
241  const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
242 
243  if (err != 0) {
244  dst = Kokkos::make_pair(3, i);
245  tokens.release(token);
246  return;
247  }
248 
249  auto import_lid = import_lids(i);
250  for (size_t k = 0; k < num_ent; ++k) {
251  indices(row_ptrs_end(import_lid)) = gids_out(k);
252  // this is OK; don't need atomic, since LIDs to pack don't have repeats.
253  row_ptrs_end(import_lid) += 1;
254  }
255 
256  tokens.release(token);
257  }
258 
259 };
260 
267 template<class LocalOrdinal, class GlobalOrdinal, class Node,
268  class RowView, class IndicesView, class BufferDevice>
269 void
270 unpackAndCombine
271 (const RowView& row_ptrs_beg,
272  const RowView& row_ptrs_end,
273  IndicesView& indices,
274  const Kokkos::View<const GlobalOrdinal*, BufferDevice,
275  Kokkos::MemoryUnmanaged>& imports,
276  const Kokkos::View<const size_t*, BufferDevice,
277  Kokkos::MemoryUnmanaged>& num_packets_per_lid,
278  const Kokkos::View<const LocalOrdinal*, BufferDevice,
279  Kokkos::MemoryUnmanaged>& import_lids,
280  const typename CrsGraph<LocalOrdinal, GlobalOrdinal,
281  Node>::padding_type& padding,
282  const bool unpack_pids,
283  const int myRank,
284  const bool verbose)
285 {
286  using LO = LocalOrdinal;
287  using GO = GlobalOrdinal;
288  using device_type = typename Node::device_type;
289  using execution_space = typename BufferDevice::execution_space;
290  using range_policy =
291  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
292  using unpack_functor_type =
294 
295  const char prefix[] =
296  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
297 
298  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
299  if (num_import_lids == 0) {
300  // Nothing to unpack
301  return;
302  }
303 
304  // Resize row pointers and indices to accommodate incoming data
305  padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding,
306  myRank, verbose);
307 
308  // Get the offsets
309  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
310  computeOffsetsFromCounts(offsets, num_packets_per_lid);
311 
312  // Determine the maximum number of entries in any row in the graph. The
313  // maximum number of entries is needed to allocate unpack buffers on the
314  // device.
315  size_t max_num_ent;
316  Kokkos::parallel_reduce
317  ("MaxReduce",
318  range_policy (0, LO (num_packets_per_lid.size ())),
319  KOKKOS_LAMBDA (const LO i, size_t& running_max_num_ent) {
320  const size_t num_packets_this_lid = num_packets_per_lid(i);
321  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
322  num_packets_this_lid;
323  if (num_ent > running_max_num_ent) {
324  running_max_num_ent = num_ent;
325  }
326  }, Kokkos::Max<size_t> (max_num_ent));
327 
328  // Now do the actual unpack!
329  unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
330  num_packets_per_lid, import_lids, offsets,
331  max_num_ent, unpack_pids);
332 
333  typename unpack_functor_type::value_type x;
334  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
335  auto x_h = x.to_std_pair();
336  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
337  prefix << "UnpackAndCombineFunctor reported error code "
338  << x_h.first << " for the first bad row " << x_h.second);
339 }
340 
341 template<class Packet, class LocalGraph, class BufferDevice>
342 size_t
344  const LocalGraph& local_graph,
345  const Kokkos::View<const typename LocalGraph::data_type*,
346  typename LocalGraph::device_type,
347  Kokkos::MemoryUnmanaged> permute_from_lids,
348  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
349  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
350  const size_t num_same_ids)
351 {
352  using Kokkos::parallel_reduce;
353  using local_graph_type = LocalGraph;
354  using LO = typename local_graph_type::data_type;
355  using device_type = typename local_graph_type::device_type;
356  using execution_space = typename device_type::execution_space;
357  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
358 
359  size_t count = 0;
360  LO num_items;
361 
362  // Number of graph entries to unpack (returned by this function).
363  num_items = static_cast<LO>(num_same_ids);
364  if (num_items) {
365  size_t kcnt = 0;
366  parallel_reduce(
367  range_policy(0, num_items),
368  KOKKOS_LAMBDA(const LO lid, size_t& update) {
369  update += static_cast<size_t>(local_graph.row_map[lid+1]
370  -local_graph.row_map[lid]);
371  }, kcnt);
372  count += kcnt;
373  }
374 
375  // Count entries copied directly from the source graph with permuting.
376  num_items = static_cast<LO>(permute_from_lids.extent(0));
377  if (num_items) {
378  size_t kcnt = 0;
379  parallel_reduce(
380  range_policy(0, num_items),
381  KOKKOS_LAMBDA(const LO i, size_t& update) {
382  const LO lid = permute_from_lids(i);
383  update += static_cast<size_t>(local_graph.row_map[lid+1]
384  - local_graph.row_map[lid]);
385  }, kcnt);
386  count += kcnt;
387  }
388 
389  {
390  // Count entries received from other MPI processes.
391  size_t tot_num_ent = 0;
392  parallel_reduce("SumReduce",
393  range_policy(0,num_packets_per_lid.size()),
394  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
395  lsum += num_packets_per_lid(i) / 2;
396  }, Kokkos::Sum<size_t>(tot_num_ent));
397  count += tot_num_ent;
398  }
399 
400  return count;
401 }
402 
404 template<class Packet, class LO, class Device, class BufferDevice>
405 void
406 setupRowPointersForRemotes(
407  const Kokkos::View<size_t*, Device>& tgt_rowptr,
408  const Kokkos::View<const LO*, BufferDevice>& import_lids,
409  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
410  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
411 {
412  using Kokkos::parallel_reduce;
413  using device_type = Device;
414  using execution_space = typename device_type::execution_space;
415  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
416  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
417 
418  const size_type N = num_packets_per_lid.extent(0);
419  parallel_for("Setup row pointers for remotes",
420  range_policy(0, N),
421  KOKKOS_LAMBDA(const size_t i){
422  using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
423  const size_t num_packets_this_lid = num_packets_per_lid(i);
424  const size_t num_ent = num_packets_this_lid / 2;
425  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
426  });
427 }
428 
429 // Convert array of row lengths to a CRS pointer array
430 template<class Device>
431 void
432 makeCrsRowPtrFromLengths(
433  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
434  const Kokkos::View<size_t*,Device>& new_start_row)
435 {
436  using Kokkos::parallel_scan;
437  using device_type = Device;
438  using execution_space = typename device_type::execution_space;
439  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
440  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
441  const size_type N = new_start_row.extent(0);
442  parallel_scan(
443  range_policy(0, N),
444  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
445  auto cur_val = tgt_rowptr(i);
446  if (final) {
447  tgt_rowptr(i) = update;
448  new_start_row(i) = tgt_rowptr(i);
449  }
450  update += cur_val;
451  }
452  );
453 }
454 
455 template<class LocalGraph, class LocalMap>
456 void
457 copyDataFromSameIDs(
458  const Kokkos::View<typename LocalMap::global_ordinal_type*,
459  typename LocalMap::device_type>& tgt_colind,
460  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
461  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
462  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
463  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
464  const LocalGraph& local_graph,
465  const LocalMap& local_col_map,
466  const size_t num_same_ids,
467  const int my_pid)
468 {
469  using Kokkos::parallel_for;
470  using device_type = typename LocalMap::device_type;
471  using LO = typename LocalMap::local_ordinal_type;
472  using execution_space = typename device_type::execution_space;
473  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
474 
475  parallel_for(
476  range_policy(0, num_same_ids),
477  KOKKOS_LAMBDA(const size_t i) {
478  using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
479 
480  const LO src_lid = static_cast<LO>(i);
481  size_t src_row = local_graph.row_map(src_lid);
482 
483  const LO tgt_lid = static_cast<LO>(i);
484  const size_t tgt_row = tgt_rowptr(tgt_lid);
485 
486  const size_t nsr = local_graph.row_map(src_lid+1)
487  - local_graph.row_map(src_lid);
488  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
489 
490  for (size_t j=local_graph.row_map(src_lid);
491  j<local_graph.row_map(src_lid+1); ++j) {
492  LO src_col = local_graph.entries(j);
493  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
494  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
495  }
496  }
497  );
498 }
499 
500 template<class LocalGraph, class LocalMap, class BufferDevice>
501 void
502 copyDataFromPermuteIDs(
503  const Kokkos::View<typename LocalMap::global_ordinal_type*,
504  typename LocalMap::device_type>& tgt_colind,
505  const Kokkos::View<int*,
506  typename LocalMap::device_type>& tgt_pids,
507  const Kokkos::View<size_t*,
508  typename LocalMap::device_type>& new_start_row,
509  const Kokkos::View<size_t*,
510  typename LocalMap::device_type>& tgt_rowptr,
511  const Kokkos::View<const int*,
512  typename LocalMap::device_type>& src_pids,
513  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
514  BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
515  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
516  BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
517  const LocalGraph& local_graph,
518  const LocalMap& local_col_map,
519  const int my_pid)
520 {
521  using Kokkos::parallel_for;
522  using device_type = typename LocalMap::device_type;
523  using LO = typename LocalMap::local_ordinal_type;
524  using execution_space = typename device_type::execution_space;
525  using size_type = typename Kokkos::View<LO*,device_type>::size_type;
526  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
527 
528  const size_type num_permute_to_lids = permute_to_lids.extent(0);
529 
530  parallel_for(
531  range_policy(0, num_permute_to_lids),
532  KOKKOS_LAMBDA(const size_t i) {
533  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
534 
535  const LO src_lid = permute_from_lids(i);
536  const size_t src_row = local_graph.row_map(src_lid);
537 
538  const LO tgt_lid = permute_to_lids(i);
539  const size_t tgt_row = tgt_rowptr(tgt_lid);
540 
541  size_t nsr = local_graph.row_map(src_lid+1)
542  - local_graph.row_map(src_lid);
543  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
544 
545  for (size_t j=local_graph.row_map(src_lid);
546  j<local_graph.row_map(src_lid+1); ++j) {
547  LO src_col = local_graph.entries(j);
548  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
549  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
550  }
551  }
552  );
553 }
554 
555 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
556 void
557 unpackAndCombineIntoCrsArrays2(
558  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
559  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
560  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
561  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
562  const Kokkos::View<
563  const typename LocalMap::local_ordinal_type*,
564  BufferDevice,
565  Kokkos::MemoryUnmanaged>& import_lids,
566  const Kokkos::View<const Packet*, BufferDevice>& imports,
567  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
568  const LocalGraph& /* local_graph */,
569  const LocalMap /*& local_col_map*/,
570  const int my_pid)
571 {
572  using Kokkos::View;
573  using Kokkos::subview;
574  using Kokkos::MemoryUnmanaged;
575  using Kokkos::parallel_reduce;
576  using Kokkos::atomic_fetch_add;
577 
578  using device_type = typename LocalMap::device_type;
579  using LO = typename LocalMap::local_ordinal_type;
580  using GO = typename LocalMap::global_ordinal_type;
581  using execution_space = typename device_type::execution_space;
582  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
583  using slice = typename Kokkos::pair<size_type, size_type>;
584  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
585 
586  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
587  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
588 
589  const size_type num_import_lids = import_lids.size();
590  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
591 
592  // RemoteIDs: Loop structure following UnpackAndCombine
593  int gbl_err_count;
594  parallel_reduce("Unpack and combine into CRS",
595  range_policy(0, num_import_lids),
596  KOKKOS_LAMBDA(const size_t i, int& err) {
597  using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
598  const size_t num_packets_this_lid = num_packets_per_lid(i);
599  const size_t num_ent = num_packets_this_lid / 2;
600  const size_t offset = offsets(i);
601  const LO lcl_row = import_lids(i);
602  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
603  const size_t end_row = start_row + num_ent;
604 
605  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
606  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
607 
608  err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
609 
610  // Correct target PIDs.
611  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
612  const int pid = pids_out(j);
613  pids_out(j) = (pid != my_pid) ? pid : -1;
614  }
615  }, gbl_err_count);
616 
617  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
618  std::invalid_argument, prefix <<
619  "Attempting to unpack PIDs, but num_ent is not even; this should never "
620  "happen! Please report this bug to the Tpetra developers.");
621 
622  return;
623 }
624 
625 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
626 void
628  const LocalGraph & local_graph,
629  const LocalMap & local_col_map,
630  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
631  BufferDevice,
632  Kokkos::MemoryUnmanaged>& import_lids,
633  const Kokkos::View<const Packet*, BufferDevice>& imports,
634  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
635  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
636  BufferDevice,
637  Kokkos::MemoryUnmanaged>& permute_to_lids,
638  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
639  BufferDevice,
640  Kokkos::MemoryUnmanaged>& permute_from_lids,
641  const Kokkos::View<size_t*,
642  typename LocalMap::device_type,
643  Kokkos::MemoryUnmanaged>& tgt_rowptr,
644  const Kokkos::View<typename LocalMap::global_ordinal_type*,
645  typename LocalMap::device_type,
646  Kokkos::MemoryUnmanaged>& tgt_colind,
647  const Kokkos::View<const int*,
648  typename LocalMap::device_type,
649  Kokkos::MemoryUnmanaged>& src_pids,
650  const Kokkos::View<int*,
651  typename LocalMap::device_type,
652  Kokkos::MemoryUnmanaged>& tgt_pids,
653  const size_t num_same_ids,
654  const size_t tgt_num_rows,
655  const size_t tgt_num_nonzeros,
656  const int my_tgt_pid)
657 {
658  using Kokkos::View;
659  using Kokkos::subview;
660  using Kokkos::parallel_for;
661  using Kokkos::MemoryUnmanaged;
662  using packet_type = Packet;
663  using local_map_type = LocalMap;
664  using local_graph_type = LocalGraph;
665  using buffer_device_type = BufferDevice;
666  using device_type = typename LocalMap::device_type;
667  using LO = typename LocalMap::local_ordinal_type;
668  using execution_space = typename device_type::execution_space;
669  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
670  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
671 
672  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
673 
674  const size_t N = tgt_num_rows;
675  const size_t mynnz = tgt_num_nonzeros;
676 
677  // In the case of reduced communicators, the sourceGraph won't have
678  // the right "my_pid", so thus we have to supply it.
679  const int my_pid = my_tgt_pid;
680 
681  // FIXME (mfh 24 Jun 2019)
682  //
683  // 1. Only zero the entries of tgt_rowptr that actually need it.
684  // 2. Consider merging these three kernels into one.
685 
686  // Zero the rowptr
687  parallel_for(
688  range_policy(0, N+1),
689  KOKKOS_LAMBDA(const size_t i) {
690  tgt_rowptr(i) = 0;
691  }
692  );
693 
694  // same IDs: Always first, always in the same place
695  parallel_for(
696  range_policy(0, num_same_ids),
697  KOKKOS_LAMBDA(const size_t i) {
698  const LO tgt_lid = static_cast<LO>(i);
699  const LO src_lid = static_cast<LO>(i);
700  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
701  - local_graph.row_map(src_lid);
702  }
703  );
704 
705  // Permute IDs: Still local, but reordered
706  const size_type num_permute_to_lids = permute_to_lids.extent(0);
707  parallel_for(
708  range_policy(0, num_permute_to_lids),
709  KOKKOS_LAMBDA(const size_t i) {
710  const LO tgt_lid = permute_to_lids(i);
711  const LO src_lid = permute_from_lids(i);
712  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
713  - local_graph.row_map(src_lid);
714  }
715  );
716 
717  // Get the offsets from the number of packets per LID
718  const size_type num_import_lids = import_lids.extent(0);
719  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
720  computeOffsetsFromCounts(offsets, num_packets_per_lid);
721 
722 #ifdef HAVE_TPETRA_DEBUG
723  {
724  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
725  const bool condition =
726  nth_offset_h != static_cast<size_t>(imports.extent(0));
727  TEUCHOS_TEST_FOR_EXCEPTION
728  (condition, std::logic_error, prefix
729  << "The final offset in bytes " << nth_offset_h
730  << " != imports.size() = " << imports.extent(0)
731  << ". Please report this bug to the Tpetra developers.");
732  }
733 #endif // HAVE_TPETRA_DEBUG
734 
735  // Setup row pointers for remotes
736  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
737  tgt_rowptr, import_lids, imports, num_packets_per_lid);
738 
739  // If multiple processes contribute to the same row, we may need to
740  // update row offsets. This tracks that.
741  View<size_t*, device_type> new_start_row("new_start_row", N+1);
742 
743  // Turn row length into a real CRS row pointer
744  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
745  {
746  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
747  bool condition = nth_tgt_rowptr_h != mynnz;
748  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
749  prefix << "CRS_rowptr[last] = " <<
750  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
751  }
752 
753  // SameIDs: Copy the data over
754  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
755  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
756 
757  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
758  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
759  local_graph, local_col_map, my_pid);
760 
761  if (imports.extent(0) <= 0) {
762  return;
763  }
764 
765  unpackAndCombineIntoCrsArrays2<
766  packet_type,local_graph_type,local_map_type,buffer_device_type>(
767  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
768  num_packets_per_lid, local_graph, local_col_map, my_pid);
769 
770  return;
771 }
772 
773 } // namespace UnpackAndCombineCrsGraphImpl
774 
822 template<class LocalOrdinal, class GlobalOrdinal, class Node>
823 size_t
826  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
827  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
828  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
829  size_t /* constantNumPackets */,
830  CombineMode /* combineMode */,
831  size_t numSameIDs,
832  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
833  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
834 {
835  using Kokkos::MemoryUnmanaged;
836  using Kokkos::View;
837  using device_type = typename Node::device_type;
838  using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
839  using local_graph_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_device_type;
840  using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
841  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
842 
843  TEUCHOS_TEST_FOR_EXCEPTION
844  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
845  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
846  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
847  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
848  // process, then the graph is neither locally nor globally indexed.
849  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
850  TEUCHOS_TEST_FOR_EXCEPTION
851  (! locallyIndexed, std::invalid_argument, prefix << "The input "
852  "CrsGraph 'sourceGraph' must be locally indexed.");
853  TEUCHOS_TEST_FOR_EXCEPTION
854  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
855  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
856  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
857 
858  auto local_graph = sourceGraph.getLocalGraphDevice();
859  auto permute_from_lids_d =
861  permuteFromLIDs.getRawPtr(),
862  permuteFromLIDs.size(), true,
863  "permute_from_lids");
864  auto imports_d =
865  create_mirror_view_from_raw_host_array(buffer_device_type(),
866  imports.getRawPtr(),
867  imports.size(), true,
868  "imports");
869  auto num_packets_per_lid_d =
870  create_mirror_view_from_raw_host_array(buffer_device_type(),
871  numPacketsPerLID.getRawPtr(),
872  numPacketsPerLID.size(), true,
873  "num_packets_per_lid");
874 
876  packet_type,local_graph_device_type,buffer_device_type>(
877  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
878 }
879 
893 template<class LocalOrdinal, class GlobalOrdinal, class Node>
894 void
897  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
898  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
899  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
900  const size_t /* constantNumPackets */,
901  const CombineMode /* combineMode */,
902  const size_t numSameIDs,
903  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
904  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
905  size_t TargetNumRows,
906  size_t TargetNumNonzeros,
907  const int MyTargetPID,
908  const Teuchos::ArrayView<size_t>& CRS_rowptr,
909  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
910  const Teuchos::ArrayView<const int>& SourcePids,
911  Teuchos::Array<int>& TargetPids)
912 {
913  using Kokkos::View;
914  using Kokkos::deep_copy;
915  using Teuchos::outArg;
916  using Teuchos::REDUCE_MAX;
917  using Teuchos::reduceAll;
918  using LO = LocalOrdinal;
919  using GO = GlobalOrdinal;
920  using crs_graph_type = CrsGraph<LO, GO, Node>;
921  using packet_type = typename crs_graph_type::packet_type;
922  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
923  using buffer_device_type = typename crs_graph_type::buffer_device_type;
924  using device_type = typename Node::device_type;
925  using size_type = typename Teuchos::ArrayView<const LO>::size_type;
926 
927  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
928 
929  TEUCHOS_TEST_FOR_EXCEPTION(
930  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
931  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
932  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
933 
934  TEUCHOS_TEST_FOR_EXCEPTION(
935  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
936  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
937  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
938  const size_type numImportLIDs = importLIDs.size();
939 
940  TEUCHOS_TEST_FOR_EXCEPTION(
941  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
942  prefix << "importLIDs.size() = " << numImportLIDs << " != "
943  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
944 
945  // Preseed TargetPids with -1 for local
946  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
947  TargetPids.resize(TargetNumNonzeros);
948  }
949  TargetPids.assign(TargetNumNonzeros, -1);
950 
951  // Grab pointers for sourceGraph
952  auto local_graph = sourceGraph.getLocalGraphDevice();
953  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
954 
955  // Convert input arrays to Kokkos::View
956  device_type outputDevice;
957  buffer_device_type bufferOutputDevice;
958 
959  Kokkos::View<const LO*, buffer_device_type> import_lids_d =
961  (bufferOutputDevice, importLIDs.getRawPtr(),
962  importLIDs.size(), true, "import_lids");
963 
964  Kokkos::View<const packet_type*, buffer_device_type> imports_d =
966  (bufferOutputDevice, imports.getRawPtr(),
967  imports.size(), true, "imports");
968 
969  Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
970  create_mirror_view_from_raw_host_array(bufferOutputDevice,
971  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
972  true, "num_packets_per_lid");
973 
974  Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
975  create_mirror_view_from_raw_host_array(bufferOutputDevice,
976  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
977  true, "permute_to_lids");
978 
979  Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
980  create_mirror_view_from_raw_host_array(bufferOutputDevice,
981  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
982  true, "permute_from_lids");
983 
984  Kokkos::View<size_t*, device_type> crs_rowptr_d =
986  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
987  true, "crs_rowptr");
988 
989  Kokkos::View<GO*, device_type> crs_colind_d =
991  CRS_colind.getRawPtr(), CRS_colind.size(),
992  true, "crs_colidx");
993 
994  Kokkos::View<const int*, device_type> src_pids_d =
996  SourcePids.getRawPtr(), SourcePids.size(),
997  true, "src_pids");
998 
999  Kokkos::View<int*, device_type> tgt_pids_d =
1001  TargetPids.getRawPtr(), TargetPids.size(),
1002  true, "tgt_pids");
1003 
1004  using local_map_type = decltype(local_col_map);
1006  packet_type,local_graph_device_type,local_map_type,buffer_device_type>(
1007  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1008  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1009  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1010 
1011  // FIXME (mfh 25 Jun 2019) HostMirror of CudaUVMSpace is CudaUVMSpace!!!
1012 
1013  // Copy outputs back to host
1014  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1015  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1016  deep_copy(crs_rowptr_h, crs_rowptr_d);
1017 
1018  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1019  CRS_colind.getRawPtr(), CRS_colind.size());
1020  deep_copy(crs_colind_h, crs_colind_d);
1021 
1022  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1023  TargetPids.getRawPtr(), TargetPids.size());
1024  deep_copy(tgt_pids_h, tgt_pids_d);
1025 
1026 }
1027 
1028 } // namespace Details
1029 } // namespace Tpetra
1030 
1031 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1032  template void \
1033  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1034  const CrsGraph<LO, GO, NT> &, \
1035  const Teuchos::ArrayView<const LO>&, \
1036  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1037  const Teuchos::ArrayView<const size_t>&, \
1038  const size_t, \
1039  const CombineMode, \
1040  const size_t, \
1041  const Teuchos::ArrayView<const LO>&, \
1042  const Teuchos::ArrayView<const LO>&, \
1043  size_t, \
1044  size_t, \
1045  const int, \
1046  const Teuchos::ArrayView<size_t>&, \
1047  const Teuchos::ArrayView<GO>&, \
1048  const Teuchos::ArrayView<const int>&, \
1049  Teuchos::Array<int>&); \
1050  template size_t \
1051  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1052  const CrsGraph<LO, GO, NT> &, \
1053  const Teuchos::ArrayView<const LO> &, \
1054  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1055  const Teuchos::ArrayView<const size_t>&, \
1056  size_t, \
1057  CombineMode, \
1058  size_t, \
1059  const Teuchos::ArrayView<const LO>&, \
1060  const Teuchos::ArrayView<const LO>&);
1061 
1062 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
Declaration of the Tpetra::CrsGraph class.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Functions for manipulating CRS arrays.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
DeviceType device_type
The device type.
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Declaration and definition of Tpetra::Details::getEntryOnHost.
local_graph_device_type getLocalGraphDevice() const
Get the local graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.