Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
42 
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
51 #include "Tpetra_CrsGraph_decl.hpp"
54 #include "Kokkos_Core.hpp"
55 #include <memory>
56 #include <string>
57 
76 
77 namespace Tpetra {
78 
79 //
80 // Users must never rely on anything in the Details namespace.
81 //
82 namespace Details {
83 
84 namespace UnpackAndCombineCrsGraphImpl {
85 
95 template<class Packet, class GO, class Device, class BufferDevice>
96 KOKKOS_FUNCTION int
97 unpackRow (const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
98  const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
99  const Kokkos::View<const Packet*,BufferDevice>& imports,
100  const size_t offset,
101  const size_t num_ent)
102 {
103  using size_type = typename Kokkos::View<GO*,Device>::size_type;
104 
105  if (num_ent == 0) {
106  // Empty rows always take zero bytes, to ensure sparsity.
107  return 0;
108  }
109 
110  // Unpack GIDs
111  for (size_type k=0; k<num_ent; k++)
112  gids_out(k) = imports(offset+k);
113 
114  // Unpack PIDs
115  if (pids_out.size() > 0) {
116  for (size_type k=0; k<num_ent; k++) {
117  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
118  }
119  }
120 
121  return 0;
122 }
123 
134 template<class LocalOrdinal,
135  class Packet,
136  class RowView,
137  class IndicesView,
138  class BufferDevice>
140 
141  using LO = LocalOrdinal;
142  using GO = typename IndicesView::value_type;
143  using packet_type = Packet;
144  using row_ptrs_type = RowView;
145  using indices_type = IndicesView;
146  using buffer_device_type = BufferDevice;
147 
148  using device_type = typename IndicesView::device_type;
149  using execution_space = typename device_type::execution_space;
150 
151  using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
152  using offsets_type = Kokkos::View<const size_t*, device_type>;
153  using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
154  using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
155 
156  using gids_scratch_type = Kokkos::View<GO*, device_type>;
157  using pids_scratch_type = Kokkos::View<int*,device_type>;
158 
159  row_ptrs_type row_ptrs_beg;
160  row_ptrs_type row_ptrs_end;
161  indices_type indices;
162  input_buffer_type imports;
163  num_packets_per_lid_type num_packets_per_lid;
164  import_lids_type import_lids;
165  offsets_type offsets;
166  size_t max_num_ent;
167  bool unpack_pids;
168  Kokkos::Experimental::UniqueToken<execution_space,
169  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
170  gids_scratch_type gids_scratch;
171  pids_scratch_type pids_scratch;
172 
173  public:
174  using value_type = Kokkos::pair<int, LO>;
175 
177  const row_ptrs_type& row_ptrs_beg_in,
178  const row_ptrs_type& row_ptrs_end_in,
179  const indices_type& indices_in,
180  const input_buffer_type& imports_in,
181  const num_packets_per_lid_type& num_packets_per_lid_in,
182  const import_lids_type& import_lids_in,
183  const offsets_type& offsets_in,
184  const size_t max_num_ent_in,
185  const bool unpack_pids_in) :
186  row_ptrs_beg(row_ptrs_beg_in),
187  row_ptrs_end(row_ptrs_end_in),
188  indices(indices_in),
189  imports(imports_in),
190  num_packets_per_lid(num_packets_per_lid_in),
191  import_lids(import_lids_in),
192  offsets(offsets_in),
193  max_num_ent(max_num_ent_in),
194  unpack_pids(unpack_pids_in),
195  tokens(execution_space()),
196  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
197  pids_scratch("pids_scratch", tokens.size() * max_num_ent)
198  {}
199 
200  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
201  {
202  using Tpetra::Details::OrdinalTraits;
203  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
204  }
205 
206  KOKKOS_INLINE_FUNCTION void
207  join(value_type& dst, const value_type& src) const
208  {
209  // `dst` should reflect the first (least) bad index and
210  // all other associated error codes and data. Thus, we need only
211  // check if the `src` object shows an error and if its associated
212  // bad index is less than `dst`'s bad index.
213  using Tpetra::Details::OrdinalTraits;
214  if (src.second != OrdinalTraits<LO>::invalid()) {
215  // An error in the src; check if
216  // 1. `dst` shows errors
217  // 2. If `dst` does show errors, if src's bad index is less than
218  // *this' bad index
219  if (dst.second == OrdinalTraits<LO>::invalid() ||
220  src.second < dst.second) {
221  dst = src;
222  }
223  }
224  }
225 
226  KOKKOS_INLINE_FUNCTION
227  void operator()(const LO i, value_type& dst) const
228  {
229  using Kokkos::View;
230  using Kokkos::subview;
231  using Kokkos::MemoryUnmanaged;
232  using size_type = typename execution_space::size_type;
233  using slice = typename Kokkos::pair<size_type, size_type>;
234 
235  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
236  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
237 
238  const size_t num_packets_this_lid = num_packets_per_lid(i);
239  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
240  : num_packets_this_lid;
241  if (unpack_pids && num_packets_this_lid%2 != 0) {
242  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
243  // should never
244  dst = Kokkos::make_pair(1, i);
245  return;
246  }
247 
248  // Only unpack data if there is a nonzero number to unpack
249  if (num_ent == 0) {
250  return;
251  }
252 
253  // there is actually something in the row
254  const size_t buf_size = imports.size();
255  const size_t offset = offsets(i);
256 
257  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
258  dst = Kokkos::make_pair(2, i); // out of bounds
259  return;
260  }
261 
262  // Get subviews in to the scratch arrays. The token returned from acquire
263  // is an integer in [0, tokens.size()). It is used to grab a unique (to
264  // this thread) subview of the scratch arrays.
265  const size_type token = tokens.acquire();
266  const size_t a = static_cast<size_t>(token) * max_num_ent;
267  const size_t b = a + num_ent;
268  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
269  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
270 
271  const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
272 
273  if (err != 0) {
274  dst = Kokkos::make_pair(3, i);
275  tokens.release(token);
276  return;
277  }
278 
279  auto import_lid = import_lids(i);
280  for (size_t k = 0; k < num_ent; ++k) {
281  indices(row_ptrs_end(import_lid)) = gids_out(k);
282  // this is OK; don't need atomic, since LIDs to pack don't have repeats.
283  row_ptrs_end(import_lid) += 1;
284  }
285 
286  tokens.release(token);
287  }
288 
289 };
290 
297 template<class LocalOrdinal, class GlobalOrdinal, class Node,
298  class RowView, class IndicesView, class BufferDevice>
299 void
300 unpackAndCombine
301 (const RowView& row_ptrs_beg,
302  const RowView& row_ptrs_end,
303  IndicesView& indices,
304  const Kokkos::View<const GlobalOrdinal*, BufferDevice,
305  Kokkos::MemoryUnmanaged>& imports,
306  const Kokkos::View<const size_t*, BufferDevice,
307  Kokkos::MemoryUnmanaged>& num_packets_per_lid,
308  const Kokkos::View<const LocalOrdinal*, BufferDevice,
309  Kokkos::MemoryUnmanaged>& import_lids,
310  const typename CrsGraph<LocalOrdinal, GlobalOrdinal,
311  Node>::padding_type& padding,
312  const bool unpack_pids,
313  const int myRank,
314  const bool verbose)
315 {
316  using LO = LocalOrdinal;
317  using GO = GlobalOrdinal;
318  using device_type = typename Node::device_type;
319  using execution_space = typename BufferDevice::execution_space;
320  using range_policy =
321  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
322  using unpack_functor_type =
324 
325  const char prefix[] =
326  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
327 
328  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
329  if (num_import_lids == 0) {
330  // Nothing to unpack
331  return;
332  }
333 
334  // Resize row pointers and indices to accommodate incoming data
335  padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding,
336  myRank, verbose);
337 
338  // Get the offsets
339  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
340  computeOffsetsFromCounts(offsets, num_packets_per_lid);
341 
342  // Determine the maximum number of entries in any row in the graph. The
343  // maximum number of entries is needed to allocate unpack buffers on the
344  // device.
345  size_t max_num_ent;
346  Kokkos::parallel_reduce
347  ("MaxReduce",
348  range_policy (0, LO (num_packets_per_lid.size ())),
349  KOKKOS_LAMBDA (const LO i, size_t& running_max_num_ent) {
350  const size_t num_packets_this_lid = num_packets_per_lid(i);
351  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
352  num_packets_this_lid;
353  if (num_ent > running_max_num_ent) {
354  running_max_num_ent = num_ent;
355  }
356  }, Kokkos::Max<size_t> (max_num_ent));
357 
358  // Now do the actual unpack!
359  unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
360  num_packets_per_lid, import_lids, offsets,
361  max_num_ent, unpack_pids);
362 
363  typename unpack_functor_type::value_type x;
364  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
365  auto x_h = x.to_std_pair();
366  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
367  prefix << "UnpackAndCombineFunctor reported error code "
368  << x_h.first << " for the first bad row " << x_h.second);
369 }
370 
371 template<class Packet, class LocalGraph, class BufferDevice>
372 size_t
374  const LocalGraph& local_graph,
375  const Kokkos::View<const typename LocalGraph::data_type*,
376  typename LocalGraph::device_type,
377  Kokkos::MemoryUnmanaged> permute_from_lids,
378  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
379  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
380  const size_t num_same_ids)
381 {
382  using Kokkos::parallel_reduce;
383  using local_graph_type = LocalGraph;
384  using LO = typename local_graph_type::data_type;
385  using device_type = typename local_graph_type::device_type;
386  using execution_space = typename device_type::execution_space;
387  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
388 
389  size_t count = 0;
390  LO num_items;
391 
392  // Number of graph entries to unpack (returned by this function).
393  num_items = static_cast<LO>(num_same_ids);
394  if (num_items) {
395  size_t kcnt = 0;
396  parallel_reduce(
397  range_policy(0, num_items),
398  KOKKOS_LAMBDA(const LO lid, size_t& update) {
399  update += static_cast<size_t>(local_graph.row_map[lid+1]
400  -local_graph.row_map[lid]);
401  }, kcnt);
402  count += kcnt;
403  }
404 
405  // Count entries copied directly from the source graph with permuting.
406  num_items = static_cast<LO>(permute_from_lids.extent(0));
407  if (num_items) {
408  size_t kcnt = 0;
409  parallel_reduce(
410  range_policy(0, num_items),
411  KOKKOS_LAMBDA(const LO i, size_t& update) {
412  const LO lid = permute_from_lids(i);
413  update += static_cast<size_t>(local_graph.row_map[lid+1]
414  - local_graph.row_map[lid]);
415  }, kcnt);
416  count += kcnt;
417  }
418 
419  {
420  // Count entries received from other MPI processes.
421  size_t tot_num_ent = 0;
422  parallel_reduce("SumReduce",
423  range_policy(0,num_packets_per_lid.size()),
424  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
425  lsum += num_packets_per_lid(i) / 2;
426  }, Kokkos::Sum<size_t>(tot_num_ent));
427  count += tot_num_ent;
428  }
429 
430  return count;
431 }
432 
434 template<class Packet, class LO, class Device, class BufferDevice>
435 void
436 setupRowPointersForRemotes(
437  const Kokkos::View<size_t*, Device>& tgt_rowptr,
438  const Kokkos::View<const LO*, BufferDevice>& import_lids,
439  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
440  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
441 {
442  using Kokkos::parallel_reduce;
443  using device_type = Device;
444  using execution_space = typename device_type::execution_space;
445  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
446  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
447 
448  const size_type N = num_packets_per_lid.extent(0);
449  parallel_for("Setup row pointers for remotes",
450  range_policy(0, N),
451  KOKKOS_LAMBDA(const size_t i){
452  using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
453  const size_t num_packets_this_lid = num_packets_per_lid(i);
454  const size_t num_ent = num_packets_this_lid / 2;
455  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
456  });
457 }
458 
459 // Convert array of row lengths to a CRS pointer array
460 template<class Device>
461 void
462 makeCrsRowPtrFromLengths(
463  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
464  const Kokkos::View<size_t*,Device>& new_start_row)
465 {
466  using Kokkos::parallel_scan;
467  using device_type = Device;
468  using execution_space = typename device_type::execution_space;
469  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
470  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
471  const size_type N = new_start_row.extent(0);
472  parallel_scan(
473  range_policy(0, N),
474  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
475  auto cur_val = tgt_rowptr(i);
476  if (final) {
477  tgt_rowptr(i) = update;
478  new_start_row(i) = tgt_rowptr(i);
479  }
480  update += cur_val;
481  }
482  );
483 }
484 
485 template<class LocalGraph, class LocalMap>
486 void
487 copyDataFromSameIDs(
488  const Kokkos::View<typename LocalMap::global_ordinal_type*,
489  typename LocalMap::device_type>& tgt_colind,
490  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
491  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
492  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
493  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
494  const LocalGraph& local_graph,
495  const LocalMap& local_col_map,
496  const size_t num_same_ids,
497  const int my_pid)
498 {
499  using Kokkos::parallel_for;
500  using device_type = typename LocalMap::device_type;
501  using LO = typename LocalMap::local_ordinal_type;
502  using execution_space = typename device_type::execution_space;
503  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
504 
505  parallel_for(
506  range_policy(0, num_same_ids),
507  KOKKOS_LAMBDA(const size_t i) {
508  using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
509 
510  const LO src_lid = static_cast<LO>(i);
511  size_t src_row = local_graph.row_map(src_lid);
512 
513  const LO tgt_lid = static_cast<LO>(i);
514  const size_t tgt_row = tgt_rowptr(tgt_lid);
515 
516  const size_t nsr = local_graph.row_map(src_lid+1)
517  - local_graph.row_map(src_lid);
518  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
519 
520  for (size_t j=local_graph.row_map(src_lid);
521  j<local_graph.row_map(src_lid+1); ++j) {
522  LO src_col = local_graph.entries(j);
523  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
524  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
525  }
526  }
527  );
528 }
529 
530 template<class LocalGraph, class LocalMap, class BufferDevice>
531 void
532 copyDataFromPermuteIDs(
533  const Kokkos::View<typename LocalMap::global_ordinal_type*,
534  typename LocalMap::device_type>& tgt_colind,
535  const Kokkos::View<int*,
536  typename LocalMap::device_type>& tgt_pids,
537  const Kokkos::View<size_t*,
538  typename LocalMap::device_type>& new_start_row,
539  const Kokkos::View<size_t*,
540  typename LocalMap::device_type>& tgt_rowptr,
541  const Kokkos::View<const int*,
542  typename LocalMap::device_type>& src_pids,
543  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
544  BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
545  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
546  BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
547  const LocalGraph& local_graph,
548  const LocalMap& local_col_map,
549  const int my_pid)
550 {
551  using Kokkos::parallel_for;
552  using device_type = typename LocalMap::device_type;
553  using LO = typename LocalMap::local_ordinal_type;
554  using execution_space = typename device_type::execution_space;
555  using size_type = typename Kokkos::View<LO*,device_type>::size_type;
556  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
557 
558  const size_type num_permute_to_lids = permute_to_lids.extent(0);
559 
560  parallel_for(
561  range_policy(0, num_permute_to_lids),
562  KOKKOS_LAMBDA(const size_t i) {
563  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
564 
565  const LO src_lid = permute_from_lids(i);
566  const size_t src_row = local_graph.row_map(src_lid);
567 
568  const LO tgt_lid = permute_to_lids(i);
569  const size_t tgt_row = tgt_rowptr(tgt_lid);
570 
571  size_t nsr = local_graph.row_map(src_lid+1)
572  - local_graph.row_map(src_lid);
573  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
574 
575  for (size_t j=local_graph.row_map(src_lid);
576  j<local_graph.row_map(src_lid+1); ++j) {
577  LO src_col = local_graph.entries(j);
578  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
579  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
580  }
581  }
582  );
583 }
584 
585 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
586 void
587 unpackAndCombineIntoCrsArrays2(
588  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
589  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
590  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
591  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
592  const Kokkos::View<
593  const typename LocalMap::local_ordinal_type*,
594  BufferDevice,
595  Kokkos::MemoryUnmanaged>& import_lids,
596  const Kokkos::View<const Packet*, BufferDevice>& imports,
597  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
598  const LocalGraph& /* local_graph */,
599  const LocalMap /*& local_col_map*/,
600  const int my_pid)
601 {
602  using Kokkos::View;
603  using Kokkos::subview;
604  using Kokkos::MemoryUnmanaged;
605  using Kokkos::parallel_reduce;
606  using Kokkos::atomic_fetch_add;
607 
608  using device_type = typename LocalMap::device_type;
609  using LO = typename LocalMap::local_ordinal_type;
610  using GO = typename LocalMap::global_ordinal_type;
611  using execution_space = typename device_type::execution_space;
612  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
613  using slice = typename Kokkos::pair<size_type, size_type>;
614  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
615 
616  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
617  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
618 
619  const size_type num_import_lids = import_lids.size();
620  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
621 
622  // RemoteIDs: Loop structure following UnpackAndCombine
623  int gbl_err_count;
624  parallel_reduce("Unpack and combine into CRS",
625  range_policy(0, num_import_lids),
626  KOKKOS_LAMBDA(const size_t i, int& err) {
627  using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
628  const size_t num_packets_this_lid = num_packets_per_lid(i);
629  const size_t num_ent = num_packets_this_lid / 2;
630  const size_t offset = offsets(i);
631  const LO lcl_row = import_lids(i);
632  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
633  const size_t end_row = start_row + num_ent;
634 
635  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
636  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
637 
638  err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
639 
640  // Correct target PIDs.
641  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
642  const int pid = pids_out(j);
643  pids_out(j) = (pid != my_pid) ? pid : -1;
644  }
645  }, gbl_err_count);
646 
647  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
648  std::invalid_argument, prefix <<
649  "Attempting to unpack PIDs, but num_ent is not even; this should never "
650  "happen! Please report this bug to the Tpetra developers.");
651 
652  return;
653 }
654 
655 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
656 void
658  const LocalGraph & local_graph,
659  const LocalMap & local_col_map,
660  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
661  BufferDevice,
662  Kokkos::MemoryUnmanaged>& import_lids,
663  const Kokkos::View<const Packet*, BufferDevice>& imports,
664  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
665  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
666  BufferDevice,
667  Kokkos::MemoryUnmanaged>& permute_to_lids,
668  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
669  BufferDevice,
670  Kokkos::MemoryUnmanaged>& permute_from_lids,
671  const Kokkos::View<size_t*,
672  typename LocalMap::device_type,
673  Kokkos::MemoryUnmanaged>& tgt_rowptr,
674  const Kokkos::View<typename LocalMap::global_ordinal_type*,
675  typename LocalMap::device_type,
676  Kokkos::MemoryUnmanaged>& tgt_colind,
677  const Kokkos::View<const int*,
678  typename LocalMap::device_type,
679  Kokkos::MemoryUnmanaged>& src_pids,
680  const Kokkos::View<int*,
681  typename LocalMap::device_type,
682  Kokkos::MemoryUnmanaged>& tgt_pids,
683  const size_t num_same_ids,
684  const size_t tgt_num_rows,
685  const size_t tgt_num_nonzeros,
686  const int my_tgt_pid)
687 {
688  using Kokkos::View;
689  using Kokkos::subview;
690  using Kokkos::parallel_for;
691  using Kokkos::MemoryUnmanaged;
692  using packet_type = Packet;
693  using local_map_type = LocalMap;
694  using local_graph_type = LocalGraph;
695  using buffer_device_type = BufferDevice;
696  using device_type = typename LocalMap::device_type;
697  using LO = typename LocalMap::local_ordinal_type;
698  using execution_space = typename device_type::execution_space;
699  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
700  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
701 
702  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
703 
704  const size_t N = tgt_num_rows;
705  const size_t mynnz = tgt_num_nonzeros;
706 
707  // In the case of reduced communicators, the sourceGraph won't have
708  // the right "my_pid", so thus we have to supply it.
709  const int my_pid = my_tgt_pid;
710 
711  // FIXME (mfh 24 Jun 2019)
712  //
713  // 1. Only zero the entries of tgt_rowptr that actually need it.
714  // 2. Consider merging these three kernels into one.
715 
716  // Zero the rowptr
717  parallel_for(
718  range_policy(0, N+1),
719  KOKKOS_LAMBDA(const size_t i) {
720  tgt_rowptr(i) = 0;
721  }
722  );
723 
724  // same IDs: Always first, always in the same place
725  parallel_for(
726  range_policy(0, num_same_ids),
727  KOKKOS_LAMBDA(const size_t i) {
728  const LO tgt_lid = static_cast<LO>(i);
729  const LO src_lid = static_cast<LO>(i);
730  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
731  - local_graph.row_map(src_lid);
732  }
733  );
734 
735  // Permute IDs: Still local, but reordered
736  const size_type num_permute_to_lids = permute_to_lids.extent(0);
737  parallel_for(
738  range_policy(0, num_permute_to_lids),
739  KOKKOS_LAMBDA(const size_t i) {
740  const LO tgt_lid = permute_to_lids(i);
741  const LO src_lid = permute_from_lids(i);
742  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
743  - local_graph.row_map(src_lid);
744  }
745  );
746 
747  // Get the offsets from the number of packets per LID
748  const size_type num_import_lids = import_lids.extent(0);
749  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
750  computeOffsetsFromCounts(offsets, num_packets_per_lid);
751 
752 #ifdef HAVE_TPETRA_DEBUG
753  {
754  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
755  const bool condition =
756  nth_offset_h != static_cast<size_t>(imports.extent(0));
757  TEUCHOS_TEST_FOR_EXCEPTION
758  (condition, std::logic_error, prefix
759  << "The final offset in bytes " << nth_offset_h
760  << " != imports.size() = " << imports.extent(0)
761  << ". Please report this bug to the Tpetra developers.");
762  }
763 #endif // HAVE_TPETRA_DEBUG
764 
765  // Setup row pointers for remotes
766  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
767  tgt_rowptr, import_lids, imports, num_packets_per_lid);
768 
769  // If multiple processes contribute to the same row, we may need to
770  // update row offsets. This tracks that.
771  View<size_t*, device_type> new_start_row("new_start_row", N+1);
772 
773  // Turn row length into a real CRS row pointer
774  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
775  {
776  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
777  bool condition = nth_tgt_rowptr_h != mynnz;
778  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
779  prefix << "CRS_rowptr[last] = " <<
780  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
781  }
782 
783  // SameIDs: Copy the data over
784  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
785  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
786 
787  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
788  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
789  local_graph, local_col_map, my_pid);
790 
791  if (imports.extent(0) <= 0) {
792  return;
793  }
794 
795  unpackAndCombineIntoCrsArrays2<
796  packet_type,local_graph_type,local_map_type,buffer_device_type>(
797  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
798  num_packets_per_lid, local_graph, local_col_map, my_pid);
799 
800  return;
801 }
802 
803 } // namespace UnpackAndCombineCrsGraphImpl
804 
852 template<class LocalOrdinal, class GlobalOrdinal, class Node>
853 size_t
856  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
857  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
858  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
859  size_t /* constantNumPackets */,
860  CombineMode /* combineMode */,
861  size_t numSameIDs,
862  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
863  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
864 {
865  using Kokkos::MemoryUnmanaged;
866  using Kokkos::View;
867  using device_type = typename Node::device_type;
868  using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
869  using local_graph_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_device_type;
870  using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
871  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
872 
873  TEUCHOS_TEST_FOR_EXCEPTION
874  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
875  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
876  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
877  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
878  // process, then the graph is neither locally nor globally indexed.
879  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
880  TEUCHOS_TEST_FOR_EXCEPTION
881  (! locallyIndexed, std::invalid_argument, prefix << "The input "
882  "CrsGraph 'sourceGraph' must be locally indexed.");
883  TEUCHOS_TEST_FOR_EXCEPTION
884  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
885  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
886  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
887 
888  auto local_graph = sourceGraph.getLocalGraphDevice();
889  auto permute_from_lids_d =
891  permuteFromLIDs.getRawPtr(),
892  permuteFromLIDs.size(), true,
893  "permute_from_lids");
894  auto imports_d =
895  create_mirror_view_from_raw_host_array(buffer_device_type(),
896  imports.getRawPtr(),
897  imports.size(), true,
898  "imports");
899  auto num_packets_per_lid_d =
900  create_mirror_view_from_raw_host_array(buffer_device_type(),
901  numPacketsPerLID.getRawPtr(),
902  numPacketsPerLID.size(), true,
903  "num_packets_per_lid");
904 
906  packet_type,local_graph_device_type,buffer_device_type>(
907  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
908 }
909 
923 template<class LocalOrdinal, class GlobalOrdinal, class Node>
924 void
927  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
928  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
929  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
930  const size_t /* constantNumPackets */,
931  const CombineMode /* combineMode */,
932  const size_t numSameIDs,
933  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
934  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
935  size_t TargetNumRows,
936  size_t TargetNumNonzeros,
937  const int MyTargetPID,
938  const Teuchos::ArrayView<size_t>& CRS_rowptr,
939  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
940  const Teuchos::ArrayView<const int>& SourcePids,
941  Teuchos::Array<int>& TargetPids)
942 {
943  using Kokkos::View;
944  using Kokkos::deep_copy;
945  using Teuchos::outArg;
946  using Teuchos::REDUCE_MAX;
947  using Teuchos::reduceAll;
948  using LO = LocalOrdinal;
949  using GO = GlobalOrdinal;
950  using crs_graph_type = CrsGraph<LO, GO, Node>;
951  using packet_type = typename crs_graph_type::packet_type;
952  using local_graph_device_type = typename crs_graph_type::local_graph_device_type;
953  using buffer_device_type = typename crs_graph_type::buffer_device_type;
954  using device_type = typename Node::device_type;
955  using size_type = typename Teuchos::ArrayView<const LO>::size_type;
956 
957  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
958 
959  TEUCHOS_TEST_FOR_EXCEPTION(
960  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
961  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
962  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
963 
964  TEUCHOS_TEST_FOR_EXCEPTION(
965  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
966  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
967  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
968  const size_type numImportLIDs = importLIDs.size();
969 
970  TEUCHOS_TEST_FOR_EXCEPTION(
971  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
972  prefix << "importLIDs.size() = " << numImportLIDs << " != "
973  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
974 
975  // Preseed TargetPids with -1 for local
976  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
977  TargetPids.resize(TargetNumNonzeros);
978  }
979  TargetPids.assign(TargetNumNonzeros, -1);
980 
981  // Grab pointers for sourceGraph
982  auto local_graph = sourceGraph.getLocalGraphDevice();
983  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
984 
985  // Convert input arrays to Kokkos::View
986  device_type outputDevice;
987  buffer_device_type bufferOutputDevice;
988 
989  Kokkos::View<const LO*, buffer_device_type> import_lids_d =
991  (bufferOutputDevice, importLIDs.getRawPtr(),
992  importLIDs.size(), true, "import_lids");
993 
994  Kokkos::View<const packet_type*, buffer_device_type> imports_d =
996  (bufferOutputDevice, imports.getRawPtr(),
997  imports.size(), true, "imports");
998 
999  Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
1000  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1001  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1002  true, "num_packets_per_lid");
1003 
1004  Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
1005  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1006  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1007  true, "permute_to_lids");
1008 
1009  Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
1010  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1011  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1012  true, "permute_from_lids");
1013 
1014  Kokkos::View<size_t*, device_type> crs_rowptr_d =
1016  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1017  true, "crs_rowptr");
1018 
1019  Kokkos::View<GO*, device_type> crs_colind_d =
1021  CRS_colind.getRawPtr(), CRS_colind.size(),
1022  true, "crs_colidx");
1023 
1024  Kokkos::View<const int*, device_type> src_pids_d =
1026  SourcePids.getRawPtr(), SourcePids.size(),
1027  true, "src_pids");
1028 
1029  Kokkos::View<int*, device_type> tgt_pids_d =
1031  TargetPids.getRawPtr(), TargetPids.size(),
1032  true, "tgt_pids");
1033 
1034  using local_map_type = decltype(local_col_map);
1036  packet_type,local_graph_device_type,local_map_type,buffer_device_type>(
1037  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1038  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1039  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1040 
1041  // FIXME (mfh 25 Jun 2019) HostMirror of CudaUVMSpace is CudaUVMSpace!!!
1042 
1043  // Copy outputs back to host
1044  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1045  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1046  deep_copy(crs_rowptr_h, crs_rowptr_d);
1047 
1048  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1049  CRS_colind.getRawPtr(), CRS_colind.size());
1050  deep_copy(crs_colind_h, crs_colind_d);
1051 
1052  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1053  TargetPids.getRawPtr(), TargetPids.size());
1054  deep_copy(tgt_pids_h, tgt_pids_d);
1055 
1056 }
1057 
1058 } // namespace Details
1059 } // namespace Tpetra
1060 
1061 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1062  template void \
1063  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1064  const CrsGraph<LO, GO, NT> &, \
1065  const Teuchos::ArrayView<const LO>&, \
1066  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1067  const Teuchos::ArrayView<const size_t>&, \
1068  const size_t, \
1069  const CombineMode, \
1070  const size_t, \
1071  const Teuchos::ArrayView<const LO>&, \
1072  const Teuchos::ArrayView<const LO>&, \
1073  size_t, \
1074  size_t, \
1075  const int, \
1076  const Teuchos::ArrayView<size_t>&, \
1077  const Teuchos::ArrayView<GO>&, \
1078  const Teuchos::ArrayView<const int>&, \
1079  Teuchos::Array<int>&); \
1080  template size_t \
1081  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1082  const CrsGraph<LO, GO, NT> &, \
1083  const Teuchos::ArrayView<const LO> &, \
1084  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1085  const Teuchos::ArrayView<const size_t>&, \
1086  size_t, \
1087  CombineMode, \
1088  size_t, \
1089  const Teuchos::ArrayView<const LO>&, \
1090  const Teuchos::ArrayView<const LO>&);
1091 
1092 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
GlobalOrdinal global_ordinal_type
The type of global indices.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
Declaration of the Tpetra::CrsGraph class.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Functions for manipulating CRS arrays.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
CombineMode
Rule for combining data in an Import or Export.
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
DeviceType device_type
The device type.
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Declaration and definition of Tpetra::Details::getEntryOnHost.
local_graph_device_type getLocalGraphDevice() const
Get the local graph.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
LocalOrdinal local_ordinal_type
The type of local indices.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.