Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
53 #include "Tpetra_CrsGraph_decl.hpp"
56 #include "Kokkos_Core.hpp"
57 #include <memory>
58 #include <string>
59 
78 
79 namespace Tpetra {
80 
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
82 // Forward declaration of Distributor
83 class Distributor;
84 #endif // DOXYGEN_SHOULD_SKIP_THIS
85 
86 //
87 // Users must never rely on anything in the Details namespace.
88 //
89 namespace Details {
90 
91 namespace UnpackAndCombineCrsGraphImpl {
92 
102 template<class Packet, class GO, class Device, class BufferDevice>
103 KOKKOS_FUNCTION int
104 unpackRow (const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
105  const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
106  const Kokkos::View<const Packet*,BufferDevice>& imports,
107  const size_t offset,
108  const size_t num_ent)
109 {
110  using size_type = typename Kokkos::View<GO*,Device>::size_type;
111 
112  if (num_ent == 0) {
113  // Empty rows always take zero bytes, to ensure sparsity.
114  return 0;
115  }
116 
117  // Unpack GIDs
118  for (size_type k=0; k<num_ent; k++)
119  gids_out(k) = imports(offset+k);
120 
121  // Unpack PIDs
122  if (pids_out.size() > 0) {
123  for (size_type k=0; k<num_ent; k++) {
124  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
125  }
126  }
127 
128  return 0;
129 }
130 
141 template<class LocalOrdinal,
142  class Packet,
143  class RowView,
144  class IndicesView,
145  class BufferDevice>
147 
148  using LO = LocalOrdinal;
149  using GO = typename IndicesView::value_type;
150  using packet_type = Packet;
151  using row_ptrs_type = RowView;
152  using indices_type = IndicesView;
153  using buffer_device_type = BufferDevice;
154 
155  using device_type = typename IndicesView::device_type;
156  using execution_space = typename device_type::execution_space;
157 
158  using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
159  using offsets_type = Kokkos::View<const size_t*, device_type>;
160  using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
161  using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
162 
163  using gids_scratch_type = Kokkos::View<GO*, device_type>;
164  using pids_scratch_type = Kokkos::View<int*,device_type>;
165 
166  row_ptrs_type row_ptrs_beg;
167  row_ptrs_type row_ptrs_end;
168  indices_type indices;
169  input_buffer_type imports;
170  num_packets_per_lid_type num_packets_per_lid;
171  import_lids_type import_lids;
172  offsets_type offsets;
173  size_t max_num_ent;
174  bool unpack_pids;
175  Kokkos::Experimental::UniqueToken<execution_space,
176  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
177  gids_scratch_type gids_scratch;
178  pids_scratch_type pids_scratch;
179 
180  public:
181  using value_type = Kokkos::pair<int, LO>;
182 
184  const row_ptrs_type& row_ptrs_beg_in,
185  const row_ptrs_type& row_ptrs_end_in,
186  const indices_type& indices_in,
187  const input_buffer_type& imports_in,
188  const num_packets_per_lid_type& num_packets_per_lid_in,
189  const import_lids_type& import_lids_in,
190  const offsets_type& offsets_in,
191  const size_t max_num_ent_in,
192  const bool unpack_pids_in) :
193  row_ptrs_beg(row_ptrs_beg_in),
194  row_ptrs_end(row_ptrs_end_in),
195  indices(indices_in),
196  imports(imports_in),
197  num_packets_per_lid(num_packets_per_lid_in),
198  import_lids(import_lids_in),
199  offsets(offsets_in),
200  max_num_ent(max_num_ent_in),
201  unpack_pids(unpack_pids_in),
202  tokens(execution_space()),
203  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
204  pids_scratch("pids_scratch", tokens.size() * max_num_ent)
205  {}
206 
207  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
208  {
209  using Tpetra::Details::OrdinalTraits;
210  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
211  }
212 
213  KOKKOS_INLINE_FUNCTION void
214  join(volatile value_type& dst, const volatile value_type& src) const
215  {
216  // `dst` should reflect the first (least) bad index and
217  // all other associated error codes and data. Thus, we need only
218  // check if the `src` object shows an error and if its associated
219  // bad index is less than `dst`'s bad index.
220  using Tpetra::Details::OrdinalTraits;
221  if (src.second != OrdinalTraits<LO>::invalid()) {
222  // An error in the src; check if
223  // 1. `dst` shows errors
224  // 2. If `dst` does show errors, if src's bad index is less than
225  // *this' bad index
226  if (dst.second == OrdinalTraits<LO>::invalid() ||
227  src.second < dst.second) {
228  dst = src;
229  }
230  }
231  }
232 
233  KOKKOS_INLINE_FUNCTION
234  void operator()(const LO i, value_type& dst) const
235  {
236  using Kokkos::View;
237  using Kokkos::subview;
238  using Kokkos::MemoryUnmanaged;
239  using size_type = typename execution_space::size_type;
240  using slice = typename Kokkos::pair<size_type, size_type>;
241 
242  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
243  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
244 
245  const size_t num_packets_this_lid = num_packets_per_lid(i);
246  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
247  : num_packets_this_lid;
248  if (unpack_pids && num_packets_this_lid%2 != 0) {
249  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
250  // should never
251  dst = Kokkos::make_pair(1, i);
252  return;
253  }
254 
255  // Only unpack data if there is a nonzero number to unpack
256  if (num_ent == 0) {
257  return;
258  }
259 
260  // there is actually something in the row
261  const size_t buf_size = imports.size();
262  const size_t offset = offsets(i);
263 
264  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
265  dst = Kokkos::make_pair(2, i); // out of bounds
266  return;
267  }
268 
269  // Get subviews in to the scratch arrays. The token returned from acquire
270  // is an integer in [0, tokens.size()). It is used to grab a unique (to
271  // this thread) subview of the scratch arrays.
272  const size_type token = tokens.acquire();
273  const size_t a = static_cast<size_t>(token) * max_num_ent;
274  const size_t b = a + num_ent;
275  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
276  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
277 
278  const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
279 
280  if (err != 0) {
281  dst = Kokkos::make_pair(3, i);
282  tokens.release(token);
283  return;
284  }
285 
286  auto import_lid = import_lids(i);
287  for (size_t k = 0; k < num_ent; ++k) {
288  indices(row_ptrs_end(import_lid)) = gids_out(k);
289  // this is OK; don't need atomic, since LIDs to pack don't have repeats.
290  row_ptrs_end(import_lid) += 1;
291  }
292 
293  tokens.release(token);
294  }
295 
296 };
297 
298 template<class NumPackets, class ImportLids, class Device>
299 Kokkos::UnorderedMap<typename ImportLids::non_const_value_type,
300  typename NumPackets::non_const_value_type,
301  Device>
302 computeCrsPadding(const NumPackets& num_packets_per_lid,
303  const ImportLids& import_lids,
304  const bool unpack_pids)
305 {
306  // Create a mapping of {LID: extra space needed} to rapidly look up which LIDs
307  // need additional padding.
308  using key_type = typename ImportLids::non_const_value_type;
309  using val_type = typename NumPackets::non_const_value_type;
310  Kokkos::UnorderedMap<key_type, val_type, Device> padding(import_lids.size());
311  auto policy = Kokkos::RangePolicy<typename Device::execution_space>(0, import_lids.size());
312  Kokkos::parallel_for("Fill padding", policy,
313  KOKKOS_LAMBDA(typename ImportLids::size_type i) {
314  auto how_much_padding = (unpack_pids) ? num_packets_per_lid(i)/2
315  : num_packets_per_lid(i);
316  padding.insert(import_lids(i), how_much_padding);
317  }
318  );
319  TEUCHOS_TEST_FOR_EXCEPTION(padding.failed_insert(), std::runtime_error,
320  "computeCrsPadding: failed to insert one or more indices in to padding map");
321  return padding;
322 }
323 
330 template<class LocalOrdinal, class Packet, class RowView,
331  class IndicesView, class BufferDevice>
332 void
333 unpackAndCombine
334 (const RowView& row_ptrs_beg,
335  const RowView& row_ptrs_end,
336  IndicesView& indices,
337  const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
338  const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
339  const Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>& import_lids,
340  const bool unpack_pids)
341 {
342 
343  using ImportLidsView =
344  Kokkos::View<const LocalOrdinal*, BufferDevice, Kokkos::MemoryUnmanaged>;
345  using NumPacketsView =
346  Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>;
347  using LO = LocalOrdinal;
348  using execution_space = typename BufferDevice::execution_space;
349  using range_policy =
350  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
351  using unpack_functor_type =
352  UnpackAndCombineFunctor<LO, Packet, RowView, IndicesView, BufferDevice>;
353 
354  const char prefix[] =
355  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
356 
357  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
358  if (num_import_lids == 0) {
359  // Nothing to unpack
360  return;
361  }
362 
363  using device_type = typename IndicesView::device_type;
364 
365  // Resize row pointers and indices to accommodate incoming data
366  auto padding =
367  computeCrsPadding<NumPacketsView, ImportLidsView, device_type>
368  (num_packets_per_lid, import_lids, unpack_pids);
369  padCrsArrays<RowView, IndicesView, decltype (padding) > (row_ptrs_beg, row_ptrs_end, indices, padding);
370 
371  // Get the offsets
372  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
373  computeOffsetsFromCounts(offsets, num_packets_per_lid);
374 
375  // Determine the maximum number of entries in any row in the graph. The
376  // maximum number of entries is needed to allocate unpack buffers on the
377  // device.
378  size_t max_num_ent;
379  Kokkos::parallel_reduce
380  ("MaxReduce",
381  range_policy (0, LO (num_packets_per_lid.size ())),
382  KOKKOS_LAMBDA (const LO i, size_t& running_max_num_ent) {
383  const size_t num_packets_this_lid = num_packets_per_lid(i);
384  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
385  num_packets_this_lid;
386  if (num_ent > running_max_num_ent) {
387  running_max_num_ent = num_ent;
388  }
389  }, Kokkos::Max<size_t> (max_num_ent));
390 
391  // Now do the actual unpack!
392  unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
393  num_packets_per_lid, import_lids, offsets,
394  max_num_ent, unpack_pids);
395 
396  typename unpack_functor_type::value_type x;
397  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
398  auto x_h = x.to_std_pair();
399  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
400  prefix << "UnpackAndCombineFunctor reported error code "
401  << x_h.first << " for the first bad row " << x_h.second);
402 }
403 
404 template<class Packet, class LocalGraph, class BufferDevice>
405 size_t
407  const LocalGraph& local_graph,
408  const Kokkos::View<const typename LocalGraph::data_type*,
409  typename LocalGraph::device_type,
410  Kokkos::MemoryUnmanaged> permute_from_lids,
411  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
412  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
413  const size_t num_same_ids)
414 {
415  using Kokkos::parallel_reduce;
416  using local_graph_type = LocalGraph;
417  using LO = typename local_graph_type::data_type;
418  using device_type = typename local_graph_type::device_type;
419  using execution_space = typename device_type::execution_space;
420  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
421 
422  size_t count = 0;
423  LO num_items;
424 
425  // Number of graph entries to unpack (returned by this function).
426  num_items = static_cast<LO>(num_same_ids);
427  if (num_items) {
428  size_t kcnt = 0;
429  parallel_reduce(
430  range_policy(0, num_items),
431  KOKKOS_LAMBDA(const LO lid, size_t& update) {
432  update += static_cast<size_t>(local_graph.row_map[lid+1]
433  -local_graph.row_map[lid]);
434  }, kcnt);
435  count += kcnt;
436  }
437 
438  // Count entries copied directly from the source graph with permuting.
439  num_items = static_cast<LO>(permute_from_lids.extent(0));
440  if (num_items) {
441  size_t kcnt = 0;
442  parallel_reduce(
443  range_policy(0, num_items),
444  KOKKOS_LAMBDA(const LO i, size_t& update) {
445  const LO lid = permute_from_lids(i);
446  update += static_cast<size_t>(local_graph.row_map[lid+1]
447  - local_graph.row_map[lid]);
448  }, kcnt);
449  count += kcnt;
450  }
451 
452  {
453  // Count entries received from other MPI processes.
454  size_t tot_num_ent = 0;
455  parallel_reduce("SumReduce",
456  num_packets_per_lid.size(),
457  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
458  lsum += num_packets_per_lid(i) / 2;
459  }, Kokkos::Sum<size_t>(tot_num_ent));
460  count += tot_num_ent;
461  }
462 
463  return count;
464 }
465 
467 template<class Packet, class LO, class Device, class BufferDevice>
468 void
469 setupRowPointersForRemotes(
470  const Kokkos::View<size_t*, Device>& tgt_rowptr,
471  const Kokkos::View<const LO*, BufferDevice>& import_lids,
472  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
473  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
474 {
475  using Kokkos::parallel_reduce;
476  using device_type = Device;
477  using execution_space = typename device_type::execution_space;
478  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
479  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
480 
481  const size_type N = num_packets_per_lid.extent(0);
482  parallel_for("Setup row pointers for remotes",
483  range_policy(0, N),
484  KOKKOS_LAMBDA(const size_t i){
485  using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
486  const size_t num_packets_this_lid = num_packets_per_lid(i);
487  const size_t num_ent = num_packets_this_lid / 2;
488  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
489  });
490 }
491 
492 // Convert array of row lengths to a CRS pointer array
493 template<class Device>
494 void
495 makeCrsRowPtrFromLengths(
496  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
497  const Kokkos::View<size_t*,Device>& new_start_row)
498 {
499  using Kokkos::parallel_scan;
500  using device_type = Device;
501  using execution_space = typename device_type::execution_space;
502  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
503  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
504  const size_type N = new_start_row.extent(0);
505  parallel_scan(
506  range_policy(0, N),
507  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
508  auto cur_val = tgt_rowptr(i);
509  if (final) {
510  tgt_rowptr(i) = update;
511  new_start_row(i) = tgt_rowptr(i);
512  }
513  update += cur_val;
514  }
515  );
516 }
517 
518 template<class LocalGraph, class LocalMap>
519 void
520 copyDataFromSameIDs(
521  const Kokkos::View<typename LocalMap::global_ordinal_type*,
522  typename LocalMap::device_type>& tgt_colind,
523  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
524  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
525  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
526  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
527  const LocalGraph& local_graph,
528  const LocalMap& local_col_map,
529  const size_t num_same_ids,
530  const int my_pid)
531 {
532  using Kokkos::parallel_for;
533  using device_type = typename LocalMap::device_type;
534  using LO = typename LocalMap::local_ordinal_type;
535  using execution_space = typename device_type::execution_space;
536  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
537 
538  parallel_for(
539  range_policy(0, num_same_ids),
540  KOKKOS_LAMBDA(const size_t i) {
541  using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
542 
543  const LO src_lid = static_cast<LO>(i);
544  size_t src_row = local_graph.row_map(src_lid);
545 
546  const LO tgt_lid = static_cast<LO>(i);
547  const size_t tgt_row = tgt_rowptr(tgt_lid);
548 
549  const size_t nsr = local_graph.row_map(src_lid+1)
550  - local_graph.row_map(src_lid);
551  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
552 
553  for (size_t j=local_graph.row_map(src_lid);
554  j<local_graph.row_map(src_lid+1); ++j) {
555  LO src_col = local_graph.entries(j);
556  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
557  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
558  }
559  }
560  );
561 }
562 
563 template<class LocalGraph, class LocalMap, class BufferDevice>
564 void
565 copyDataFromPermuteIDs(
566  const Kokkos::View<typename LocalMap::global_ordinal_type*,
567  typename LocalMap::device_type>& tgt_colind,
568  const Kokkos::View<int*,
569  typename LocalMap::device_type>& tgt_pids,
570  const Kokkos::View<size_t*,
571  typename LocalMap::device_type>& new_start_row,
572  const Kokkos::View<size_t*,
573  typename LocalMap::device_type>& tgt_rowptr,
574  const Kokkos::View<const int*,
575  typename LocalMap::device_type>& src_pids,
576  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
577  BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
578  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
579  BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
580  const LocalGraph& local_graph,
581  const LocalMap& local_col_map,
582  const int my_pid)
583 {
584  using Kokkos::parallel_for;
585  using device_type = typename LocalMap::device_type;
586  using LO = typename LocalMap::local_ordinal_type;
587  using execution_space = typename device_type::execution_space;
588  using size_type = typename Kokkos::View<LO*,device_type>::size_type;
589  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
590 
591  const size_type num_permute_to_lids = permute_to_lids.extent(0);
592 
593  parallel_for(
594  range_policy(0, num_permute_to_lids),
595  KOKKOS_LAMBDA(const size_t i) {
596  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
597 
598  const LO src_lid = permute_from_lids(i);
599  const size_t src_row = local_graph.row_map(src_lid);
600 
601  const LO tgt_lid = permute_to_lids(i);
602  const size_t tgt_row = tgt_rowptr(tgt_lid);
603 
604  size_t nsr = local_graph.row_map(src_lid+1)
605  - local_graph.row_map(src_lid);
606  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
607 
608  for (size_t j=local_graph.row_map(src_lid);
609  j<local_graph.row_map(src_lid+1); ++j) {
610  LO src_col = local_graph.entries(j);
611  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
612  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
613  }
614  }
615  );
616 }
617 
618 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
619 void
620 unpackAndCombineIntoCrsArrays2(
621  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
622  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
623  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
624  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
625  const Kokkos::View<
626  const typename LocalMap::local_ordinal_type*,
627  BufferDevice,
628  Kokkos::MemoryUnmanaged>& import_lids,
629  const Kokkos::View<const Packet*, BufferDevice>& imports,
630  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
631  const LocalGraph& /* local_graph */,
632  const LocalMap /*& local_col_map*/,
633  const int my_pid)
634 {
635  using Kokkos::View;
636  using Kokkos::subview;
637  using Kokkos::MemoryUnmanaged;
638  using Kokkos::parallel_reduce;
639  using Kokkos::atomic_fetch_add;
640 
641  using device_type = typename LocalMap::device_type;
642  using LO = typename LocalMap::local_ordinal_type;
643  using GO = typename LocalMap::global_ordinal_type;
644  using execution_space = typename device_type::execution_space;
645  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
646  using slice = typename Kokkos::pair<size_type, size_type>;
647  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
648 
649  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
650  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
651 
652  const size_type num_import_lids = import_lids.size();
653  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
654 
655  // RemoteIDs: Loop structure following UnpackAndCombine
656  int gbl_err_count;
657  parallel_reduce("Unpack and combine into CRS",
658  range_policy(0, num_import_lids),
659  KOKKOS_LAMBDA(const size_t i, int& err) {
660  using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
661  const size_t num_packets_this_lid = num_packets_per_lid(i);
662  const size_t num_ent = num_packets_this_lid / 2;
663  const size_t offset = offsets(i);
664  const LO lcl_row = import_lids(i);
665  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
666  const size_t end_row = start_row + num_ent;
667 
668  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
669  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
670 
671  err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
672 
673  // Correct target PIDs.
674  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
675  const int pid = pids_out(j);
676  pids_out(j) = (pid != my_pid) ? pid : -1;
677  }
678  }, gbl_err_count);
679 
680  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
681  std::invalid_argument, prefix <<
682  "Attempting to unpack PIDs, but num_ent is not even; this should never "
683  "happen! Please report this bug to the Tpetra developers.");
684 
685  return;
686 }
687 
688 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
689 void
691  const LocalGraph & local_graph,
692  const LocalMap & local_col_map,
693  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
694  BufferDevice,
695  Kokkos::MemoryUnmanaged>& import_lids,
696  const Kokkos::View<const Packet*, BufferDevice>& imports,
697  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
698  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
699  BufferDevice,
700  Kokkos::MemoryUnmanaged>& permute_to_lids,
701  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
702  BufferDevice,
703  Kokkos::MemoryUnmanaged>& permute_from_lids,
704  const Kokkos::View<size_t*,
705  typename LocalMap::device_type,
706  Kokkos::MemoryUnmanaged>& tgt_rowptr,
707  const Kokkos::View<typename LocalMap::global_ordinal_type*,
708  typename LocalMap::device_type,
709  Kokkos::MemoryUnmanaged>& tgt_colind,
710  const Kokkos::View<const int*,
711  typename LocalMap::device_type,
712  Kokkos::MemoryUnmanaged>& src_pids,
713  const Kokkos::View<int*,
714  typename LocalMap::device_type,
715  Kokkos::MemoryUnmanaged>& tgt_pids,
716  const size_t num_same_ids,
717  const size_t tgt_num_rows,
718  const size_t tgt_num_nonzeros,
719  const int my_tgt_pid)
720 {
721  using Kokkos::View;
722  using Kokkos::subview;
723  using Kokkos::parallel_for;
724  using Kokkos::MemoryUnmanaged;
725  using packet_type = Packet;
726  using local_map_type = LocalMap;
727  using local_graph_type = LocalGraph;
728  using buffer_device_type = BufferDevice;
729  using device_type = typename LocalMap::device_type;
730  using LO = typename LocalMap::local_ordinal_type;
731  using execution_space = typename device_type::execution_space;
732  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
733  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
734 
735  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
736 
737  const size_t N = tgt_num_rows;
738  const size_t mynnz = tgt_num_nonzeros;
739 
740  // In the case of reduced communicators, the sourceGraph won't have
741  // the right "my_pid", so thus we have to supply it.
742  const int my_pid = my_tgt_pid;
743 
744  // FIXME (mfh 24 Jun 2019)
745  //
746  // 1. Only zero the entries of tgt_rowptr that actually need it.
747  // 2. Consider merging these three kernels into one.
748 
749  // Zero the rowptr
750  parallel_for(
751  range_policy(0, N+1),
752  KOKKOS_LAMBDA(const size_t i) {
753  tgt_rowptr(i) = 0;
754  }
755  );
756 
757  // same IDs: Always first, always in the same place
758  parallel_for(
759  range_policy(0, num_same_ids),
760  KOKKOS_LAMBDA(const size_t i) {
761  const LO tgt_lid = static_cast<LO>(i);
762  const LO src_lid = static_cast<LO>(i);
763  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
764  - local_graph.row_map(src_lid);
765  }
766  );
767 
768  // Permute IDs: Still local, but reordered
769  const size_type num_permute_to_lids = permute_to_lids.extent(0);
770  parallel_for(
771  range_policy(0, num_permute_to_lids),
772  KOKKOS_LAMBDA(const size_t i) {
773  const LO tgt_lid = permute_to_lids(i);
774  const LO src_lid = permute_from_lids(i);
775  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
776  - local_graph.row_map(src_lid);
777  }
778  );
779 
780  // Get the offsets from the number of packets per LID
781  const size_type num_import_lids = import_lids.extent(0);
782  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
783  computeOffsetsFromCounts(offsets, num_packets_per_lid);
784 
785 #ifdef HAVE_TPETRA_DEBUG
786  {
787  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
788  const bool condition =
789  nth_offset_h != static_cast<size_t>(imports.extent(0));
790  TEUCHOS_TEST_FOR_EXCEPTION
791  (condition, std::logic_error, prefix
792  << "The final offset in bytes " << nth_offset_h
793  << " != imports.size() = " << imports.extent(0)
794  << ". Please report this bug to the Tpetra developers.");
795  }
796 #endif // HAVE_TPETRA_DEBUG
797 
798  // Setup row pointers for remotes
799  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
800  tgt_rowptr, import_lids, imports, num_packets_per_lid);
801 
802  // If multiple processes contribute to the same row, we may need to
803  // update row offsets. This tracks that.
804  View<size_t*, device_type> new_start_row("new_start_row", N+1);
805 
806  // Turn row length into a real CRS row pointer
807  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
808  {
809  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
810  bool condition = nth_tgt_rowptr_h != mynnz;
811  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
812  prefix << "CRS_rowptr[last] = " <<
813  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
814  }
815 
816  // SameIDs: Copy the data over
817  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
818  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
819 
820  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
821  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
822  local_graph, local_col_map, my_pid);
823 
824  if (imports.extent(0) <= 0) {
825  return;
826  }
827 
828  unpackAndCombineIntoCrsArrays2<
829  packet_type,local_graph_type,local_map_type,buffer_device_type>(
830  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
831  num_packets_per_lid, local_graph, local_col_map, my_pid);
832 
833  return;
834 }
835 
836 } // namespace UnpackAndCombineCrsGraphImpl
837 
871 template<class LO, class GO, class Node>
872 void
874  CrsGraph<LO, GO, Node>& graph,
875  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,Node>::packet_type>& imports,
876  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
877  const Teuchos::ArrayView<const LO>& importLIDs,
878  size_t /* constantNumPackets */,
879  Distributor & /* distor */,
880  CombineMode /* combineMode */)
881 {
882 
883  TEUCHOS_TEST_FOR_EXCEPTION(!graph.isGloballyIndexed(), std::invalid_argument,
884  "Graph must be globally indexed!");
885 
886 
887  using Kokkos::View;
888  using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
889  using graph_type = CrsGraph<LO,GO,Node>;
890  using device_type = typename Node::device_type;
891  using buffer_device_type = typename graph_type::buffer_device_type;
892  using execution_space = typename device_type::execution_space;
893  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
894  using row_ptrs_type = typename graph_type::local_graph_type::row_map_type::non_const_type;
895  using indices_type = typename graph_type::t_GlobalOrdinal_1D;
896 
897  // Convert all Teuchos::Array to Kokkos::View.
898 
899  buffer_device_type bufferOutputDevice;
900 
901  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
902  // them to device. Since unpacking is done directly in to the local graph
903  // (lclGraph), no copying needs to be performed after unpacking.
904  auto imports_d =
905  create_mirror_view_from_raw_host_array(bufferOutputDevice,
906  imports.getRawPtr(), imports.size(),
907  true, "imports");
908 
909  auto num_packets_per_lid_d =
910  create_mirror_view_from_raw_host_array(bufferOutputDevice,
911  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
912  true, "num_packets_per_lid");
913 
914  auto import_lids_d =
915  create_mirror_view_from_raw_host_array(bufferOutputDevice,
916  importLIDs.getRawPtr(), importLIDs.size(),
917  true, "import_lids");
918 
919  // We are OK using the protected data directly (k_*) because this function is
920  // a friend of CrsGraph
921  indices_type indices("indices", graph.k_gblInds1D_.extent(0));
922  Kokkos::deep_copy(indices, graph.k_gblInds1D_);
923 
924  row_ptrs_type row_ptrs_beg("row_ptrs_beg", graph.k_rowPtrs_.extent(0));
925  Kokkos::deep_copy(row_ptrs_beg, graph.k_rowPtrs_);
926 
927  const size_t N = (row_ptrs_beg.extent(0) == 0 ? 0 : row_ptrs_beg.extent(0) - 1);
928  row_ptrs_type row_ptrs_end("row_ptrs_end", N);
929 
930  bool refill_num_row_entries = false;
931  if (graph.k_numRowEntries_.extent(0) > 0) {
932  // Case 1: Packed storage
933  refill_num_row_entries = true;
934  auto num_row_entries = graph.k_numRowEntries_;
935  Kokkos::parallel_for("Fill end row pointers", range_policy(0, N),
936  KOKKOS_LAMBDA(const size_t i){
937  row_ptrs_end(i) = row_ptrs_beg(i) + num_row_entries(i);
938  });
939 
940  } else {
941  // mfh If packed storage, don't need row_ptrs_end to be separate allocation;
942  // could just have it alias row_ptrs_beg+1.
943 
944  // Case 2: Packed storage
945  Kokkos::parallel_for("Fill end row pointers",
946  range_policy(0, N), KOKKOS_LAMBDA(const size_t i){
947  row_ptrs_end(i) = row_ptrs_beg(i+1);
948  });
949  }
950 
951  // Now do the actual unpack!
952  unpackAndCombine<LO, GO, row_ptrs_type, indices_type, buffer_device_type>
953  (row_ptrs_beg, row_ptrs_end, indices, imports_d,
954  num_packets_per_lid_d, import_lids_d, false);
955 
956  // mfh Later, permit graph to be locally indexed, and check whether
957  // incoming column indices are in the column Map. If not, error.
958  if (refill_num_row_entries) {
959  Kokkos::parallel_for("Fill num entries",
960  range_policy(0, N), KOKKOS_LAMBDA(const size_t i){
961  graph.k_numRowEntries_(i) = row_ptrs_end(i) - row_ptrs_beg(i);
962  });
963  }
964  graph.k_rowPtrs_ = row_ptrs_beg;
965  graph.k_gblInds1D_ = indices;
966 
967  return;
968 }
969 
970 template<class LO, class GO, class Node>
971 void
972 unpackCrsGraphAndCombineNew(
973  CrsGraph<LO, GO, Node>& /* sourceGraph */,
974  const Kokkos::DualView<const typename CrsGraph<LO,GO,Node>::packet_type*,
975  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* imports */,
976  const Kokkos::DualView<const size_t*,
977  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* numPacketsPerLID */,
978  const Kokkos::DualView<const LO*,
979  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* importLIDs */,
980  const size_t /* constantNumPackets */,
981  Distributor& /* distor */,
982  const CombineMode /* combineMode */)
983 {
984  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "METHOD NOT COMPLETE");
985 #if 0
986  using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
988  using Kokkos::View;
989  using device_type = typename Node::device_type;
990  using graph_type = CrsGraph<LO, GO, Node>;
991  using packet_type = typename graph_type::packet_type;
992  using local_graph_type = typename graph_type::local_graph_type;
993  using buffer_device_type = typename graph_type::buffer_device_type;
994  using buffer_memory_space = typename buffer_device_type::memory_space;
995  using memory_space = typename device_type::memory_space;
996 
997  using row_ptrs_type = typename graph_type::local_graph_type::row_map_type::non_const_type;
998  using execution_space = typename device_type::execution_space;
999  using indices_type = Kokkos::View<GO*, execution_space>;
1000 
1001  static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
1002  "Node::device_type and LocalGraph::device_type must be "
1003  "the same.");
1004 
1005  {
1006  auto numPacketsPerLID_nc = castAwayConstDualView(numPacketsPerLID);
1007  numPacketsPerLID_nc.sync_device ();
1008  }
1009  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
1010 
1011  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1012  auto import_lids_d = importLIDs.view_device ();
1013 
1014  {
1015  auto imports_nc = castAwayConstDualView(imports);
1016  imports_nc.sync_device ();
1017  }
1018  auto imports_d = imports.view_device ();
1019 
1020  // Now do the actual unpack!
1021  // TJF: Should be grabbed from the Graph
1022  indices_type indices;
1023  row_ptrs_type row_ptrs_beg;
1024  row_ptrs_type row_ptrs_end;
1025  unpackAndCombine<LO,packet_type,row_ptrs_type,indices_type,device_type,buffer_device_type>(
1026  row_ptrs_beg, row_ptrs_end, indices, imports_d,
1027  num_packets_per_lid_d, import_lids_d, false);
1028 #endif // 0
1029 }
1030 
1080 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1081 size_t
1083  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1084  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1085  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
1086  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1087  size_t /* constantNumPackets */,
1088  Distributor &/* distor */,
1089  CombineMode /* combineMode */,
1090  size_t numSameIDs,
1091  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1092  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1093 {
1094  using Kokkos::MemoryUnmanaged;
1095  using Kokkos::View;
1096  using device_type = typename Node::device_type;
1097  using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
1098  using local_graph_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type;
1099  using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
1100  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1101 
1102  TEUCHOS_TEST_FOR_EXCEPTION
1103  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1104  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1105  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1106  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1107  // process, then the graph is neither locally nor globally indexed.
1108  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
1109  TEUCHOS_TEST_FOR_EXCEPTION
1110  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1111  "CrsGraph 'sourceGraph' must be locally indexed.");
1112  TEUCHOS_TEST_FOR_EXCEPTION
1113  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1114  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1115  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1116 
1117  auto local_graph = sourceGraph.getLocalGraph();
1118  auto permute_from_lids_d =
1120  permuteFromLIDs.getRawPtr(),
1121  permuteFromLIDs.size(), true,
1122  "permute_from_lids");
1123  auto imports_d =
1124  create_mirror_view_from_raw_host_array(buffer_device_type(),
1125  imports.getRawPtr(),
1126  imports.size(), true,
1127  "imports");
1128  auto num_packets_per_lid_d =
1129  create_mirror_view_from_raw_host_array(buffer_device_type(),
1130  numPacketsPerLID.getRawPtr(),
1131  numPacketsPerLID.size(), true,
1132  "num_packets_per_lid");
1133 
1135  packet_type,local_graph_type,buffer_device_type>(
1136  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1137 }
1138 
1152 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1153 void
1155  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1156  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1157  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
1158  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1159  const size_t /* constantNumPackets */,
1160  Distributor& /* distor */,
1161  const CombineMode /* combineMode */,
1162  const size_t numSameIDs,
1163  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1164  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1165  size_t TargetNumRows,
1166  size_t TargetNumNonzeros,
1167  const int MyTargetPID,
1168  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1169  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1170  const Teuchos::ArrayView<const int>& SourcePids,
1171  Teuchos::Array<int>& TargetPids)
1172 {
1173  using Kokkos::View;
1174  using Kokkos::deep_copy;
1175  using Teuchos::outArg;
1176  using Teuchos::REDUCE_MAX;
1177  using Teuchos::reduceAll;
1178  using LO = LocalOrdinal;
1179  using GO = GlobalOrdinal;
1180  using crs_graph_type = CrsGraph<LO, GO, Node>;
1181  using packet_type = typename crs_graph_type::packet_type;
1182  using local_graph_type = typename crs_graph_type::local_graph_type;
1183  using buffer_device_type = typename crs_graph_type::buffer_device_type;
1184  using device_type = typename Node::device_type;
1185  using size_type = typename Teuchos::ArrayView<const LO>::size_type;
1186 
1187  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1188 
1189  TEUCHOS_TEST_FOR_EXCEPTION(
1190  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1191  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1192  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1193 
1194  TEUCHOS_TEST_FOR_EXCEPTION(
1195  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1196  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
1197  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1198  const size_type numImportLIDs = importLIDs.size();
1199 
1200  TEUCHOS_TEST_FOR_EXCEPTION(
1201  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1202  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1203  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1204 
1205  // Preseed TargetPids with -1 for local
1206  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1207  TargetPids.resize(TargetNumNonzeros);
1208  }
1209  TargetPids.assign(TargetNumNonzeros, -1);
1210 
1211  // Grab pointers for sourceGraph
1212  auto local_graph = sourceGraph.getLocalGraph();
1213  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
1214 
1215  // Convert input arrays to Kokkos::View
1216  device_type outputDevice;
1217  buffer_device_type bufferOutputDevice;
1218 
1219  Kokkos::View<const LO*, buffer_device_type> import_lids_d =
1221  (bufferOutputDevice, importLIDs.getRawPtr(),
1222  importLIDs.size(), true, "import_lids");
1223 
1224  Kokkos::View<const packet_type*, buffer_device_type> imports_d =
1226  (bufferOutputDevice, imports.getRawPtr(),
1227  imports.size(), true, "imports");
1228 
1229  Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
1230  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1231  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1232  true, "num_packets_per_lid");
1233 
1234  Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
1235  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1236  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1237  true, "permute_to_lids");
1238 
1239  Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
1240  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1241  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1242  true, "permute_from_lids");
1243 
1244  Kokkos::View<size_t*, device_type> crs_rowptr_d =
1246  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1247  true, "crs_rowptr");
1248 
1249  Kokkos::View<GO*, device_type> crs_colind_d =
1251  CRS_colind.getRawPtr(), CRS_colind.size(),
1252  true, "crs_colidx");
1253 
1254  Kokkos::View<const int*, device_type> src_pids_d =
1256  SourcePids.getRawPtr(), SourcePids.size(),
1257  true, "src_pids");
1258 
1259  Kokkos::View<int*, device_type> tgt_pids_d =
1261  TargetPids.getRawPtr(), TargetPids.size(),
1262  true, "tgt_pids");
1263 
1264  using local_map_type = decltype(local_col_map);
1266  packet_type,local_graph_type,local_map_type,buffer_device_type>(
1267  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1268  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1269  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1270 
1271  // FIXME (mfh 25 Jun 2019) HostMirror of CudaUVMSpace is CudaUVMSpace!!!
1272 
1273  // Copy outputs back to host
1274  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1275  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1276  deep_copy(crs_rowptr_h, crs_rowptr_d);
1277 
1278  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1279  CRS_colind.getRawPtr(), CRS_colind.size());
1280  deep_copy(crs_colind_h, crs_colind_d);
1281 
1282  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1283  TargetPids.getRawPtr(), TargetPids.size());
1284  deep_copy(tgt_pids_h, tgt_pids_d);
1285 
1286 }
1287 
1288 } // namespace Details
1289 } // namespace Tpetra
1290 
1291 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1292  template void \
1293  Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1294  CrsGraph<LO, GO, NT>&, \
1295  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1296  const Teuchos::ArrayView<const size_t>&, \
1297  const Teuchos::ArrayView<const LO>&, \
1298  size_t, \
1299  Distributor&, \
1300  CombineMode); \
1301  template void \
1302  Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1303  CrsGraph<LO, GO, NT>&, \
1304  const Kokkos::DualView<const CrsGraph<LO, GO, NT>::packet_type*, \
1305  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1306  const Kokkos::DualView<const size_t*, \
1307  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1308  const Kokkos::DualView<const LO*, \
1309  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1310  const size_t, \
1311  Distributor&, \
1312  const CombineMode); \
1313  template void \
1314  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1315  const CrsGraph<LO, GO, NT> &, \
1316  const Teuchos::ArrayView<const LO>&, \
1317  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1318  const Teuchos::ArrayView<const size_t>&, \
1319  const size_t, \
1320  Distributor&, \
1321  const CombineMode, \
1322  const size_t, \
1323  const Teuchos::ArrayView<const LO>&, \
1324  const Teuchos::ArrayView<const LO>&, \
1325  size_t, \
1326  size_t, \
1327  const int, \
1328  const Teuchos::ArrayView<size_t>&, \
1329  const Teuchos::ArrayView<GO>&, \
1330  const Teuchos::ArrayView<const int>&, \
1331  Teuchos::Array<int>&); \
1332  template size_t \
1333  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1334  const CrsGraph<LO, GO, NT> &, \
1335  const Teuchos::ArrayView<const LO> &, \
1336  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1337  const Teuchos::ArrayView<const size_t>&, \
1338  size_t, \
1339  Distributor &, \
1340  CombineMode, \
1341  size_t, \
1342  const Teuchos::ArrayView<const LO>&, \
1343  const Teuchos::ArrayView<const LO>&);
1344 
1345 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
t_GlobalOrdinal_1D k_gblInds1D_
Global column indices for all rows.
Declaration of the Tpetra::CrsGraph class.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
Functions for manipulating CRS arrays.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Sets up and executes a communication plan for a Tpetra DistObject.
local_graph_type::row_map_type::const_type k_rowPtrs_
Row offsets for &quot;1-D&quot; storage.
CombineMode
Rule for combining data in an Import or Export.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
void unpackCrsGraphAndCombine(CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode)
Unpack the imported column indices and combine into graph.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
local_graph_type getLocalGraph() const
Get the local graph.
Declaration and definition of Tpetra::Details::getEntryOnHost.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.