Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
43 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
44 
45 #include "TpetraCore_config.h"
46 #include "Teuchos_Array.hpp"
47 #include "Teuchos_ArrayView.hpp"
53 #include "Tpetra_CrsGraph_decl.hpp"
56 #include "Kokkos_Core.hpp"
57 #include <memory>
58 #include <string>
59 
78 
79 namespace Tpetra {
80 
81 #ifndef DOXYGEN_SHOULD_SKIP_THIS
82 // Forward declaration of Distributor
83 class Distributor;
84 #endif // DOXYGEN_SHOULD_SKIP_THIS
85 
86 //
87 // Users must never rely on anything in the Details namespace.
88 //
89 namespace Details {
90 
91 namespace UnpackAndCombineCrsGraphImpl {
92 
102 template<class Packet, class GO, class Device, class BufferDevice>
103 KOKKOS_FUNCTION int
104 unpackRow(typename Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
105  typename Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
106  const Kokkos::View<const Packet*,BufferDevice>& imports,
107  const size_t offset,
108  const size_t num_ent)
109 {
110  using size_type = typename Kokkos::View<GO*,Device>::size_type;
111 
112  if (num_ent == 0) {
113  // Empty rows always take zero bytes, to ensure sparsity.
114  return 0;
115  }
116 
117  // Unpack GIDs
118  for (size_type k=0; k<num_ent; k++)
119  gids_out(k) = imports(offset+k);
120 
121  // Unpack PIDs
122  if (pids_out.size() > 0) {
123  for (size_type k=0; k<num_ent; k++)
124  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
125  }
126 
127  return 0;
128 }
129 
140 template<class LocalOrdinal, class Packet, class RowView,
141  class IndicesView, class Device, class BufferDevice>
143 
144  using LO = LocalOrdinal;
145  using GO = typename IndicesView::value_type;
146  using packet_type = Packet;
147  using row_ptrs_type = RowView;
148  using indices_type = IndicesView;
149  using buffer_device_type = BufferDevice;
150 
151  using device_type = Device;
152  using execution_space = typename device_type::execution_space;
153 
154  using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
155  using offsets_type = Kokkos::View<const size_t*, device_type>;
156  using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
157  using import_lids_type = Kokkos::View<const LO*, device_type>;
158 
159  using gids_scratch_type = Kokkos::View<GO*, device_type>;
160  using pids_scratch_type = Kokkos::View<int*,device_type>;
161 
162  row_ptrs_type row_ptrs_beg;
163  row_ptrs_type row_ptrs_end;
164  indices_type indices;
165  input_buffer_type imports;
166  num_packets_per_lid_type num_packets_per_lid;
167  import_lids_type import_lids;
168  offsets_type offsets;
169  size_t max_num_ent;
170  bool unpack_pids;
171  Kokkos::Experimental::UniqueToken<execution_space,
172  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
173  gids_scratch_type gids_scratch;
174  pids_scratch_type pids_scratch;
175 
176  public:
177  using value_type = Kokkos::pair<int, LO>;
178 
180  const row_ptrs_type& row_ptrs_beg_in,
181  row_ptrs_type& row_ptrs_end_in,
182  indices_type& indices_in,
183  const input_buffer_type& imports_in,
184  const num_packets_per_lid_type& num_packets_per_lid_in,
185  const import_lids_type& import_lids_in,
186  const offsets_type& offsets_in,
187  const size_t max_num_ent_in,
188  const bool unpack_pids_in) :
189  row_ptrs_beg(row_ptrs_beg_in),
190  row_ptrs_end(row_ptrs_end_in),
191  indices(indices_in),
192  imports(imports_in),
193  num_packets_per_lid(num_packets_per_lid_in),
194  import_lids(import_lids_in),
195  offsets(offsets_in),
196  max_num_ent(max_num_ent_in),
197  unpack_pids(unpack_pids_in),
198  tokens(execution_space()),
199  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
200  pids_scratch("pids_scratch", tokens.size() * max_num_ent)
201  {}
202 
203  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
204  {
205  using Tpetra::Details::OrdinalTraits;
206  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
207  }
208 
209  KOKKOS_INLINE_FUNCTION void
210  join(volatile value_type& dst, const volatile value_type& src) const
211  {
212  // `dst` should reflect the first (least) bad index and
213  // all other associated error codes and data. Thus, we need only
214  // check if the `src` object shows an error and if its associated
215  // bad index is less than `dst`'s bad index.
216  using Tpetra::Details::OrdinalTraits;
217  if (src.second != OrdinalTraits<LO>::invalid()) {
218  // An error in the src; check if
219  // 1. `dst` shows errors
220  // 2. If `dst` does show errors, if src's bad index is less than
221  // *this' bad index
222  if (dst.second == OrdinalTraits<LO>::invalid() ||
223  src.second < dst.second) {
224  dst = src;
225  }
226  }
227  }
228 
229  KOKKOS_INLINE_FUNCTION
230  void operator()(const LO i, value_type& dst) const
231  {
232  using Kokkos::View;
233  using Kokkos::subview;
234  using Kokkos::MemoryUnmanaged;
235  using size_type = typename execution_space::size_type;
236  using slice = typename Kokkos::pair<size_type, size_type>;
237 
238  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
239  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
240 
241  const size_t num_packets_this_lid = num_packets_per_lid(i);
242  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
243  : num_packets_this_lid;
244  if (unpack_pids && num_packets_this_lid%2 != 0) {
245  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
246  // should never
247  dst = Kokkos::make_pair(1, i);
248  return;
249  }
250 
251  // Only unpack data if there is a nonzero number to unpack
252  if (num_ent == 0) {
253  return;
254  }
255 
256  // there is actually something in the row
257  const size_t buf_size = imports.size();
258  const size_t offset = offsets(i);
259 
260  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
261  dst = Kokkos::make_pair(2, i); // out of bounds
262  return;
263  }
264 
265  // Get subviews in to the scratch arrays. The token returned from acquire
266  // is an integer in [0, tokens.size()). It is used to grab a unique (to
267  // this thread) subview of the scratch arrays.
268  const size_type token = tokens.acquire();
269  const size_t a = static_cast<size_t>(token) * max_num_ent;
270  const size_t b = a + num_ent;
271  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
272  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
273 
274  // Unpack this row!
275  int err = unpackRow<packet_type,GO,device_type,buffer_device_type>(
276  gids_out, pids_out, imports, offset, num_ent);
277 
278  if (err != 0) {
279  dst = Kokkos::make_pair(3, i);
280  tokens.release(token);
281  return;
282  }
283 
284  auto import_lid = import_lids(i);
285  for (size_t k = 0; k < num_ent; ++k) {
286  indices(row_ptrs_end(import_lid)) = gids_out(k);
287  // this is OK; don't need atomic, since LIDs to pack don't have repeats.
288  row_ptrs_end(import_lid) += 1;
289  }
290 
291  tokens.release(token);
292  }
293 
294 };
295 
296 template<class NumPackets, class ImportLids, class Device>
297 Kokkos::UnorderedMap<typename ImportLids::non_const_value_type,
298  typename NumPackets::non_const_value_type,
299  Device>
300 computeCrsPadding(const NumPackets& num_packets_per_lid,
301  const ImportLids& import_lids,
302  const bool unpack_pids)
303 {
304  // Create a mapping of {LID: extra space needed} to rapidly look up which LIDs
305  // need additional padding.
306  using key_type = typename ImportLids::non_const_value_type;
307  using val_type = typename NumPackets::non_const_value_type;
308  Kokkos::UnorderedMap<key_type, val_type, Device> padding(import_lids.size());
309  auto policy = Kokkos::RangePolicy<typename Device::execution_space>(0, import_lids.size());
310  Kokkos::parallel_for("Fill padding", policy,
311  KOKKOS_LAMBDA(typename ImportLids::size_type i) {
312  auto how_much_padding = (unpack_pids) ? num_packets_per_lid(i)/2
313  : num_packets_per_lid(i);
314  padding.insert(import_lids(i), how_much_padding);
315  }
316  );
317  TEUCHOS_TEST_FOR_EXCEPTION(padding.failed_insert(), std::runtime_error,
318  "computeCrsPadding: failed to insert one or more indices in to padding map");
319  return padding;
320 }
321 
328 template<class LocalOrdinal, class Packet, class RowView,
329  class IndicesView, class Device, class BufferDevice>
330 void
331 unpackAndCombine(
332  RowView& row_ptrs_beg,
333  RowView& row_ptrs_end,
334  IndicesView& indices,
335  const Kokkos::View<const Packet*, BufferDevice, Kokkos::MemoryUnmanaged>& imports,
336  const Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>& num_packets_per_lid,
337  const Kokkos::View<const LocalOrdinal*, Device, Kokkos::MemoryUnmanaged>& import_lids,
338  const bool unpack_pids)
339 {
340 
341  using ImportLidsView = Kokkos::View<const LocalOrdinal*, Device, Kokkos::MemoryUnmanaged>;
342  using NumPacketsView = Kokkos::View<const size_t*, BufferDevice, Kokkos::MemoryUnmanaged>;
343  using LO = LocalOrdinal;
344  using device_type = Device;
345  using execution_space = typename device_type::execution_space;
346  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
347  using unpack_functor_type = UnpackAndCombineFunctor<LO,Packet,RowView,IndicesView,Device,BufferDevice>;
348 
349  const char prefix[] =
350  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
351 
352  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
353  if (num_import_lids == 0) {
354  // Nothing to unpack
355  return;
356  }
357 
358  // Resize row pointers and indices to accommodate incoming data
359  auto padding = computeCrsPadding<NumPacketsView,ImportLidsView,Device>(
360  num_packets_per_lid, import_lids, unpack_pids);
361  padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding);
362 
363  // Get the offsets
364  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
365  computeOffsetsFromCounts(offsets, num_packets_per_lid);
366 
367  // Determine the maximum number of entries in any row in the graph. The
368  // maximum number of entries is needed to allocate unpack buffers on the
369  // device.
370  size_t max_num_ent;
371  Kokkos::parallel_reduce("MaxReduce",
372  range_policy(0, static_cast<LO>(num_packets_per_lid.size())),
373  KOKKOS_LAMBDA(const LO& i, size_t& running_max_num_ent) {
374  size_t num_packets_this_lid = num_packets_per_lid(i);
375  size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
376  : num_packets_this_lid;
377  if (num_ent > running_max_num_ent) running_max_num_ent = num_ent;
378  }, Kokkos::Max<size_t>(max_num_ent));
379 
380  // Now do the actual unpack!
381  unpack_functor_type f(row_ptrs_beg, row_ptrs_end, indices,
382  imports, num_packets_per_lid, import_lids, offsets,
383  max_num_ent, unpack_pids);
384 
385  typename unpack_functor_type::value_type x;
386  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
387  auto x_h = x.to_std_pair();
388  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
389  prefix << "UnpackAndCombineFunctor reported error code "
390  << x_h.first << " for the first bad row " << x_h.second);
391 
392  return;
393 }
394 
395 template<class Packet, class LocalGraph, class BufferDevice>
396 size_t
398  const LocalGraph& local_graph,
399  const Kokkos::View<const typename LocalGraph::data_type*,
400  typename LocalGraph::device_type,
401  Kokkos::MemoryUnmanaged> permute_from_lids,
402  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
403  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
404  const size_t num_same_ids)
405 {
406  using Kokkos::parallel_reduce;
407  using local_graph_type = LocalGraph;
408  using LO = typename local_graph_type::data_type;
409  using device_type = typename local_graph_type::device_type;
410  using execution_space = typename device_type::execution_space;
411  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
412 
413  size_t count = 0;
414  LO num_items;
415 
416  // Number of graph entries to unpack (returned by this function).
417  num_items = static_cast<LO>(num_same_ids);
418  if (num_items) {
419  size_t kcnt = 0;
420  parallel_reduce(
421  range_policy(0, num_items),
422  KOKKOS_LAMBDA(const LO lid, size_t& update) {
423  update += static_cast<size_t>(local_graph.row_map[lid+1]
424  -local_graph.row_map[lid]);
425  }, kcnt);
426  count += kcnt;
427  }
428 
429  // Count entries copied directly from the source graph with permuting.
430  num_items = static_cast<LO>(permute_from_lids.extent(0));
431  if (num_items) {
432  size_t kcnt = 0;
433  parallel_reduce(
434  range_policy(0, num_items),
435  KOKKOS_LAMBDA(const LO i, size_t& update) {
436  const LO lid = permute_from_lids(i);
437  update += static_cast<size_t>(local_graph.row_map[lid+1]
438  - local_graph.row_map[lid]);
439  }, kcnt);
440  count += kcnt;
441  }
442 
443  {
444  // Count entries received from other MPI processes.
445  size_t tot_num_ent = 0;
446  parallel_reduce("SumReduce",
447  num_packets_per_lid.size(),
448  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
449  lsum += num_packets_per_lid(i) / 2;
450  }, Kokkos::Sum<size_t>(tot_num_ent));
451  count += tot_num_ent;
452  }
453 
454  return count;
455 }
456 
458 template<class Packet, class LO, class Device, class BufferDevice>
459 void
460 setupRowPointersForRemotes(
461  const Kokkos::View<size_t*, Device>& tgt_rowptr,
462  const Kokkos::View<const LO*, Device>& import_lids,
463  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
464  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
465 {
466  using Kokkos::parallel_reduce;
467  using device_type = Device;
468  using execution_space = typename device_type::execution_space;
469  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
470  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
471 
472  const size_type N = num_packets_per_lid.extent(0);
473  parallel_for("Setup row pointers for remotes",
474  range_policy(0, N),
475  KOKKOS_LAMBDA(const size_t i){
476  using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
477  const size_t num_packets_this_lid = num_packets_per_lid(i);
478  const size_t num_ent = num_packets_this_lid / 2;
479  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
480  });
481 }
482 
483 // Convert array of row lengths to a CRS pointer array
484 template<class Device>
485 void
486 makeCrsRowPtrFromLengths(
487  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
488  const Kokkos::View<size_t*,Device>& new_start_row)
489 {
490  using Kokkos::parallel_scan;
491  using device_type = Device;
492  using execution_space = typename device_type::execution_space;
493  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
494  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
495  const size_type N = new_start_row.extent(0);
496  parallel_scan(
497  range_policy(0, N),
498  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
499  auto cur_val = tgt_rowptr(i);
500  if (final) {
501  tgt_rowptr(i) = update;
502  new_start_row(i) = tgt_rowptr(i);
503  }
504  update += cur_val;
505  }
506  );
507 }
508 
509 template<class LocalGraph, class LocalMap>
510 void
511 copyDataFromSameIDs(
512  const Kokkos::View<typename LocalMap::global_ordinal_type*,
513  typename LocalMap::device_type>& tgt_colind,
514  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
515  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
516  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
517  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
518  const LocalGraph& local_graph,
519  const LocalMap& local_col_map,
520  const size_t num_same_ids,
521  const int my_pid)
522 {
523  using Kokkos::parallel_for;
524  using device_type = typename LocalMap::device_type;
525  using LO = typename LocalMap::local_ordinal_type;
526  using execution_space = typename device_type::execution_space;
527  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
528 
529  parallel_for(
530  range_policy(0, num_same_ids),
531  KOKKOS_LAMBDA(const size_t i) {
532  using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
533 
534  const LO src_lid = static_cast<LO>(i);
535  size_t src_row = local_graph.row_map(src_lid);
536 
537  const LO tgt_lid = static_cast<LO>(i);
538  const size_t tgt_row = tgt_rowptr(tgt_lid);
539 
540  const size_t nsr = local_graph.row_map(src_lid+1)
541  - local_graph.row_map(src_lid);
542  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
543 
544  for (size_t j=local_graph.row_map(src_lid);
545  j<local_graph.row_map(src_lid+1); ++j) {
546  LO src_col = local_graph.entries(j);
547  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
548  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
549  }
550  }
551  );
552 }
553 
554 template<class LocalGraph, class LocalMap>
555 void
556 copyDataFromPermuteIDs(
557  const Kokkos::View<typename LocalMap::global_ordinal_type*,
558  typename LocalMap::device_type>& tgt_colind,
559  const Kokkos::View<int*,
560  typename LocalMap::device_type>& tgt_pids,
561  const Kokkos::View<size_t*,
562  typename LocalMap::device_type>& new_start_row,
563  const Kokkos::View<size_t*,
564  typename LocalMap::device_type>& tgt_rowptr,
565  const Kokkos::View<const int*,
566  typename LocalMap::device_type>& src_pids,
567  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
568  typename LocalMap::device_type>& permute_to_lids,
569  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
570  typename LocalMap::device_type>& permute_from_lids,
571  const LocalGraph& local_graph,
572  const LocalMap& local_col_map,
573  const int my_pid)
574 {
575  using Kokkos::parallel_for;
576  using device_type = typename LocalMap::device_type;
577  using LO = typename LocalMap::local_ordinal_type;
578  using execution_space = typename device_type::execution_space;
579  using size_type = typename Kokkos::View<LO*,device_type>::size_type;
580  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
581 
582  const size_type num_permute_to_lids = permute_to_lids.extent(0);
583 
584  parallel_for(
585  range_policy(0, num_permute_to_lids),
586  KOKKOS_LAMBDA(const size_t i) {
587  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
588 
589  const LO src_lid = permute_from_lids(i);
590  const size_t src_row = local_graph.row_map(src_lid);
591 
592  const LO tgt_lid = permute_to_lids(i);
593  const size_t tgt_row = tgt_rowptr(tgt_lid);
594 
595  size_t nsr = local_graph.row_map(src_lid+1)
596  - local_graph.row_map(src_lid);
597  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
598 
599  for (size_t j=local_graph.row_map(src_lid);
600  j<local_graph.row_map(src_lid+1); ++j) {
601  LO src_col = local_graph.entries(j);
602  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
603  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
604  }
605  }
606  );
607 }
608 
609 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
610 void
611 unpackAndCombineIntoCrsArrays2(
612  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
613  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
614  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
615  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
616  const Kokkos::View<const typename LocalMap::local_ordinal_type*, typename LocalMap::device_type>& import_lids,
617  const Kokkos::View<const Packet*, BufferDevice>& imports,
618  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
619  const LocalGraph& /* local_graph */,
620  const LocalMap /*& local_col_map*/,
621  const int my_pid)
622 {
623  using Kokkos::View;
624  using Kokkos::subview;
625  using Kokkos::MemoryUnmanaged;
626  using Kokkos::parallel_reduce;
627  using Kokkos::atomic_fetch_add;
628 
629  using packet_type = Packet;
630  using buffer_device_type = BufferDevice;
631  using device_type = typename LocalMap::device_type;
632  using LO = typename LocalMap::local_ordinal_type;
633  using GO = typename LocalMap::global_ordinal_type;
634  using execution_space = typename device_type::execution_space;
635  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
636  using slice = typename Kokkos::pair<size_type, size_type>;
637  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
638 
639  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
640  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
641 
642  const size_type num_import_lids = import_lids.size();
643  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
644 
645  // RemoteIDs: Loop structure following UnpackAndCombine
646  int gbl_err_count;
647  parallel_reduce("Unpack and combine into CRS",
648  range_policy(0, num_import_lids),
649  KOKKOS_LAMBDA(const size_t i, int& err) {
650  using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
651  const size_t num_packets_this_lid = num_packets_per_lid(i);
652  const size_t num_ent = num_packets_this_lid / 2;
653  const size_t offset = offsets(i);
654  const LO lcl_row = import_lids(i);
655  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
656  const size_t end_row = start_row + num_ent;
657 
658  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
659  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
660 
661  err += unpackRow<packet_type,GO,device_type,buffer_device_type>(
662  gids_out, pids_out, imports, offset, num_ent);
663 
664  // Correct target PIDs.
665  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
666  const int pid = pids_out(j);
667  pids_out(j) = (pid != my_pid) ? pid : -1;
668  }
669  }, gbl_err_count);
670 
671  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
672  std::invalid_argument, prefix <<
673  "Attempting to unpack PIDs, but num_ent is not even; this should never "
674  "happen! Please report this bug to the Tpetra developers.");
675 
676  return;
677 }
678 
679 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
680 void
682  const LocalGraph & local_graph,
683  const LocalMap & local_col_map,
684  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
685  typename LocalMap::device_type,
686  Kokkos::MemoryUnmanaged>& import_lids,
687  const Kokkos::View<const Packet*, BufferDevice>& imports,
688  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
689  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
690  typename LocalMap::device_type,
691  Kokkos::MemoryUnmanaged>& permute_to_lids,
692  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
693  typename LocalMap::device_type,
694  Kokkos::MemoryUnmanaged>& permute_from_lids,
695  const Kokkos::View<size_t*,
696  typename LocalMap::device_type,
697  Kokkos::MemoryUnmanaged>& tgt_rowptr,
698  const Kokkos::View<typename LocalMap::global_ordinal_type*,
699  typename LocalMap::device_type,
700  Kokkos::MemoryUnmanaged>& tgt_colind,
701  const Kokkos::View<const int*,
702  typename LocalMap::device_type,
703  Kokkos::MemoryUnmanaged>& src_pids,
704  const Kokkos::View<int*,
705  typename LocalMap::device_type,
706  Kokkos::MemoryUnmanaged>& tgt_pids,
707  const size_t num_same_ids,
708  const size_t tgt_num_rows,
709  const size_t tgt_num_nonzeros,
710  const int my_tgt_pid)
711 {
712  using Kokkos::View;
713  using Kokkos::subview;
714  using Kokkos::parallel_for;
715  using Kokkos::MemoryUnmanaged;
716  using packet_type = Packet;
717  using local_map_type = LocalMap;
718  using local_graph_type = LocalGraph;
719  using buffer_device_type = BufferDevice;
720  using device_type = typename LocalMap::device_type;
721  using LO = typename LocalMap::local_ordinal_type;
722  using execution_space = typename device_type::execution_space;
723  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
724  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
725 
726  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
727 
728  const size_t N = tgt_num_rows;
729  const size_t mynnz = tgt_num_nonzeros;
730 
731  // In the case of reduced communicators, the sourceGraph won't have
732  // the right "my_pid", so thus we have to supply it.
733  const int my_pid = my_tgt_pid;
734 
735  // Zero the rowptr
736  parallel_for(
737  range_policy(0, N+1),
738  KOKKOS_LAMBDA(const size_t i) {
739  tgt_rowptr(i) = 0;
740  }
741  );
742 
743  // same IDs: Always first, always in the same place
744  parallel_for(
745  range_policy(0, num_same_ids),
746  KOKKOS_LAMBDA(const size_t i) {
747  const LO tgt_lid = static_cast<LO>(i);
748  const LO src_lid = static_cast<LO>(i);
749  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
750  - local_graph.row_map(src_lid);
751  }
752  );
753 
754  // Permute IDs: Still local, but reordered
755  const size_type num_permute_to_lids = permute_to_lids.extent(0);
756  parallel_for(
757  range_policy(0, num_permute_to_lids),
758  KOKKOS_LAMBDA(const size_t i) {
759  const LO tgt_lid = permute_to_lids(i);
760  const LO src_lid = permute_from_lids(i);
761  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
762  - local_graph.row_map(src_lid);
763  }
764  );
765 
766  // Get the offsets from the number of packets per LID
767  const size_type num_import_lids = import_lids.extent(0);
768  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
769  computeOffsetsFromCounts(offsets, num_packets_per_lid);
770 
771 #ifdef HAVE_TPETRA_DEBUG
772  {
773  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
774  const bool condition =
775  nth_offset_h != static_cast<size_t>(imports.extent(0));
776  TEUCHOS_TEST_FOR_EXCEPTION
777  (condition, std::logic_error, prefix
778  << "The final offset in bytes " << nth_offset_h
779  << " != imports.size() = " << imports.extent(0)
780  << ". Please report this bug to the Tpetra developers.");
781  }
782 #endif // HAVE_TPETRA_DEBUG
783 
784  // Setup row pointers for remotes
785  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
786  tgt_rowptr, import_lids, imports, num_packets_per_lid);
787 
788  // If multiple processes contribute to the same row, we may need to
789  // update row offsets. This tracks that.
790  View<size_t*, device_type> new_start_row("new_start_row", N+1);
791 
792  // Turn row length into a real CRS row pointer
793  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
794  {
795  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
796  bool condition = nth_tgt_rowptr_h != mynnz;
797  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
798  prefix << "CRS_rowptr[last] = " <<
799  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
800  }
801 
802  // SameIDs: Copy the data over
803  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
804  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
805 
806  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
807  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
808  local_graph, local_col_map, my_pid);
809 
810  if (imports.extent(0) <= 0) {
811  return;
812  }
813 
814  unpackAndCombineIntoCrsArrays2<
815  packet_type,local_graph_type,local_map_type,buffer_device_type>(
816  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
817  num_packets_per_lid, local_graph, local_col_map, my_pid);
818 
819  return;
820 }
821 
822 } // namespace UnpackAndCombineCrsGraphImpl
823 
857 template<class LO, class GO, class Node>
858 void
860  CrsGraph<LO, GO, Node>& graph,
861  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,Node>::packet_type>& imports,
862  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
863  const Teuchos::ArrayView<const LO>& importLIDs,
864  size_t /* constantNumPackets */,
865  Distributor & /* distor */,
866  CombineMode /* combineMode */)
867 {
868 
869  TEUCHOS_TEST_FOR_EXCEPTION(!graph.isGloballyIndexed(), std::invalid_argument,
870  "Graph must be globally indexed!");
871 
872 
873  using Kokkos::View;
874  using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
875  using graph_type = CrsGraph<LO,GO,Node>;
876  using device_type = typename Node::device_type;
877  using packet_type = typename graph_type::packet_type;
878  using buffer_device_type = typename graph_type::buffer_device_type;
879  using execution_space = typename device_type::execution_space;
880  typename execution_space::device_type outputDevice;
881  using buffer_execution_space = typename buffer_device_type::execution_space;
882  typename buffer_execution_space::device_type bufferOutputDevice;
883  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
884 
885  using row_ptrs_type = typename graph_type::local_graph_type::row_map_type::non_const_type;
886  using indices_type = typename graph_type::t_GlobalOrdinal_1D;
887 
888  // Convert all Teuchos::Array to Kokkos::View.
889 
890  // numPacketsPerLID, importLIDs, and imports are input, so we have to copy
891  // them to device. Since unpacking is done directly in to the local graph
892  // (lclGraph), no copying needs to be performed after unpacking.
893  auto imports_d =
894  create_mirror_view_from_raw_host_array(bufferOutputDevice,
895  imports.getRawPtr(), imports.size(),
896  true, "imports");
897 
898  auto num_packets_per_lid_d =
899  create_mirror_view_from_raw_host_array(bufferOutputDevice,
900  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
901  true, "num_packets_per_lid");
902 
903  auto import_lids_d =
905  importLIDs.getRawPtr(), importLIDs.size(),
906  true, "import_lids");
907 
908  // We are OK using the protected data directly (k_*) because this function is
909  // a friend of CrsGraph
910  indices_type indices("indices", graph.k_gblInds1D_.extent(0));
911  Kokkos::deep_copy(indices, graph.k_gblInds1D_);
912 
913  row_ptrs_type row_ptrs_beg("row_ptrs_beg", graph.k_rowPtrs_.extent(0));
914  Kokkos::deep_copy(row_ptrs_beg, graph.k_rowPtrs_);
915 
916  const size_t N = (row_ptrs_beg.extent(0) == 0 ? 0 : row_ptrs_beg.extent(0) - 1);
917  row_ptrs_type row_ptrs_end("row_ptrs_end", N);
918 
919  bool refill_num_row_entries = false;
920  if (graph.k_numRowEntries_.extent(0) > 0) {
921  // Case 1: Packed storage
922  refill_num_row_entries = true;
923  auto num_row_entries = graph.k_numRowEntries_;
924  Kokkos::parallel_for("Fill end row pointers", range_policy(0, N),
925  KOKKOS_LAMBDA(const size_t i){
926  row_ptrs_end(i) = row_ptrs_beg(i) + num_row_entries(i);
927  });
928 
929  } else {
930  // mfh If packed storage, don't need row_ptrs_end to be separate allocation;
931  // could just have it alias row_ptrs_beg+1.
932 
933  // Case 2: Packed storage
934  Kokkos::parallel_for("Fill end row pointers",
935  range_policy(0, N), KOKKOS_LAMBDA(const size_t i){
936  row_ptrs_end(i) = row_ptrs_beg(i+1);
937  });
938  }
939 
940  // Now do the actual unpack!
941  unpackAndCombine<LO,packet_type,row_ptrs_type,indices_type,device_type,buffer_device_type>(
942  row_ptrs_beg, row_ptrs_end, indices, imports_d,
943  num_packets_per_lid_d, import_lids_d, false);
944 
945  // mfh Later, permit graph to be locally indexed, and check whether
946  // incoming column indices are in the column Map. If not, error.
947  if (refill_num_row_entries) {
948  Kokkos::parallel_for("Fill num entries",
949  range_policy(0, N), KOKKOS_LAMBDA(const size_t i){
950  graph.k_numRowEntries_(i) = row_ptrs_end(i) - row_ptrs_beg(i);
951  });
952  }
953  graph.k_rowPtrs_ = row_ptrs_beg;
954  graph.k_gblInds1D_ = indices;
955 
956  return;
957 }
958 
959 template<class LO, class GO, class Node>
960 void
961 unpackCrsGraphAndCombineNew(
962  CrsGraph<LO, GO, Node>& /* sourceGraph */,
963  const Kokkos::DualView<const typename CrsGraph<LO,GO,Node>::packet_type*,
964  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* imports */,
965  const Kokkos::DualView<const size_t*,
966  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* numPacketsPerLID */,
967  const Kokkos::DualView<const LO*,
968  typename CrsGraph<LO,GO,Node>::buffer_device_type>& /* importLIDs */,
969  const size_t /* constantNumPackets */,
970  Distributor& /* distor */,
971  const CombineMode /* combineMode */)
972 {
973  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, "METHOD NOT COMPLETE");
974 #if 0
975  using UnpackAndCombineCrsGraphImpl::unpackAndCombine;
977  using Kokkos::View;
978  using device_type = typename Node::device_type;
979  using graph_type = CrsGraph<LO, GO, Node>;
980  using packet_type = typename graph_type::packet_type;
981  using local_graph_type = typename graph_type::local_graph_type;
982  using buffer_device_type = typename graph_type::buffer_device_type;
983  using buffer_memory_space = typename buffer_device_type::memory_space;
984  using memory_space = typename device_type::memory_space;
985 
986  using row_ptrs_type = typename graph_type::local_graph_type::row_map_type::non_const_type;
987  using execution_space = typename device_type::execution_space;
988  using indices_type = Kokkos::View<GO*, execution_space>;
989 
990  static_assert(std::is_same<device_type, typename local_graph_type::device_type>::value,
991  "Node::device_type and LocalGraph::device_type must be "
992  "the same.");
993 
994  {
995  auto numPacketsPerLID_nc = castAwayConstDualView(numPacketsPerLID);
996  numPacketsPerLID_nc.sync_device ();
997  }
998  auto num_packets_per_lid_d = numPacketsPerLID.view_device ();
999 
1000  TEUCHOS_ASSERT( ! importLIDs.need_sync_device () );
1001  auto import_lids_d = importLIDs.view_device ();
1002 
1003  {
1004  auto imports_nc = castAwayConstDualView(imports);
1005  imports_nc.sync_device ();
1006  }
1007  auto imports_d = imports.view_device ();
1008 
1009  // Now do the actual unpack!
1010  // TJF: Should be grabbed from the Graph
1011  indices_type indices;
1012  row_ptrs_type row_ptrs_beg;
1013  row_ptrs_type row_ptrs_end;
1014  unpackAndCombine<LO,packet_type,row_ptrs_type,indices_type,device_type,buffer_device_type>(
1015  row_ptrs_beg, row_ptrs_end, indices, imports_d,
1016  num_packets_per_lid_d, import_lids_d, false);
1017 #endif // 0
1018 }
1019 
1069 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1070 size_t
1072  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1073  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
1074  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
1075  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1076  size_t /* constantNumPackets */,
1077  Distributor &/* distor */,
1078  CombineMode /* combineMode */,
1079  size_t numSameIDs,
1080  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1081  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
1082 {
1083  using Kokkos::MemoryUnmanaged;
1084  using Kokkos::View;
1085  using device_type = typename Node::device_type;
1086  using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
1087  using local_graph_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type;
1088  using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
1089  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
1090 
1091  TEUCHOS_TEST_FOR_EXCEPTION
1092  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1093  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
1094  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1095  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
1096  // process, then the graph is neither locally nor globally indexed.
1097  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
1098  TEUCHOS_TEST_FOR_EXCEPTION
1099  (! locallyIndexed, std::invalid_argument, prefix << "The input "
1100  "CrsGraph 'sourceGraph' must be locally indexed.");
1101  TEUCHOS_TEST_FOR_EXCEPTION
1102  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
1103  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
1104  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1105 
1106  auto local_graph = sourceGraph.getLocalGraph();
1107  auto permute_from_lids_d =
1109  permuteFromLIDs.getRawPtr(),
1110  permuteFromLIDs.size(), true,
1111  "permute_from_lids");
1112  auto imports_d =
1113  create_mirror_view_from_raw_host_array(buffer_device_type(),
1114  imports.getRawPtr(),
1115  imports.size(), true,
1116  "imports");
1117  auto num_packets_per_lid_d =
1118  create_mirror_view_from_raw_host_array(buffer_device_type(),
1119  numPacketsPerLID.getRawPtr(),
1120  numPacketsPerLID.size(), true,
1121  "num_packets_per_lid");
1122 
1124  packet_type,local_graph_type,buffer_device_type>(
1125  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
1126 }
1127 
1141 template<class LocalOrdinal, class GlobalOrdinal, class Node>
1142 void
1144  const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> & sourceGraph,
1145  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
1146  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
1147  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
1148  const size_t /* constantNumPackets */,
1149  Distributor& /* distor */,
1150  const CombineMode /* combineMode */,
1151  const size_t numSameIDs,
1152  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
1153  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
1154  size_t TargetNumRows,
1155  size_t TargetNumNonzeros,
1156  const int MyTargetPID,
1157  const Teuchos::ArrayView<size_t>& CRS_rowptr,
1158  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
1159  const Teuchos::ArrayView<const int>& SourcePids,
1160  Teuchos::Array<int>& TargetPids)
1161 {
1162  using Kokkos::View;
1163  using Kokkos::deep_copy;
1164  using Teuchos::ArrayView;
1165  using Teuchos::outArg;
1166  using Teuchos::REDUCE_MAX;
1167  using Teuchos::reduceAll;
1168  using LO = LocalOrdinal;
1169  using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
1170  using local_graph_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type;
1171  using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
1172  using device_type = typename Node::device_type;
1173  using execution_space = typename device_type::execution_space;
1174  using buffer_execution_space = typename buffer_device_type::execution_space;
1175  using size_type = typename ArrayView<const LO>::size_type;
1176 
1177  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
1178 
1179  TEUCHOS_TEST_FOR_EXCEPTION(
1180  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
1181  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
1182  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
1183 
1184  TEUCHOS_TEST_FOR_EXCEPTION(
1185  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
1186  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
1187  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
1188  const size_type numImportLIDs = importLIDs.size();
1189 
1190  TEUCHOS_TEST_FOR_EXCEPTION(
1191  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
1192  prefix << "importLIDs.size() = " << numImportLIDs << " != "
1193  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
1194 
1195  // Preseed TargetPids with -1 for local
1196  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
1197  TargetPids.resize(TargetNumNonzeros);
1198  }
1199  TargetPids.assign(TargetNumNonzeros, -1);
1200 
1201  // Grab pointers for sourceGraph
1202  auto local_graph = sourceGraph.getLocalGraph();
1203  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
1204 
1205  // Convert input arrays to Kokkos::View
1206  typename execution_space::device_type outputDevice;
1207  typename buffer_execution_space::device_type bufferOutputDevice;
1208 
1209  auto import_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1210  importLIDs.getRawPtr(), importLIDs.size(),
1211  true, "import_lids");
1212 
1213  auto imports_d = create_mirror_view_from_raw_host_array(bufferOutputDevice,
1214  imports.getRawPtr(), imports.size(),
1215  true, "imports");
1216 
1217  auto num_packets_per_lid_d = create_mirror_view_from_raw_host_array(bufferOutputDevice,
1218  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1219  true, "num_packets_per_lid");
1220 
1221  auto permute_from_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1222  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1223  true, "permute_from_lids");
1224 
1225  auto permute_to_lids_d = create_mirror_view_from_raw_host_array(outputDevice,
1226  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1227  true, "permute_to_lids");
1228 
1229  auto crs_rowptr_d = create_mirror_view_from_raw_host_array(outputDevice,
1230  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1231  true, "crs_rowptr");
1232 
1233  auto crs_colind_d = create_mirror_view_from_raw_host_array(outputDevice,
1234  CRS_colind.getRawPtr(), CRS_colind.size(),
1235  true, "crs_colidx");
1236 
1237  auto src_pids_d = create_mirror_view_from_raw_host_array(outputDevice,
1238  SourcePids.getRawPtr(), SourcePids.size(),
1239  true, "src_pids");
1240 
1241  auto tgt_pids_d = create_mirror_view_from_raw_host_array(outputDevice,
1242  TargetPids.getRawPtr(), TargetPids.size(),
1243  true, "tgt_pids");
1244 
1245  using local_map_type = decltype(local_col_map);
1247  packet_type,local_graph_type,local_map_type,buffer_device_type>(
1248  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1249  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1250  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1251 
1252  // Copy outputs back to host
1253  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1254  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1255  deep_copy(crs_rowptr_h, crs_rowptr_d);
1256 
1257  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1258  CRS_colind.getRawPtr(), CRS_colind.size());
1259  deep_copy(crs_colind_h, crs_colind_d);
1260 
1261  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1262  TargetPids.getRawPtr(), TargetPids.size());
1263  deep_copy(tgt_pids_h, tgt_pids_d);
1264 
1265 }
1266 
1267 } // namespace Details
1268 } // namespace Tpetra
1269 
1270 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1271  template void \
1272  Details::unpackCrsGraphAndCombine<LO, GO, NT>( \
1273  CrsGraph<LO, GO, NT>&, \
1274  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1275  const Teuchos::ArrayView<const size_t>&, \
1276  const Teuchos::ArrayView<const LO>&, \
1277  size_t, \
1278  Distributor&, \
1279  CombineMode); \
1280  template void \
1281  Details::unpackCrsGraphAndCombineNew<LO, GO, NT>( \
1282  CrsGraph<LO, GO, NT>&, \
1283  const Kokkos::DualView<const CrsGraph<LO, GO, NT>::packet_type*, \
1284  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1285  const Kokkos::DualView<const size_t*, \
1286  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1287  const Kokkos::DualView<const LO*, \
1288  CrsGraph<LO, GO, NT>::buffer_device_type>&, \
1289  const size_t, \
1290  Distributor&, \
1291  const CombineMode); \
1292  template void \
1293  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1294  const CrsGraph<LO, GO, NT> &, \
1295  const Teuchos::ArrayView<const LO>&, \
1296  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1297  const Teuchos::ArrayView<const size_t>&, \
1298  const size_t, \
1299  Distributor&, \
1300  const CombineMode, \
1301  const size_t, \
1302  const Teuchos::ArrayView<const LO>&, \
1303  const Teuchos::ArrayView<const LO>&, \
1304  size_t, \
1305  size_t, \
1306  const int, \
1307  const Teuchos::ArrayView<size_t>&, \
1308  const Teuchos::ArrayView<GO>&, \
1309  const Teuchos::ArrayView<const int>&, \
1310  Teuchos::Array<int>&); \
1311  template size_t \
1312  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1313  const CrsGraph<LO, GO, NT> &, \
1314  const Teuchos::ArrayView<const LO> &, \
1315  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1316  const Teuchos::ArrayView<const size_t>&, \
1317  size_t, \
1318  Distributor &, \
1319  CombineMode, \
1320  size_t, \
1321  const Teuchos::ArrayView<const LO>&, \
1322  const Teuchos::ArrayView<const LO>&);
1323 
1324 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
t_GlobalOrdinal_1D k_gblInds1D_
Global column indices for all rows.
Declaration of the Tpetra::CrsGraph class.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
Functions for manipulating CRS arrays.
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
Sets up and executes a communication plan for a Tpetra DistObject.
local_graph_type::row_map_type::const_type k_rowPtrs_
Row offsets for &quot;1-D&quot; storage.
CombineMode
Rule for combining data in an Import or Export.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void padCrsArrays(RowPtr &rowPtrBeg, RowPtr &rowPtrEnd, Indices &indices, const Padding &padding)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
void unpackCrsGraphAndCombine(CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &importLIDs, size_t constantNumPackets, Distributor &distor, CombineMode combineMode)
Unpack the imported column indices and combine into graph.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
local_graph_type getLocalGraph() const
Get the local graph.
Declaration and definition of Tpetra::Details::getEntryOnHost.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.