Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_Details_unpackCrsGraphAndCombine_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
41 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
42 
43 #include "TpetraCore_config.h"
44 #include "Teuchos_Array.hpp"
45 #include "Teuchos_ArrayView.hpp"
51 #include "Tpetra_CrsGraph_decl.hpp"
54 #include "Kokkos_Core.hpp"
55 #include <memory>
56 #include <string>
57 
76 
77 namespace Tpetra {
78 
79 #ifndef DOXYGEN_SHOULD_SKIP_THIS
80 // Forward declaration of Distributor
81 class Distributor;
82 #endif // DOXYGEN_SHOULD_SKIP_THIS
83 
84 //
85 // Users must never rely on anything in the Details namespace.
86 //
87 namespace Details {
88 
89 namespace UnpackAndCombineCrsGraphImpl {
90 
100 template<class Packet, class GO, class Device, class BufferDevice>
101 KOKKOS_FUNCTION int
102 unpackRow (const Kokkos::View<GO*,Device,Kokkos::MemoryUnmanaged>& gids_out,
103  const Kokkos::View<int*,Device,Kokkos::MemoryUnmanaged>& pids_out,
104  const Kokkos::View<const Packet*,BufferDevice>& imports,
105  const size_t offset,
106  const size_t num_ent)
107 {
108  using size_type = typename Kokkos::View<GO*,Device>::size_type;
109 
110  if (num_ent == 0) {
111  // Empty rows always take zero bytes, to ensure sparsity.
112  return 0;
113  }
114 
115  // Unpack GIDs
116  for (size_type k=0; k<num_ent; k++)
117  gids_out(k) = imports(offset+k);
118 
119  // Unpack PIDs
120  if (pids_out.size() > 0) {
121  for (size_type k=0; k<num_ent; k++) {
122  pids_out(k) = static_cast<int>(imports(offset+num_ent+k));
123  }
124  }
125 
126  return 0;
127 }
128 
139 template<class LocalOrdinal,
140  class Packet,
141  class RowView,
142  class IndicesView,
143  class BufferDevice>
145 
146  using LO = LocalOrdinal;
147  using GO = typename IndicesView::value_type;
148  using packet_type = Packet;
149  using row_ptrs_type = RowView;
150  using indices_type = IndicesView;
151  using buffer_device_type = BufferDevice;
152 
153  using device_type = typename IndicesView::device_type;
154  using execution_space = typename device_type::execution_space;
155 
156  using num_packets_per_lid_type = Kokkos::View<const size_t*, buffer_device_type>;
157  using offsets_type = Kokkos::View<const size_t*, device_type>;
158  using input_buffer_type = Kokkos::View<const packet_type*, buffer_device_type>;
159  using import_lids_type = Kokkos::View<const LO*, buffer_device_type>;
160 
161  using gids_scratch_type = Kokkos::View<GO*, device_type>;
162  using pids_scratch_type = Kokkos::View<int*,device_type>;
163 
164  row_ptrs_type row_ptrs_beg;
165  row_ptrs_type row_ptrs_end;
166  indices_type indices;
167  input_buffer_type imports;
168  num_packets_per_lid_type num_packets_per_lid;
169  import_lids_type import_lids;
170  offsets_type offsets;
171  size_t max_num_ent;
172  bool unpack_pids;
173  Kokkos::Experimental::UniqueToken<execution_space,
174  Kokkos::Experimental::UniqueTokenScope::Global> tokens;
175  gids_scratch_type gids_scratch;
176  pids_scratch_type pids_scratch;
177 
178  public:
179  using value_type = Kokkos::pair<int, LO>;
180 
182  const row_ptrs_type& row_ptrs_beg_in,
183  const row_ptrs_type& row_ptrs_end_in,
184  const indices_type& indices_in,
185  const input_buffer_type& imports_in,
186  const num_packets_per_lid_type& num_packets_per_lid_in,
187  const import_lids_type& import_lids_in,
188  const offsets_type& offsets_in,
189  const size_t max_num_ent_in,
190  const bool unpack_pids_in) :
191  row_ptrs_beg(row_ptrs_beg_in),
192  row_ptrs_end(row_ptrs_end_in),
193  indices(indices_in),
194  imports(imports_in),
195  num_packets_per_lid(num_packets_per_lid_in),
196  import_lids(import_lids_in),
197  offsets(offsets_in),
198  max_num_ent(max_num_ent_in),
199  unpack_pids(unpack_pids_in),
200  tokens(execution_space()),
201  gids_scratch("gids_scratch", tokens.size() * max_num_ent),
202  pids_scratch("pids_scratch", tokens.size() * max_num_ent)
203  {}
204 
205  KOKKOS_INLINE_FUNCTION void init(value_type& dst) const
206  {
207  using Tpetra::Details::OrdinalTraits;
208  dst = Kokkos::make_pair(0, OrdinalTraits<LO>::invalid());
209  }
210 
211  KOKKOS_INLINE_FUNCTION void
212  join(volatile value_type& dst, const volatile value_type& src) const
213  {
214  // `dst` should reflect the first (least) bad index and
215  // all other associated error codes and data. Thus, we need only
216  // check if the `src` object shows an error and if its associated
217  // bad index is less than `dst`'s bad index.
218  using Tpetra::Details::OrdinalTraits;
219  if (src.second != OrdinalTraits<LO>::invalid()) {
220  // An error in the src; check if
221  // 1. `dst` shows errors
222  // 2. If `dst` does show errors, if src's bad index is less than
223  // *this' bad index
224  if (dst.second == OrdinalTraits<LO>::invalid() ||
225  src.second < dst.second) {
226  dst = src;
227  }
228  }
229  }
230 
231  KOKKOS_INLINE_FUNCTION
232  void operator()(const LO i, value_type& dst) const
233  {
234  using Kokkos::View;
235  using Kokkos::subview;
236  using Kokkos::MemoryUnmanaged;
237  using size_type = typename execution_space::size_type;
238  using slice = typename Kokkos::pair<size_type, size_type>;
239 
240  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
241  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
242 
243  const size_t num_packets_this_lid = num_packets_per_lid(i);
244  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2
245  : num_packets_this_lid;
246  if (unpack_pids && num_packets_this_lid%2 != 0) {
247  // Attempting to unpack PIDs, but num_packets_this_lid is not even; this
248  // should never
249  dst = Kokkos::make_pair(1, i);
250  return;
251  }
252 
253  // Only unpack data if there is a nonzero number to unpack
254  if (num_ent == 0) {
255  return;
256  }
257 
258  // there is actually something in the row
259  const size_t buf_size = imports.size();
260  const size_t offset = offsets(i);
261 
262  if (offset > buf_size || offset + num_packets_this_lid > buf_size) {
263  dst = Kokkos::make_pair(2, i); // out of bounds
264  return;
265  }
266 
267  // Get subviews in to the scratch arrays. The token returned from acquire
268  // is an integer in [0, tokens.size()). It is used to grab a unique (to
269  // this thread) subview of the scratch arrays.
270  const size_type token = tokens.acquire();
271  const size_t a = static_cast<size_t>(token) * max_num_ent;
272  const size_t b = a + num_ent;
273  gids_out_type gids_out = subview(gids_scratch, slice(a, b));
274  pids_out_type pids_out = subview(pids_scratch, slice(a, (unpack_pids ? b : a)));
275 
276  const int err = unpackRow (gids_out, pids_out, imports, offset, num_ent);
277 
278  if (err != 0) {
279  dst = Kokkos::make_pair(3, i);
280  tokens.release(token);
281  return;
282  }
283 
284  auto import_lid = import_lids(i);
285  for (size_t k = 0; k < num_ent; ++k) {
286  indices(row_ptrs_end(import_lid)) = gids_out(k);
287  // this is OK; don't need atomic, since LIDs to pack don't have repeats.
288  row_ptrs_end(import_lid) += 1;
289  }
290 
291  tokens.release(token);
292  }
293 
294 };
295 
302 template<class LocalOrdinal, class GlobalOrdinal, class Node,
303  class RowView, class IndicesView, class BufferDevice>
304 void
305 unpackAndCombine
306 (const RowView& row_ptrs_beg,
307  const RowView& row_ptrs_end,
308  IndicesView& indices,
309  const Kokkos::View<const GlobalOrdinal*, BufferDevice,
310  Kokkos::MemoryUnmanaged>& imports,
311  const Kokkos::View<const size_t*, BufferDevice,
312  Kokkos::MemoryUnmanaged>& num_packets_per_lid,
313  const Kokkos::View<const LocalOrdinal*, BufferDevice,
314  Kokkos::MemoryUnmanaged>& import_lids,
315  const typename CrsGraph<LocalOrdinal, GlobalOrdinal,
316  Node>::padding_type& padding,
317  const bool unpack_pids,
318  const int myRank,
319  const bool verbose)
320 {
321  using LO = LocalOrdinal;
322  using GO = GlobalOrdinal;
323  using device_type = typename Node::device_type;
324  using execution_space = typename BufferDevice::execution_space;
325  using range_policy =
326  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
327  using unpack_functor_type =
329 
330  const char prefix[] =
331  "Tpetra::Details::UnpackAndCombineCrsGraphImpl::unpackAndCombine: ";
332 
333  const size_t num_import_lids = static_cast<size_t>(import_lids.extent(0));
334  if (num_import_lids == 0) {
335  // Nothing to unpack
336  return;
337  }
338 
339  // Resize row pointers and indices to accommodate incoming data
340  padCrsArrays(row_ptrs_beg, row_ptrs_end, indices, padding,
341  myRank, verbose);
342 
343  // Get the offsets
344  Kokkos::View<size_t*, device_type> offsets("offsets", num_import_lids+1);
345  computeOffsetsFromCounts(offsets, num_packets_per_lid);
346 
347  // Determine the maximum number of entries in any row in the graph. The
348  // maximum number of entries is needed to allocate unpack buffers on the
349  // device.
350  size_t max_num_ent;
351  Kokkos::parallel_reduce
352  ("MaxReduce",
353  range_policy (0, LO (num_packets_per_lid.size ())),
354  KOKKOS_LAMBDA (const LO i, size_t& running_max_num_ent) {
355  const size_t num_packets_this_lid = num_packets_per_lid(i);
356  const size_t num_ent = (unpack_pids) ? num_packets_this_lid/2 :
357  num_packets_this_lid;
358  if (num_ent > running_max_num_ent) {
359  running_max_num_ent = num_ent;
360  }
361  }, Kokkos::Max<size_t> (max_num_ent));
362 
363  // Now do the actual unpack!
364  unpack_functor_type f (row_ptrs_beg, row_ptrs_end, indices, imports,
365  num_packets_per_lid, import_lids, offsets,
366  max_num_ent, unpack_pids);
367 
368  typename unpack_functor_type::value_type x;
369  Kokkos::parallel_reduce(range_policy(0, static_cast<LO>(num_import_lids)), f, x);
370  auto x_h = x.to_std_pair();
371  TEUCHOS_TEST_FOR_EXCEPTION(x_h.first != 0, std::runtime_error,
372  prefix << "UnpackAndCombineFunctor reported error code "
373  << x_h.first << " for the first bad row " << x_h.second);
374 }
375 
376 template<class Packet, class LocalGraph, class BufferDevice>
377 size_t
379  const LocalGraph& local_graph,
380  const Kokkos::View<const typename LocalGraph::data_type*,
381  typename LocalGraph::device_type,
382  Kokkos::MemoryUnmanaged> permute_from_lids,
383  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
384  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
385  const size_t num_same_ids)
386 {
387  using Kokkos::parallel_reduce;
388  using local_graph_type = LocalGraph;
389  using LO = typename local_graph_type::data_type;
390  using device_type = typename local_graph_type::device_type;
391  using execution_space = typename device_type::execution_space;
392  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
393 
394  size_t count = 0;
395  LO num_items;
396 
397  // Number of graph entries to unpack (returned by this function).
398  num_items = static_cast<LO>(num_same_ids);
399  if (num_items) {
400  size_t kcnt = 0;
401  parallel_reduce(
402  range_policy(0, num_items),
403  KOKKOS_LAMBDA(const LO lid, size_t& update) {
404  update += static_cast<size_t>(local_graph.row_map[lid+1]
405  -local_graph.row_map[lid]);
406  }, kcnt);
407  count += kcnt;
408  }
409 
410  // Count entries copied directly from the source graph with permuting.
411  num_items = static_cast<LO>(permute_from_lids.extent(0));
412  if (num_items) {
413  size_t kcnt = 0;
414  parallel_reduce(
415  range_policy(0, num_items),
416  KOKKOS_LAMBDA(const LO i, size_t& update) {
417  const LO lid = permute_from_lids(i);
418  update += static_cast<size_t>(local_graph.row_map[lid+1]
419  - local_graph.row_map[lid]);
420  }, kcnt);
421  count += kcnt;
422  }
423 
424  {
425  // Count entries received from other MPI processes.
426  size_t tot_num_ent = 0;
427  parallel_reduce("SumReduce",
428  num_packets_per_lid.size(),
429  KOKKOS_LAMBDA(const int& i, size_t& lsum) {
430  lsum += num_packets_per_lid(i) / 2;
431  }, Kokkos::Sum<size_t>(tot_num_ent));
432  count += tot_num_ent;
433  }
434 
435  return count;
436 }
437 
439 template<class Packet, class LO, class Device, class BufferDevice>
440 void
441 setupRowPointersForRemotes(
442  const Kokkos::View<size_t*, Device>& tgt_rowptr,
443  const Kokkos::View<const LO*, BufferDevice>& import_lids,
444  const Kokkos::View<const Packet*, BufferDevice>& /* imports */,
445  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid)
446 {
447  using Kokkos::parallel_reduce;
448  using device_type = Device;
449  using execution_space = typename device_type::execution_space;
450  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
451  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
452 
453  const size_type N = num_packets_per_lid.extent(0);
454  parallel_for("Setup row pointers for remotes",
455  range_policy(0, N),
456  KOKKOS_LAMBDA(const size_t i){
457  using atomic_incr_type = typename std::remove_reference<decltype(tgt_rowptr(0))>::type;
458  const size_t num_packets_this_lid = num_packets_per_lid(i);
459  const size_t num_ent = num_packets_this_lid / 2;
460  Kokkos::atomic_fetch_add(&tgt_rowptr(import_lids(i)), atomic_incr_type(num_ent));
461  });
462 }
463 
464 // Convert array of row lengths to a CRS pointer array
465 template<class Device>
466 void
467 makeCrsRowPtrFromLengths(
468  const Kokkos::View<size_t*,Device,Kokkos::MemoryUnmanaged>& tgt_rowptr,
469  const Kokkos::View<size_t*,Device>& new_start_row)
470 {
471  using Kokkos::parallel_scan;
472  using device_type = Device;
473  using execution_space = typename device_type::execution_space;
474  using size_type = typename Kokkos::View<size_t*,device_type>::size_type;
475  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
476  const size_type N = new_start_row.extent(0);
477  parallel_scan(
478  range_policy(0, N),
479  KOKKOS_LAMBDA(const size_t& i, size_t& update, const bool& final) {
480  auto cur_val = tgt_rowptr(i);
481  if (final) {
482  tgt_rowptr(i) = update;
483  new_start_row(i) = tgt_rowptr(i);
484  }
485  update += cur_val;
486  }
487  );
488 }
489 
490 template<class LocalGraph, class LocalMap>
491 void
492 copyDataFromSameIDs(
493  const Kokkos::View<typename LocalMap::global_ordinal_type*,
494  typename LocalMap::device_type>& tgt_colind,
495  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
496  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
497  const Kokkos::View<size_t*, typename LocalMap::device_type>& tgt_rowptr,
498  const Kokkos::View<const int*, typename LocalMap::device_type>& src_pids,
499  const LocalGraph& local_graph,
500  const LocalMap& local_col_map,
501  const size_t num_same_ids,
502  const int my_pid)
503 {
504  using Kokkos::parallel_for;
505  using device_type = typename LocalMap::device_type;
506  using LO = typename LocalMap::local_ordinal_type;
507  using execution_space = typename device_type::execution_space;
508  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
509 
510  parallel_for(
511  range_policy(0, num_same_ids),
512  KOKKOS_LAMBDA(const size_t i) {
513  using atomic_incr_type =typename std::remove_reference<decltype(new_start_row(0))>::type;
514 
515  const LO src_lid = static_cast<LO>(i);
516  size_t src_row = local_graph.row_map(src_lid);
517 
518  const LO tgt_lid = static_cast<LO>(i);
519  const size_t tgt_row = tgt_rowptr(tgt_lid);
520 
521  const size_t nsr = local_graph.row_map(src_lid+1)
522  - local_graph.row_map(src_lid);
523  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
524 
525  for (size_t j=local_graph.row_map(src_lid);
526  j<local_graph.row_map(src_lid+1); ++j) {
527  LO src_col = local_graph.entries(j);
528  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
529  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
530  }
531  }
532  );
533 }
534 
535 template<class LocalGraph, class LocalMap, class BufferDevice>
536 void
537 copyDataFromPermuteIDs(
538  const Kokkos::View<typename LocalMap::global_ordinal_type*,
539  typename LocalMap::device_type>& tgt_colind,
540  const Kokkos::View<int*,
541  typename LocalMap::device_type>& tgt_pids,
542  const Kokkos::View<size_t*,
543  typename LocalMap::device_type>& new_start_row,
544  const Kokkos::View<size_t*,
545  typename LocalMap::device_type>& tgt_rowptr,
546  const Kokkos::View<const int*,
547  typename LocalMap::device_type>& src_pids,
548  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
549  BufferDevice, Kokkos::MemoryUnmanaged>& permute_to_lids,
550  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
551  BufferDevice, Kokkos::MemoryUnmanaged>& permute_from_lids,
552  const LocalGraph& local_graph,
553  const LocalMap& local_col_map,
554  const int my_pid)
555 {
556  using Kokkos::parallel_for;
557  using device_type = typename LocalMap::device_type;
558  using LO = typename LocalMap::local_ordinal_type;
559  using execution_space = typename device_type::execution_space;
560  using size_type = typename Kokkos::View<LO*,device_type>::size_type;
561  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
562 
563  const size_type num_permute_to_lids = permute_to_lids.extent(0);
564 
565  parallel_for(
566  range_policy(0, num_permute_to_lids),
567  KOKKOS_LAMBDA(const size_t i) {
568  using atomic_incr_type = typename std::remove_reference<decltype(new_start_row(0))>::type;
569 
570  const LO src_lid = permute_from_lids(i);
571  const size_t src_row = local_graph.row_map(src_lid);
572 
573  const LO tgt_lid = permute_to_lids(i);
574  const size_t tgt_row = tgt_rowptr(tgt_lid);
575 
576  size_t nsr = local_graph.row_map(src_lid+1)
577  - local_graph.row_map(src_lid);
578  Kokkos::atomic_fetch_add(&new_start_row(tgt_lid), atomic_incr_type(nsr));
579 
580  for (size_t j=local_graph.row_map(src_lid);
581  j<local_graph.row_map(src_lid+1); ++j) {
582  LO src_col = local_graph.entries(j);
583  tgt_colind(tgt_row + j - src_row) = local_col_map.getGlobalElement(src_col);
584  tgt_pids(tgt_row + j - src_row) = (src_pids(src_col) != my_pid) ? src_pids(src_col) : -1;
585  }
586  }
587  );
588 }
589 
590 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
591 void
592 unpackAndCombineIntoCrsArrays2(
593  const Kokkos::View<typename LocalMap::global_ordinal_type*, typename LocalMap::device_type>& tgt_colind,
594  const Kokkos::View<int*, typename LocalMap::device_type>& tgt_pids,
595  const Kokkos::View<size_t*,typename LocalMap::device_type>& new_start_row,
596  const Kokkos::View<const size_t*, typename LocalMap::device_type>& offsets,
597  const Kokkos::View<
598  const typename LocalMap::local_ordinal_type*,
599  BufferDevice,
600  Kokkos::MemoryUnmanaged>& import_lids,
601  const Kokkos::View<const Packet*, BufferDevice>& imports,
602  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
603  const LocalGraph& /* local_graph */,
604  const LocalMap /*& local_col_map*/,
605  const int my_pid)
606 {
607  using Kokkos::View;
608  using Kokkos::subview;
609  using Kokkos::MemoryUnmanaged;
610  using Kokkos::parallel_reduce;
611  using Kokkos::atomic_fetch_add;
612 
613  using device_type = typename LocalMap::device_type;
614  using LO = typename LocalMap::local_ordinal_type;
615  using GO = typename LocalMap::global_ordinal_type;
616  using execution_space = typename device_type::execution_space;
617  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
618  using slice = typename Kokkos::pair<size_type, size_type>;
619  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_type>>;
620 
621  using pids_out_type = View<int*,device_type, MemoryUnmanaged>;
622  using gids_out_type = View<GO*, device_type, MemoryUnmanaged>;
623 
624  const size_type num_import_lids = import_lids.size();
625  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays2: ";
626 
627  // RemoteIDs: Loop structure following UnpackAndCombine
628  int gbl_err_count;
629  parallel_reduce("Unpack and combine into CRS",
630  range_policy(0, num_import_lids),
631  KOKKOS_LAMBDA(const size_t i, int& err) {
632  using atomic_incr_type = typename std::remove_reference< decltype( new_start_row(0) )>::type;
633  const size_t num_packets_this_lid = num_packets_per_lid(i);
634  const size_t num_ent = num_packets_this_lid / 2;
635  const size_t offset = offsets(i);
636  const LO lcl_row = import_lids(i);
637  const size_t start_row = atomic_fetch_add(&new_start_row(lcl_row), atomic_incr_type(num_ent));
638  const size_t end_row = start_row + num_ent;
639 
640  gids_out_type gids_out = subview(tgt_colind, slice(start_row, end_row));
641  pids_out_type pids_out = subview(tgt_pids, slice(start_row, end_row));
642 
643  err += unpackRow (gids_out, pids_out, imports, offset, num_ent);
644 
645  // Correct target PIDs.
646  for (size_t j = 0; j < static_cast<size_t>(num_ent); ++j) {
647  const int pid = pids_out(j);
648  pids_out(j) = (pid != my_pid) ? pid : -1;
649  }
650  }, gbl_err_count);
651 
652  TEUCHOS_TEST_FOR_EXCEPTION(gbl_err_count != 0,
653  std::invalid_argument, prefix <<
654  "Attempting to unpack PIDs, but num_ent is not even; this should never "
655  "happen! Please report this bug to the Tpetra developers.");
656 
657  return;
658 }
659 
660 template<class Packet, class LocalGraph, class LocalMap, class BufferDevice>
661 void
663  const LocalGraph & local_graph,
664  const LocalMap & local_col_map,
665  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
666  BufferDevice,
667  Kokkos::MemoryUnmanaged>& import_lids,
668  const Kokkos::View<const Packet*, BufferDevice>& imports,
669  const Kokkos::View<const size_t*, BufferDevice>& num_packets_per_lid,
670  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
671  BufferDevice,
672  Kokkos::MemoryUnmanaged>& permute_to_lids,
673  const Kokkos::View<const typename LocalMap::local_ordinal_type*,
674  BufferDevice,
675  Kokkos::MemoryUnmanaged>& permute_from_lids,
676  const Kokkos::View<size_t*,
677  typename LocalMap::device_type,
678  Kokkos::MemoryUnmanaged>& tgt_rowptr,
679  const Kokkos::View<typename LocalMap::global_ordinal_type*,
680  typename LocalMap::device_type,
681  Kokkos::MemoryUnmanaged>& tgt_colind,
682  const Kokkos::View<const int*,
683  typename LocalMap::device_type,
684  Kokkos::MemoryUnmanaged>& src_pids,
685  const Kokkos::View<int*,
686  typename LocalMap::device_type,
687  Kokkos::MemoryUnmanaged>& tgt_pids,
688  const size_t num_same_ids,
689  const size_t tgt_num_rows,
690  const size_t tgt_num_nonzeros,
691  const int my_tgt_pid)
692 {
693  using Kokkos::View;
694  using Kokkos::subview;
695  using Kokkos::parallel_for;
696  using Kokkos::MemoryUnmanaged;
697  using packet_type = Packet;
698  using local_map_type = LocalMap;
699  using local_graph_type = LocalGraph;
700  using buffer_device_type = BufferDevice;
701  using device_type = typename LocalMap::device_type;
702  using LO = typename LocalMap::local_ordinal_type;
703  using execution_space = typename device_type::execution_space;
704  using size_type = typename Kokkos::View<LO*, device_type>::size_type;
705  using range_policy = Kokkos::RangePolicy<execution_space, Kokkos::IndexType<size_t>>;
706 
707  const char prefix[] = "UnpackAndCombineCrsGraphImpl::unpackAndCombineIntoCrsArrays: ";
708 
709  const size_t N = tgt_num_rows;
710  const size_t mynnz = tgt_num_nonzeros;
711 
712  // In the case of reduced communicators, the sourceGraph won't have
713  // the right "my_pid", so thus we have to supply it.
714  const int my_pid = my_tgt_pid;
715 
716  // FIXME (mfh 24 Jun 2019)
717  //
718  // 1. Only zero the entries of tgt_rowptr that actually need it.
719  // 2. Consider merging these three kernels into one.
720 
721  // Zero the rowptr
722  parallel_for(
723  range_policy(0, N+1),
724  KOKKOS_LAMBDA(const size_t i) {
725  tgt_rowptr(i) = 0;
726  }
727  );
728 
729  // same IDs: Always first, always in the same place
730  parallel_for(
731  range_policy(0, num_same_ids),
732  KOKKOS_LAMBDA(const size_t i) {
733  const LO tgt_lid = static_cast<LO>(i);
734  const LO src_lid = static_cast<LO>(i);
735  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
736  - local_graph.row_map(src_lid);
737  }
738  );
739 
740  // Permute IDs: Still local, but reordered
741  const size_type num_permute_to_lids = permute_to_lids.extent(0);
742  parallel_for(
743  range_policy(0, num_permute_to_lids),
744  KOKKOS_LAMBDA(const size_t i) {
745  const LO tgt_lid = permute_to_lids(i);
746  const LO src_lid = permute_from_lids(i);
747  tgt_rowptr(tgt_lid) = local_graph.row_map(src_lid+1)
748  - local_graph.row_map(src_lid);
749  }
750  );
751 
752  // Get the offsets from the number of packets per LID
753  const size_type num_import_lids = import_lids.extent(0);
754  View<size_t*, device_type> offsets("offsets", num_import_lids+1);
755  computeOffsetsFromCounts(offsets, num_packets_per_lid);
756 
757 #ifdef HAVE_TPETRA_DEBUG
758  {
759  auto nth_offset_h = getEntryOnHost(offsets, num_import_lids);
760  const bool condition =
761  nth_offset_h != static_cast<size_t>(imports.extent(0));
762  TEUCHOS_TEST_FOR_EXCEPTION
763  (condition, std::logic_error, prefix
764  << "The final offset in bytes " << nth_offset_h
765  << " != imports.size() = " << imports.extent(0)
766  << ". Please report this bug to the Tpetra developers.");
767  }
768 #endif // HAVE_TPETRA_DEBUG
769 
770  // Setup row pointers for remotes
771  setupRowPointersForRemotes<packet_type,LO,device_type,buffer_device_type>(
772  tgt_rowptr, import_lids, imports, num_packets_per_lid);
773 
774  // If multiple processes contribute to the same row, we may need to
775  // update row offsets. This tracks that.
776  View<size_t*, device_type> new_start_row("new_start_row", N+1);
777 
778  // Turn row length into a real CRS row pointer
779  makeCrsRowPtrFromLengths(tgt_rowptr, new_start_row);
780  {
781  auto nth_tgt_rowptr_h = getEntryOnHost(tgt_rowptr, N);
782  bool condition = nth_tgt_rowptr_h != mynnz;
783  TEUCHOS_TEST_FOR_EXCEPTION(condition, std::invalid_argument,
784  prefix << "CRS_rowptr[last] = " <<
785  nth_tgt_rowptr_h << "!= mynnz = " << mynnz << ".");
786  }
787 
788  // SameIDs: Copy the data over
789  copyDataFromSameIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
790  tgt_rowptr, src_pids, local_graph, local_col_map, num_same_ids, my_pid);
791 
792  copyDataFromPermuteIDs<LocalGraph,LocalMap>(tgt_colind, tgt_pids, new_start_row,
793  tgt_rowptr, src_pids, permute_to_lids, permute_from_lids,
794  local_graph, local_col_map, my_pid);
795 
796  if (imports.extent(0) <= 0) {
797  return;
798  }
799 
800  unpackAndCombineIntoCrsArrays2<
801  packet_type,local_graph_type,local_map_type,buffer_device_type>(
802  tgt_colind, tgt_pids, new_start_row, offsets, import_lids, imports,
803  num_packets_per_lid, local_graph, local_col_map, my_pid);
804 
805  return;
806 }
807 
808 } // namespace UnpackAndCombineCrsGraphImpl
809 
859 template<class LocalOrdinal, class GlobalOrdinal, class Node>
860 size_t
863  const Teuchos::ArrayView<const LocalOrdinal> &importLIDs,
864  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type> &imports,
865  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
866  size_t /* constantNumPackets */,
867  Distributor &/* distor */,
868  CombineMode /* combineMode */,
869  size_t numSameIDs,
870  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
871  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs)
872 {
873  using Kokkos::MemoryUnmanaged;
874  using Kokkos::View;
875  using device_type = typename Node::device_type;
876  using packet_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type;
877  using local_graph_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::local_graph_type;
878  using buffer_device_type = typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::buffer_device_type;
879  const char prefix[] = "unpackAndCombineWithOwningPIDsCount: ";
880 
881  TEUCHOS_TEST_FOR_EXCEPTION
882  (permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
883  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size() << " != "
884  "permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
885  // FIXME (mfh 26 Jan 2015) If there are no entries on the calling
886  // process, then the graph is neither locally nor globally indexed.
887  const bool locallyIndexed = sourceGraph.isLocallyIndexed();
888  TEUCHOS_TEST_FOR_EXCEPTION
889  (! locallyIndexed, std::invalid_argument, prefix << "The input "
890  "CrsGraph 'sourceGraph' must be locally indexed.");
891  TEUCHOS_TEST_FOR_EXCEPTION
892  (importLIDs.size() != numPacketsPerLID.size(), std::invalid_argument,
893  prefix << "importLIDs.size() = " << importLIDs.size() << " != "
894  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
895 
896  auto local_graph = sourceGraph.getLocalGraph();
897  auto permute_from_lids_d =
899  permuteFromLIDs.getRawPtr(),
900  permuteFromLIDs.size(), true,
901  "permute_from_lids");
902  auto imports_d =
903  create_mirror_view_from_raw_host_array(buffer_device_type(),
904  imports.getRawPtr(),
905  imports.size(), true,
906  "imports");
907  auto num_packets_per_lid_d =
908  create_mirror_view_from_raw_host_array(buffer_device_type(),
909  numPacketsPerLID.getRawPtr(),
910  numPacketsPerLID.size(), true,
911  "num_packets_per_lid");
912 
914  packet_type,local_graph_type,buffer_device_type>(
915  local_graph, permute_from_lids_d, imports_d, num_packets_per_lid_d, numSameIDs);
916 }
917 
931 template<class LocalOrdinal, class GlobalOrdinal, class Node>
932 void
935  const Teuchos::ArrayView<const LocalOrdinal>& importLIDs,
936  const Teuchos::ArrayView<const typename CrsGraph<LocalOrdinal,GlobalOrdinal,Node>::packet_type>& imports,
937  const Teuchos::ArrayView<const size_t>& numPacketsPerLID,
938  const size_t /* constantNumPackets */,
939  Distributor& /* distor */,
940  const CombineMode /* combineMode */,
941  const size_t numSameIDs,
942  const Teuchos::ArrayView<const LocalOrdinal>& permuteToLIDs,
943  const Teuchos::ArrayView<const LocalOrdinal>& permuteFromLIDs,
944  size_t TargetNumRows,
945  size_t TargetNumNonzeros,
946  const int MyTargetPID,
947  const Teuchos::ArrayView<size_t>& CRS_rowptr,
948  const Teuchos::ArrayView<GlobalOrdinal>& CRS_colind,
949  const Teuchos::ArrayView<const int>& SourcePids,
950  Teuchos::Array<int>& TargetPids)
951 {
952  using Kokkos::View;
953  using Kokkos::deep_copy;
954  using Teuchos::outArg;
955  using Teuchos::REDUCE_MAX;
956  using Teuchos::reduceAll;
957  using LO = LocalOrdinal;
958  using GO = GlobalOrdinal;
959  using crs_graph_type = CrsGraph<LO, GO, Node>;
960  using packet_type = typename crs_graph_type::packet_type;
961  using local_graph_type = typename crs_graph_type::local_graph_type;
962  using buffer_device_type = typename crs_graph_type::buffer_device_type;
963  using device_type = typename Node::device_type;
964  using size_type = typename Teuchos::ArrayView<const LO>::size_type;
965 
966  const char prefix[] = "Tpetra::Details::unpackAndCombineIntoCrsArrays: ";
967 
968  TEUCHOS_TEST_FOR_EXCEPTION(
969  TargetNumRows + 1 != static_cast<size_t>(CRS_rowptr.size()),
970  std::invalid_argument, prefix << "CRS_rowptr.size() = " <<
971  CRS_rowptr.size() << "!= TargetNumRows+1 = " << TargetNumRows+1 << ".");
972 
973  TEUCHOS_TEST_FOR_EXCEPTION(
974  permuteToLIDs.size() != permuteFromLIDs.size(), std::invalid_argument,
975  prefix << "permuteToLIDs.size() = " << permuteToLIDs.size()
976  << "!= permuteFromLIDs.size() = " << permuteFromLIDs.size() << ".");
977  const size_type numImportLIDs = importLIDs.size();
978 
979  TEUCHOS_TEST_FOR_EXCEPTION(
980  numImportLIDs != numPacketsPerLID.size(), std::invalid_argument,
981  prefix << "importLIDs.size() = " << numImportLIDs << " != "
982  "numPacketsPerLID.size() = " << numPacketsPerLID.size() << ".");
983 
984  // Preseed TargetPids with -1 for local
985  if (static_cast<size_t>(TargetPids.size()) != TargetNumNonzeros) {
986  TargetPids.resize(TargetNumNonzeros);
987  }
988  TargetPids.assign(TargetNumNonzeros, -1);
989 
990  // Grab pointers for sourceGraph
991  auto local_graph = sourceGraph.getLocalGraph();
992  auto local_col_map = sourceGraph.getColMap()->getLocalMap();
993 
994  // Convert input arrays to Kokkos::View
995  device_type outputDevice;
996  buffer_device_type bufferOutputDevice;
997 
998  Kokkos::View<const LO*, buffer_device_type> import_lids_d =
1000  (bufferOutputDevice, importLIDs.getRawPtr(),
1001  importLIDs.size(), true, "import_lids");
1002 
1003  Kokkos::View<const packet_type*, buffer_device_type> imports_d =
1005  (bufferOutputDevice, imports.getRawPtr(),
1006  imports.size(), true, "imports");
1007 
1008  Kokkos::View<const size_t*, buffer_device_type> num_packets_per_lid_d =
1009  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1010  numPacketsPerLID.getRawPtr(), numPacketsPerLID.size(),
1011  true, "num_packets_per_lid");
1012 
1013  Kokkos::View<const LO*, buffer_device_type> permute_to_lids_d =
1014  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1015  permuteToLIDs.getRawPtr(), permuteToLIDs.size(),
1016  true, "permute_to_lids");
1017 
1018  Kokkos::View<const LO*, buffer_device_type> permute_from_lids_d =
1019  create_mirror_view_from_raw_host_array(bufferOutputDevice,
1020  permuteFromLIDs.getRawPtr(), permuteFromLIDs.size(),
1021  true, "permute_from_lids");
1022 
1023  Kokkos::View<size_t*, device_type> crs_rowptr_d =
1025  CRS_rowptr.getRawPtr(), CRS_rowptr.size(),
1026  true, "crs_rowptr");
1027 
1028  Kokkos::View<GO*, device_type> crs_colind_d =
1030  CRS_colind.getRawPtr(), CRS_colind.size(),
1031  true, "crs_colidx");
1032 
1033  Kokkos::View<const int*, device_type> src_pids_d =
1035  SourcePids.getRawPtr(), SourcePids.size(),
1036  true, "src_pids");
1037 
1038  Kokkos::View<int*, device_type> tgt_pids_d =
1040  TargetPids.getRawPtr(), TargetPids.size(),
1041  true, "tgt_pids");
1042 
1043  using local_map_type = decltype(local_col_map);
1045  packet_type,local_graph_type,local_map_type,buffer_device_type>(
1046  local_graph, local_col_map, import_lids_d, imports_d, num_packets_per_lid_d,
1047  permute_to_lids_d, permute_from_lids_d, crs_rowptr_d, crs_colind_d, src_pids_d,
1048  tgt_pids_d, numSameIDs, TargetNumRows, TargetNumNonzeros, MyTargetPID);
1049 
1050  // FIXME (mfh 25 Jun 2019) HostMirror of CudaUVMSpace is CudaUVMSpace!!!
1051 
1052  // Copy outputs back to host
1053  typename decltype(crs_rowptr_d)::HostMirror crs_rowptr_h(
1054  CRS_rowptr.getRawPtr(), CRS_rowptr.size());
1055  deep_copy(crs_rowptr_h, crs_rowptr_d);
1056 
1057  typename decltype(crs_colind_d)::HostMirror crs_colind_h(
1058  CRS_colind.getRawPtr(), CRS_colind.size());
1059  deep_copy(crs_colind_h, crs_colind_d);
1060 
1061  typename decltype(tgt_pids_d)::HostMirror tgt_pids_h(
1062  TargetPids.getRawPtr(), TargetPids.size());
1063  deep_copy(tgt_pids_h, tgt_pids_d);
1064 
1065 }
1066 
1067 } // namespace Details
1068 } // namespace Tpetra
1069 
1070 #define TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_INSTANT( LO, GO, NT ) \
1071  template void \
1072  Details::unpackAndCombineIntoCrsArrays<LO, GO, NT>( \
1073  const CrsGraph<LO, GO, NT> &, \
1074  const Teuchos::ArrayView<const LO>&, \
1075  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type>&, \
1076  const Teuchos::ArrayView<const size_t>&, \
1077  const size_t, \
1078  Distributor&, \
1079  const CombineMode, \
1080  const size_t, \
1081  const Teuchos::ArrayView<const LO>&, \
1082  const Teuchos::ArrayView<const LO>&, \
1083  size_t, \
1084  size_t, \
1085  const int, \
1086  const Teuchos::ArrayView<size_t>&, \
1087  const Teuchos::ArrayView<GO>&, \
1088  const Teuchos::ArrayView<const int>&, \
1089  Teuchos::Array<int>&); \
1090  template size_t \
1091  Details::unpackAndCombineWithOwningPIDsCount<LO, GO, NT>( \
1092  const CrsGraph<LO, GO, NT> &, \
1093  const Teuchos::ArrayView<const LO> &, \
1094  const Teuchos::ArrayView<const typename CrsGraph<LO,GO,NT>::packet_type> &, \
1095  const Teuchos::ArrayView<const size_t>&, \
1096  size_t, \
1097  Distributor &, \
1098  CombineMode, \
1099  size_t, \
1100  const Teuchos::ArrayView<const LO>&, \
1101  const Teuchos::ArrayView<const LO>&);
1102 
1103 #endif // TPETRA_DETAILS_UNPACKCRSGRAPHANDCOMBINE_DEF_HPP
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Import KokkosSparse::OrdinalTraits, a traits class for &quot;invalid&quot; (flag) values of integer types...
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type > local_graph_type
The type of the part of the sparse graph on each MPI process.
Declaration of the Tpetra::CrsGraph class.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, Distributor &distor, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
Functions for manipulating CRS arrays.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Sets up and executes a communication plan for a Tpetra DistObject.
CombineMode
Rule for combining data in an Import or Export.
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, Distributor &distor, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
typename dist_object_type::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
local_graph_type getLocalGraph() const
Get the local graph.
Declaration and definition of Tpetra::Details::getEntryOnHost.
global_ordinal_type packet_type
Type of each entry of the DistObject communication buffer.
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.