Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
44 
47 
51 #include "Tpetra_BlockMultiVector.hpp"
52 #include "Tpetra_BlockView.hpp"
53 
54 #include "Teuchos_TimeMonitor.hpp"
55 #ifdef HAVE_TPETRA_DEBUG
56 # include <set>
57 #endif // HAVE_TPETRA_DEBUG
58 
59 //
60 // mfh 25 May 2016: Temporary fix for #393.
61 //
62 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
63 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
64 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
65 // use them in that case, either.
66 //
67 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
68 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
69 // thing for any GCC version.
70 //
71 #if defined(__CUDACC__)
72  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
73 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
75 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
76 
77 #elif defined(__GNUC__)
78 
79 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
81 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
82 
83 #else // some other compiler
84 
85  // Optimistically assume that other compilers aren't broken.
86 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
88 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
89 #endif // defined(__CUDACC__), defined(__GNUC__)
90 
91 
92 namespace Tpetra {
93 
94 namespace Impl {
95 
96  template<typename T>
97  struct BlockCrsRowStruct {
98  T totalNumEntries, totalNumBytes, maxRowLength;
99  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct() = default;
100  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
101  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
102  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
103  };
104 
105  template<typename T>
106  static
107  KOKKOS_INLINE_FUNCTION
108  void operator+=(volatile BlockCrsRowStruct<T> &a,
109  volatile const BlockCrsRowStruct<T> &b) {
110  a.totalNumEntries += b.totalNumEntries;
111  a.totalNumBytes += b.totalNumBytes;
112  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
113  }
114 
115  template<typename T>
116  static
117  KOKKOS_INLINE_FUNCTION
118  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
119  a.totalNumEntries += b.totalNumEntries;
120  a.totalNumBytes += b.totalNumBytes;
121  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
122  }
123 
124  template<typename T, typename ExecSpace>
125  struct BlockCrsReducer {
126  typedef BlockCrsReducer reducer;
127  typedef T value_type;
128  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
129  value_type *value;
130 
131  KOKKOS_INLINE_FUNCTION
132  BlockCrsReducer(value_type &val) : value(&val) {}
133 
134  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
135  KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const { dst += src; }
136  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
137  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
138  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
139  };
140 
141  template<class AlphaCoeffType,
142  class GraphType,
143  class MatrixValuesType,
144  class InVecType,
145  class BetaCoeffType,
146  class OutVecType,
147  bool IsBuiltInType>
148  class BcrsApplyNoTransFunctor {
149  private:
150  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
151  "MatrixValuesType must be a Kokkos::View.");
152  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
153  "OutVecType must be a Kokkos::View.");
154  static_assert (Kokkos::Impl::is_view<InVecType>::value,
155  "InVecType must be a Kokkos::View.");
156  static_assert (std::is_same<MatrixValuesType,
157  typename MatrixValuesType::const_type>::value,
158  "MatrixValuesType must be a const Kokkos::View.");
159  static_assert (std::is_same<OutVecType,
160  typename OutVecType::non_const_type>::value,
161  "OutVecType must be a nonconst Kokkos::View.");
162  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
163  "InVecType must be a const Kokkos::View.");
164  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
165  "MatrixValuesType must be a rank-1 Kokkos::View.");
166  static_assert (static_cast<int> (InVecType::rank) == 1,
167  "InVecType must be a rank-1 Kokkos::View.");
168  static_assert (static_cast<int> (OutVecType::rank) == 1,
169  "OutVecType must be a rank-1 Kokkos::View.");
170  typedef typename MatrixValuesType::non_const_value_type scalar_type;
171 
172  public:
173  typedef typename GraphType::device_type device_type;
174 
176  typedef typename std::remove_const<typename GraphType::data_type>::type
178 
185  void setX (const InVecType& X) { X_ = X; }
186 
193  void setY (const OutVecType& Y) { Y_ = Y; }
194 
195  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
196  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
197 
199  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
200  const GraphType& graph,
201  const MatrixValuesType& val,
202  const local_ordinal_type blockSize,
203  const InVecType& X,
204  const beta_coeff_type& beta,
205  const OutVecType& Y) :
206  alpha_ (alpha),
207  ptr_ (graph.row_map),
208  ind_ (graph.entries),
209  val_ (val),
210  blockSize_ (blockSize),
211  X_ (X),
212  beta_ (beta),
213  Y_ (Y)
214  {}
215 
216  // Dummy team version
217  KOKKOS_INLINE_FUNCTION void
218  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
219  {
220  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
221  }
222 
223  // Range Policy for non built-in types
224  KOKKOS_INLINE_FUNCTION void
225  operator () (const local_ordinal_type& lclRow) const
226  {
231  using Kokkos::Details::ArithTraits;
232  // I'm not writing 'using Kokkos::make_pair;' here, because that
233  // may break builds for users who make the mistake of putting
234  // 'using namespace std;' in the global namespace. Please don't
235  // ever do that! But just in case you do, I'll take this
236  // precaution.
237  using Kokkos::parallel_for;
238  using Kokkos::subview;
239  typedef typename decltype (ptr_)::non_const_value_type offset_type;
240  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
241  Kokkos::LayoutRight,
242  device_type,
243  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
244  little_block_type;
245 
246  const offset_type Y_ptBeg = lclRow * blockSize_;
247  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
248  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
249 
250  // This version of the code does not use temporary storage.
251  // Each thread writes to its own block of the target vector.
252  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
253  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
254  }
255  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
256  SCAL (beta_, Y_cur);
257  }
258 
259  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
260  const offset_type blkBeg = ptr_[lclRow];
261  const offset_type blkEnd = ptr_[lclRow+1];
262  // Precompute to save integer math in the inner loop.
263  const offset_type bs2 = blockSize_ * blockSize_;
264  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
265  ++absBlkOff) {
266  little_block_type A_cur (val_.data () + absBlkOff * bs2,
267  blockSize_, blockSize_);
268  const offset_type X_blkCol = ind_[absBlkOff];
269  const offset_type X_ptBeg = X_blkCol * blockSize_;
270  const offset_type X_ptEnd = X_ptBeg + blockSize_;
271  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
272 
273  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
274  } // for each entry in current local block row of matrix
275  }
276  }
277 
278  private:
279  alpha_coeff_type alpha_;
280  typename GraphType::row_map_type::const_type ptr_;
281  typename GraphType::entries_type::const_type ind_;
282  MatrixValuesType val_;
283  local_ordinal_type blockSize_;
284  InVecType X_;
285  beta_coeff_type beta_;
286  OutVecType Y_;
287  };
288 
289  template<class AlphaCoeffType,
290  class GraphType,
291  class MatrixValuesType,
292  class InVecType,
293  class BetaCoeffType,
294  class OutVecType>
295  class BcrsApplyNoTransFunctor<AlphaCoeffType,
296  GraphType,
297  MatrixValuesType,
298  InVecType,
299  BetaCoeffType,
300  OutVecType,
301  true> {
302  private:
303  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
304  "MatrixValuesType must be a Kokkos::View.");
305  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
306  "OutVecType must be a Kokkos::View.");
307  static_assert (Kokkos::Impl::is_view<InVecType>::value,
308  "InVecType must be a Kokkos::View.");
309  static_assert (std::is_same<MatrixValuesType,
310  typename MatrixValuesType::const_type>::value,
311  "MatrixValuesType must be a const Kokkos::View.");
312  static_assert (std::is_same<OutVecType,
313  typename OutVecType::non_const_type>::value,
314  "OutVecType must be a nonconst Kokkos::View.");
315  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
316  "InVecType must be a const Kokkos::View.");
317  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
318  "MatrixValuesType must be a rank-1 Kokkos::View.");
319  static_assert (static_cast<int> (InVecType::rank) == 1,
320  "InVecType must be a rank-1 Kokkos::View.");
321  static_assert (static_cast<int> (OutVecType::rank) == 1,
322  "OutVecType must be a rank-1 Kokkos::View.");
323  typedef typename MatrixValuesType::non_const_value_type scalar_type;
324 
325  public:
326  typedef typename GraphType::device_type device_type;
327 
329  typedef typename std::remove_const<typename GraphType::data_type>::type
331 
338  void setX (const InVecType& X) { X_ = X; }
339 
346  void setY (const OutVecType& Y) { Y_ = Y; }
347 
348  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
349  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
350 
352  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
353  const GraphType& graph,
354  const MatrixValuesType& val,
355  const local_ordinal_type blockSize,
356  const InVecType& X,
357  const beta_coeff_type& beta,
358  const OutVecType& Y) :
359  alpha_ (alpha),
360  ptr_ (graph.row_map),
361  ind_ (graph.entries),
362  val_ (val),
363  blockSize_ (blockSize),
364  X_ (X),
365  beta_ (beta),
366  Y_ (Y)
367  {}
368 
369  // dummy Range version
370  KOKKOS_INLINE_FUNCTION void
371  operator () (const local_ordinal_type& lclRow) const
372  {
373  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
374  }
375 
376  // Team policy for built-in types
377  KOKKOS_INLINE_FUNCTION void
378  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
379  {
380  const local_ordinal_type lclRow = member.league_rank();
381 
382  using Kokkos::Details::ArithTraits;
383  // I'm not writing 'using Kokkos::make_pair;' here, because that
384  // may break builds for users who make the mistake of putting
385  // 'using namespace std;' in the global namespace. Please don't
386  // ever do that! But just in case you do, I'll take this
387  // precaution.
388  using Kokkos::parallel_for;
389  using Kokkos::subview;
390  typedef typename decltype (ptr_)::non_const_value_type offset_type;
391  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
392  Kokkos::LayoutRight,
393  device_type,
394  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
395  little_block_type;
396 
397  const offset_type Y_ptBeg = lclRow * blockSize_;
398  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
399  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
400 
401  // This version of the code does not use temporary storage.
402  // Each thread writes to its own block of the target vector.
403  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
404  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
405  [&](const local_ordinal_type &i) {
406  Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
407  });
408  }
409  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
410  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
411  [&](const local_ordinal_type &i) {
412  Y_cur(i) *= beta_;
413  });
414  }
415  member.team_barrier();
416 
417  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
418  const offset_type blkBeg = ptr_[lclRow];
419  const offset_type blkEnd = ptr_[lclRow+1];
420  // Precompute to save integer math in the inner loop.
421  const offset_type bs2 = blockSize_ * blockSize_;
422  little_block_type A_cur (val_.data (), blockSize_, blockSize_);
423  auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
424  Kokkos::parallel_for
425  (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
426  [&](const local_ordinal_type &absBlkOff) {
427  A_cur.assign_data(val_.data () + absBlkOff * bs2);
428  const offset_type X_blkCol = ind_[absBlkOff];
429  const offset_type X_ptBeg = X_blkCol * blockSize_;
430  X_cur.assign_data(&X_(X_ptBeg));
431 
432  Kokkos::parallel_for
433  (Kokkos::ThreadVectorRange(member, blockSize_),
434  [&](const local_ordinal_type &k0) {
435  scalar_type val(0);
436  for (local_ordinal_type k1=0;k1<blockSize_;++k1)
437  val += A_cur(k0,k1)*X_cur(k1);
438 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
439  // host space team size is always 1
440  Y_cur(k0) += alpha_*val;
441 #else
442  // cuda space team size can be larger than 1
443  // atomic is not allowed for sacado type;
444  // thus this needs to be specialized or
445  // sacado atomic should be supported.
446  Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
447 #endif
448  });
449  }); // for each entry in current local block row of matrix
450  }
451  }
452 
453  private:
454  alpha_coeff_type alpha_;
455  typename GraphType::row_map_type::const_type ptr_;
456  typename GraphType::entries_type::const_type ind_;
457  MatrixValuesType val_;
458  local_ordinal_type blockSize_;
459  InVecType X_;
460  beta_coeff_type beta_;
461  OutVecType Y_;
462  };
463 
464 
465  template<class AlphaCoeffType,
466  class GraphType,
467  class MatrixValuesType,
468  class InMultiVecType,
469  class BetaCoeffType,
470  class OutMultiVecType>
471  void
472  bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
473  const GraphType& graph,
474  const MatrixValuesType& val,
475  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
476  const InMultiVecType& X,
477  const BetaCoeffType& beta,
478  const OutMultiVecType& Y
479  )
480  {
481  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
482  "MatrixValuesType must be a Kokkos::View.");
483  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
484  "OutMultiVecType must be a Kokkos::View.");
485  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
486  "InMultiVecType must be a Kokkos::View.");
487  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
488  "MatrixValuesType must be a rank-1 Kokkos::View.");
489  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
490  "OutMultiVecType must be a rank-2 Kokkos::View.");
491  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
492  "InMultiVecType must be a rank-2 Kokkos::View.");
493 
494  typedef typename GraphType::device_type::execution_space execution_space;
495  typedef typename MatrixValuesType::const_type matrix_values_type;
496  typedef typename OutMultiVecType::non_const_type out_multivec_type;
497  typedef typename InMultiVecType::const_type in_multivec_type;
498  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
499  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
500  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
501 
502  constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
503 
504  const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
505  static_cast<LO> (0) :
506  static_cast<LO> (graph.row_map.extent (0) - 1);
507  const LO numVecs = Y.extent (1);
508  if (numLocalMeshRows == 0 || numVecs == 0) {
509  return; // code below doesn't handle numVecs==0 correctly
510  }
511 
512  // These assignments avoid instantiating the functor extra times
513  // unnecessarily, e.g., for X const vs. nonconst. We only need the
514  // X const case, so only instantiate for that case.
515  in_multivec_type X_in = X;
516  out_multivec_type Y_out = Y;
517 
518  // The functor only knows how to handle one vector at a time, and it
519  // expects 1-D Views. Thus, we need to know the type of each column
520  // of X and Y.
521  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
522  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
523  typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
524  matrix_values_type, in_vec_type, beta_type, out_vec_type,
525  is_builtin_type_enabled> functor_type;
526 
527  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
528  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
529 
530  // Compute the first column of Y.
531  if (is_builtin_type_enabled) {
532  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
533  // Built-in version uses atomic add which might not be supported from sacado or any user-defined types.
534  typedef Kokkos::TeamPolicy<execution_space> policy_type;
535  policy_type policy(1,1);
536 #if defined(KOKKOS_ENABLE_CUDA)
537  constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
538 #else
539  constexpr bool is_cuda = false;
540 #endif // defined(KOKKOS_ENABLE_CUDA)
541  if (is_cuda) {
542  LO team_size = 8, vector_size = 1;
543  if (blockSize <= 5) vector_size = 4;
544  else if (blockSize <= 9) vector_size = 8;
545  else if (blockSize <= 12) vector_size = 12;
546  else if (blockSize <= 20) vector_size = 20;
547  else vector_size = 20;
548  policy = policy_type(numLocalMeshRows, team_size, vector_size);
549  } else {
550  policy = policy_type(numLocalMeshRows, 1, 1);
551  }
552  Kokkos::parallel_for (policy, functor);
553 
554  // Compute the remaining columns of Y.
555  for (LO j = 1; j < numVecs; ++j) {
556  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
557  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
558  functor.setX (X_j);
559  functor.setY (Y_j);
560  Kokkos::parallel_for (policy, functor);
561  }
562  } else {
563  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
564  Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
565  Kokkos::parallel_for (policy, functor);
566  for (LO j = 1; j < numVecs; ++j) {
567  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
568  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
569  functor.setX (X_j);
570  functor.setY (Y_j);
571  Kokkos::parallel_for (policy, functor);
572  }
573  }
574  }
575 } // namespace Impl
576 
577 namespace { // (anonymous)
578 
579 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
580 // version that takes two Kokkos::View arguments).
581 template<class Scalar, class LO, class GO, class Node>
582 class GetLocalDiagCopy {
583 public:
584  typedef typename Node::device_type device_type;
585  typedef size_t diag_offset_type;
586  typedef Kokkos::View<const size_t*, device_type,
587  Kokkos::MemoryUnmanaged> diag_offsets_type;
588  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
589  typedef typename global_graph_type::local_graph_type local_graph_type;
590  typedef typename local_graph_type::row_map_type row_offsets_type;
591  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
592  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
593  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
594 
595  // Constructor
596  GetLocalDiagCopy (const diag_type& diag,
597  const values_type& val,
598  const diag_offsets_type& diagOffsets,
599  const row_offsets_type& ptr,
600  const LO blockSize) :
601  diag_ (diag),
602  diagOffsets_ (diagOffsets),
603  ptr_ (ptr),
604  blockSize_ (blockSize),
605  offsetPerBlock_ (blockSize_*blockSize_),
606  val_(val)
607  {}
608 
609  KOKKOS_INLINE_FUNCTION void
610  operator() (const LO& lclRowInd) const
611  {
612  using Kokkos::ALL;
613 
614  // Get row offset
615  const size_t absOffset = ptr_[lclRowInd];
616 
617  // Get offset relative to start of row
618  const size_t relOffset = diagOffsets_[lclRowInd];
619 
620  // Get the total offset
621  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
622 
623  // Get a view of the block. BCRS currently uses LayoutRight
624  // regardless of the device.
625  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
626  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
627  const_little_block_type;
628  const_little_block_type D_in (val_.data () + pointOffset,
629  blockSize_, blockSize_);
630  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
631  COPY (D_in, D_out);
632  }
633 
634  private:
635  diag_type diag_;
636  diag_offsets_type diagOffsets_;
637  row_offsets_type ptr_;
638  LO blockSize_;
639  LO offsetPerBlock_;
640  values_type val_;
641  };
642 } // namespace (anonymous)
643 
644  template<class Scalar, class LO, class GO, class Node>
645  std::ostream&
646  BlockCrsMatrix<Scalar, LO, GO, Node>::
647  markLocalErrorAndGetStream ()
648  {
649  * (this->localError_) = true;
650  if ((*errs_).is_null ()) {
651  *errs_ = Teuchos::rcp (new std::ostringstream ());
652  }
653  return **errs_;
654  }
655 
656  template<class Scalar, class LO, class GO, class Node>
659  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
660  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
661  blockSize_ (static_cast<LO> (0)),
662  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
663  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
664  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
665  offsetPerBlock_ (0),
666  localError_ (new bool (false)),
667  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
668  {
669  }
670 
671  template<class Scalar, class LO, class GO, class Node>
674  const LO blockSize) :
675  dist_object_type (graph.getMap ()),
676  graph_ (graph),
677  rowMeshMap_ (* (graph.getRowMap ())),
678  blockSize_ (blockSize),
679  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
680  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
681  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
682  offsetPerBlock_ (blockSize * blockSize),
683  localError_ (new bool (false)),
684  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
685  {
686  TEUCHOS_TEST_FOR_EXCEPTION(
687  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
688  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
689  "rows (isSorted() is false). This class assumes sorted rows.");
690 
691  graphRCP_ = Teuchos::rcpFromRef(graph_);
692 
693  // Trick to test whether LO is nonpositive, without a compiler
694  // warning in case LO is unsigned (which is generally a bad idea
695  // anyway). I don't promise that the trick works, but it
696  // generally does with gcc at least, in my experience.
697  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
698  TEUCHOS_TEST_FOR_EXCEPTION(
699  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
700  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
701  " <= 0. The block size must be positive.");
702 
703  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
704  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
705 
706  {
707  // These are rcp
708  const auto domainPointMap = getDomainMap();
709  const auto colPointMap = Teuchos::rcp
710  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
711  *pointImporter_ = Teuchos::rcp
712  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
713  }
714  {
715  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
716  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
717 
718  row_map_type ptr_d = graph.getLocalGraph ().row_map;
719  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
720  Kokkos::deep_copy (ptr_h_nc, ptr_d);
721  ptrHost_ = ptr_h_nc;
722  }
723  {
724  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
725  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
726 
727  entries_type ind_d = graph.getLocalGraph ().entries;
728  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
729  Kokkos::deep_copy (ind_h_nc, ind_d);
730  indHost_ = ind_h_nc;
731  }
732 
733  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
734  val_ = decltype (val_) ("val", numValEnt);
735  }
736 
737  template<class Scalar, class LO, class GO, class Node>
740  const map_type& domainPointMap,
741  const map_type& rangePointMap,
742  const LO blockSize) :
743  dist_object_type (graph.getMap ()),
744  graph_ (graph),
745  rowMeshMap_ (* (graph.getRowMap ())),
746  domainPointMap_ (domainPointMap),
747  rangePointMap_ (rangePointMap),
748  blockSize_ (blockSize),
749  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
750  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
751  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
752  offsetPerBlock_ (blockSize * blockSize),
753  localError_ (new bool (false)),
754  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
755  {
756  TEUCHOS_TEST_FOR_EXCEPTION(
757  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
758  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
759  "rows (isSorted() is false). This class assumes sorted rows.");
760 
761  graphRCP_ = Teuchos::rcpFromRef(graph_);
762 
763  // Trick to test whether LO is nonpositive, without a compiler
764  // warning in case LO is unsigned (which is generally a bad idea
765  // anyway). I don't promise that the trick works, but it
766  // generally does with gcc at least, in my experience.
767  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
768  TEUCHOS_TEST_FOR_EXCEPTION(
769  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
770  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
771  " <= 0. The block size must be positive.");
772  {
773  // These are rcp
774  const auto rcpDomainPointMap = getDomainMap();
775  const auto colPointMap = Teuchos::rcp
776  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
777  *pointImporter_ = Teuchos::rcp
778  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
779  }
780  {
781  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
782  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
783 
784  row_map_type ptr_d = graph.getLocalGraph ().row_map;
785  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
786  Kokkos::deep_copy (ptr_h_nc, ptr_d);
787  ptrHost_ = ptr_h_nc;
788  }
789  {
790  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
791  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
792 
793  entries_type ind_d = graph.getLocalGraph ().entries;
794  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
795  Kokkos::deep_copy (ind_h_nc, ind_d);
796  indHost_ = ind_h_nc;
797  }
798 
799  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
800  val_ = decltype (val_) ("val", numValEnt);
801  }
802 
803  template<class Scalar, class LO, class GO, class Node>
804  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
807  { // Copy constructor of map_type does a shallow copy.
808  // We're only returning by RCP for backwards compatibility.
809  return Teuchos::rcp (new map_type (domainPointMap_));
810  }
811 
812  template<class Scalar, class LO, class GO, class Node>
813  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
815  getRangeMap () const
816  { // Copy constructor of map_type does a shallow copy.
817  // We're only returning by RCP for backwards compatibility.
818  return Teuchos::rcp (new map_type (rangePointMap_));
819  }
820 
821  template<class Scalar, class LO, class GO, class Node>
822  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
824  getRowMap () const
825  {
826  return graph_.getRowMap();
827  }
828 
829  template<class Scalar, class LO, class GO, class Node>
830  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
832  getColMap () const
833  {
834  return graph_.getColMap();
835  }
836 
837  template<class Scalar, class LO, class GO, class Node>
841  {
842  return graph_.getGlobalNumRows();
843  }
844 
845  template<class Scalar, class LO, class GO, class Node>
846  size_t
849  {
850  return graph_.getNodeNumRows();
851  }
852 
853  template<class Scalar, class LO, class GO, class Node>
854  size_t
857  {
858  return graph_.getNodeMaxNumRowEntries();
859  }
860 
861  template<class Scalar, class LO, class GO, class Node>
862  void
864  apply (const mv_type& X,
865  mv_type& Y,
866  Teuchos::ETransp mode,
867  Scalar alpha,
868  Scalar beta) const
869  {
871  TEUCHOS_TEST_FOR_EXCEPTION(
872  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
873  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
874  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
875  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
876 
877  BMV X_view;
878  BMV Y_view;
879  const LO blockSize = getBlockSize ();
880  try {
881  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
882  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
883  }
884  catch (std::invalid_argument& e) {
885  TEUCHOS_TEST_FOR_EXCEPTION(
886  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
887  "apply: Either the input MultiVector X or the output MultiVector Y "
888  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
889  "graph. BlockMultiVector's constructor threw the following exception: "
890  << e.what ());
891  }
892 
893  try {
894  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
895  // either that or mark fields of this class as 'mutable'. The
896  // problem is that applyBlock wants to do lazy initialization of
897  // temporary block multivectors.
898  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
899  } catch (std::invalid_argument& e) {
900  TEUCHOS_TEST_FOR_EXCEPTION(
901  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
902  "apply: The implementation method applyBlock complained about having "
903  "an invalid argument. It reported the following: " << e.what ());
904  } catch (std::logic_error& e) {
905  TEUCHOS_TEST_FOR_EXCEPTION(
906  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
907  "apply: The implementation method applyBlock complained about a "
908  "possible bug in its implementation. It reported the following: "
909  << e.what ());
910  } catch (std::exception& e) {
911  TEUCHOS_TEST_FOR_EXCEPTION(
912  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
913  "apply: The implementation method applyBlock threw an exception which "
914  "is neither std::invalid_argument nor std::logic_error, but is a "
915  "subclass of std::exception. It reported the following: "
916  << e.what ());
917  } catch (...) {
918  TEUCHOS_TEST_FOR_EXCEPTION(
919  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
920  "apply: The implementation method applyBlock threw an exception which "
921  "is not an instance of a subclass of std::exception. This probably "
922  "indicates a bug. Please report this to the Tpetra developers.");
923  }
924  }
925 
926  template<class Scalar, class LO, class GO, class Node>
927  void
931  Teuchos::ETransp mode,
932  const Scalar alpha,
933  const Scalar beta)
934  {
935  TEUCHOS_TEST_FOR_EXCEPTION(
936  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
937  "Tpetra::BlockCrsMatrix::applyBlock: "
938  "X and Y have different block sizes. X.getBlockSize() = "
939  << X.getBlockSize () << " != Y.getBlockSize() = "
940  << Y.getBlockSize () << ".");
941 
942  if (mode == Teuchos::NO_TRANS) {
943  applyBlockNoTrans (X, Y, alpha, beta);
944  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
945  applyBlockTrans (X, Y, mode, alpha, beta);
946  } else {
947  TEUCHOS_TEST_FOR_EXCEPTION(
948  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
949  "applyBlock: Invalid 'mode' argument. Valid values are "
950  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
951  }
952  }
953 
954  template<class Scalar, class LO, class GO, class Node>
955  void
957  setAllToScalar (const Scalar& alpha)
958  {
959 #ifdef HAVE_TPETRA_DEBUG
960  const char prefix[] = "Tpetra::BlockCrsMatrix::setAllToScalar: ";
961 #endif // HAVE_TPETRA_DEBUG
962 
963  if (this->need_sync_device ()) {
964  // If we need to sync to device, then the data were last
965  // modified on host. In that case, we should again modify them
966  // on host.
967 #ifdef HAVE_TPETRA_DEBUG
968  TEUCHOS_TEST_FOR_EXCEPTION
969  (this->need_sync_host (), std::runtime_error,
970  prefix << "The matrix's values need sync on both device and host.");
971 #endif // HAVE_TPETRA_DEBUG
972  this->modify_host ();
973  Kokkos::deep_copy (getValuesHost (), alpha);
974  }
975  else {
976  // If we need to sync to host, then the data were last modified
977  // on device. In that case, we should again modify them on
978  // device. Also, prefer modifying on device if neither side is
979  // marked as modified.
980  this->modify_device ();
981  Kokkos::deep_copy (this->getValuesDevice (), alpha);
982  }
983  }
984 
985  template<class Scalar, class LO, class GO, class Node>
986  LO
988  replaceLocalValues (const LO localRowInd,
989  const LO colInds[],
990  const Scalar vals[],
991  const LO numColInds) const
992  {
993 #ifdef HAVE_TPETRA_DEBUG
994  const char prefix[] =
995  "Tpetra::BlockCrsMatrix::replaceLocalValues: ";
996 #endif // HAVE_TPETRA_DEBUG
997 
998  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
999  // We modified no values, because the input local row index is
1000  // invalid on the calling process. That may not be an error, if
1001  // numColInds is zero anyway; it doesn't matter. This is the
1002  // advantage of returning the number of valid indices.
1003  return static_cast<LO> (0);
1004  }
1005  const impl_scalar_type* const vIn =
1006  reinterpret_cast<const impl_scalar_type*> (vals);
1007  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1008  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1009  const LO perBlockSize = this->offsetPerBlock ();
1010  LO hint = 0; // Guess for the relative offset into the current row
1011  LO pointOffset = 0; // Current offset into input values
1012  LO validCount = 0; // number of valid column indices in colInds
1013 
1014 #ifdef HAVE_TPETRA_DEBUG
1015  TEUCHOS_TEST_FOR_EXCEPTION
1016  (this->need_sync_host (), std::runtime_error,
1017  prefix << "The matrix's data were last modified on device, but have "
1018  "not been sync'd to host. Please sync to host (by calling "
1019  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1020  "method.");
1021 #endif // HAVE_TPETRA_DEBUG
1022 
1023  auto vals_host_out = getValuesHost ();
1024  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1025 
1026  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1027  const LO relBlockOffset =
1028  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1029  if (relBlockOffset != LINV) {
1030  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1031  // are stored contiguously, with no padding. "Contiguously"
1032  // means that all memory between the first and last entries
1033  // belongs to the block (no striding). "No padding" means
1034  // that getBlockSize() * getBlockSize() is exactly the number
1035  // of entries that the block uses. For another place where
1036  // this assumption is encoded, see sumIntoLocalValues.
1037 
1038  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1039  // little_block_type A_old =
1040  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1041  impl_scalar_type* const A_old =
1042  vals_host_out_raw + absBlockOffset * perBlockSize;
1043  // const_little_block_type A_new =
1044  // getConstLocalBlockFromInput (vIn, pointOffset);
1045  const impl_scalar_type* const A_new = vIn + pointOffset;
1046  // COPY (A_new, A_old);
1047  for (LO i = 0; i < perBlockSize; ++i) {
1048  A_old[i] = A_new[i];
1049  }
1050  hint = relBlockOffset + 1;
1051  ++validCount;
1052  }
1053  }
1054  return validCount;
1055  }
1056 
1057  template <class Scalar, class LO, class GO, class Node>
1058  void
1060  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1061  Kokkos::MemoryUnmanaged>& offsets) const
1062  {
1063  graph_.getLocalDiagOffsets (offsets);
1064  }
1065 
1066 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1067  template <class Scalar, class LO, class GO, class Node>
1068  void TPETRA_DEPRECATED
1070  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
1071  {
1072  // mfh 19 Mar 2016: We plan to deprecate the ArrayRCP version of
1073  // this method in CrsGraph too, so don't call it (otherwise build
1074  // warnings will show up and annoy users). Instead, copy results
1075  // in and out, if the memory space requires it.
1076 
1077  const size_t lclNumRows = graph_.getNodeNumRows ();
1078  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
1079  offsets.resize (lclNumRows);
1080  }
1081 
1082  // The input ArrayRCP must always be a host pointer. Thus, if
1083  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
1084  // to write to that allocation directly as a Kokkos::View.
1085  typedef typename device_type::memory_space memory_space;
1086  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
1087  // It is always syntactically correct to assign a raw host
1088  // pointer to a device View, so this code will compile correctly
1089  // even if this branch never runs.
1090  typedef Kokkos::View<size_t*, device_type,
1091  Kokkos::MemoryUnmanaged> output_type;
1092  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1093  graph_.getLocalDiagOffsets (offsetsOut);
1094  }
1095  else {
1096  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
1097  graph_.getLocalDiagOffsets (offsetsTmp);
1098  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
1099  Kokkos::MemoryUnmanaged> output_type;
1100  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1101  Kokkos::deep_copy (offsetsOut, offsetsTmp);
1102  }
1103  }
1104 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1105 
1106  template <class Scalar, class LO, class GO, class Node>
1107  void
1111  const Kokkos::View<impl_scalar_type***, device_type,
1112  Kokkos::MemoryUnmanaged>& D_inv,
1113  const Scalar& omega,
1114  const ESweepDirection direction) const
1115  {
1116  using Kokkos::ALL;
1117  const impl_scalar_type zero =
1118  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1119  const impl_scalar_type one =
1120  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1121  const LO numLocalMeshRows =
1122  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1123  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1124 
1125  // If using (new) Kokkos, replace localMem with thread-local
1126  // memory. Note that for larger block sizes, this will affect the
1127  // two-level parallelization. Look to Stokhos for best practice
1128  // on making this fast for GPUs.
1129  const LO blockSize = getBlockSize ();
1130  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1131  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1132  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1133 
1134  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1135  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1136  if (direction == Forward) {
1137  rowBegin = 1;
1138  rowEnd = numLocalMeshRows+1;
1139  rowStride = 1;
1140  }
1141  else if (direction == Backward) {
1142  rowBegin = numLocalMeshRows;
1143  rowEnd = 0;
1144  rowStride = -1;
1145  }
1146  else if (direction == Symmetric) {
1147  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1148  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1149  return;
1150  }
1151 
1152  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1153  const Scalar minus_omega = -omega;
1154 
1155  if (numVecs == 1) {
1156  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1157  const LO actlRow = lclRow - 1;
1158 
1159  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1160  COPY (B_cur, X_lcl);
1161  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1162 
1163  const size_t meshBeg = ptrHost_[actlRow];
1164  const size_t meshEnd = ptrHost_[actlRow+1];
1165  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1166  const LO meshCol = indHost_[absBlkOff];
1167  const_little_block_type A_cur =
1168  getConstLocalBlockFromAbsOffset (absBlkOff);
1169  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1170 
1171  // X_lcl += alpha*A_cur*X_cur
1172  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1173  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1174  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1175  } // for each entry in the current local row of the matrix
1176 
1177  // NOTE (mfh 20 Jan 2016) The two input Views here are
1178  // unmanaged already, so we don't have to take unmanaged
1179  // subviews first.
1180  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1181  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1182  FILL (X_update, zero);
1183  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1184  } // for each local row of the matrix
1185  }
1186  else {
1187  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1188  for (LO j = 0; j < numVecs; ++j) {
1189  LO actlRow = lclRow-1;
1190 
1191  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1192  COPY (B_cur, X_lcl);
1193  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1194 
1195  const size_t meshBeg = ptrHost_[actlRow];
1196  const size_t meshEnd = ptrHost_[actlRow+1];
1197  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1198  const LO meshCol = indHost_[absBlkOff];
1199  const_little_block_type A_cur =
1200  getConstLocalBlockFromAbsOffset (absBlkOff);
1201  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1202 
1203  // X_lcl += alpha*A_cur*X_cur
1204  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1205  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1206  } // for each entry in the current local row of the matrx
1207 
1208  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1209  auto X_update = X.getLocalBlock (actlRow, j);
1210  FILL (X_update, zero);
1211  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1212  } // for each entry in the current local row of the matrix
1213  } // for each local row of the matrix
1214  }
1215  }
1216 
1217  template <class Scalar, class LO, class GO, class Node>
1218  void
1221  const MultiVector<Scalar,LO,GO,Node> &/* B */,
1222  const MultiVector<Scalar,LO,GO,Node> &/* D */,
1223  const Scalar& /* dampingFactor */,
1224  const ESweepDirection /* direction */,
1225  const int /* numSweeps */,
1226  const bool /* zeroInitialGuess */) const
1227  {
1228  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1229  // interface for block Gauss-Seidel.
1230  TEUCHOS_TEST_FOR_EXCEPTION(
1231  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
1232  "gaussSeidelCopy: Not implemented.");
1233  }
1234 
1235  template <class Scalar, class LO, class GO, class Node>
1236  void
1239  const MultiVector<Scalar,LO,GO,Node>& /* B */,
1240  const MultiVector<Scalar,LO,GO,Node>& /* D */,
1241  const Teuchos::ArrayView<LO>& /* rowIndices */,
1242  const Scalar& /* dampingFactor */,
1243  const ESweepDirection /* direction */,
1244  const int /* numSweeps */,
1245  const bool /* zeroInitialGuess */) const
1246  {
1247  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1248  // interface for block Gauss-Seidel.
1249  TEUCHOS_TEST_FOR_EXCEPTION(
1250  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
1251  "reorderedGaussSeidelCopy: Not implemented.");
1252  }
1253 
1254 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1255  template <class Scalar, class LO, class GO, class Node>
1256  void TPETRA_DEPRECATED
1259  const Teuchos::ArrayView<const size_t>& offsets) const
1260  {
1261  using Teuchos::ArrayRCP;
1262  using Teuchos::ArrayView;
1263  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1264 
1265  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
1266  const LO* columnIndices;
1267  Scalar* vals;
1268  LO numColumns;
1269  Teuchos::Array<LO> cols(1);
1270 
1271  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
1272  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
1273  for (size_t i = 0; i < myNumRows; ++i) {
1274  cols[0] = i;
1275  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1276  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
1277  }
1278  else {
1279  getLocalRowView (i, columnIndices, vals, numColumns);
1280  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
1281  }
1282  }
1283  }
1284 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1285 
1286  template <class Scalar, class LO, class GO, class Node>
1287  void
1289  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1290  Kokkos::MemoryUnmanaged>& diag,
1291  const Kokkos::View<const size_t*, device_type,
1292  Kokkos::MemoryUnmanaged>& offsets) const
1293  {
1294  using Kokkos::parallel_for;
1295  typedef typename device_type::execution_space execution_space;
1296  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1297 
1298  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1299  const LO blockSize = this->getBlockSize ();
1300  TEUCHOS_TEST_FOR_EXCEPTION
1301  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1302  static_cast<LO> (diag.extent (1)) < blockSize ||
1303  static_cast<LO> (diag.extent (2)) < blockSize,
1304  std::invalid_argument, prefix <<
1305  "The input Kokkos::View is not big enough to hold all the data.");
1306  TEUCHOS_TEST_FOR_EXCEPTION
1307  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1308  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1309  "diagonal blocks " << lclNumMeshRows << ".");
1310 
1311 #ifdef HAVE_TPETRA_DEBUG
1312  TEUCHOS_TEST_FOR_EXCEPTION
1313  (this->template need_sync<device_type> (), std::runtime_error,
1314  prefix << "The matrix's data were last modified on host, but have "
1315  "not been sync'd to device. Please sync to device (by calling "
1316  "sync<device_type>() on this matrix) before calling this method.");
1317 #endif // HAVE_TPETRA_DEBUG
1318 
1319  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1320  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1321 
1322  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1323  // we reserve the right to do lazy allocation of device data. (We
1324  // don't plan to do lazy allocation for host data; the host
1325  // version of the data always exists.)
1327  auto vals_dev =
1328  const_cast<this_type*> (this)->template getValues<device_type> ();
1329 
1330  parallel_for (policy_type (0, lclNumMeshRows),
1331  functor_type (diag, vals_dev, offsets,
1332  graph_.getLocalGraph ().row_map, blockSize_));
1333  }
1334 
1335 
1336  template <class Scalar, class LO, class GO, class Node>
1337  void
1339  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1340  Kokkos::MemoryUnmanaged>& diag,
1341  const Teuchos::ArrayView<const size_t>& offsets) const
1342  {
1343  using Kokkos::ALL;
1344  using Kokkos::parallel_for;
1345  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1346  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1347 
1348  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1349  const LO blockSize = this->getBlockSize ();
1350  TEUCHOS_TEST_FOR_EXCEPTION
1351  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1352  static_cast<LO> (diag.extent (1)) < blockSize ||
1353  static_cast<LO> (diag.extent (2)) < blockSize,
1354  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1355  "The input Kokkos::View is not big enough to hold all the data.");
1356  TEUCHOS_TEST_FOR_EXCEPTION
1357  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1358  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1359  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1360  "blocks " << lclNumMeshRows << ".");
1361 
1362  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1363  // quite made everything work with CUDA yet.
1364  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1365  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1366  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1367  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1368  COPY (D_in, D_out);
1369  });
1370  }
1371 
1372 
1373  template<class Scalar, class LO, class GO, class Node>
1374  LO
1376  absMaxLocalValues (const LO localRowInd,
1377  const LO colInds[],
1378  const Scalar vals[],
1379  const LO numColInds) const
1380  {
1381  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1382  // We modified no values, because the input local row index is
1383  // invalid on the calling process. That may not be an error, if
1384  // numColInds is zero anyway; it doesn't matter. This is the
1385  // advantage of returning the number of valid indices.
1386  return static_cast<LO> (0);
1387  }
1388  const impl_scalar_type* const vIn =
1389  reinterpret_cast<const impl_scalar_type*> (vals);
1390  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1391  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1392  const LO perBlockSize = this->offsetPerBlock ();
1393  LO hint = 0; // Guess for the relative offset into the current row
1394  LO pointOffset = 0; // Current offset into input values
1395  LO validCount = 0; // number of valid column indices in colInds
1396 
1397  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1398  const LO relBlockOffset =
1399  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1400  if (relBlockOffset != LINV) {
1401  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1402  little_block_type A_old =
1403  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1404  const_little_block_type A_new =
1405  getConstLocalBlockFromInput (vIn, pointOffset);
1406 
1407  ::Tpetra::Impl::absMax (A_old, A_new);
1408  hint = relBlockOffset + 1;
1409  ++validCount;
1410  }
1411  }
1412  return validCount;
1413  }
1414 
1415 
1416  template<class Scalar, class LO, class GO, class Node>
1417  LO
1419  sumIntoLocalValues (const LO localRowInd,
1420  const LO colInds[],
1421  const Scalar vals[],
1422  const LO numColInds) const
1423  {
1424 #ifdef HAVE_TPETRA_DEBUG
1425  const char prefix[] =
1426  "Tpetra::BlockCrsMatrix::sumIntoLocalValues: ";
1427 #endif // HAVE_TPETRA_DEBUG
1428 
1429  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1430  // We modified no values, because the input local row index is
1431  // invalid on the calling process. That may not be an error, if
1432  // numColInds is zero anyway; it doesn't matter. This is the
1433  // advantage of returning the number of valid indices.
1434  return static_cast<LO> (0);
1435  }
1436  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1437  const impl_scalar_type* const vIn =
1438  reinterpret_cast<const impl_scalar_type*> (vals);
1439  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1440  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1441  const LO perBlockSize = this->offsetPerBlock ();
1442  LO hint = 0; // Guess for the relative offset into the current row
1443  LO pointOffset = 0; // Current offset into input values
1444  LO validCount = 0; // number of valid column indices in colInds
1445 
1446 #ifdef HAVE_TPETRA_DEBUG
1447  TEUCHOS_TEST_FOR_EXCEPTION
1448  (this->need_sync_host (), std::runtime_error,
1449  prefix << "The matrix's data were last modified on device, but have not "
1450  "been sync'd to host. Please sync to host (by calling "
1451  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1452 #endif // HAVE_TPETRA_DEBUG
1453 
1454  auto vals_host_out =
1455  getValuesHost ();
1456  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1457 
1458  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1459  const LO relBlockOffset =
1460  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1461  if (relBlockOffset != LINV) {
1462  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1463  // are stored contiguously, with no padding. "Contiguously"
1464  // means that all memory between the first and last entries
1465  // belongs to the block (no striding). "No padding" means
1466  // that getBlockSize() * getBlockSize() is exactly the number
1467  // of entries that the block uses. For another place where
1468  // this assumption is encoded, see replaceLocalValues.
1469 
1470  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1471  // little_block_type A_old =
1472  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1473  impl_scalar_type* const A_old =
1474  vals_host_out_raw + absBlockOffset * perBlockSize;
1475  // const_little_block_type A_new =
1476  // getConstLocalBlockFromInput (vIn, pointOffset);
1477  const impl_scalar_type* const A_new = vIn + pointOffset;
1478  // AXPY (ONE, A_new, A_old);
1479  for (LO i = 0; i < perBlockSize; ++i) {
1480  A_old[i] += A_new[i];
1481  }
1482  hint = relBlockOffset + 1;
1483  ++validCount;
1484  }
1485  }
1486  return validCount;
1487  }
1488 
1489  template<class Scalar, class LO, class GO, class Node>
1490  LO
1492  getLocalRowView (const LO localRowInd,
1493  const LO*& colInds,
1494  Scalar*& vals,
1495  LO& numInds) const
1496  {
1497 #ifdef HAVE_TPETRA_DEBUG
1498  const char prefix[] =
1499  "Tpetra::BlockCrsMatrix::getLocalRowView: ";
1500 #endif // HAVE_TPETRA_DEBUG
1501 
1502  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1503  colInds = NULL;
1504  vals = NULL;
1505  numInds = 0;
1506  return Teuchos::OrdinalTraits<LO>::invalid ();
1507  }
1508  else {
1509  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1510  colInds = indHost_.data () + absBlockOffsetStart;
1511 
1512 #ifdef HAVE_TPETRA_DEBUG
1513  TEUCHOS_TEST_FOR_EXCEPTION
1514  (this->need_sync_host (), std::runtime_error,
1515  prefix << "The matrix's data were last modified on device, but have "
1516  "not been sync'd to host. Please sync to host (by calling "
1517  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1518  "method.");
1519 #endif // HAVE_TPETRA_DEBUG
1520 
1521  auto vals_host_out = getValuesHost ();
1522  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1523  impl_scalar_type* const vOut = vals_host_out_raw +
1524  absBlockOffsetStart * offsetPerBlock ();
1525  vals = reinterpret_cast<Scalar*> (vOut);
1526 
1527  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1528  return 0; // indicates no error
1529  }
1530  }
1531 
1532  template<class Scalar, class LO, class GO, class Node>
1533  void
1535  getLocalRowCopy (LO LocalRow,
1536  const Teuchos::ArrayView<LO>& Indices,
1537  const Teuchos::ArrayView<Scalar>& Values,
1538  size_t &NumEntries) const
1539  {
1540  const LO *colInds;
1541  Scalar *vals;
1542  LO numInds;
1543  getLocalRowView(LocalRow,colInds,vals,numInds);
1544  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1545  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1546  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1547  << numInds << " row entries");
1548  }
1549  for (LO i=0; i<numInds; ++i) {
1550  Indices[i] = colInds[i];
1551  }
1552  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1553  Values[i] = vals[i];
1554  }
1555  NumEntries = numInds;
1556  }
1557 
1558  template<class Scalar, class LO, class GO, class Node>
1559  LO
1561  getLocalRowOffsets (const LO localRowInd,
1562  ptrdiff_t offsets[],
1563  const LO colInds[],
1564  const LO numColInds) const
1565  {
1566  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1567  // We got no offsets, because the input local row index is
1568  // invalid on the calling process. That may not be an error, if
1569  // numColInds is zero anyway; it doesn't matter. This is the
1570  // advantage of returning the number of valid indices.
1571  return static_cast<LO> (0);
1572  }
1573 
1574  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1575  LO hint = 0; // Guess for the relative offset into the current row
1576  LO validCount = 0; // number of valid column indices in colInds
1577 
1578  for (LO k = 0; k < numColInds; ++k) {
1579  const LO relBlockOffset =
1580  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1581  if (relBlockOffset != LINV) {
1582  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1583  hint = relBlockOffset + 1;
1584  ++validCount;
1585  }
1586  }
1587  return validCount;
1588  }
1589 
1590 
1591  template<class Scalar, class LO, class GO, class Node>
1592  LO
1594  replaceLocalValuesByOffsets (const LO localRowInd,
1595  const ptrdiff_t offsets[],
1596  const Scalar vals[],
1597  const LO numOffsets) const
1598  {
1599  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1600  // We modified no values, because the input local row index is
1601  // invalid on the calling process. That may not be an error, if
1602  // numColInds is zero anyway; it doesn't matter. This is the
1603  // advantage of returning the number of valid indices.
1604  return static_cast<LO> (0);
1605  }
1606  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1607 
1608  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1609  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1610  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1611  size_t pointOffset = 0; // Current offset into input values
1612  LO validCount = 0; // number of valid offsets
1613 
1614  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1615  const size_t relBlockOffset = offsets[k];
1616  if (relBlockOffset != STINV) {
1617  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1618  little_block_type A_old =
1619  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1620  const_little_block_type A_new =
1621  getConstLocalBlockFromInput (vIn, pointOffset);
1622  COPY (A_new, A_old);
1623  ++validCount;
1624  }
1625  }
1626  return validCount;
1627  }
1628 
1629 
1630  template<class Scalar, class LO, class GO, class Node>
1631  LO
1633  absMaxLocalValuesByOffsets (const LO localRowInd,
1634  const ptrdiff_t offsets[],
1635  const Scalar vals[],
1636  const LO numOffsets) const
1637  {
1638  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1639  // We modified no values, because the input local row index is
1640  // invalid on the calling process. That may not be an error, if
1641  // numColInds is zero anyway; it doesn't matter. This is the
1642  // advantage of returning the number of valid indices.
1643  return static_cast<LO> (0);
1644  }
1645  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1646 
1647  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1648  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1649  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1650  size_t pointOffset = 0; // Current offset into input values
1651  LO validCount = 0; // number of valid offsets
1652 
1653  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1654  const size_t relBlockOffset = offsets[k];
1655  if (relBlockOffset != STINV) {
1656  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1657  little_block_type A_old =
1658  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1659  const_little_block_type A_new =
1660  getConstLocalBlockFromInput (vIn, pointOffset);
1661  ::Tpetra::Impl::absMax (A_old, A_new);
1662  ++validCount;
1663  }
1664  }
1665  return validCount;
1666  }
1667 
1668 
1669  template<class Scalar, class LO, class GO, class Node>
1670  LO
1672  sumIntoLocalValuesByOffsets (const LO localRowInd,
1673  const ptrdiff_t offsets[],
1674  const Scalar vals[],
1675  const LO numOffsets) const
1676  {
1677  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1678  // We modified no values, because the input local row index is
1679  // invalid on the calling process. That may not be an error, if
1680  // numColInds is zero anyway; it doesn't matter. This is the
1681  // advantage of returning the number of valid indices.
1682  return static_cast<LO> (0);
1683  }
1684  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1685  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1686 
1687  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1688  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1689  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1690  size_t pointOffset = 0; // Current offset into input values
1691  LO validCount = 0; // number of valid offsets
1692 
1693  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1694  const size_t relBlockOffset = offsets[k];
1695  if (relBlockOffset != STINV) {
1696  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1697  little_block_type A_old =
1698  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1699  const_little_block_type A_new =
1700  getConstLocalBlockFromInput (vIn, pointOffset);
1701  //A_old.update (ONE, A_new);
1702  AXPY (ONE, A_new, A_old);
1703  ++validCount;
1704  }
1705  }
1706  return validCount;
1707  }
1708 
1709 
1710  template<class Scalar, class LO, class GO, class Node>
1711  size_t
1713  getNumEntriesInLocalRow (const LO localRowInd) const
1714  {
1715  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1716  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1717  return static_cast<LO> (0); // the calling process doesn't have that row
1718  } else {
1719  return static_cast<LO> (numEntInGraph);
1720  }
1721  }
1722 
1723  template<class Scalar, class LO, class GO, class Node>
1724  void
1728  const Teuchos::ETransp mode,
1729  const Scalar alpha,
1730  const Scalar beta)
1731  {
1732  (void) X;
1733  (void) Y;
1734  (void) mode;
1735  (void) alpha;
1736  (void) beta;
1737 
1738  TEUCHOS_TEST_FOR_EXCEPTION(
1739  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
1740  "transpose and conjugate transpose modes are not yet implemented.");
1741  }
1742 
1743  template<class Scalar, class LO, class GO, class Node>
1744  void
1745  BlockCrsMatrix<Scalar, LO, GO, Node>::
1746  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1747  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1748  const Scalar alpha,
1749  const Scalar beta)
1750  {
1751  using Teuchos::RCP;
1752  using Teuchos::rcp;
1753  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1754  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1755  const Scalar zero = STS::zero ();
1756  const Scalar one = STS::one ();
1757  RCP<const import_type> import = graph_.getImporter ();
1758  // "export" is a reserved C++ keyword, so we can't use it.
1759  RCP<const export_type> theExport = graph_.getExporter ();
1760  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1761 
1762  if (alpha == zero) {
1763  if (beta == zero) {
1764  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1765  }
1766  else if (beta != one) {
1767  Y.scale (beta);
1768  }
1769  }
1770  else { // alpha != 0
1771  const BMV* X_colMap = NULL;
1772  if (import.is_null ()) {
1773  try {
1774  X_colMap = &X;
1775  }
1776  catch (std::exception& e) {
1777  TEUCHOS_TEST_FOR_EXCEPTION
1778  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1779  "operator= threw an exception: " << std::endl << e.what ());
1780  }
1781  }
1782  else {
1783  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1784  // Y_rowMap_ below. This lets us do lazy initialization
1785  // correctly with view semantics of BlockCrsMatrix. All views
1786  // of this BlockCrsMatrix have the same outer pointer. That
1787  // way, we can set the inner pointer in one view, and all
1788  // other views will see it.
1789  if ((*X_colMap_).is_null () ||
1790  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1791  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1792  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1793  static_cast<LO> (X.getNumVectors ())));
1794  }
1795  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1796  **pointImporter_,
1798  try {
1799  X_colMap = &(**X_colMap_);
1800  }
1801  catch (std::exception& e) {
1802  TEUCHOS_TEST_FOR_EXCEPTION
1803  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1804  "operator= threw an exception: " << std::endl << e.what ());
1805  }
1806  }
1807 
1808  BMV* Y_rowMap = NULL;
1809  if (theExport.is_null ()) {
1810  Y_rowMap = &Y;
1811  }
1812  else if ((*Y_rowMap_).is_null () ||
1813  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1814  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1815  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1816  static_cast<LO> (X.getNumVectors ())));
1817  try {
1818  Y_rowMap = &(**Y_rowMap_);
1819  }
1820  catch (std::exception& e) {
1821  TEUCHOS_TEST_FOR_EXCEPTION(
1822  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1823  "operator= threw an exception: " << std::endl << e.what ());
1824  }
1825  }
1826 
1827  try {
1828  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1829  }
1830  catch (std::exception& e) {
1831  TEUCHOS_TEST_FOR_EXCEPTION
1832  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1833  "an exception: " << e.what ());
1834  }
1835  catch (...) {
1836  TEUCHOS_TEST_FOR_EXCEPTION
1837  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1838  "an exception not a subclass of std::exception.");
1839  }
1840 
1841  if (! theExport.is_null ()) {
1842  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1843  }
1844  }
1845  }
1846 
1847  template<class Scalar, class LO, class GO, class Node>
1848  void
1849  BlockCrsMatrix<Scalar, LO, GO, Node>::
1850  localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1851  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1852  const Scalar alpha,
1853  const Scalar beta)
1854  {
1855  using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1856 
1857  const impl_scalar_type alpha_impl = alpha;
1858  const auto graph = this->graph_.getLocalGraph ();
1859  const impl_scalar_type beta_impl = beta;
1860  const LO blockSize = this->getBlockSize ();
1861 
1862  auto X_mv = X.getMultiVectorView ();
1863  auto Y_mv = Y.getMultiVectorView ();
1864  Y_mv.template modify<device_type> ();
1865 
1866  auto X_lcl = X_mv.template getLocalView<device_type> ();
1867  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1868  auto val = this->val_.template view<device_type> ();
1869 
1870  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1871  beta_impl, Y_lcl);
1872  }
1873 
1874  template<class Scalar, class LO, class GO, class Node>
1875  LO
1876  BlockCrsMatrix<Scalar, LO, GO, Node>::
1877  findRelOffsetOfColumnIndex (const LO localRowIndex,
1878  const LO colIndexToFind,
1879  const LO hint) const
1880  {
1881  const size_t absStartOffset = ptrHost_[localRowIndex];
1882  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1883  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1884  // Amortize pointer arithmetic over the search loop.
1885  const LO* const curInd = indHost_.data () + absStartOffset;
1886 
1887  // If the hint was correct, then the hint is the offset to return.
1888  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1889  // Always return the offset relative to the current row.
1890  return hint;
1891  }
1892 
1893  // The hint was wrong, so we must search for the given column
1894  // index in the column indices for the given row.
1895  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1896 
1897  // We require that the graph have sorted rows. However, binary
1898  // search only pays if the current row is longer than a certain
1899  // amount. We set this to 32, but you might want to tune this.
1900  const LO maxNumEntriesForLinearSearch = 32;
1901  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1902  // Use binary search. It would probably be better for us to
1903  // roll this loop by hand. If we wrote it right, a smart
1904  // compiler could perhaps use conditional loads and avoid
1905  // branches (according to Jed Brown on May 2014).
1906  const LO* beg = curInd;
1907  const LO* end = curInd + numEntriesInRow;
1908  std::pair<const LO*, const LO*> p =
1909  std::equal_range (beg, end, colIndexToFind);
1910  if (p.first != p.second) {
1911  // offset is relative to the current row
1912  relOffset = static_cast<LO> (p.first - beg);
1913  }
1914  }
1915  else { // use linear search
1916  for (LO k = 0; k < numEntriesInRow; ++k) {
1917  if (colIndexToFind == curInd[k]) {
1918  relOffset = k; // offset is relative to the current row
1919  break;
1920  }
1921  }
1922  }
1923 
1924  return relOffset;
1925  }
1926 
1927  template<class Scalar, class LO, class GO, class Node>
1928  LO
1929  BlockCrsMatrix<Scalar, LO, GO, Node>::
1930  offsetPerBlock () const
1931  {
1932  return offsetPerBlock_;
1933  }
1934 
1935  template<class Scalar, class LO, class GO, class Node>
1936  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1937  BlockCrsMatrix<Scalar, LO, GO, Node>::
1938  getConstLocalBlockFromInput (const impl_scalar_type* val,
1939  const size_t pointOffset) const
1940  {
1941  // Row major blocks
1942  const LO rowStride = blockSize_;
1943  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1944  }
1945 
1946  template<class Scalar, class LO, class GO, class Node>
1947  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1948  BlockCrsMatrix<Scalar, LO, GO, Node>::
1949  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1950  const size_t pointOffset) const
1951  {
1952  // Row major blocks
1953  const LO rowStride = blockSize_;
1954  return little_block_type (val + pointOffset, blockSize_, rowStride);
1955  }
1956 
1957  template<class Scalar, class LO, class GO, class Node>
1958  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1959  BlockCrsMatrix<Scalar, LO, GO, Node>::
1960  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1961  {
1962 #ifdef HAVE_TPETRA_DEBUG
1963  const char prefix[] =
1964  "Tpetra::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
1965 #endif // HAVE_TPETRA_DEBUG
1966 
1967  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1968  // An empty block signifies an error. We don't expect to see
1969  // this error in correct code, but it's helpful for avoiding
1970  // memory corruption in case there is a bug.
1971  return const_little_block_type ();
1972  }
1973  else {
1974 #ifdef HAVE_TPETRA_DEBUG
1975  TEUCHOS_TEST_FOR_EXCEPTION
1976  (this->need_sync_host (), std::runtime_error,
1977  prefix << "The matrix's data were last modified on device, but have "
1978  "not been sync'd to host. Please sync to host (by calling "
1979  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1980  "method.");
1981 #endif // HAVE_TPETRA_DEBUG
1982  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1983 
1984  auto vals_host = getValuesHost ();
1985  const impl_scalar_type* vals_host_raw = vals_host.data ();
1986 
1987  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
1988  }
1989  }
1990 
1991  template<class Scalar, class LO, class GO, class Node>
1992  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1993  BlockCrsMatrix<Scalar, LO, GO, Node>::
1994  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
1995  const size_t relMeshOffset) const
1996  {
1997  typedef impl_scalar_type IST;
1998 
1999  const LO* lclColInds = NULL;
2000  Scalar* lclVals = NULL;
2001  LO numEnt = 0;
2002 
2003  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
2004  if (err != 0) {
2005  // An empty block signifies an error. We don't expect to see
2006  // this error in correct code, but it's helpful for avoiding
2007  // memory corruption in case there is a bug.
2008  return const_little_block_type ();
2009  }
2010  else {
2011  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
2012  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
2013  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
2014  relPointOffset);
2015  }
2016  }
2017 
2018  template<class Scalar, class LO, class GO, class Node>
2019  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
2020  BlockCrsMatrix<Scalar, LO, GO, Node>::
2021  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2022  {
2023 #ifdef HAVE_TPETRA_DEBUG
2024  const char prefix[] =
2025  "Tpetra::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
2026 #endif // HAVE_TPETRA_DEBUG
2027 
2028  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2029  // An empty block signifies an error. We don't expect to see
2030  // this error in correct code, but it's helpful for avoiding
2031  // memory corruption in case there is a bug.
2032  return little_block_type ();
2033  }
2034  else {
2035  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2036 #ifdef HAVE_TPETRA_DEBUG
2037  TEUCHOS_TEST_FOR_EXCEPTION
2038  (this->need_sync_host (), std::runtime_error,
2039  prefix << "The matrix's data were last modified on device, but have "
2040  "not been sync'd to host. Please sync to host (by calling "
2041  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2042  "method.");
2043 #endif // HAVE_TPETRA_DEBUG
2044  auto vals_host = getValuesHost();
2045  impl_scalar_type* vals_host_raw = vals_host.data ();
2046  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2047  }
2048  }
2049 
2050  template<class Scalar, class LO, class GO, class Node>
2051  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
2052  BlockCrsMatrix<Scalar, LO, GO, Node>::
2053  getLocalBlock (const LO localRowInd, const LO localColInd) const
2054  {
2055  const size_t absRowBlockOffset = ptrHost_[localRowInd];
2056  const LO relBlockOffset =
2057  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
2058 
2059  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
2060  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
2061  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
2062  }
2063  else {
2064  return little_block_type ();
2065  }
2066  }
2067 
2068  // template<class Scalar, class LO, class GO, class Node>
2069  // void
2070  // BlockCrsMatrix<Scalar, LO, GO, Node>::
2071  // clearLocalErrorStateAndStream ()
2072  // {
2073  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2074  // * (const_cast<this_type*> (this)->localError_) = false;
2075  // *errs_ = Teuchos::null;
2076  // }
2077 
2078  template<class Scalar, class LO, class GO, class Node>
2079  bool
2080  BlockCrsMatrix<Scalar, LO, GO, Node>::
2081  checkSizes (const ::Tpetra::SrcDistObject& source)
2082  {
2083  using std::endl;
2084  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2085  const this_type* src = dynamic_cast<const this_type* > (&source);
2086 
2087  if (src == NULL) {
2088  std::ostream& err = markLocalErrorAndGetStream ();
2089  err << "checkSizes: The source object of the Import or Export "
2090  "must be a BlockCrsMatrix with the same template parameters as the "
2091  "target object." << endl;
2092  }
2093  else {
2094  // Use a string of ifs, not if-elseifs, because we want to know
2095  // all the errors.
2096  if (src->getBlockSize () != this->getBlockSize ()) {
2097  std::ostream& err = markLocalErrorAndGetStream ();
2098  err << "checkSizes: The source and target objects of the Import or "
2099  << "Export must have the same block sizes. The source's block "
2100  << "size = " << src->getBlockSize () << " != the target's block "
2101  << "size = " << this->getBlockSize () << "." << endl;
2102  }
2103  if (! src->graph_.isFillComplete ()) {
2104  std::ostream& err = markLocalErrorAndGetStream ();
2105  err << "checkSizes: The source object of the Import or Export is "
2106  "not fill complete. Both source and target objects must be fill "
2107  "complete." << endl;
2108  }
2109  if (! this->graph_.isFillComplete ()) {
2110  std::ostream& err = markLocalErrorAndGetStream ();
2111  err << "checkSizes: The target object of the Import or Export is "
2112  "not fill complete. Both source and target objects must be fill "
2113  "complete." << endl;
2114  }
2115  if (src->graph_.getColMap ().is_null ()) {
2116  std::ostream& err = markLocalErrorAndGetStream ();
2117  err << "checkSizes: The source object of the Import or Export does "
2118  "not have a column Map. Both source and target objects must have "
2119  "column Maps." << endl;
2120  }
2121  if (this->graph_.getColMap ().is_null ()) {
2122  std::ostream& err = markLocalErrorAndGetStream ();
2123  err << "checkSizes: The target object of the Import or Export does "
2124  "not have a column Map. Both source and target objects must have "
2125  "column Maps." << endl;
2126  }
2127  }
2128 
2129  return ! (* (this->localError_));
2130  }
2131 
2132  template<class Scalar, class LO, class GO, class Node>
2133  void
2134  BlockCrsMatrix<Scalar, LO, GO, Node>::
2135 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2136  copyAndPermuteNew
2137 #else // TPETRA_ENABLE_DEPRECATED_CODE
2138  copyAndPermute
2139 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2140  (const ::Tpetra::SrcDistObject& source,
2141  const size_t numSameIDs,
2142  const Kokkos::DualView<const local_ordinal_type*,
2143  buffer_device_type>& permuteToLIDs,
2144  const Kokkos::DualView<const local_ordinal_type*,
2145  buffer_device_type>& permuteFromLIDs)
2146  {
2147  using ::Tpetra::Details::Behavior;
2149  using ::Tpetra::Details::ProfilingRegion;
2150  using std::endl;
2151  using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2152 
2153  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
2154  const bool debug = Behavior::debug();
2155  const bool verbose = Behavior::verbose();
2156 
2157  // Define this function prefix
2158  std::string prefix;
2159  {
2160  std::ostringstream os;
2161  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2162  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
2163  prefix = os.str();
2164  }
2165 
2166  // check if this already includes a local error
2167  if (* (this->localError_)) {
2168  std::ostream& err = this->markLocalErrorAndGetStream ();
2169  err << prefix
2170  << "The target object of the Import or Export is already in an error state."
2171  << endl;
2172  return;
2173  }
2174 
2175  //
2176  // Verbose input dual view status
2177  //
2178  if (verbose) {
2179  std::ostringstream os;
2180  os << prefix << endl
2181  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
2182  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
2183  std::cerr << os.str ();
2184  }
2185 
2189  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
2190  std::ostream& err = this->markLocalErrorAndGetStream ();
2191  err << prefix
2192  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
2193  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
2194  << "." << endl;
2195  return;
2196  }
2197  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
2198  std::ostream& err = this->markLocalErrorAndGetStream ();
2199  err << prefix
2200  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
2201  << endl;
2202  return;
2203  }
2204 
2205  const this_type* src = dynamic_cast<const this_type* > (&source);
2206  if (src == NULL) {
2207  std::ostream& err = this->markLocalErrorAndGetStream ();
2208  err << prefix
2209  << "The source (input) object of the Import or "
2210  "Export is either not a BlockCrsMatrix, or does not have the right "
2211  "template parameters. checkSizes() should have caught this. "
2212  "Please report this bug to the Tpetra developers." << endl;
2213  return;
2214  }
2215  else {
2216  // Kyungjoo: where is val_ modified ?
2217  // When we have dual view as a member variable,
2218  // which function should make sure the val_ is upto date ?
2219  // IMO, wherever it is used, the function should check its
2220  // availability.
2221  const_cast<this_type*>(src)->sync_host();
2222  }
2223  this->sync_host();
2224 
2225  bool lclErr = false;
2226 #ifdef HAVE_TPETRA_DEBUG
2227  std::set<LO> invalidSrcCopyRows;
2228  std::set<LO> invalidDstCopyRows;
2229  std::set<LO> invalidDstCopyCols;
2230  std::set<LO> invalidDstPermuteCols;
2231  std::set<LO> invalidPermuteFromRows;
2232 #endif // HAVE_TPETRA_DEBUG
2233 
2234  // Copy the initial sequence of rows that are the same.
2235  //
2236  // The two graphs might have different column Maps, so we need to
2237  // do this using global column indices. This is purely local, so
2238  // we only need to check for local sameness of the two column
2239  // Maps.
2240 
2241 #ifdef HAVE_TPETRA_DEBUG
2242  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2243 #endif // HAVE_TPETRA_DEBUG
2244  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2245  const map_type& srcColMap = * (src->graph_.getColMap ());
2246  const map_type& dstColMap = * (this->graph_.getColMap ());
2247  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2248 
2249  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
2250  if (verbose) {
2251  std::ostringstream os;
2252  os << prefix
2253  << "canUseLocalColumnIndices: "
2254  << (canUseLocalColumnIndices ? "true" : "false")
2255  << ", numPermute: " << numPermute
2256  << endl;
2257  std::cerr << os.str ();
2258  }
2259 
2260  const auto permuteToLIDsHost = permuteToLIDs.view_host();
2261  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
2262 
2263  if (canUseLocalColumnIndices) {
2264  // Copy local rows that are the "same" in both source and target.
2265  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2266 #ifdef HAVE_TPETRA_DEBUG
2267  if (! srcRowMap.isNodeLocalElement (localRow)) {
2268  lclErr = true;
2269  invalidSrcCopyRows.insert (localRow);
2270  continue; // skip invalid rows
2271  }
2272 #endif // HAVE_TPETRA_DEBUG
2273 
2274  const LO* lclSrcCols;
2275  Scalar* vals;
2276  LO numEntries;
2277  // If this call fails, that means the mesh row local index is
2278  // invalid. That means the Import or Export is invalid somehow.
2279  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2280  if (err != 0) {
2281  lclErr = true;
2282 #ifdef HAVE_TPETRA_DEBUG
2283  (void) invalidSrcCopyRows.insert (localRow);
2284 #endif // HAVE_TPETRA_DEBUG
2285  }
2286  else {
2287  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2288  if (err != numEntries) {
2289  lclErr = true;
2290  if (! dstRowMap.isNodeLocalElement (localRow)) {
2291 #ifdef HAVE_TPETRA_DEBUG
2292  invalidDstCopyRows.insert (localRow);
2293 #endif // HAVE_TPETRA_DEBUG
2294  }
2295  else {
2296  // Once there's an error, there's no sense in saving
2297  // time, so we check whether the column indices were
2298  // invalid. However, only remember which ones were
2299  // invalid in a debug build, because that might take a
2300  // lot of space.
2301  for (LO k = 0; k < numEntries; ++k) {
2302  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2303  lclErr = true;
2304 #ifdef HAVE_TPETRA_DEBUG
2305  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2306 #endif // HAVE_TPETRA_DEBUG
2307  }
2308  }
2309  }
2310  }
2311  }
2312  } // for each "same" local row
2313 
2314  // Copy the "permute" local rows.
2315  for (size_t k = 0; k < numPermute; ++k) {
2316  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2317  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2318 
2319  const LO* lclSrcCols;
2320  Scalar* vals;
2321  LO numEntries;
2322  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2323  if (err != 0) {
2324  lclErr = true;
2325 #ifdef HAVE_TPETRA_DEBUG
2326  invalidPermuteFromRows.insert (srcLclRow);
2327 #endif // HAVE_TPETRA_DEBUG
2328  }
2329  else {
2330  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2331  if (err != numEntries) {
2332  lclErr = true;
2333 #ifdef HAVE_TPETRA_DEBUG
2334  for (LO c = 0; c < numEntries; ++c) {
2335  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2336  invalidDstPermuteCols.insert (lclSrcCols[c]);
2337  }
2338  }
2339 #endif // HAVE_TPETRA_DEBUG
2340  }
2341  }
2342  }
2343  }
2344  else { // must convert column indices to global
2345  // Reserve space to store the destination matrix's local column indices.
2346  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2347  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2348 
2349  // Copy local rows that are the "same" in both source and target.
2350  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2351  const LO* lclSrcCols;
2352  Scalar* vals;
2353  LO numEntries;
2354  // If this call fails, that means the mesh row local index is
2355  // invalid. That means the Import or Export is invalid somehow.
2356  LO err = 0;
2357  try {
2358  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2359  } catch (std::exception& e) {
2360  if (debug) {
2361  std::ostringstream os;
2362  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2363  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2364  << localRow << ", src->getLocalRowView() threw an exception: "
2365  << e.what ();
2366  std::cerr << os.str ();
2367  }
2368  throw e;
2369  }
2370 
2371  if (err != 0) {
2372  lclErr = true;
2373 #ifdef HAVE_TPETRA_DEBUG
2374  invalidSrcCopyRows.insert (localRow);
2375 #endif // HAVE_TPETRA_DEBUG
2376  }
2377  else {
2378  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2379  lclErr = true;
2380  if (debug) {
2381  std::ostringstream os;
2382  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2383  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2384  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2385  << maxNumEnt << endl;
2386  std::cerr << os.str ();
2387  }
2388  }
2389  else {
2390  // Convert the source matrix's local column indices to the
2391  // destination matrix's local column indices.
2392  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2393  for (LO j = 0; j < numEntries; ++j) {
2394  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2395  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2396  lclErr = true;
2397 #ifdef HAVE_TPETRA_DEBUG
2398  invalidDstCopyCols.insert (lclDstColsView[j]);
2399 #endif // HAVE_TPETRA_DEBUG
2400  }
2401  }
2402  try {
2403  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2404  } catch (std::exception& e) {
2405  if (debug) {
2406  std::ostringstream os;
2407  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2408  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2409  << localRow << ", this->replaceLocalValues() threw an exception: "
2410  << e.what ();
2411  std::cerr << os.str ();
2412  }
2413  throw e;
2414  }
2415  if (err != numEntries) {
2416  lclErr = true;
2417  if (debug) {
2418  std::ostringstream os;
2419  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2420  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2421  "localRow " << localRow << ", this->replaceLocalValues "
2422  "returned " << err << " instead of numEntries = "
2423  << numEntries << endl;
2424  std::cerr << os.str ();
2425  }
2426  }
2427  }
2428  }
2429  }
2430 
2431  // Copy the "permute" local rows.
2432  for (size_t k = 0; k < numPermute; ++k) {
2433  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2434  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2435 
2436  const LO* lclSrcCols;
2437  Scalar* vals;
2438  LO numEntries;
2439  LO err = 0;
2440  try {
2441  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2442  } catch (std::exception& e) {
2443  if (debug) {
2444  std::ostringstream os;
2445  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2446  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2447  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2448  << ", src->getLocalRowView() threw an exception: " << e.what ();
2449  std::cerr << os.str ();
2450  }
2451  throw e;
2452  }
2453 
2454  if (err != 0) {
2455  lclErr = true;
2456 #ifdef HAVE_TPETRA_DEBUG
2457  invalidPermuteFromRows.insert (srcLclRow);
2458 #endif // HAVE_TPETRA_DEBUG
2459  }
2460  else {
2461  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2462  lclErr = true;
2463  }
2464  else {
2465  // Convert the source matrix's local column indices to the
2466  // destination matrix's local column indices.
2467  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2468  for (LO j = 0; j < numEntries; ++j) {
2469  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2470  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2471  lclErr = true;
2472 #ifdef HAVE_TPETRA_DEBUG
2473  invalidDstPermuteCols.insert (lclDstColsView[j]);
2474 #endif // HAVE_TPETRA_DEBUG
2475  }
2476  }
2477  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2478  if (err != numEntries) {
2479  lclErr = true;
2480  }
2481  }
2482  }
2483  }
2484  }
2485 
2486  if (lclErr) {
2487  std::ostream& err = this->markLocalErrorAndGetStream ();
2488 #ifdef HAVE_TPETRA_DEBUG
2489  err << "copyAndPermute: The graph structure of the source of the "
2490  "Import or Export must be a subset of the graph structure of the "
2491  "target. ";
2492  err << "invalidSrcCopyRows = [";
2493  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2494  it != invalidSrcCopyRows.end (); ++it) {
2495  err << *it;
2496  typename std::set<LO>::const_iterator itp1 = it;
2497  itp1++;
2498  if (itp1 != invalidSrcCopyRows.end ()) {
2499  err << ",";
2500  }
2501  }
2502  err << "], invalidDstCopyRows = [";
2503  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2504  it != invalidDstCopyRows.end (); ++it) {
2505  err << *it;
2506  typename std::set<LO>::const_iterator itp1 = it;
2507  itp1++;
2508  if (itp1 != invalidDstCopyRows.end ()) {
2509  err << ",";
2510  }
2511  }
2512  err << "], invalidDstCopyCols = [";
2513  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2514  it != invalidDstCopyCols.end (); ++it) {
2515  err << *it;
2516  typename std::set<LO>::const_iterator itp1 = it;
2517  itp1++;
2518  if (itp1 != invalidDstCopyCols.end ()) {
2519  err << ",";
2520  }
2521  }
2522  err << "], invalidDstPermuteCols = [";
2523  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2524  it != invalidDstPermuteCols.end (); ++it) {
2525  err << *it;
2526  typename std::set<LO>::const_iterator itp1 = it;
2527  itp1++;
2528  if (itp1 != invalidDstPermuteCols.end ()) {
2529  err << ",";
2530  }
2531  }
2532  err << "], invalidPermuteFromRows = [";
2533  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2534  it != invalidPermuteFromRows.end (); ++it) {
2535  err << *it;
2536  typename std::set<LO>::const_iterator itp1 = it;
2537  itp1++;
2538  if (itp1 != invalidPermuteFromRows.end ()) {
2539  err << ",";
2540  }
2541  }
2542  err << "]" << endl;
2543 #else
2544  err << "copyAndPermute: The graph structure of the source of the "
2545  "Import or Export must be a subset of the graph structure of the "
2546  "target." << endl;
2547 #endif // HAVE_TPETRA_DEBUG
2548  }
2549 
2550  if (debug) {
2551  std::ostringstream os;
2552  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2553  const bool lclSuccess = ! (* (this->localError_));
2554  os << "*** Proc " << myRank << ": copyAndPermute "
2555  << (lclSuccess ? "succeeded" : "FAILED");
2556  if (lclSuccess) {
2557  os << endl;
2558  } else {
2559  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2560  }
2561  std::cerr << os.str ();
2562  }
2563  }
2564 
2565  namespace { // (anonymous)
2566 
2585  template<class LO, class GO, class D>
2586  size_t
2587  packRowCount (const size_t numEnt,
2588  const size_t numBytesPerValue,
2589  const size_t blkSize)
2590  {
2591  using ::Tpetra::Details::PackTraits;
2592 
2593  if (numEnt == 0) {
2594  // Empty rows always take zero bytes, to ensure sparsity.
2595  return 0;
2596  }
2597  else {
2598  // We store the number of entries as a local index (LO).
2599  LO numEntLO = 0; // packValueCount wants this.
2600  GO gid;
2601  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2602  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2603  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2604  return numEntLen + gidsLen + valsLen;
2605  }
2606  }
2607 
2618  template<class ST, class LO, class GO, class D>
2619  size_t
2620  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
2621  const size_t offset,
2622  const size_t numBytes,
2623  const size_t /* numBytesPerValue */)
2624  {
2625  using Kokkos::subview;
2626  using ::Tpetra::Details::PackTraits;
2627 
2628  if (numBytes == 0) {
2629  // Empty rows always take zero bytes, to ensure sparsity.
2630  return static_cast<size_t> (0);
2631  }
2632  else {
2633  LO numEntLO = 0;
2634  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2635  TEUCHOS_TEST_FOR_EXCEPTION
2636  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2637  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2638  << ".");
2639  const char* const inBuf = imports.data () + offset;
2640  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2641  TEUCHOS_TEST_FOR_EXCEPTION
2642  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2643  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2644  << ".");
2645  return static_cast<size_t> (numEntLO);
2646  }
2647  }
2648 
2652  template<class ST, class LO, class GO, class D>
2653  size_t
2654  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO, D>::output_buffer_type exports,
2655  const size_t offset,
2656  const size_t numEnt,
2657  const typename ::Tpetra::Details::PackTraits<GO, D>::input_array_type& gidsIn,
2658  const typename ::Tpetra::Details::PackTraits<ST, D>::input_array_type& valsIn,
2659  const size_t numBytesPerValue,
2660  const size_t blockSize)
2661  {
2662  using Kokkos::subview;
2663  using ::Tpetra::Details::PackTraits;
2664 
2665  if (numEnt == 0) {
2666  // Empty rows always take zero bytes, to ensure sparsity.
2667  return 0;
2668  }
2669  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2670 
2671  const GO gid = 0; // packValueCount wants this
2672  const LO numEntLO = static_cast<size_t> (numEnt);
2673 
2674  const size_t numEntBeg = offset;
2675  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2676  const size_t gidsBeg = numEntBeg + numEntLen;
2677  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2678  const size_t valsBeg = gidsBeg + gidsLen;
2679  const size_t valsLen = numScalarEnt * numBytesPerValue;
2680 
2681  char* const numEntOut = exports.data () + numEntBeg;
2682  char* const gidsOut = exports.data () + gidsBeg;
2683  char* const valsOut = exports.data () + valsBeg;
2684 
2685  size_t numBytesOut = 0;
2686  int errorCode = 0;
2687  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2688 
2689  {
2690  Kokkos::pair<int, size_t> p;
2691  p = PackTraits<GO, D>::packArray (gidsOut, gidsIn.data (), numEnt);
2692  errorCode += p.first;
2693  numBytesOut += p.second;
2694 
2695  p = PackTraits<ST, D>::packArray (valsOut, valsIn.data (), numScalarEnt);
2696  errorCode += p.first;
2697  numBytesOut += p.second;
2698  }
2699 
2700  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2701  TEUCHOS_TEST_FOR_EXCEPTION
2702  (numBytesOut != expectedNumBytes, std::logic_error,
2703  "packRowForBlockCrs: numBytesOut = " << numBytesOut
2704  << " != expectedNumBytes = " << expectedNumBytes << ".");
2705 
2706  TEUCHOS_TEST_FOR_EXCEPTION
2707  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2708  "PackTraits::packArray returned a nonzero error code");
2709 
2710  return numBytesOut;
2711  }
2712 
2713  // Return the number of bytes actually read / used.
2714  template<class ST, class LO, class GO, class D>
2715  size_t
2716  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2717  const typename ::Tpetra::Details::PackTraits<ST, D>::output_array_type& valsOut,
2718  const typename ::Tpetra::Details::PackTraits<int, D>::input_buffer_type& imports,
2719  const size_t offset,
2720  const size_t numBytes,
2721  const size_t numEnt,
2722  const size_t numBytesPerValue,
2723  const size_t blockSize)
2724  {
2725  using ::Tpetra::Details::PackTraits;
2726 
2727  if (numBytes == 0) {
2728  // Rows with zero bytes always have zero entries.
2729  return 0;
2730  }
2731  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2732  TEUCHOS_TEST_FOR_EXCEPTION
2733  (static_cast<size_t> (imports.extent (0)) <= offset,
2734  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2735  << imports.extent (0) << " <= offset = " << offset << ".");
2736  TEUCHOS_TEST_FOR_EXCEPTION
2737  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2738  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2739  << imports.extent (0) << " < offset + numBytes = "
2740  << (offset + numBytes) << ".");
2741 
2742  const GO gid = 0; // packValueCount wants this
2743  const LO lid = 0; // packValueCount wants this
2744 
2745  const size_t numEntBeg = offset;
2746  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2747  const size_t gidsBeg = numEntBeg + numEntLen;
2748  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2749  const size_t valsBeg = gidsBeg + gidsLen;
2750  const size_t valsLen = numScalarEnt * numBytesPerValue;
2751 
2752  const char* const numEntIn = imports.data () + numEntBeg;
2753  const char* const gidsIn = imports.data () + gidsBeg;
2754  const char* const valsIn = imports.data () + valsBeg;
2755 
2756  size_t numBytesOut = 0;
2757  int errorCode = 0;
2758  LO numEntOut;
2759  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2760  TEUCHOS_TEST_FOR_EXCEPTION
2761  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2762  "unpackRowForBlockCrs: Expected number of entries " << numEnt
2763  << " != actual number of entries " << numEntOut << ".");
2764 
2765  {
2766  Kokkos::pair<int, size_t> p;
2767  p = PackTraits<GO, D>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2768  errorCode += p.first;
2769  numBytesOut += p.second;
2770 
2771  p = PackTraits<ST, D>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2772  errorCode += p.first;
2773  numBytesOut += p.second;
2774  }
2775 
2776  TEUCHOS_TEST_FOR_EXCEPTION
2777  (numBytesOut != numBytes, std::logic_error,
2778  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2779  << " != numBytes = " << numBytes << ".");
2780 
2781  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2782  TEUCHOS_TEST_FOR_EXCEPTION
2783  (numBytesOut != expectedNumBytes, std::logic_error,
2784  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2785  << " != expectedNumBytes = " << expectedNumBytes << ".");
2786 
2787  TEUCHOS_TEST_FOR_EXCEPTION
2788  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2789  "PackTraits::unpackArray returned a nonzero error code");
2790 
2791  return numBytesOut;
2792  }
2793  } // namespace (anonymous)
2794 
2795  template<class Scalar, class LO, class GO, class Node>
2796  void
2797  BlockCrsMatrix<Scalar, LO, GO, Node>::
2798 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2799  packAndPrepareNew
2800 #else // TPETRA_ENABLE_DEPRECATED_CODE
2801  packAndPrepare
2802 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2803  (const ::Tpetra::SrcDistObject& source,
2804  const Kokkos::DualView<const local_ordinal_type*,
2805  buffer_device_type>& exportLIDs,
2806  Kokkos::DualView<packet_type*,
2807  buffer_device_type>& exports, // output
2808  Kokkos::DualView<size_t*,
2809  buffer_device_type> numPacketsPerLID, // output
2810  size_t& constantNumPackets,
2811  Distributor& /* distor */)
2812  {
2813  using ::Tpetra::Details::Behavior;
2815  using ::Tpetra::Details::ProfilingRegion;
2816  using ::Tpetra::Details::PackTraits;
2817 
2818  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2819 
2820  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2821 
2822  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2823 
2824  const bool debug = Behavior::debug();
2825  const bool verbose = Behavior::verbose();
2826 
2827  // Define this function prefix
2828  std::string prefix;
2829  {
2830  std::ostringstream os;
2831  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2832  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2833  prefix = os.str();
2834  }
2835 
2836  // check if this already includes a local error
2837  if (* (this->localError_)) {
2838  std::ostream& err = this->markLocalErrorAndGetStream ();
2839  err << prefix
2840  << "The target object of the Import or Export is already in an error state."
2841  << std::endl;
2842  return;
2843  }
2844 
2845  //
2846  // Verbose input dual view status
2847  //
2848  if (verbose) {
2849  std::ostringstream os;
2850  os << prefix << std::endl
2851  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2852  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2853  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2854  std::cerr << os.str ();
2855  }
2856 
2860  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2861  std::ostream& err = this->markLocalErrorAndGetStream ();
2862  err << prefix
2863  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2864  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2865  << "." << std::endl;
2866  return;
2867  }
2868  if (exportLIDs.need_sync_host ()) {
2869  std::ostream& err = this->markLocalErrorAndGetStream ();
2870  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2871  return;
2872  }
2873 
2874  const this_type* src = dynamic_cast<const this_type* > (&source);
2875  if (src == NULL) {
2876  std::ostream& err = this->markLocalErrorAndGetStream ();
2877  err << prefix
2878  << "The source (input) object of the Import or "
2879  "Export is either not a BlockCrsMatrix, or does not have the right "
2880  "template parameters. checkSizes() should have caught this. "
2881  "Please report this bug to the Tpetra developers." << std::endl;
2882  return;
2883  }
2884 
2885  // Graphs and matrices are allowed to have a variable number of
2886  // entries per row. We could test whether all rows have the same
2887  // number of entries, but DistObject can only use this
2888  // optimization if all rows on _all_ processes have the same
2889  // number of entries. Rather than do the all-reduce necessary to
2890  // test for this unlikely case, we tell DistObject (by setting
2891  // constantNumPackets to zero) to assume that different rows may
2892  // have different numbers of entries.
2893  constantNumPackets = 0;
2894 
2895  // const values
2896  const crs_graph_type& srcGraph = src->graph_;
2897  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2898  const size_t numExportLIDs = exportLIDs.extent (0);
2899  const size_t numBytesPerValue =
2900  PackTraits<impl_scalar_type, host_exec>
2901  ::packValueCount(this->val_.extent(0) ? this->val_.view_host()(0) : impl_scalar_type());
2902 
2903  // Compute the number of bytes ("packets") per row to pack. While
2904  // we're at it, compute the total # of block entries to send, and
2905  // the max # of block entries in any of the rows we're sending.
2906 
2907  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2908 
2909  // Graph information is on host; let's do this on host parallel reduce
2910  auto exportLIDsHost = exportLIDs.view_host();
2911  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2912  numPacketsPerLID.modify_host ();
2913  {
2914  using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2915  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2916  Kokkos::parallel_reduce
2917  (policy,
2918  [=](const size_t &i, typename reducer_type::value_type &update) {
2919  const LO lclRow = exportLIDsHost(i);
2920  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2921  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2922 
2923  const size_t numBytes =
2924  packRowCount<LO, GO, host_exec> (numEnt, numBytesPerValue, blockSize);
2925  numPacketsPerLIDHost(i) = numBytes;
2926  update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2927  }, rowReducerStruct);
2928  }
2929 
2930  // Compute the number of bytes ("packets") per row to pack. While
2931  // we're at it, compute the total # of block entries to send, and
2932  // the max # of block entries in any of the rows we're sending.
2933  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2934  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2935  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2936 
2937  if (verbose) {
2938  std::ostringstream os;
2939  os << prefix
2940  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2941  << std::endl;
2942  std::cerr << os.str ();
2943  }
2944 
2945  // We use a "struct of arrays" approach to packing each row's
2946  // entries. All the column indices (as global indices) go first,
2947  // then all their owning process ranks, and then the values.
2948  if(exports.extent(0) != totalNumBytes)
2949  {
2950  const std::string oldLabel = exports.d_view.label ();
2951  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2952  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2953  }
2954  if (totalNumEntries > 0) {
2955  // Current position (in bytes) in the 'exports' output array.
2956  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2957  {
2958  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2959  Kokkos::parallel_scan
2960  (policy,
2961  [=](const size_t &i, size_t &update, const bool &final) {
2962  if (final) offset(i) = update;
2963  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2964  });
2965  }
2966  if (offset(numExportLIDs) != totalNumBytes) {
2967  std::ostream& err = this->markLocalErrorAndGetStream ();
2968  err << prefix
2969  << "At end of method, the final offset (in bytes) "
2970  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2971  << totalNumBytes << ". "
2972  << "Please report this bug to the Tpetra developers." << std::endl;
2973  return;
2974  }
2975 
2976  // For each block row of the matrix owned by the calling
2977  // process, pack that block row's column indices and values into
2978  // the exports array.
2979 
2980  // Source matrix's column Map. We verified in checkSizes() that
2981  // the column Map exists (is not null).
2982  const map_type& srcColMap = * (srcGraph.getColMap ());
2983 
2984  // Pack the data for each row to send, into the 'exports' buffer.
2985  // exports will be modified on host.
2986  exports.modify_host();
2987  {
2988  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2989  const auto policy =
2990  policy_type(numExportLIDs, 1, 1)
2991  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2992  Kokkos::parallel_for
2993  (policy,
2994  [=](const typename policy_type::member_type &member) {
2995  const size_t i = member.league_rank();
2996  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2997  gblColInds(member.team_scratch(0), maxRowLength);
2998 
2999  const LO lclRowInd = exportLIDsHost(i);
3000  const LO* lclColIndsRaw;
3001  Scalar* valsRaw;
3002  LO numEntLO;
3003  // It's OK to ignore the return value, since if the calling
3004  // process doesn't own that local row, then the number of
3005  // entries in that row on the calling process is zero.
3006  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
3007 
3008  const size_t numEnt = static_cast<size_t> (numEntLO);
3009  Kokkos::View<const LO*,host_exec> lclColInds (lclColIndsRaw, numEnt);
3010 
3011  // Convert column indices from local to global.
3012  for (size_t j = 0; j < numEnt; ++j)
3013  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
3014 
3015  // Kyungjoo: additional wrapping scratch view is necessary
3016  // the following function interface need the same execution space
3017  // host scratch space somehow is not considered same as the host_exec
3018  // Copy the row's data into the current spot in the exports array.
3019  const size_t numBytes = packRowForBlockCrs<impl_scalar_type,LO,GO,host_exec>
3020  (exports.view_host(),
3021  offset(i),
3022  numEnt,
3023  Kokkos::View<const GO*,host_exec>(gblColInds.data(), maxRowLength),
3024  Kokkos::View<const impl_scalar_type*,host_exec>(reinterpret_cast<const impl_scalar_type*>(valsRaw), numEnt*blockSize*blockSize),
3025  numBytesPerValue,
3026  blockSize);
3027 
3028  // numBytes should be same as the difference between offsets
3029  if (debug) {
3030  const size_t offsetDiff = offset(i+1) - offset(i);
3031  if (numBytes != offsetDiff) {
3032  std::ostringstream os;
3033  os << prefix
3034  << "numBytes computed from packRowForBlockCrs is different from "
3035  << "precomputed offset values, LID = " << i << std::endl;
3036  std::cerr << os.str ();
3037  }
3038  }
3039  }); // for each LID (of a row) to send
3040  }
3041  } // if totalNumEntries > 0
3042 
3043  if (debug) {
3044  std::ostringstream os;
3045  const bool lclSuccess = ! (* (this->localError_));
3046  os << prefix
3047  << (lclSuccess ? "succeeded" : "FAILED")
3048  << std::endl;
3049  std::cerr << os.str ();
3050  }
3051  }
3052 
3053  template<class Scalar, class LO, class GO, class Node>
3054  void
3055  BlockCrsMatrix<Scalar, LO, GO, Node>::
3056 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3057  unpackAndCombineNew
3058 #else // TPETRA_ENABLE_DEPRECATED_CODE
3059  unpackAndCombine
3060 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3061  (const Kokkos::DualView<const local_ordinal_type*,
3062  buffer_device_type>& importLIDs,
3063  Kokkos::DualView<packet_type*,
3064  buffer_device_type> imports,
3065  Kokkos::DualView<size_t*,
3066  buffer_device_type> numPacketsPerLID,
3067  const size_t /* constantNumPackets */,
3068  Distributor& /* distor */,
3069  const CombineMode combineMode)
3070  {
3071  using ::Tpetra::Details::Behavior;
3073  using ::Tpetra::Details::ProfilingRegion;
3074  using ::Tpetra::Details::PackTraits;
3075  using std::endl;
3076  using host_exec =
3077  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
3078 
3079  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
3080  const bool verbose = Behavior::verbose ();
3081 
3082  // Define this function prefix
3083  std::string prefix;
3084  {
3085  std::ostringstream os;
3086  auto map = this->graph_.getRowMap ();
3087  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
3088  const int myRank = comm.is_null () ? -1 : comm->getRank ();
3089  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
3090  prefix = os.str ();
3091  if (verbose) {
3092  os << "Start" << endl;
3093  std::cerr << os.str ();
3094  }
3095  }
3096 
3097  // check if this already includes a local error
3098  if (* (this->localError_)) {
3099  std::ostream& err = this->markLocalErrorAndGetStream ();
3100  std::ostringstream os;
3101  os << prefix << "{Im/Ex}port target is already in error." << endl;
3102  if (verbose) {
3103  std::cerr << os.str ();
3104  }
3105  err << os.str ();
3106  return;
3107  }
3108 
3112  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
3113  std::ostream& err = this->markLocalErrorAndGetStream ();
3114  std::ostringstream os;
3115  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
3116  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
3117  << "." << endl;
3118  if (verbose) {
3119  std::cerr << os.str ();
3120  }
3121  err << os.str ();
3122  return;
3123  }
3124 
3125  if (combineMode != ADD && combineMode != INSERT &&
3126  combineMode != REPLACE && combineMode != ABSMAX &&
3127  combineMode != ZERO) {
3128  std::ostream& err = this->markLocalErrorAndGetStream ();
3129  std::ostringstream os;
3130  os << prefix
3131  << "Invalid CombineMode value " << combineMode << ". Valid "
3132  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3133  << std::endl;
3134  if (verbose) {
3135  std::cerr << os.str ();
3136  }
3137  err << os.str ();
3138  return;
3139  }
3140 
3141  if (this->graph_.getColMap ().is_null ()) {
3142  std::ostream& err = this->markLocalErrorAndGetStream ();
3143  std::ostringstream os;
3144  os << prefix << "Target matrix's column Map is null." << endl;
3145  if (verbose) {
3146  std::cerr << os.str ();
3147  }
3148  err << os.str ();
3149  return;
3150  }
3151 
3152  // Target matrix's column Map. Use to convert the global column
3153  // indices in the receive buffer to local indices. We verified in
3154  // checkSizes() that the column Map exists (is not null).
3155  const map_type& tgtColMap = * (this->graph_.getColMap ());
3156 
3157  // Const values
3158  const size_t blockSize = this->getBlockSize ();
3159  const size_t numImportLIDs = importLIDs.extent(0);
3160  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
3161  // default-constructed instance could have a different size than
3162  // other instances. (We assume all nominally constructed
3163  // instances have the same size; that's not the issue here.) This
3164  // could be bad if the calling process has no entries, but other
3165  // processes have entries that they want to send to us.
3166  const size_t numBytesPerValue =
3167  PackTraits<impl_scalar_type, host_exec>::packValueCount
3168  (this->val_.extent (0) ? this->val_.view_host () (0) : impl_scalar_type ());
3169  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3170  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3171 
3172  if (verbose) {
3173  std::ostringstream os;
3174  os << prefix << "combineMode: "
3175  << ::Tpetra::combineModeToString (combineMode)
3176  << ", blockSize: " << blockSize
3177  << ", numImportLIDs: " << numImportLIDs
3178  << ", numBytesPerValue: " << numBytesPerValue
3179  << ", maxRowNumEnt: " << maxRowNumEnt
3180  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
3181  std::cerr << os.str ();
3182  }
3183 
3184  if (combineMode == ZERO || numImportLIDs == 0) {
3185  if (verbose) {
3186  std::ostringstream os;
3187  os << prefix << "Nothing to unpack. Done!" << std::endl;
3188  std::cerr << os.str ();
3189  }
3190  return;
3191  }
3192 
3193  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
3194  // we can remove this sync.
3195  if (imports.need_sync_host ()) {
3196  imports.sync_host ();
3197  }
3198 
3199  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
3200  // sync'd numPacketsPerLID to host, since it needs to do that in
3201  // order to post MPI_Irecv messages with the correct lengths. A
3202  // hypothetical device-specific boundary exchange implementation
3203  // could possibly receive data without sync'ing lengths to host,
3204  // but we don't need to design for that nonexistent thing yet.
3205 
3206  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
3207  importLIDs.need_sync_host ()) {
3208  std::ostream& err = this->markLocalErrorAndGetStream ();
3209  std::ostringstream os;
3210  os << prefix << "All input DualViews must be sync'd to host by now. "
3211  << "imports_nc.need_sync_host()="
3212  << (imports.need_sync_host () ? "true" : "false")
3213  << ", numPacketsPerLID.need_sync_host()="
3214  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
3215  << ", importLIDs.need_sync_host()="
3216  << (importLIDs.need_sync_host () ? "true" : "false")
3217  << "." << endl;
3218  if (verbose) {
3219  std::cerr << os.str ();
3220  }
3221  err << os.str ();
3222  return;
3223  }
3224 
3225  const auto importLIDsHost = importLIDs.view_host ();
3226  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
3227 
3228  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
3229  // loop, by only unpacking on final==true (when we know the
3230  // current offset's value).
3231 
3232  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
3233  {
3234  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
3235  Kokkos::parallel_scan
3236  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
3237  [=] (const size_t &i, size_t &update, const bool &final) {
3238  if (final) offset(i) = update;
3239  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
3240  });
3241  }
3242 
3243  // this variable does not matter with a race condition (just error flag)
3244  //
3245  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
3246  // or atomics on bool, so we use int instead. (I know we're not
3247  // launching a CUDA loop here, but we could in the future, and I'd
3248  // like to avoid potential trouble.)
3249  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
3250  errorDuringUnpack ("errorDuringUnpack");
3251  errorDuringUnpack () = 0;
3252  {
3253  using policy_type = Kokkos::TeamPolicy<host_exec>;
3254  const auto policy = policy_type (numImportLIDs, 1, 1)
3255  .set_scratch_size (0, Kokkos::PerTeam (sizeof (GO) * maxRowNumEnt +
3256  sizeof (LO) * maxRowNumEnt +
3257  numBytesPerValue * maxRowNumScalarEnt));
3258  using host_scratch_space = typename host_exec::scratch_memory_space;
3259  using pair_type = Kokkos::pair<size_t, size_t>;
3260  Kokkos::parallel_for
3261  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
3262  [=] (const typename policy_type::member_type& member) {
3263  const size_t i = member.league_rank();
3264 
3265  Kokkos::View<GO*, host_scratch_space> gblColInds
3266  (member.team_scratch (0), maxRowNumEnt);
3267  Kokkos::View<LO*, host_scratch_space> lclColInds
3268  (member.team_scratch (0), maxRowNumEnt);
3269  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
3270  (member.team_scratch (0), maxRowNumScalarEnt);
3271 
3272  const size_t offval = offset(i);
3273  const LO lclRow = importLIDsHost(i);
3274  const size_t numBytes = numPacketsPerLIDHost(i);
3275  const size_t numEnt =
3276  unpackRowCount<impl_scalar_type, LO, GO, host_exec>
3277  (imports.view_host (), offval, numBytes, numBytesPerValue);
3278 
3279  if (numBytes > 0) {
3280  if (numEnt > maxRowNumEnt) {
3281  errorDuringUnpack() = 1;
3282  if (verbose) {
3283  std::ostringstream os;
3284  os << prefix
3285  << "At i = " << i << ", numEnt = " << numEnt
3286  << " > maxRowNumEnt = " << maxRowNumEnt
3287  << std::endl;
3288  std::cerr << os.str ();
3289  }
3290  return;
3291  }
3292  }
3293  const size_t numScalarEnt = numEnt*blockSize*blockSize;
3294  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
3295  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
3296  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
3297 
3298  // Kyungjoo: additional wrapping scratch view is necessary
3299  // the following function interface need the same execution space
3300  // host scratch space somehow is not considered same as the host_exec
3301  size_t numBytesOut = 0;
3302  try {
3303  numBytesOut =
3304  unpackRowForBlockCrs<impl_scalar_type, LO, GO, host_exec>
3305  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
3306  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
3307  imports.view_host(),
3308  offval, numBytes, numEnt,
3309  numBytesPerValue, blockSize);
3310  }
3311  catch (std::exception& e) {
3312  errorDuringUnpack() = 1;
3313  if (verbose) {
3314  std::ostringstream os;
3315  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
3316  << e.what () << endl;
3317  std::cerr << os.str ();
3318  }
3319  return;
3320  }
3321 
3322  if (numBytes != numBytesOut) {
3323  errorDuringUnpack() = 1;
3324  if (verbose) {
3325  std::ostringstream os;
3326  os << prefix
3327  << "At i = " << i << ", numBytes = " << numBytes
3328  << " != numBytesOut = " << numBytesOut << "."
3329  << std::endl;
3330  std::cerr << os.str();
3331  }
3332  return;
3333  }
3334 
3335  // Convert incoming global indices to local indices.
3336  for (size_t k = 0; k < numEnt; ++k) {
3337  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3338  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3339  if (verbose) {
3340  std::ostringstream os;
3341  os << prefix
3342  << "At i = " << i << ", GID " << gidsOut(k)
3343  << " is not owned by the calling process."
3344  << std::endl;
3345  std::cerr << os.str();
3346  }
3347  return;
3348  }
3349  }
3350 
3351  // Combine the incoming data with the matrix's current data.
3352  LO numCombd = 0;
3353  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
3354  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
3355  (const_cast<const impl_scalar_type*> (valsOut.data ()));
3356  if (combineMode == ADD) {
3357  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3358  } else if (combineMode == INSERT || combineMode == REPLACE) {
3359  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3360  } else if (combineMode == ABSMAX) {
3361  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3362  }
3363 
3364  if (static_cast<LO> (numEnt) != numCombd) {
3365  errorDuringUnpack() = 1;
3366  if (verbose) {
3367  std::ostringstream os;
3368  os << prefix << "At i = " << i << ", numEnt = " << numEnt
3369  << " != numCombd = " << numCombd << "."
3370  << endl;
3371  std::cerr << os.str ();
3372  }
3373  return;
3374  }
3375  }); // for each import LID i
3376  }
3377 
3378  if (errorDuringUnpack () != 0) {
3379  std::ostream& err = this->markLocalErrorAndGetStream ();
3380  err << prefix << "Unpacking failed.";
3381  if (! verbose) {
3382  err << " Please run again with the environment variable setting "
3383  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3384  }
3385  err << endl;
3386  }
3387 
3388  if (verbose) {
3389  std::ostringstream os;
3390  const bool lclSuccess = ! (* (this->localError_));
3391  os << prefix
3392  << (lclSuccess ? "succeeded" : "FAILED")
3393  << std::endl;
3394  std::cerr << os.str ();
3395  }
3396  }
3397 
3398  template<class Scalar, class LO, class GO, class Node>
3399  std::string
3401  {
3402  using Teuchos::TypeNameTraits;
3403  std::ostringstream os;
3404  os << "\"Tpetra::BlockCrsMatrix\": { "
3405  << "Template parameters: { "
3406  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3407  << "LO: " << TypeNameTraits<LO>::name ()
3408  << "GO: " << TypeNameTraits<GO>::name ()
3409  << "Node: " << TypeNameTraits<Node>::name ()
3410  << " }"
3411  << ", Label: \"" << this->getObjectLabel () << "\""
3412  << ", Global dimensions: ["
3413  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3414  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3415  << ", Block size: " << getBlockSize ()
3416  << " }";
3417  return os.str ();
3418  }
3419 
3420 
3421  template<class Scalar, class LO, class GO, class Node>
3422  void
3424  describe (Teuchos::FancyOStream& out,
3425  const Teuchos::EVerbosityLevel verbLevel) const
3426  {
3427  using Teuchos::ArrayRCP;
3428  using Teuchos::CommRequest;
3429  using Teuchos::FancyOStream;
3430  using Teuchos::getFancyOStream;
3431  using Teuchos::ireceive;
3432  using Teuchos::isend;
3433  using Teuchos::outArg;
3434  using Teuchos::TypeNameTraits;
3435  using Teuchos::VERB_DEFAULT;
3436  using Teuchos::VERB_NONE;
3437  using Teuchos::VERB_LOW;
3438  using Teuchos::VERB_MEDIUM;
3439  // using Teuchos::VERB_HIGH;
3440  using Teuchos::VERB_EXTREME;
3441  using Teuchos::RCP;
3442  using Teuchos::wait;
3443  using std::endl;
3444 #ifdef HAVE_TPETRA_DEBUG
3445  const char prefix[] = "Tpetra::BlockCrsMatrix::describe: ";
3446 #endif // HAVE_TPETRA_DEBUG
3447 
3448  // Set default verbosity if applicable.
3449  const Teuchos::EVerbosityLevel vl =
3450  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3451 
3452  if (vl == VERB_NONE) {
3453  return; // print nothing
3454  }
3455 
3456  // describe() always starts with a tab before it prints anything.
3457  Teuchos::OSTab tab0 (out);
3458 
3459  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3460  Teuchos::OSTab tab1 (out);
3461 
3462  out << "Template parameters:" << endl;
3463  {
3464  Teuchos::OSTab tab2 (out);
3465  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3466  << "LO: " << TypeNameTraits<LO>::name () << endl
3467  << "GO: " << TypeNameTraits<GO>::name () << endl
3468  << "Node: " << TypeNameTraits<Node>::name () << endl;
3469  }
3470  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3471  << "Global dimensions: ["
3472  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3473  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3474 
3475  const LO blockSize = getBlockSize ();
3476  out << "Block size: " << blockSize << endl;
3477 
3478  // constituent objects
3479  if (vl >= VERB_MEDIUM) {
3480  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3481  const int myRank = comm.getRank ();
3482  if (myRank == 0) {
3483  out << "Row Map:" << endl;
3484  }
3485  getRowMap()->describe (out, vl);
3486 
3487  if (! getColMap ().is_null ()) {
3488  if (getColMap() == getRowMap()) {
3489  if (myRank == 0) {
3490  out << "Column Map: same as row Map" << endl;
3491  }
3492  }
3493  else {
3494  if (myRank == 0) {
3495  out << "Column Map:" << endl;
3496  }
3497  getColMap ()->describe (out, vl);
3498  }
3499  }
3500  if (! getDomainMap ().is_null ()) {
3501  if (getDomainMap () == getRowMap ()) {
3502  if (myRank == 0) {
3503  out << "Domain Map: same as row Map" << endl;
3504  }
3505  }
3506  else if (getDomainMap () == getColMap ()) {
3507  if (myRank == 0) {
3508  out << "Domain Map: same as column Map" << endl;
3509  }
3510  }
3511  else {
3512  if (myRank == 0) {
3513  out << "Domain Map:" << endl;
3514  }
3515  getDomainMap ()->describe (out, vl);
3516  }
3517  }
3518  if (! getRangeMap ().is_null ()) {
3519  if (getRangeMap () == getDomainMap ()) {
3520  if (myRank == 0) {
3521  out << "Range Map: same as domain Map" << endl;
3522  }
3523  }
3524  else if (getRangeMap () == getRowMap ()) {
3525  if (myRank == 0) {
3526  out << "Range Map: same as row Map" << endl;
3527  }
3528  }
3529  else {
3530  if (myRank == 0) {
3531  out << "Range Map: " << endl;
3532  }
3533  getRangeMap ()->describe (out, vl);
3534  }
3535  }
3536  }
3537 
3538  if (vl >= VERB_EXTREME) {
3539  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3540  // to host, since it's supposed to be const. However, that's
3541  // the easiest and least memory-intensive way to implement this
3542  // method.
3543  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
3544  const_cast<this_type*> (this)->sync_host ();
3545 
3546 #ifdef HAVE_TPETRA_DEBUG
3547  TEUCHOS_TEST_FOR_EXCEPTION
3548  (this->need_sync_host (), std::logic_error,
3549  prefix << "Right after sync to host, the matrix claims that it needs "
3550  "sync to host. Please report this bug to the Tpetra developers.");
3551 #endif // HAVE_TPETRA_DEBUG
3552 
3553  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3554  const int myRank = comm.getRank ();
3555  const int numProcs = comm.getSize ();
3556 
3557  // Print the calling process' data to the given output stream.
3558  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3559  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3560  FancyOStream& os = *osPtr;
3561  os << "Process " << myRank << ":" << endl;
3562  Teuchos::OSTab tab2 (os);
3563 
3564  const map_type& meshRowMap = * (graph_.getRowMap ());
3565  const map_type& meshColMap = * (graph_.getColMap ());
3566  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3567  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3568  ++meshLclRow) {
3569  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3570  os << "Row " << meshGblRow << ": {";
3571 
3572  const LO* lclColInds = NULL;
3573  Scalar* vals = NULL;
3574  LO numInds = 0;
3575  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3576 
3577  for (LO k = 0; k < numInds; ++k) {
3578  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3579 
3580  os << "Col " << gblCol << ": [";
3581  for (LO i = 0; i < blockSize; ++i) {
3582  for (LO j = 0; j < blockSize; ++j) {
3583  os << vals[blockSize*blockSize*k + i*blockSize + j];
3584  if (j + 1 < blockSize) {
3585  os << ", ";
3586  }
3587  }
3588  if (i + 1 < blockSize) {
3589  os << "; ";
3590  }
3591  }
3592  os << "]";
3593  if (k + 1 < numInds) {
3594  os << ", ";
3595  }
3596  }
3597  os << "}" << endl;
3598  }
3599 
3600  // Print data on Process 0. This will automatically respect the
3601  // current indentation level.
3602  if (myRank == 0) {
3603  out << lclOutStrPtr->str ();
3604  lclOutStrPtr = Teuchos::null; // clear it to save space
3605  }
3606 
3607  const int sizeTag = 1337;
3608  const int dataTag = 1338;
3609 
3610  ArrayRCP<char> recvDataBuf; // only used on Process 0
3611 
3612  // Send string sizes and data from each process in turn to
3613  // Process 0, and print on that process.
3614  for (int p = 1; p < numProcs; ++p) {
3615  if (myRank == 0) {
3616  // Receive the incoming string's length.
3617  ArrayRCP<size_t> recvSize (1);
3618  recvSize[0] = 0;
3619  RCP<CommRequest<int> > recvSizeReq =
3620  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3621  wait<int> (comm, outArg (recvSizeReq));
3622  const size_t numCharsToRecv = recvSize[0];
3623 
3624  // Allocate space for the string to receive. Reuse receive
3625  // buffer space if possible. We can do this because in the
3626  // current implementation, we only have one receive in
3627  // flight at a time. Leave space for the '\0' at the end,
3628  // in case the sender doesn't send it.
3629  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3630  recvDataBuf.resize (numCharsToRecv + 1);
3631  }
3632  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3633  // Post the receive of the actual string data.
3634  RCP<CommRequest<int> > recvDataReq =
3635  ireceive<int, char> (recvData, p, dataTag, comm);
3636  wait<int> (comm, outArg (recvDataReq));
3637 
3638  // Print the received data. This will respect the current
3639  // indentation level. Make sure that the string is
3640  // null-terminated.
3641  recvDataBuf[numCharsToRecv] = '\0';
3642  out << recvDataBuf.getRawPtr ();
3643  }
3644  else if (myRank == p) { // if I am not Process 0, and my rank is p
3645  // This deep-copies the string at most twice, depending on
3646  // whether std::string reference counts internally (it
3647  // generally does, so this won't deep-copy at all).
3648  const std::string stringToSend = lclOutStrPtr->str ();
3649  lclOutStrPtr = Teuchos::null; // clear original to save space
3650 
3651  // Send the string's length to Process 0.
3652  const size_t numCharsToSend = stringToSend.size ();
3653  ArrayRCP<size_t> sendSize (1);
3654  sendSize[0] = numCharsToSend;
3655  RCP<CommRequest<int> > sendSizeReq =
3656  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3657  wait<int> (comm, outArg (sendSizeReq));
3658 
3659  // Send the actual string to Process 0. We know that the
3660  // string has length > 0, so it's save to take the address
3661  // of the first entry. Make a nonowning ArrayRCP to hold
3662  // the string. Process 0 will add a null termination
3663  // character at the end of the string, after it receives the
3664  // message.
3665  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3666  RCP<CommRequest<int> > sendDataReq =
3667  isend<int, char> (sendData, 0, dataTag, comm);
3668  wait<int> (comm, outArg (sendDataReq));
3669  }
3670  } // for each process rank p other than 0
3671  } // extreme verbosity level (print the whole matrix)
3672  }
3673 
3674  template<class Scalar, class LO, class GO, class Node>
3675  Teuchos::RCP<const Teuchos::Comm<int> >
3677  getComm() const
3678  {
3679  return graph_.getComm();
3680  }
3681 
3682 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3683  template<class Scalar, class LO, class GO, class Node>
3684  TPETRA_DEPRECATED
3685  Teuchos::RCP<Node>
3687  getNode() const
3688  {
3689  return Teuchos::null;
3690 
3691  }
3692 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3693 
3694  template<class Scalar, class LO, class GO, class Node>
3698  {
3699  return graph_.getGlobalNumCols();
3700  }
3701 
3702  template<class Scalar, class LO, class GO, class Node>
3703  size_t
3706  {
3707  return graph_.getNodeNumCols();
3708  }
3709 
3710  template<class Scalar, class LO, class GO, class Node>
3711  GO
3714  {
3715  return graph_.getIndexBase();
3716  }
3717 
3718  template<class Scalar, class LO, class GO, class Node>
3722  {
3723  return graph_.getGlobalNumEntries();
3724  }
3725 
3726  template<class Scalar, class LO, class GO, class Node>
3727  size_t
3730  {
3731  return graph_.getNodeNumEntries();
3732  }
3733 
3734  template<class Scalar, class LO, class GO, class Node>
3735  size_t
3737  getNumEntriesInGlobalRow (GO globalRow) const
3738  {
3739  return graph_.getNumEntriesInGlobalRow(globalRow);
3740  }
3741 
3742 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3743  template<class Scalar, class LO, class GO, class Node>
3744  global_size_t TPETRA_DEPRECATED
3746  getGlobalNumDiags () const
3747  {
3748  using HDM = Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3749  return dynamic_cast<const HDM&> (this->graph_).getGlobalNumDiagsImpl ();
3750  }
3751 
3752  template<class Scalar, class LO, class GO, class Node>
3753  size_t TPETRA_DEPRECATED
3754  BlockCrsMatrix<Scalar, LO, GO, Node>::
3755  getNodeNumDiags () const
3756  {
3757  using HDM = Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3758  return dynamic_cast<const HDM&> (this->graph_).getNodeNumDiagsImpl ();
3759  }
3760 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3761 
3762  template<class Scalar, class LO, class GO, class Node>
3763  size_t
3766  {
3767  return graph_.getGlobalMaxNumRowEntries();
3768  }
3769 
3770  template<class Scalar, class LO, class GO, class Node>
3771  bool
3773  hasColMap() const
3774  {
3775  return graph_.hasColMap();
3776  }
3777 
3778 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3779  template<class Scalar, class LO, class GO, class Node>
3780  bool TPETRA_DEPRECATED
3782  isLowerTriangular () const
3783  {
3784  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3785  return dynamic_cast<const HDM&> (this->graph_).isLowerTriangularImpl ();
3786  }
3787 
3788  template<class Scalar, class LO, class GO, class Node>
3789  bool TPETRA_DEPRECATED
3790  BlockCrsMatrix<Scalar, LO, GO, Node>::
3791  isUpperTriangular () const
3792  {
3793  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3794  return dynamic_cast<const HDM&> (this->graph_).isUpperTriangularImpl ();
3795  }
3796 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3797 
3798  template<class Scalar, class LO, class GO, class Node>
3799  bool
3802  {
3803  return graph_.isLocallyIndexed();
3804  }
3805 
3806  template<class Scalar, class LO, class GO, class Node>
3807  bool
3810  {
3811  return graph_.isGloballyIndexed();
3812  }
3813 
3814  template<class Scalar, class LO, class GO, class Node>
3815  bool
3818  {
3819  return graph_.isFillComplete ();
3820  }
3821 
3822  template<class Scalar, class LO, class GO, class Node>
3823  bool
3826  {
3827  return false;
3828  }
3829 
3830 
3831  template<class Scalar, class LO, class GO, class Node>
3832  void
3834  getGlobalRowCopy (GO /* GlobalRow */,
3835  const Teuchos::ArrayView<GO> &/* Indices */,
3836  const Teuchos::ArrayView<Scalar> &/* Values */,
3837  size_t &/* NumEntries */) const
3838  {
3839  TEUCHOS_TEST_FOR_EXCEPTION(
3840  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3841  "This class doesn't support global matrix indexing.");
3842 
3843  }
3844 
3845  template<class Scalar, class LO, class GO, class Node>
3846  void
3848  getGlobalRowView (GO /* GlobalRow */,
3849  Teuchos::ArrayView<const GO> &/* indices */,
3850  Teuchos::ArrayView<const Scalar> &/* values */) const
3851  {
3852  TEUCHOS_TEST_FOR_EXCEPTION(
3853  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3854  "This class doesn't support global matrix indexing.");
3855 
3856  }
3857 
3858  template<class Scalar, class LO, class GO, class Node>
3859  void
3861  getLocalRowView (LO /* LocalRow */,
3862  Teuchos::ArrayView<const LO>& /* indices */,
3863  Teuchos::ArrayView<const Scalar>& /* values */) const
3864  {
3865  TEUCHOS_TEST_FOR_EXCEPTION(
3866  true, std::logic_error, "Tpetra::BlockCrsMatrix::getLocalRowView: "
3867  "This class doesn't support local matrix indexing.");
3868  }
3869 
3870  template<class Scalar, class LO, class GO, class Node>
3871  void
3874  {
3875 #ifdef HAVE_TPETRA_DEBUG
3876  const char prefix[] =
3877  "Tpetra::BlockCrsMatrix::getLocalDiagCopy: ";
3878 #endif // HAVE_TPETRA_DEBUG
3879 
3880  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3881 
3882  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3883  graph_.getLocalDiagOffsets (diagOffsets);
3884 
3885  // The code below works on host, so use a host View.
3886  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3887  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3888  // We're filling diag on host for now.
3889  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3890 
3891 #ifdef HAVE_TPETRA_DEBUG
3892  TEUCHOS_TEST_FOR_EXCEPTION
3893  (this->need_sync_host (), std::runtime_error,
3894  prefix << "The matrix's data were last modified on device, but have "
3895  "not been sync'd to host. Please sync to host (by calling "
3896  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3897  "method.");
3898 #endif // HAVE_TPETRA_DEBUG
3899 
3900  auto vals_host_out = getValuesHost ();
3901  Scalar* vals_host_out_raw =
3902  reinterpret_cast<Scalar*> (vals_host_out.data ());
3903 
3904  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3905  size_t rowOffset = 0;
3906  size_t offset = 0;
3907  LO bs = getBlockSize();
3908  for(size_t r=0; r<getNodeNumRows(); r++)
3909  {
3910  // move pointer to start of diagonal block
3911  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3912  for(int b=0; b<bs; b++)
3913  {
3914  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3915  }
3916  // move pointer to start of next block row
3917  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3918  }
3919 
3920  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3921  }
3922 
3923  template<class Scalar, class LO, class GO, class Node>
3924  void
3926  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3927  {
3928  TEUCHOS_TEST_FOR_EXCEPTION(
3929  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3930  "not implemented.");
3931 
3932  }
3933 
3934  template<class Scalar, class LO, class GO, class Node>
3935  void
3937  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3938  {
3939  TEUCHOS_TEST_FOR_EXCEPTION(
3940  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3941  "not implemented.");
3942 
3943  }
3944 
3945  template<class Scalar, class LO, class GO, class Node>
3946  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3948  getGraph() const
3949  {
3950  return graphRCP_;
3951  }
3952 
3953  template<class Scalar, class LO, class GO, class Node>
3954  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3957  {
3958  TEUCHOS_TEST_FOR_EXCEPTION(
3959  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3960  "not implemented.");
3961  }
3962 
3963 } // namespace Tpetra
3964 
3965 //
3966 // Explicit instantiation macro
3967 //
3968 // Must be expanded from within the Tpetra namespace!
3969 //
3970 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3971  template class BlockCrsMatrix< S, LO, GO, NODE >;
3972 
3973 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
size_t getNodeNumRows() const
get the local number of block rows
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
virtual bool hasColMap() const
Whether this matrix has a well-defined column Map.
LocalOrdinal getMinLocalIndex() const
The minimum local index.
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
int local_ordinal_type
Default value of Scalar template parameter.
MultiVector for multiple degrees of freedom per mesh point.
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
size_t global_size_t
Global size_t object.
size_t getNodeNumEntries() const override
The local number of entries in the graph.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
virtual bool isFillComplete() const
Whether fillComplete() has been called.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
Insert new values that don&#39;t currently exist.
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in any row of the matrix, on this process.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
Sets up and executes a communication plan for a Tpetra DistObject.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
global_size_t getGlobalNumRows() const
get the global number of block rows
CrsGraphType::global_ordinal_type getGlobalNumDiags(const CrsGraphType &G)
Number of populated diagonal entries in the given sparse graph, over all processes in the graph&#39;s (MP...
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
Replace old value with maximum of magnitudes of old and new values.
Replace existing values with new values.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
Replace old values with zero.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
A distributed dense vector.
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
local_graph_type getLocalGraph() const
Get the local graph.
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
void reorderedGaussSeidelCopy(::Tpetra::MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
std::string description() const
One-line description of this object.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual GO getIndexBase() const
The index base for global indices in this matrix.