Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 
40 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
41 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
42 
45 
49 #include "Tpetra_BlockMultiVector.hpp"
50 #include "Tpetra_BlockView.hpp"
51 
52 #include "Teuchos_TimeMonitor.hpp"
53 #ifdef HAVE_TPETRA_DEBUG
54 # include <set>
55 #endif // HAVE_TPETRA_DEBUG
56 
57 //
58 // mfh 25 May 2016: Temporary fix for #393.
59 //
60 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
61 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
62 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
63 // use them in that case, either.
64 //
65 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
66 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
67 // thing for any GCC version.
68 //
69 #if defined(__CUDACC__)
70  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
71 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
72 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
73 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74 
75 #elif defined(__GNUC__)
76 
77 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
78 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
79 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80 
81 #else // some other compiler
82 
83  // Optimistically assume that other compilers aren't broken.
84 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
85 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
86 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87 #endif // defined(__CUDACC__), defined(__GNUC__)
88 
89 
90 namespace Tpetra {
91 
92 namespace Impl {
93 
94  template<typename T>
95  struct BlockCrsRowStruct {
96  T totalNumEntries, totalNumBytes, maxRowLength;
97  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
98  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
99  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
100  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
101  };
102 
103  template<typename T>
104  static
105  KOKKOS_INLINE_FUNCTION
106  void operator+=(volatile BlockCrsRowStruct<T> &a,
107  volatile const BlockCrsRowStruct<T> &b) {
108  a.totalNumEntries += b.totalNumEntries;
109  a.totalNumBytes += b.totalNumBytes;
110  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
111  }
112 
113  template<typename T>
114  static
115  KOKKOS_INLINE_FUNCTION
116  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
117  a.totalNumEntries += b.totalNumEntries;
118  a.totalNumBytes += b.totalNumBytes;
119  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
120  }
121 
122  template<typename T, typename ExecSpace>
123  struct BlockCrsReducer {
124  typedef BlockCrsReducer reducer;
125  typedef T value_type;
126  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
127  value_type *value;
128 
129  KOKKOS_INLINE_FUNCTION
130  BlockCrsReducer(value_type &val) : value(&val) {}
131 
132  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
133  KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const { dst += src; }
134  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
135  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
136  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
137  };
138 
139  template<class AlphaCoeffType,
140  class GraphType,
141  class MatrixValuesType,
142  class InVecType,
143  class BetaCoeffType,
144  class OutVecType,
145  bool IsBuiltInType>
146  class BcrsApplyNoTransFunctor {
147  private:
148  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
149  "MatrixValuesType must be a Kokkos::View.");
150  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
151  "OutVecType must be a Kokkos::View.");
152  static_assert (Kokkos::Impl::is_view<InVecType>::value,
153  "InVecType must be a Kokkos::View.");
154  static_assert (std::is_same<MatrixValuesType,
155  typename MatrixValuesType::const_type>::value,
156  "MatrixValuesType must be a const Kokkos::View.");
157  static_assert (std::is_same<OutVecType,
158  typename OutVecType::non_const_type>::value,
159  "OutVecType must be a nonconst Kokkos::View.");
160  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
161  "InVecType must be a const Kokkos::View.");
162  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
163  "MatrixValuesType must be a rank-1 Kokkos::View.");
164  static_assert (static_cast<int> (InVecType::rank) == 1,
165  "InVecType must be a rank-1 Kokkos::View.");
166  static_assert (static_cast<int> (OutVecType::rank) == 1,
167  "OutVecType must be a rank-1 Kokkos::View.");
168  typedef typename MatrixValuesType::non_const_value_type scalar_type;
169 
170  public:
171  typedef typename GraphType::device_type device_type;
172 
174  typedef typename std::remove_const<typename GraphType::data_type>::type
176 
183  void setX (const InVecType& X) { X_ = X; }
184 
191  void setY (const OutVecType& Y) { Y_ = Y; }
192 
193  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
194  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
195 
197  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
198  const GraphType& graph,
199  const MatrixValuesType& val,
200  const local_ordinal_type blockSize,
201  const InVecType& X,
202  const beta_coeff_type& beta,
203  const OutVecType& Y) :
204  alpha_ (alpha),
205  ptr_ (graph.row_map),
206  ind_ (graph.entries),
207  val_ (val),
208  blockSize_ (blockSize),
209  X_ (X),
210  beta_ (beta),
211  Y_ (Y)
212  {}
213 
214  // Dummy team version
215  KOKKOS_INLINE_FUNCTION void
216  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
217  {
218  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
219  }
220 
221  // Range Policy for non built-in types
222  KOKKOS_INLINE_FUNCTION void
223  operator () (const local_ordinal_type& lclRow) const
224  {
229  using Kokkos::Details::ArithTraits;
230  // I'm not writing 'using Kokkos::make_pair;' here, because that
231  // may break builds for users who make the mistake of putting
232  // 'using namespace std;' in the global namespace. Please don't
233  // ever do that! But just in case you do, I'll take this
234  // precaution.
235  using Kokkos::parallel_for;
236  using Kokkos::subview;
237  typedef typename decltype (ptr_)::non_const_value_type offset_type;
238  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
239  Kokkos::LayoutRight,
240  device_type,
241  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
242  little_block_type;
243 
244  const offset_type Y_ptBeg = lclRow * blockSize_;
245  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
246  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
247 
248  // This version of the code does not use temporary storage.
249  // Each thread writes to its own block of the target vector.
250  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
251  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
252  }
253  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
254  SCAL (beta_, Y_cur);
255  }
256 
257  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
258  const offset_type blkBeg = ptr_[lclRow];
259  const offset_type blkEnd = ptr_[lclRow+1];
260  // Precompute to save integer math in the inner loop.
261  const offset_type bs2 = blockSize_ * blockSize_;
262  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
263  ++absBlkOff) {
264  little_block_type A_cur (val_.data () + absBlkOff * bs2,
265  blockSize_, blockSize_);
266  const offset_type X_blkCol = ind_[absBlkOff];
267  const offset_type X_ptBeg = X_blkCol * blockSize_;
268  const offset_type X_ptEnd = X_ptBeg + blockSize_;
269  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
270 
271  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
272  } // for each entry in current local block row of matrix
273  }
274  }
275 
276  private:
277  alpha_coeff_type alpha_;
278  typename GraphType::row_map_type::const_type ptr_;
279  typename GraphType::entries_type::const_type ind_;
280  MatrixValuesType val_;
281  local_ordinal_type blockSize_;
282  InVecType X_;
283  beta_coeff_type beta_;
284  OutVecType Y_;
285  };
286 
287  template<class AlphaCoeffType,
288  class GraphType,
289  class MatrixValuesType,
290  class InVecType,
291  class BetaCoeffType,
292  class OutVecType>
293  class BcrsApplyNoTransFunctor<AlphaCoeffType,
294  GraphType,
295  MatrixValuesType,
296  InVecType,
297  BetaCoeffType,
298  OutVecType,
299  true> {
300  private:
301  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
302  "MatrixValuesType must be a Kokkos::View.");
303  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
304  "OutVecType must be a Kokkos::View.");
305  static_assert (Kokkos::Impl::is_view<InVecType>::value,
306  "InVecType must be a Kokkos::View.");
307  static_assert (std::is_same<MatrixValuesType,
308  typename MatrixValuesType::const_type>::value,
309  "MatrixValuesType must be a const Kokkos::View.");
310  static_assert (std::is_same<OutVecType,
311  typename OutVecType::non_const_type>::value,
312  "OutVecType must be a nonconst Kokkos::View.");
313  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
314  "InVecType must be a const Kokkos::View.");
315  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
316  "MatrixValuesType must be a rank-1 Kokkos::View.");
317  static_assert (static_cast<int> (InVecType::rank) == 1,
318  "InVecType must be a rank-1 Kokkos::View.");
319  static_assert (static_cast<int> (OutVecType::rank) == 1,
320  "OutVecType must be a rank-1 Kokkos::View.");
321  typedef typename MatrixValuesType::non_const_value_type scalar_type;
322 
323  public:
324  typedef typename GraphType::device_type device_type;
325 
327  typedef typename std::remove_const<typename GraphType::data_type>::type
329 
336  void setX (const InVecType& X) { X_ = X; }
337 
344  void setY (const OutVecType& Y) { Y_ = Y; }
345 
346  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
347  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
348 
350  BcrsApplyNoTransFunctor (const alpha_coeff_type& alpha,
351  const GraphType& graph,
352  const MatrixValuesType& val,
353  const local_ordinal_type blockSize,
354  const InVecType& X,
355  const beta_coeff_type& beta,
356  const OutVecType& Y) :
357  alpha_ (alpha),
358  ptr_ (graph.row_map),
359  ind_ (graph.entries),
360  val_ (val),
361  blockSize_ (blockSize),
362  X_ (X),
363  beta_ (beta),
364  Y_ (Y)
365  {}
366 
367  // dummy Range version
368  KOKKOS_INLINE_FUNCTION void
369  operator () (const local_ordinal_type& lclRow) const
370  {
371  Kokkos::abort("Tpetra::BcrsApplyNoTransFunctor:: this should not be called");
372  }
373 
374  // Team policy for built-in types
375  KOKKOS_INLINE_FUNCTION void
376  operator () (const typename Kokkos::TeamPolicy<typename device_type::execution_space>::member_type & member) const
377  {
378  const local_ordinal_type lclRow = member.league_rank();
379 
380  using Kokkos::Details::ArithTraits;
381  // I'm not writing 'using Kokkos::make_pair;' here, because that
382  // may break builds for users who make the mistake of putting
383  // 'using namespace std;' in the global namespace. Please don't
384  // ever do that! But just in case you do, I'll take this
385  // precaution.
386  using Kokkos::parallel_for;
387  using Kokkos::subview;
388  typedef typename decltype (ptr_)::non_const_value_type offset_type;
389  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
390  Kokkos::LayoutRight,
391  device_type,
392  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
393  little_block_type;
394 
395  const offset_type Y_ptBeg = lclRow * blockSize_;
396  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
397  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
398 
399  // This version of the code does not use temporary storage.
400  // Each thread writes to its own block of the target vector.
401  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
402  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
403  [&](const local_ordinal_type &i) {
404  Y_cur(i) = ArithTraits<beta_coeff_type>::zero ();
405  });
406  }
407  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
408  Kokkos::parallel_for(Kokkos::TeamVectorRange(member, blockSize_),
409  [&](const local_ordinal_type &i) {
410  Y_cur(i) *= beta_;
411  });
412  }
413  member.team_barrier();
414 
415  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
416  const offset_type blkBeg = ptr_[lclRow];
417  const offset_type blkEnd = ptr_[lclRow+1];
418  // Precompute to save integer math in the inner loop.
419  const offset_type bs2 = blockSize_ * blockSize_;
420  little_block_type A_cur (val_.data (), blockSize_, blockSize_);
421  auto X_cur = subview (X_, ::Kokkos::make_pair (0, blockSize_));
422  Kokkos::parallel_for
423  (Kokkos::TeamThreadRange(member, blkBeg, blkEnd),
424  [&](const local_ordinal_type &absBlkOff) {
425  A_cur.assign_data(val_.data () + absBlkOff * bs2);
426  const offset_type X_blkCol = ind_[absBlkOff];
427  const offset_type X_ptBeg = X_blkCol * blockSize_;
428  X_cur.assign_data(&X_(X_ptBeg));
429 
430  Kokkos::parallel_for
431  (Kokkos::ThreadVectorRange(member, blockSize_),
432  [&](const local_ordinal_type &k0) {
433  scalar_type val(0);
434  for (local_ordinal_type k1=0;k1<blockSize_;++k1)
435  val += A_cur(k0,k1)*X_cur(k1);
436 #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
437  // host space team size is always 1
438  Y_cur(k0) += alpha_*val;
439 #else
440  // cuda space team size can be larger than 1
441  // atomic is not allowed for sacado type;
442  // thus this needs to be specialized or
443  // sacado atomic should be supported.
444  Kokkos::atomic_add(&Y_cur(k0), alpha_*val);
445 #endif
446  });
447  }); // for each entry in current local block row of matrix
448  }
449  }
450 
451  private:
452  alpha_coeff_type alpha_;
453  typename GraphType::row_map_type::const_type ptr_;
454  typename GraphType::entries_type::const_type ind_;
455  MatrixValuesType val_;
456  local_ordinal_type blockSize_;
457  InVecType X_;
458  beta_coeff_type beta_;
459  OutVecType Y_;
460  };
461 
462 
463  template<class AlphaCoeffType,
464  class GraphType,
465  class MatrixValuesType,
466  class InMultiVecType,
467  class BetaCoeffType,
468  class OutMultiVecType>
469  void
470  bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
471  const GraphType& graph,
472  const MatrixValuesType& val,
473  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
474  const InMultiVecType& X,
475  const BetaCoeffType& beta,
476  const OutMultiVecType& Y
477  )
478  {
479  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
480  "MatrixValuesType must be a Kokkos::View.");
481  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
482  "OutMultiVecType must be a Kokkos::View.");
483  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
484  "InMultiVecType must be a Kokkos::View.");
485  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
486  "MatrixValuesType must be a rank-1 Kokkos::View.");
487  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
488  "OutMultiVecType must be a rank-2 Kokkos::View.");
489  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
490  "InMultiVecType must be a rank-2 Kokkos::View.");
491 
492  typedef typename GraphType::device_type::execution_space execution_space;
493  typedef typename MatrixValuesType::const_type matrix_values_type;
494  typedef typename OutMultiVecType::non_const_type out_multivec_type;
495  typedef typename InMultiVecType::const_type in_multivec_type;
496  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
497  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
498  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
499 
500  constexpr bool is_builtin_type_enabled = std::is_arithmetic<typename InMultiVecType::non_const_value_type>::value;
501 
502  const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
503  static_cast<LO> (0) :
504  static_cast<LO> (graph.row_map.extent (0) - 1);
505  const LO numVecs = Y.extent (1);
506  if (numLocalMeshRows == 0 || numVecs == 0) {
507  return; // code below doesn't handle numVecs==0 correctly
508  }
509 
510  // These assignments avoid instantiating the functor extra times
511  // unnecessarily, e.g., for X const vs. nonconst. We only need the
512  // X const case, so only instantiate for that case.
513  in_multivec_type X_in = X;
514  out_multivec_type Y_out = Y;
515 
516  // The functor only knows how to handle one vector at a time, and it
517  // expects 1-D Views. Thus, we need to know the type of each column
518  // of X and Y.
519  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
520  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
521  typedef BcrsApplyNoTransFunctor<alpha_type, GraphType,
522  matrix_values_type, in_vec_type, beta_type, out_vec_type,
523  is_builtin_type_enabled> functor_type;
524 
525  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
526  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
527 
528  // Compute the first column of Y.
529  if (is_builtin_type_enabled) {
530  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
531  // Built-in version uses atomic add which might not be supported from sacado or any user-defined types.
532  typedef Kokkos::TeamPolicy<execution_space> policy_type;
533  policy_type policy(1,1);
534 #if defined(KOKKOS_ENABLE_CUDA)
535  constexpr bool is_cuda = std::is_same<execution_space, Kokkos::Cuda>::value;
536 #else
537  constexpr bool is_cuda = false;
538 #endif // defined(KOKKOS_ENABLE_CUDA)
539  if (is_cuda) {
540  LO team_size = 8, vector_size = 1;
541  if (blockSize <= 5) vector_size = 4;
542  else if (blockSize <= 9) vector_size = 8;
543  else if (blockSize <= 12) vector_size = 12;
544  else if (blockSize <= 20) vector_size = 20;
545  else vector_size = 20;
546  policy = policy_type(numLocalMeshRows, team_size, vector_size);
547  } else {
548  policy = policy_type(numLocalMeshRows, 1, 1);
549  }
550  Kokkos::parallel_for (policy, functor);
551 
552  // Compute the remaining columns of Y.
553  for (LO j = 1; j < numVecs; ++j) {
554  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
555  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
556  functor.setX (X_j);
557  functor.setY (Y_j);
558  Kokkos::parallel_for (policy, functor);
559  }
560  } else {
561  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
562  Kokkos::RangePolicy<execution_space,LO> policy(0, numLocalMeshRows);
563  Kokkos::parallel_for (policy, functor);
564  for (LO j = 1; j < numVecs; ++j) {
565  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
566  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
567  functor.setX (X_j);
568  functor.setY (Y_j);
569  Kokkos::parallel_for (policy, functor);
570  }
571  }
572  }
573 } // namespace Impl
574 
575 namespace { // (anonymous)
576 
577 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
578 // version that takes two Kokkos::View arguments).
579 template<class Scalar, class LO, class GO, class Node>
580 class GetLocalDiagCopy {
581 public:
582  typedef typename Node::device_type device_type;
583  typedef size_t diag_offset_type;
584  typedef Kokkos::View<const size_t*, device_type,
585  Kokkos::MemoryUnmanaged> diag_offsets_type;
586  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
587  typedef typename global_graph_type::local_graph_type local_graph_type;
588  typedef typename local_graph_type::row_map_type row_offsets_type;
589  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
590  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
591  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
592 
593  // Constructor
594  GetLocalDiagCopy (const diag_type& diag,
595  const values_type& val,
596  const diag_offsets_type& diagOffsets,
597  const row_offsets_type& ptr,
598  const LO blockSize) :
599  diag_ (diag),
600  diagOffsets_ (diagOffsets),
601  ptr_ (ptr),
602  blockSize_ (blockSize),
603  offsetPerBlock_ (blockSize_*blockSize_),
604  val_(val)
605  {}
606 
607  KOKKOS_INLINE_FUNCTION void
608  operator() (const LO& lclRowInd) const
609  {
610  using Kokkos::ALL;
611 
612  // Get row offset
613  const size_t absOffset = ptr_[lclRowInd];
614 
615  // Get offset relative to start of row
616  const size_t relOffset = diagOffsets_[lclRowInd];
617 
618  // Get the total offset
619  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
620 
621  // Get a view of the block. BCRS currently uses LayoutRight
622  // regardless of the device.
623  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
624  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
625  const_little_block_type;
626  const_little_block_type D_in (val_.data () + pointOffset,
627  blockSize_, blockSize_);
628  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
629  COPY (D_in, D_out);
630  }
631 
632  private:
633  diag_type diag_;
634  diag_offsets_type diagOffsets_;
635  row_offsets_type ptr_;
636  LO blockSize_;
637  LO offsetPerBlock_;
638  values_type val_;
639  };
640 } // namespace (anonymous)
641 
642  template<class Scalar, class LO, class GO, class Node>
643  std::ostream&
644  BlockCrsMatrix<Scalar, LO, GO, Node>::
645  markLocalErrorAndGetStream ()
646  {
647  * (this->localError_) = true;
648  if ((*errs_).is_null ()) {
649  *errs_ = Teuchos::rcp (new std::ostringstream ());
650  }
651  return **errs_;
652  }
653 
654  template<class Scalar, class LO, class GO, class Node>
657  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
658  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
659  blockSize_ (static_cast<LO> (0)),
660  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
661  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
662  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
663  offsetPerBlock_ (0),
664  localError_ (new bool (false)),
665  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
666  {
667  }
668 
669  template<class Scalar, class LO, class GO, class Node>
672  const LO blockSize) :
673  dist_object_type (graph.getMap ()),
674  graph_ (graph),
675  rowMeshMap_ (* (graph.getRowMap ())),
676  blockSize_ (blockSize),
677  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
678  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
679  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
680  offsetPerBlock_ (blockSize * blockSize),
681  localError_ (new bool (false)),
682  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
683  {
684  TEUCHOS_TEST_FOR_EXCEPTION(
685  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
686  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
687  "rows (isSorted() is false). This class assumes sorted rows.");
688 
689  graphRCP_ = Teuchos::rcpFromRef(graph_);
690 
691  // Trick to test whether LO is nonpositive, without a compiler
692  // warning in case LO is unsigned (which is generally a bad idea
693  // anyway). I don't promise that the trick works, but it
694  // generally does with gcc at least, in my experience.
695  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
696  TEUCHOS_TEST_FOR_EXCEPTION(
697  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
698  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
699  " <= 0. The block size must be positive.");
700 
701  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
702  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
703 
704  {
705  // These are rcp
706  const auto domainPointMap = getDomainMap();
707  const auto colPointMap = Teuchos::rcp
708  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
709  *pointImporter_ = Teuchos::rcp
710  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
711  }
712  {
713  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
714  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
715 
716  row_map_type ptr_d = graph.getLocalGraph ().row_map;
717  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
718  Kokkos::deep_copy (ptr_h_nc, ptr_d);
719  ptrHost_ = ptr_h_nc;
720  }
721  {
722  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
723  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
724 
725  entries_type ind_d = graph.getLocalGraph ().entries;
726  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
727  Kokkos::deep_copy (ind_h_nc, ind_d);
728  indHost_ = ind_h_nc;
729  }
730 
731  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
732  val_ = decltype (val_) ("val", numValEnt);
733  }
734 
735  template<class Scalar, class LO, class GO, class Node>
738  const map_type& domainPointMap,
739  const map_type& rangePointMap,
740  const LO blockSize) :
741  dist_object_type (graph.getMap ()),
742  graph_ (graph),
743  rowMeshMap_ (* (graph.getRowMap ())),
744  domainPointMap_ (domainPointMap),
745  rangePointMap_ (rangePointMap),
746  blockSize_ (blockSize),
747  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
748  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
749  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
750  offsetPerBlock_ (blockSize * blockSize),
751  localError_ (new bool (false)),
752  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
753  {
754  TEUCHOS_TEST_FOR_EXCEPTION(
755  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
756  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
757  "rows (isSorted() is false). This class assumes sorted rows.");
758 
759  graphRCP_ = Teuchos::rcpFromRef(graph_);
760 
761  // Trick to test whether LO is nonpositive, without a compiler
762  // warning in case LO is unsigned (which is generally a bad idea
763  // anyway). I don't promise that the trick works, but it
764  // generally does with gcc at least, in my experience.
765  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
766  TEUCHOS_TEST_FOR_EXCEPTION(
767  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
768  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
769  " <= 0. The block size must be positive.");
770  {
771  // These are rcp
772  const auto rcpDomainPointMap = getDomainMap();
773  const auto colPointMap = Teuchos::rcp
774  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
775  *pointImporter_ = Teuchos::rcp
776  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
777  }
778  {
779  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
780  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
781 
782  row_map_type ptr_d = graph.getLocalGraph ().row_map;
783  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
784  Kokkos::deep_copy (ptr_h_nc, ptr_d);
785  ptrHost_ = ptr_h_nc;
786  }
787  {
788  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
789  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
790 
791  entries_type ind_d = graph.getLocalGraph ().entries;
792  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
793  Kokkos::deep_copy (ind_h_nc, ind_d);
794  indHost_ = ind_h_nc;
795  }
796 
797  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
798  val_ = decltype (val_) ("val", numValEnt);
799  }
800 
801  template<class Scalar, class LO, class GO, class Node>
802  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
805  { // Copy constructor of map_type does a shallow copy.
806  // We're only returning by RCP for backwards compatibility.
807  return Teuchos::rcp (new map_type (domainPointMap_));
808  }
809 
810  template<class Scalar, class LO, class GO, class Node>
811  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
813  getRangeMap () const
814  { // Copy constructor of map_type does a shallow copy.
815  // We're only returning by RCP for backwards compatibility.
816  return Teuchos::rcp (new map_type (rangePointMap_));
817  }
818 
819  template<class Scalar, class LO, class GO, class Node>
820  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
822  getRowMap () const
823  {
824  return graph_.getRowMap();
825  }
826 
827  template<class Scalar, class LO, class GO, class Node>
828  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
830  getColMap () const
831  {
832  return graph_.getColMap();
833  }
834 
835  template<class Scalar, class LO, class GO, class Node>
839  {
840  return graph_.getGlobalNumRows();
841  }
842 
843  template<class Scalar, class LO, class GO, class Node>
844  size_t
847  {
848  return graph_.getNodeNumRows();
849  }
850 
851  template<class Scalar, class LO, class GO, class Node>
852  size_t
855  {
856  return graph_.getNodeMaxNumRowEntries();
857  }
858 
859  template<class Scalar, class LO, class GO, class Node>
860  void
862  apply (const mv_type& X,
863  mv_type& Y,
864  Teuchos::ETransp mode,
865  Scalar alpha,
866  Scalar beta) const
867  {
869  TEUCHOS_TEST_FOR_EXCEPTION(
870  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
871  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
872  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
873  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
874 
875  BMV X_view;
876  BMV Y_view;
877  const LO blockSize = getBlockSize ();
878  try {
879  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
880  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
881  }
882  catch (std::invalid_argument& e) {
883  TEUCHOS_TEST_FOR_EXCEPTION(
884  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
885  "apply: Either the input MultiVector X or the output MultiVector Y "
886  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
887  "graph. BlockMultiVector's constructor threw the following exception: "
888  << e.what ());
889  }
890 
891  try {
892  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
893  // either that or mark fields of this class as 'mutable'. The
894  // problem is that applyBlock wants to do lazy initialization of
895  // temporary block multivectors.
896  const_cast<this_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
897  } catch (std::invalid_argument& e) {
898  TEUCHOS_TEST_FOR_EXCEPTION(
899  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
900  "apply: The implementation method applyBlock complained about having "
901  "an invalid argument. It reported the following: " << e.what ());
902  } catch (std::logic_error& e) {
903  TEUCHOS_TEST_FOR_EXCEPTION(
904  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
905  "apply: The implementation method applyBlock complained about a "
906  "possible bug in its implementation. It reported the following: "
907  << e.what ());
908  } catch (std::exception& e) {
909  TEUCHOS_TEST_FOR_EXCEPTION(
910  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
911  "apply: The implementation method applyBlock threw an exception which "
912  "is neither std::invalid_argument nor std::logic_error, but is a "
913  "subclass of std::exception. It reported the following: "
914  << e.what ());
915  } catch (...) {
916  TEUCHOS_TEST_FOR_EXCEPTION(
917  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
918  "apply: The implementation method applyBlock threw an exception which "
919  "is not an instance of a subclass of std::exception. This probably "
920  "indicates a bug. Please report this to the Tpetra developers.");
921  }
922  }
923 
924  template<class Scalar, class LO, class GO, class Node>
925  void
929  Teuchos::ETransp mode,
930  const Scalar alpha,
931  const Scalar beta)
932  {
933  TEUCHOS_TEST_FOR_EXCEPTION(
934  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
935  "Tpetra::BlockCrsMatrix::applyBlock: "
936  "X and Y have different block sizes. X.getBlockSize() = "
937  << X.getBlockSize () << " != Y.getBlockSize() = "
938  << Y.getBlockSize () << ".");
939 
940  if (mode == Teuchos::NO_TRANS) {
941  applyBlockNoTrans (X, Y, alpha, beta);
942  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
943  applyBlockTrans (X, Y, mode, alpha, beta);
944  } else {
945  TEUCHOS_TEST_FOR_EXCEPTION(
946  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
947  "applyBlock: Invalid 'mode' argument. Valid values are "
948  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
949  }
950  }
951 
952  template<class Scalar, class LO, class GO, class Node>
953  void
955  setAllToScalar (const Scalar& alpha)
956  {
957 #ifdef HAVE_TPETRA_DEBUG
958  const char prefix[] = "Tpetra::BlockCrsMatrix::setAllToScalar: ";
959 #endif // HAVE_TPETRA_DEBUG
960 
961  if (this->need_sync_device ()) {
962  // If we need to sync to device, then the data were last
963  // modified on host. In that case, we should again modify them
964  // on host.
965 #ifdef HAVE_TPETRA_DEBUG
966  TEUCHOS_TEST_FOR_EXCEPTION
967  (this->need_sync_host (), std::runtime_error,
968  prefix << "The matrix's values need sync on both device and host.");
969 #endif // HAVE_TPETRA_DEBUG
970  this->modify_host ();
971  Kokkos::deep_copy (getValuesHost (), alpha);
972  }
973  else {
974  // If we need to sync to host, then the data were last modified
975  // on device. In that case, we should again modify them on
976  // device. Also, prefer modifying on device if neither side is
977  // marked as modified.
978  this->modify_device ();
979  Kokkos::deep_copy (this->getValuesDevice (), alpha);
980  }
981  }
982 
983  template<class Scalar, class LO, class GO, class Node>
984  LO
986  replaceLocalValues (const LO localRowInd,
987  const LO colInds[],
988  const Scalar vals[],
989  const LO numColInds) const
990  {
991 #ifdef HAVE_TPETRA_DEBUG
992  const char prefix[] =
993  "Tpetra::BlockCrsMatrix::replaceLocalValues: ";
994 #endif // HAVE_TPETRA_DEBUG
995 
996  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
997  // We modified no values, because the input local row index is
998  // invalid on the calling process. That may not be an error, if
999  // numColInds is zero anyway; it doesn't matter. This is the
1000  // advantage of returning the number of valid indices.
1001  return static_cast<LO> (0);
1002  }
1003  const impl_scalar_type* const vIn =
1004  reinterpret_cast<const impl_scalar_type*> (vals);
1005  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1006  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1007  const LO perBlockSize = this->offsetPerBlock ();
1008  LO hint = 0; // Guess for the relative offset into the current row
1009  LO pointOffset = 0; // Current offset into input values
1010  LO validCount = 0; // number of valid column indices in colInds
1011 
1012 #ifdef HAVE_TPETRA_DEBUG
1013  TEUCHOS_TEST_FOR_EXCEPTION
1014  (this->need_sync_host (), std::runtime_error,
1015  prefix << "The matrix's data were last modified on device, but have "
1016  "not been sync'd to host. Please sync to host (by calling "
1017  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1018  "method.");
1019 #endif // HAVE_TPETRA_DEBUG
1020 
1021  auto vals_host_out = getValuesHost ();
1022  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1023 
1024  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1025  const LO relBlockOffset =
1026  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1027  if (relBlockOffset != LINV) {
1028  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1029  // are stored contiguously, with no padding. "Contiguously"
1030  // means that all memory between the first and last entries
1031  // belongs to the block (no striding). "No padding" means
1032  // that getBlockSize() * getBlockSize() is exactly the number
1033  // of entries that the block uses. For another place where
1034  // this assumption is encoded, see sumIntoLocalValues.
1035 
1036  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1037  // little_block_type A_old =
1038  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1039  impl_scalar_type* const A_old =
1040  vals_host_out_raw + absBlockOffset * perBlockSize;
1041  // const_little_block_type A_new =
1042  // getConstLocalBlockFromInput (vIn, pointOffset);
1043  const impl_scalar_type* const A_new = vIn + pointOffset;
1044  // COPY (A_new, A_old);
1045  for (LO i = 0; i < perBlockSize; ++i) {
1046  A_old[i] = A_new[i];
1047  }
1048  hint = relBlockOffset + 1;
1049  ++validCount;
1050  }
1051  }
1052  return validCount;
1053  }
1054 
1055  template <class Scalar, class LO, class GO, class Node>
1056  void
1058  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1059  Kokkos::MemoryUnmanaged>& offsets) const
1060  {
1061  graph_.getLocalDiagOffsets (offsets);
1062  }
1063 
1064 
1065  template <class Scalar, class LO, class GO, class Node>
1066  void
1070  const Kokkos::View<impl_scalar_type***, device_type,
1071  Kokkos::MemoryUnmanaged>& D_inv,
1072  const Scalar& omega,
1073  const ESweepDirection direction) const
1074  {
1075  using Kokkos::ALL;
1076  const impl_scalar_type zero =
1077  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1078  const impl_scalar_type one =
1079  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1080  const LO numLocalMeshRows =
1081  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1082  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1083 
1084  // If using (new) Kokkos, replace localMem with thread-local
1085  // memory. Note that for larger block sizes, this will affect the
1086  // two-level parallelization. Look to Stokhos for best practice
1087  // on making this fast for GPUs.
1088  const LO blockSize = getBlockSize ();
1089  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1090  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1091  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1092 
1093  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1094  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1095  if (direction == Forward) {
1096  rowBegin = 1;
1097  rowEnd = numLocalMeshRows+1;
1098  rowStride = 1;
1099  }
1100  else if (direction == Backward) {
1101  rowBegin = numLocalMeshRows;
1102  rowEnd = 0;
1103  rowStride = -1;
1104  }
1105  else if (direction == Symmetric) {
1106  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1107  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1108  return;
1109  }
1110 
1111  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1112  const Scalar minus_omega = -omega;
1113 
1114  if (numVecs == 1) {
1115  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1116  const LO actlRow = lclRow - 1;
1117 
1118  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1119  COPY (B_cur, X_lcl);
1120  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1121 
1122  const size_t meshBeg = ptrHost_[actlRow];
1123  const size_t meshEnd = ptrHost_[actlRow+1];
1124  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1125  const LO meshCol = indHost_[absBlkOff];
1126  const_little_block_type A_cur =
1127  getConstLocalBlockFromAbsOffset (absBlkOff);
1128  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1129 
1130  // X_lcl += alpha*A_cur*X_cur
1131  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1132  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1133  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1134  } // for each entry in the current local row of the matrix
1135 
1136  // NOTE (mfh 20 Jan 2016) The two input Views here are
1137  // unmanaged already, so we don't have to take unmanaged
1138  // subviews first.
1139  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1140  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1141  FILL (X_update, zero);
1142  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1143  } // for each local row of the matrix
1144  }
1145  else {
1146  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1147  for (LO j = 0; j < numVecs; ++j) {
1148  LO actlRow = lclRow-1;
1149 
1150  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1151  COPY (B_cur, X_lcl);
1152  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1153 
1154  const size_t meshBeg = ptrHost_[actlRow];
1155  const size_t meshEnd = ptrHost_[actlRow+1];
1156  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1157  const LO meshCol = indHost_[absBlkOff];
1158  const_little_block_type A_cur =
1159  getConstLocalBlockFromAbsOffset (absBlkOff);
1160  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1161 
1162  // X_lcl += alpha*A_cur*X_cur
1163  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1164  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1165  } // for each entry in the current local row of the matrx
1166 
1167  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1168  auto X_update = X.getLocalBlock (actlRow, j);
1169  FILL (X_update, zero);
1170  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1171  } // for each entry in the current local row of the matrix
1172  } // for each local row of the matrix
1173  }
1174  }
1175 
1176  template <class Scalar, class LO, class GO, class Node>
1177  void
1180  const MultiVector<Scalar,LO,GO,Node> &/* B */,
1181  const MultiVector<Scalar,LO,GO,Node> &/* D */,
1182  const Scalar& /* dampingFactor */,
1183  const ESweepDirection /* direction */,
1184  const int /* numSweeps */,
1185  const bool /* zeroInitialGuess */) const
1186  {
1187  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1188  // interface for block Gauss-Seidel.
1189  TEUCHOS_TEST_FOR_EXCEPTION(
1190  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
1191  "gaussSeidelCopy: Not implemented.");
1192  }
1193 
1194  template <class Scalar, class LO, class GO, class Node>
1195  void
1198  const MultiVector<Scalar,LO,GO,Node>& /* B */,
1199  const MultiVector<Scalar,LO,GO,Node>& /* D */,
1200  const Teuchos::ArrayView<LO>& /* rowIndices */,
1201  const Scalar& /* dampingFactor */,
1202  const ESweepDirection /* direction */,
1203  const int /* numSweeps */,
1204  const bool /* zeroInitialGuess */) const
1205  {
1206  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1207  // interface for block Gauss-Seidel.
1208  TEUCHOS_TEST_FOR_EXCEPTION(
1209  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
1210  "reorderedGaussSeidelCopy: Not implemented.");
1211  }
1212 
1213 
1214  template <class Scalar, class LO, class GO, class Node>
1215  void
1217  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1218  Kokkos::MemoryUnmanaged>& diag,
1219  const Kokkos::View<const size_t*, device_type,
1220  Kokkos::MemoryUnmanaged>& offsets) const
1221  {
1222  using Kokkos::parallel_for;
1223  typedef typename device_type::execution_space execution_space;
1224  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1225 
1226  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1227  const LO blockSize = this->getBlockSize ();
1228  TEUCHOS_TEST_FOR_EXCEPTION
1229  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1230  static_cast<LO> (diag.extent (1)) < blockSize ||
1231  static_cast<LO> (diag.extent (2)) < blockSize,
1232  std::invalid_argument, prefix <<
1233  "The input Kokkos::View is not big enough to hold all the data.");
1234  TEUCHOS_TEST_FOR_EXCEPTION
1235  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1236  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1237  "diagonal blocks " << lclNumMeshRows << ".");
1238 
1239 #ifdef HAVE_TPETRA_DEBUG
1240  TEUCHOS_TEST_FOR_EXCEPTION
1241  (this->template need_sync<device_type> (), std::runtime_error,
1242  prefix << "The matrix's data were last modified on host, but have "
1243  "not been sync'd to device. Please sync to device (by calling "
1244  "sync<device_type>() on this matrix) before calling this method.");
1245 #endif // HAVE_TPETRA_DEBUG
1246 
1247  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1248  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1249 
1250  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1251  // we reserve the right to do lazy allocation of device data. (We
1252  // don't plan to do lazy allocation for host data; the host
1253  // version of the data always exists.)
1255  auto vals_dev =
1256  const_cast<this_type*> (this)->template getValues<device_type> ();
1257 
1258  parallel_for (policy_type (0, lclNumMeshRows),
1259  functor_type (diag, vals_dev, offsets,
1260  graph_.getLocalGraph ().row_map, blockSize_));
1261  }
1262 
1263 
1264  template <class Scalar, class LO, class GO, class Node>
1265  void
1267  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1268  Kokkos::MemoryUnmanaged>& diag,
1269  const Teuchos::ArrayView<const size_t>& offsets) const
1270  {
1271  using Kokkos::ALL;
1272  using Kokkos::parallel_for;
1273  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1274  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1275 
1276  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1277  const LO blockSize = this->getBlockSize ();
1278  TEUCHOS_TEST_FOR_EXCEPTION
1279  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1280  static_cast<LO> (diag.extent (1)) < blockSize ||
1281  static_cast<LO> (diag.extent (2)) < blockSize,
1282  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1283  "The input Kokkos::View is not big enough to hold all the data.");
1284  TEUCHOS_TEST_FOR_EXCEPTION
1285  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1286  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1287  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1288  "blocks " << lclNumMeshRows << ".");
1289 
1290  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1291  // quite made everything work with CUDA yet.
1292  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1293  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1294  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1295  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1296  COPY (D_in, D_out);
1297  });
1298  }
1299 
1300 
1301  template<class Scalar, class LO, class GO, class Node>
1302  LO
1304  absMaxLocalValues (const LO localRowInd,
1305  const LO colInds[],
1306  const Scalar vals[],
1307  const LO numColInds) const
1308  {
1309  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1310  // We modified no values, because the input local row index is
1311  // invalid on the calling process. That may not be an error, if
1312  // numColInds is zero anyway; it doesn't matter. This is the
1313  // advantage of returning the number of valid indices.
1314  return static_cast<LO> (0);
1315  }
1316  const impl_scalar_type* const vIn =
1317  reinterpret_cast<const impl_scalar_type*> (vals);
1318  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1319  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1320  const LO perBlockSize = this->offsetPerBlock ();
1321  LO hint = 0; // Guess for the relative offset into the current row
1322  LO pointOffset = 0; // Current offset into input values
1323  LO validCount = 0; // number of valid column indices in colInds
1324 
1325  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1326  const LO relBlockOffset =
1327  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1328  if (relBlockOffset != LINV) {
1329  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1330  little_block_type A_old =
1331  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1332  const_little_block_type A_new =
1333  getConstLocalBlockFromInput (vIn, pointOffset);
1334 
1335  ::Tpetra::Impl::absMax (A_old, A_new);
1336  hint = relBlockOffset + 1;
1337  ++validCount;
1338  }
1339  }
1340  return validCount;
1341  }
1342 
1343 
1344  template<class Scalar, class LO, class GO, class Node>
1345  LO
1347  sumIntoLocalValues (const LO localRowInd,
1348  const LO colInds[],
1349  const Scalar vals[],
1350  const LO numColInds) const
1351  {
1352 #ifdef HAVE_TPETRA_DEBUG
1353  const char prefix[] =
1354  "Tpetra::BlockCrsMatrix::sumIntoLocalValues: ";
1355 #endif // HAVE_TPETRA_DEBUG
1356 
1357  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1358  // We modified no values, because the input local row index is
1359  // invalid on the calling process. That may not be an error, if
1360  // numColInds is zero anyway; it doesn't matter. This is the
1361  // advantage of returning the number of valid indices.
1362  return static_cast<LO> (0);
1363  }
1364  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1365  const impl_scalar_type* const vIn =
1366  reinterpret_cast<const impl_scalar_type*> (vals);
1367  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1368  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1369  const LO perBlockSize = this->offsetPerBlock ();
1370  LO hint = 0; // Guess for the relative offset into the current row
1371  LO pointOffset = 0; // Current offset into input values
1372  LO validCount = 0; // number of valid column indices in colInds
1373 
1374 #ifdef HAVE_TPETRA_DEBUG
1375  TEUCHOS_TEST_FOR_EXCEPTION
1376  (this->need_sync_host (), std::runtime_error,
1377  prefix << "The matrix's data were last modified on device, but have not "
1378  "been sync'd to host. Please sync to host (by calling "
1379  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1380 #endif // HAVE_TPETRA_DEBUG
1381 
1382  auto vals_host_out =
1383  getValuesHost ();
1384  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1385 
1386  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1387  const LO relBlockOffset =
1388  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1389  if (relBlockOffset != LINV) {
1390  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1391  // are stored contiguously, with no padding. "Contiguously"
1392  // means that all memory between the first and last entries
1393  // belongs to the block (no striding). "No padding" means
1394  // that getBlockSize() * getBlockSize() is exactly the number
1395  // of entries that the block uses. For another place where
1396  // this assumption is encoded, see replaceLocalValues.
1397 
1398  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1399  // little_block_type A_old =
1400  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1401  impl_scalar_type* const A_old =
1402  vals_host_out_raw + absBlockOffset * perBlockSize;
1403  // const_little_block_type A_new =
1404  // getConstLocalBlockFromInput (vIn, pointOffset);
1405  const impl_scalar_type* const A_new = vIn + pointOffset;
1406  // AXPY (ONE, A_new, A_old);
1407  for (LO i = 0; i < perBlockSize; ++i) {
1408  A_old[i] += A_new[i];
1409  }
1410  hint = relBlockOffset + 1;
1411  ++validCount;
1412  }
1413  }
1414  return validCount;
1415  }
1416 
1417  template<class Scalar, class LO, class GO, class Node>
1418  LO
1420  getLocalRowView (const LO localRowInd,
1421  const LO*& colInds,
1422  Scalar*& vals,
1423  LO& numInds) const
1424  {
1425 #ifdef HAVE_TPETRA_DEBUG
1426  const char prefix[] =
1427  "Tpetra::BlockCrsMatrix::getLocalRowView: ";
1428 #endif // HAVE_TPETRA_DEBUG
1429 
1430  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1431  colInds = NULL;
1432  vals = NULL;
1433  numInds = 0;
1434  return Teuchos::OrdinalTraits<LO>::invalid ();
1435  }
1436  else {
1437  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1438  colInds = indHost_.data () + absBlockOffsetStart;
1439 
1440 #ifdef HAVE_TPETRA_DEBUG
1441  TEUCHOS_TEST_FOR_EXCEPTION
1442  (this->need_sync_host (), std::runtime_error,
1443  prefix << "The matrix's data were last modified on device, but have "
1444  "not been sync'd to host. Please sync to host (by calling "
1445  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1446  "method.");
1447 #endif // HAVE_TPETRA_DEBUG
1448 
1449  auto vals_host_out = getValuesHost ();
1450  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1451  impl_scalar_type* const vOut = vals_host_out_raw +
1452  absBlockOffsetStart * offsetPerBlock ();
1453  vals = reinterpret_cast<Scalar*> (vOut);
1454 
1455  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1456  return 0; // indicates no error
1457  }
1458  }
1459 
1460  template<class Scalar, class LO, class GO, class Node>
1461  void
1463  getLocalRowCopy (LO LocalRow,
1464  const Teuchos::ArrayView<LO>& Indices,
1465  const Teuchos::ArrayView<Scalar>& Values,
1466  size_t &NumEntries) const
1467  {
1468  const LO *colInds;
1469  Scalar *vals;
1470  LO numInds;
1471  getLocalRowView(LocalRow,colInds,vals,numInds);
1472  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1473  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1474  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1475  << numInds << " row entries");
1476  }
1477  for (LO i=0; i<numInds; ++i) {
1478  Indices[i] = colInds[i];
1479  }
1480  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1481  Values[i] = vals[i];
1482  }
1483  NumEntries = numInds;
1484  }
1485 
1486  template<class Scalar, class LO, class GO, class Node>
1487  LO
1489  getLocalRowOffsets (const LO localRowInd,
1490  ptrdiff_t offsets[],
1491  const LO colInds[],
1492  const LO numColInds) const
1493  {
1494  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1495  // We got no offsets, because the input local row index is
1496  // invalid on the calling process. That may not be an error, if
1497  // numColInds is zero anyway; it doesn't matter. This is the
1498  // advantage of returning the number of valid indices.
1499  return static_cast<LO> (0);
1500  }
1501 
1502  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1503  LO hint = 0; // Guess for the relative offset into the current row
1504  LO validCount = 0; // number of valid column indices in colInds
1505 
1506  for (LO k = 0; k < numColInds; ++k) {
1507  const LO relBlockOffset =
1508  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1509  if (relBlockOffset != LINV) {
1510  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1511  hint = relBlockOffset + 1;
1512  ++validCount;
1513  }
1514  }
1515  return validCount;
1516  }
1517 
1518 
1519  template<class Scalar, class LO, class GO, class Node>
1520  LO
1522  replaceLocalValuesByOffsets (const LO localRowInd,
1523  const ptrdiff_t offsets[],
1524  const Scalar vals[],
1525  const LO numOffsets) const
1526  {
1527  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1528  // We modified no values, because the input local row index is
1529  // invalid on the calling process. That may not be an error, if
1530  // numColInds is zero anyway; it doesn't matter. This is the
1531  // advantage of returning the number of valid indices.
1532  return static_cast<LO> (0);
1533  }
1534  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1535 
1536  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1537  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1538  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1539  size_t pointOffset = 0; // Current offset into input values
1540  LO validCount = 0; // number of valid offsets
1541 
1542  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1543  const size_t relBlockOffset = offsets[k];
1544  if (relBlockOffset != STINV) {
1545  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1546  little_block_type A_old =
1547  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1548  const_little_block_type A_new =
1549  getConstLocalBlockFromInput (vIn, pointOffset);
1550  COPY (A_new, A_old);
1551  ++validCount;
1552  }
1553  }
1554  return validCount;
1555  }
1556 
1557 
1558  template<class Scalar, class LO, class GO, class Node>
1559  LO
1561  absMaxLocalValuesByOffsets (const LO localRowInd,
1562  const ptrdiff_t offsets[],
1563  const Scalar vals[],
1564  const LO numOffsets) const
1565  {
1566  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1567  // We modified no values, because the input local row index is
1568  // invalid on the calling process. That may not be an error, if
1569  // numColInds is zero anyway; it doesn't matter. This is the
1570  // advantage of returning the number of valid indices.
1571  return static_cast<LO> (0);
1572  }
1573  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1574 
1575  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1576  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1577  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1578  size_t pointOffset = 0; // Current offset into input values
1579  LO validCount = 0; // number of valid offsets
1580 
1581  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1582  const size_t relBlockOffset = offsets[k];
1583  if (relBlockOffset != STINV) {
1584  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1585  little_block_type A_old =
1586  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1587  const_little_block_type A_new =
1588  getConstLocalBlockFromInput (vIn, pointOffset);
1589  ::Tpetra::Impl::absMax (A_old, A_new);
1590  ++validCount;
1591  }
1592  }
1593  return validCount;
1594  }
1595 
1596 
1597  template<class Scalar, class LO, class GO, class Node>
1598  LO
1600  sumIntoLocalValuesByOffsets (const LO localRowInd,
1601  const ptrdiff_t offsets[],
1602  const Scalar vals[],
1603  const LO numOffsets) const
1604  {
1605  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1606  // We modified no values, because the input local row index is
1607  // invalid on the calling process. That may not be an error, if
1608  // numColInds is zero anyway; it doesn't matter. This is the
1609  // advantage of returning the number of valid indices.
1610  return static_cast<LO> (0);
1611  }
1612  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1613  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1614 
1615  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1616  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1617  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1618  size_t pointOffset = 0; // Current offset into input values
1619  LO validCount = 0; // number of valid offsets
1620 
1621  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1622  const size_t relBlockOffset = offsets[k];
1623  if (relBlockOffset != STINV) {
1624  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1625  little_block_type A_old =
1626  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1627  const_little_block_type A_new =
1628  getConstLocalBlockFromInput (vIn, pointOffset);
1629  //A_old.update (ONE, A_new);
1630  AXPY (ONE, A_new, A_old);
1631  ++validCount;
1632  }
1633  }
1634  return validCount;
1635  }
1636 
1637 
1638  template<class Scalar, class LO, class GO, class Node>
1639  size_t
1641  getNumEntriesInLocalRow (const LO localRowInd) const
1642  {
1643  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1644  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1645  return static_cast<LO> (0); // the calling process doesn't have that row
1646  } else {
1647  return static_cast<LO> (numEntInGraph);
1648  }
1649  }
1650 
1651  template<class Scalar, class LO, class GO, class Node>
1652  void
1656  const Teuchos::ETransp mode,
1657  const Scalar alpha,
1658  const Scalar beta)
1659  {
1660  (void) X;
1661  (void) Y;
1662  (void) mode;
1663  (void) alpha;
1664  (void) beta;
1665 
1666  TEUCHOS_TEST_FOR_EXCEPTION(
1667  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
1668  "transpose and conjugate transpose modes are not yet implemented.");
1669  }
1670 
1671  template<class Scalar, class LO, class GO, class Node>
1672  void
1673  BlockCrsMatrix<Scalar, LO, GO, Node>::
1674  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1675  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1676  const Scalar alpha,
1677  const Scalar beta)
1678  {
1679  using Teuchos::RCP;
1680  using Teuchos::rcp;
1681  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1682  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1683  const Scalar zero = STS::zero ();
1684  const Scalar one = STS::one ();
1685  RCP<const import_type> import = graph_.getImporter ();
1686  // "export" is a reserved C++ keyword, so we can't use it.
1687  RCP<const export_type> theExport = graph_.getExporter ();
1688  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1689 
1690  if (alpha == zero) {
1691  if (beta == zero) {
1692  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1693  }
1694  else if (beta != one) {
1695  Y.scale (beta);
1696  }
1697  }
1698  else { // alpha != 0
1699  const BMV* X_colMap = NULL;
1700  if (import.is_null ()) {
1701  try {
1702  X_colMap = &X;
1703  }
1704  catch (std::exception& e) {
1705  TEUCHOS_TEST_FOR_EXCEPTION
1706  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1707  "operator= threw an exception: " << std::endl << e.what ());
1708  }
1709  }
1710  else {
1711  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1712  // Y_rowMap_ below. This lets us do lazy initialization
1713  // correctly with view semantics of BlockCrsMatrix. All views
1714  // of this BlockCrsMatrix have the same outer pointer. That
1715  // way, we can set the inner pointer in one view, and all
1716  // other views will see it.
1717  if ((*X_colMap_).is_null () ||
1718  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1719  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1720  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1721  static_cast<LO> (X.getNumVectors ())));
1722  }
1723  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1724  **pointImporter_,
1726  try {
1727  X_colMap = &(**X_colMap_);
1728  }
1729  catch (std::exception& e) {
1730  TEUCHOS_TEST_FOR_EXCEPTION
1731  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1732  "operator= threw an exception: " << std::endl << e.what ());
1733  }
1734  }
1735 
1736  BMV* Y_rowMap = NULL;
1737  if (theExport.is_null ()) {
1738  Y_rowMap = &Y;
1739  }
1740  else if ((*Y_rowMap_).is_null () ||
1741  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1742  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1743  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1744  static_cast<LO> (X.getNumVectors ())));
1745  try {
1746  Y_rowMap = &(**Y_rowMap_);
1747  }
1748  catch (std::exception& e) {
1749  TEUCHOS_TEST_FOR_EXCEPTION(
1750  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1751  "operator= threw an exception: " << std::endl << e.what ());
1752  }
1753  }
1754 
1755  try {
1756  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1757  }
1758  catch (std::exception& e) {
1759  TEUCHOS_TEST_FOR_EXCEPTION
1760  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1761  "an exception: " << e.what ());
1762  }
1763  catch (...) {
1764  TEUCHOS_TEST_FOR_EXCEPTION
1765  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1766  "an exception not a subclass of std::exception.");
1767  }
1768 
1769  if (! theExport.is_null ()) {
1770  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1771  }
1772  }
1773  }
1774 
1775  template<class Scalar, class LO, class GO, class Node>
1776  void
1777  BlockCrsMatrix<Scalar, LO, GO, Node>::
1778  localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1779  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1780  const Scalar alpha,
1781  const Scalar beta)
1782  {
1783  using ::Tpetra::Impl::bcrsLocalApplyNoTrans;
1784 
1785  const impl_scalar_type alpha_impl = alpha;
1786  const auto graph = this->graph_.getLocalGraph ();
1787  const impl_scalar_type beta_impl = beta;
1788  const LO blockSize = this->getBlockSize ();
1789 
1790  auto X_mv = X.getMultiVectorView ();
1791  auto Y_mv = Y.getMultiVectorView ();
1792  Y_mv.template modify<device_type> ();
1793 
1794  auto X_lcl = X_mv.template getLocalView<device_type> ();
1795  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1796  auto val = this->val_.template view<device_type> ();
1797 
1798  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1799  beta_impl, Y_lcl);
1800  }
1801 
1802  template<class Scalar, class LO, class GO, class Node>
1803  LO
1804  BlockCrsMatrix<Scalar, LO, GO, Node>::
1805  findRelOffsetOfColumnIndex (const LO localRowIndex,
1806  const LO colIndexToFind,
1807  const LO hint) const
1808  {
1809  const size_t absStartOffset = ptrHost_[localRowIndex];
1810  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1811  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1812  // Amortize pointer arithmetic over the search loop.
1813  const LO* const curInd = indHost_.data () + absStartOffset;
1814 
1815  // If the hint was correct, then the hint is the offset to return.
1816  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1817  // Always return the offset relative to the current row.
1818  return hint;
1819  }
1820 
1821  // The hint was wrong, so we must search for the given column
1822  // index in the column indices for the given row.
1823  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1824 
1825  // We require that the graph have sorted rows. However, binary
1826  // search only pays if the current row is longer than a certain
1827  // amount. We set this to 32, but you might want to tune this.
1828  const LO maxNumEntriesForLinearSearch = 32;
1829  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1830  // Use binary search. It would probably be better for us to
1831  // roll this loop by hand. If we wrote it right, a smart
1832  // compiler could perhaps use conditional loads and avoid
1833  // branches (according to Jed Brown on May 2014).
1834  const LO* beg = curInd;
1835  const LO* end = curInd + numEntriesInRow;
1836  std::pair<const LO*, const LO*> p =
1837  std::equal_range (beg, end, colIndexToFind);
1838  if (p.first != p.second) {
1839  // offset is relative to the current row
1840  relOffset = static_cast<LO> (p.first - beg);
1841  }
1842  }
1843  else { // use linear search
1844  for (LO k = 0; k < numEntriesInRow; ++k) {
1845  if (colIndexToFind == curInd[k]) {
1846  relOffset = k; // offset is relative to the current row
1847  break;
1848  }
1849  }
1850  }
1851 
1852  return relOffset;
1853  }
1854 
1855  template<class Scalar, class LO, class GO, class Node>
1856  LO
1857  BlockCrsMatrix<Scalar, LO, GO, Node>::
1858  offsetPerBlock () const
1859  {
1860  return offsetPerBlock_;
1861  }
1862 
1863  template<class Scalar, class LO, class GO, class Node>
1864  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1865  BlockCrsMatrix<Scalar, LO, GO, Node>::
1866  getConstLocalBlockFromInput (const impl_scalar_type* val,
1867  const size_t pointOffset) const
1868  {
1869  // Row major blocks
1870  const LO rowStride = blockSize_;
1871  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1872  }
1873 
1874  template<class Scalar, class LO, class GO, class Node>
1875  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1876  BlockCrsMatrix<Scalar, LO, GO, Node>::
1877  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1878  const size_t pointOffset) const
1879  {
1880  // Row major blocks
1881  const LO rowStride = blockSize_;
1882  return little_block_type (val + pointOffset, blockSize_, rowStride);
1883  }
1884 
1885  template<class Scalar, class LO, class GO, class Node>
1886  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1887  BlockCrsMatrix<Scalar, LO, GO, Node>::
1888  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1889  {
1890 #ifdef HAVE_TPETRA_DEBUG
1891  const char prefix[] =
1892  "Tpetra::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
1893 #endif // HAVE_TPETRA_DEBUG
1894 
1895  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1896  // An empty block signifies an error. We don't expect to see
1897  // this error in correct code, but it's helpful for avoiding
1898  // memory corruption in case there is a bug.
1899  return const_little_block_type ();
1900  }
1901  else {
1902 #ifdef HAVE_TPETRA_DEBUG
1903  TEUCHOS_TEST_FOR_EXCEPTION
1904  (this->need_sync_host (), std::runtime_error,
1905  prefix << "The matrix's data were last modified on device, but have "
1906  "not been sync'd to host. Please sync to host (by calling "
1907  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1908  "method.");
1909 #endif // HAVE_TPETRA_DEBUG
1910  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1911 
1912  auto vals_host = getValuesHost ();
1913  const impl_scalar_type* vals_host_raw = vals_host.data ();
1914 
1915  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
1916  }
1917  }
1918 
1919  template<class Scalar, class LO, class GO, class Node>
1920  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1921  BlockCrsMatrix<Scalar, LO, GO, Node>::
1922  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
1923  const size_t relMeshOffset) const
1924  {
1925  typedef impl_scalar_type IST;
1926 
1927  const LO* lclColInds = NULL;
1928  Scalar* lclVals = NULL;
1929  LO numEnt = 0;
1930 
1931  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
1932  if (err != 0) {
1933  // An empty block signifies an error. We don't expect to see
1934  // this error in correct code, but it's helpful for avoiding
1935  // memory corruption in case there is a bug.
1936  return const_little_block_type ();
1937  }
1938  else {
1939  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
1940  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
1941  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
1942  relPointOffset);
1943  }
1944  }
1945 
1946  template<class Scalar, class LO, class GO, class Node>
1947  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1948  BlockCrsMatrix<Scalar, LO, GO, Node>::
1949  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
1950  {
1951 #ifdef HAVE_TPETRA_DEBUG
1952  const char prefix[] =
1953  "Tpetra::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
1954 #endif // HAVE_TPETRA_DEBUG
1955 
1956  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
1957  // An empty block signifies an error. We don't expect to see
1958  // this error in correct code, but it's helpful for avoiding
1959  // memory corruption in case there is a bug.
1960  return little_block_type ();
1961  }
1962  else {
1963  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
1964 #ifdef HAVE_TPETRA_DEBUG
1965  TEUCHOS_TEST_FOR_EXCEPTION
1966  (this->need_sync_host (), std::runtime_error,
1967  prefix << "The matrix's data were last modified on device, but have "
1968  "not been sync'd to host. Please sync to host (by calling "
1969  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1970  "method.");
1971 #endif // HAVE_TPETRA_DEBUG
1972  auto vals_host = getValuesHost();
1973  impl_scalar_type* vals_host_raw = vals_host.data ();
1974  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
1975  }
1976  }
1977 
1978  template<class Scalar, class LO, class GO, class Node>
1979  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1980  BlockCrsMatrix<Scalar, LO, GO, Node>::
1981  getLocalBlock (const LO localRowInd, const LO localColInd) const
1982  {
1983  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1984  const LO relBlockOffset =
1985  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1986 
1987  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1988  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1989  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1990  }
1991  else {
1992  return little_block_type ();
1993  }
1994  }
1995 
1996  // template<class Scalar, class LO, class GO, class Node>
1997  // void
1998  // BlockCrsMatrix<Scalar, LO, GO, Node>::
1999  // clearLocalErrorStateAndStream ()
2000  // {
2001  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2002  // * (const_cast<this_type*> (this)->localError_) = false;
2003  // *errs_ = Teuchos::null;
2004  // }
2005 
2006  template<class Scalar, class LO, class GO, class Node>
2007  bool
2008  BlockCrsMatrix<Scalar, LO, GO, Node>::
2009  checkSizes (const ::Tpetra::SrcDistObject& source)
2010  {
2011  using std::endl;
2012  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2013  const this_type* src = dynamic_cast<const this_type* > (&source);
2014 
2015  if (src == NULL) {
2016  std::ostream& err = markLocalErrorAndGetStream ();
2017  err << "checkSizes: The source object of the Import or Export "
2018  "must be a BlockCrsMatrix with the same template parameters as the "
2019  "target object." << endl;
2020  }
2021  else {
2022  // Use a string of ifs, not if-elseifs, because we want to know
2023  // all the errors.
2024  if (src->getBlockSize () != this->getBlockSize ()) {
2025  std::ostream& err = markLocalErrorAndGetStream ();
2026  err << "checkSizes: The source and target objects of the Import or "
2027  << "Export must have the same block sizes. The source's block "
2028  << "size = " << src->getBlockSize () << " != the target's block "
2029  << "size = " << this->getBlockSize () << "." << endl;
2030  }
2031  if (! src->graph_.isFillComplete ()) {
2032  std::ostream& err = markLocalErrorAndGetStream ();
2033  err << "checkSizes: The source object of the Import or Export is "
2034  "not fill complete. Both source and target objects must be fill "
2035  "complete." << endl;
2036  }
2037  if (! this->graph_.isFillComplete ()) {
2038  std::ostream& err = markLocalErrorAndGetStream ();
2039  err << "checkSizes: The target object of the Import or Export is "
2040  "not fill complete. Both source and target objects must be fill "
2041  "complete." << endl;
2042  }
2043  if (src->graph_.getColMap ().is_null ()) {
2044  std::ostream& err = markLocalErrorAndGetStream ();
2045  err << "checkSizes: The source object of the Import or Export does "
2046  "not have a column Map. Both source and target objects must have "
2047  "column Maps." << endl;
2048  }
2049  if (this->graph_.getColMap ().is_null ()) {
2050  std::ostream& err = markLocalErrorAndGetStream ();
2051  err << "checkSizes: The target object of the Import or Export does "
2052  "not have a column Map. Both source and target objects must have "
2053  "column Maps." << endl;
2054  }
2055  }
2056 
2057  return ! (* (this->localError_));
2058  }
2059 
2060  template<class Scalar, class LO, class GO, class Node>
2061  void
2064  (const ::Tpetra::SrcDistObject& source,
2065  const size_t numSameIDs,
2066  const Kokkos::DualView<const local_ordinal_type*,
2067  buffer_device_type>& permuteToLIDs,
2068  const Kokkos::DualView<const local_ordinal_type*,
2069  buffer_device_type>& permuteFromLIDs)
2070  {
2071  using ::Tpetra::Details::Behavior;
2073  using ::Tpetra::Details::ProfilingRegion;
2074  using std::endl;
2075  using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2076 
2077  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
2078  const bool debug = Behavior::debug();
2079  const bool verbose = Behavior::verbose();
2080 
2081  // Define this function prefix
2082  std::string prefix;
2083  {
2084  std::ostringstream os;
2085  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2086  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
2087  prefix = os.str();
2088  }
2089 
2090  // check if this already includes a local error
2091  if (* (this->localError_)) {
2092  std::ostream& err = this->markLocalErrorAndGetStream ();
2093  err << prefix
2094  << "The target object of the Import or Export is already in an error state."
2095  << endl;
2096  return;
2097  }
2098 
2099  //
2100  // Verbose input dual view status
2101  //
2102  if (verbose) {
2103  std::ostringstream os;
2104  os << prefix << endl
2105  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
2106  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
2107  std::cerr << os.str ();
2108  }
2109 
2113  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
2114  std::ostream& err = this->markLocalErrorAndGetStream ();
2115  err << prefix
2116  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
2117  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
2118  << "." << endl;
2119  return;
2120  }
2121  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
2122  std::ostream& err = this->markLocalErrorAndGetStream ();
2123  err << prefix
2124  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
2125  << endl;
2126  return;
2127  }
2128 
2129  const this_type* src = dynamic_cast<const this_type* > (&source);
2130  if (src == NULL) {
2131  std::ostream& err = this->markLocalErrorAndGetStream ();
2132  err << prefix
2133  << "The source (input) object of the Import or "
2134  "Export is either not a BlockCrsMatrix, or does not have the right "
2135  "template parameters. checkSizes() should have caught this. "
2136  "Please report this bug to the Tpetra developers." << endl;
2137  return;
2138  }
2139  else {
2140  // Kyungjoo: where is val_ modified ?
2141  // When we have dual view as a member variable,
2142  // which function should make sure the val_ is upto date ?
2143  // IMO, wherever it is used, the function should check its
2144  // availability.
2145  const_cast<this_type*>(src)->sync_host();
2146  }
2147  this->sync_host();
2148 
2149  bool lclErr = false;
2150 #ifdef HAVE_TPETRA_DEBUG
2151  std::set<LO> invalidSrcCopyRows;
2152  std::set<LO> invalidDstCopyRows;
2153  std::set<LO> invalidDstCopyCols;
2154  std::set<LO> invalidDstPermuteCols;
2155  std::set<LO> invalidPermuteFromRows;
2156 #endif // HAVE_TPETRA_DEBUG
2157 
2158  // Copy the initial sequence of rows that are the same.
2159  //
2160  // The two graphs might have different column Maps, so we need to
2161  // do this using global column indices. This is purely local, so
2162  // we only need to check for local sameness of the two column
2163  // Maps.
2164 
2165 #ifdef HAVE_TPETRA_DEBUG
2166  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2167 #endif // HAVE_TPETRA_DEBUG
2168  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2169  const map_type& srcColMap = * (src->graph_.getColMap ());
2170  const map_type& dstColMap = * (this->graph_.getColMap ());
2171  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2172 
2173  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
2174  if (verbose) {
2175  std::ostringstream os;
2176  os << prefix
2177  << "canUseLocalColumnIndices: "
2178  << (canUseLocalColumnIndices ? "true" : "false")
2179  << ", numPermute: " << numPermute
2180  << endl;
2181  std::cerr << os.str ();
2182  }
2183 
2184  const auto permuteToLIDsHost = permuteToLIDs.view_host();
2185  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
2186 
2187  if (canUseLocalColumnIndices) {
2188  // Copy local rows that are the "same" in both source and target.
2189  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2190 #ifdef HAVE_TPETRA_DEBUG
2191  if (! srcRowMap.isNodeLocalElement (localRow)) {
2192  lclErr = true;
2193  invalidSrcCopyRows.insert (localRow);
2194  continue; // skip invalid rows
2195  }
2196 #endif // HAVE_TPETRA_DEBUG
2197 
2198  const LO* lclSrcCols;
2199  Scalar* vals;
2200  LO numEntries;
2201  // If this call fails, that means the mesh row local index is
2202  // invalid. That means the Import or Export is invalid somehow.
2203  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2204  if (err != 0) {
2205  lclErr = true;
2206 #ifdef HAVE_TPETRA_DEBUG
2207  (void) invalidSrcCopyRows.insert (localRow);
2208 #endif // HAVE_TPETRA_DEBUG
2209  }
2210  else {
2211  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2212  if (err != numEntries) {
2213  lclErr = true;
2214  if (! dstRowMap.isNodeLocalElement (localRow)) {
2215 #ifdef HAVE_TPETRA_DEBUG
2216  invalidDstCopyRows.insert (localRow);
2217 #endif // HAVE_TPETRA_DEBUG
2218  }
2219  else {
2220  // Once there's an error, there's no sense in saving
2221  // time, so we check whether the column indices were
2222  // invalid. However, only remember which ones were
2223  // invalid in a debug build, because that might take a
2224  // lot of space.
2225  for (LO k = 0; k < numEntries; ++k) {
2226  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2227  lclErr = true;
2228 #ifdef HAVE_TPETRA_DEBUG
2229  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2230 #endif // HAVE_TPETRA_DEBUG
2231  }
2232  }
2233  }
2234  }
2235  }
2236  } // for each "same" local row
2237 
2238  // Copy the "permute" local rows.
2239  for (size_t k = 0; k < numPermute; ++k) {
2240  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2241  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2242 
2243  const LO* lclSrcCols;
2244  Scalar* vals;
2245  LO numEntries;
2246  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2247  if (err != 0) {
2248  lclErr = true;
2249 #ifdef HAVE_TPETRA_DEBUG
2250  invalidPermuteFromRows.insert (srcLclRow);
2251 #endif // HAVE_TPETRA_DEBUG
2252  }
2253  else {
2254  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2255  if (err != numEntries) {
2256  lclErr = true;
2257 #ifdef HAVE_TPETRA_DEBUG
2258  for (LO c = 0; c < numEntries; ++c) {
2259  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2260  invalidDstPermuteCols.insert (lclSrcCols[c]);
2261  }
2262  }
2263 #endif // HAVE_TPETRA_DEBUG
2264  }
2265  }
2266  }
2267  }
2268  else { // must convert column indices to global
2269  // Reserve space to store the destination matrix's local column indices.
2270  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2271  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2272 
2273  // Copy local rows that are the "same" in both source and target.
2274  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2275  const LO* lclSrcCols;
2276  Scalar* vals;
2277  LO numEntries;
2278  // If this call fails, that means the mesh row local index is
2279  // invalid. That means the Import or Export is invalid somehow.
2280  LO err = 0;
2281  try {
2282  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2283  } catch (std::exception& e) {
2284  if (debug) {
2285  std::ostringstream os;
2286  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2287  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2288  << localRow << ", src->getLocalRowView() threw an exception: "
2289  << e.what ();
2290  std::cerr << os.str ();
2291  }
2292  throw e;
2293  }
2294 
2295  if (err != 0) {
2296  lclErr = true;
2297 #ifdef HAVE_TPETRA_DEBUG
2298  invalidSrcCopyRows.insert (localRow);
2299 #endif // HAVE_TPETRA_DEBUG
2300  }
2301  else {
2302  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2303  lclErr = true;
2304  if (debug) {
2305  std::ostringstream os;
2306  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2307  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2308  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2309  << maxNumEnt << endl;
2310  std::cerr << os.str ();
2311  }
2312  }
2313  else {
2314  // Convert the source matrix's local column indices to the
2315  // destination matrix's local column indices.
2316  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2317  for (LO j = 0; j < numEntries; ++j) {
2318  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2319  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2320  lclErr = true;
2321 #ifdef HAVE_TPETRA_DEBUG
2322  invalidDstCopyCols.insert (lclDstColsView[j]);
2323 #endif // HAVE_TPETRA_DEBUG
2324  }
2325  }
2326  try {
2327  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2328  } catch (std::exception& e) {
2329  if (debug) {
2330  std::ostringstream os;
2331  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2332  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2333  << localRow << ", this->replaceLocalValues() threw an exception: "
2334  << e.what ();
2335  std::cerr << os.str ();
2336  }
2337  throw e;
2338  }
2339  if (err != numEntries) {
2340  lclErr = true;
2341  if (debug) {
2342  std::ostringstream os;
2343  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2344  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2345  "localRow " << localRow << ", this->replaceLocalValues "
2346  "returned " << err << " instead of numEntries = "
2347  << numEntries << endl;
2348  std::cerr << os.str ();
2349  }
2350  }
2351  }
2352  }
2353  }
2354 
2355  // Copy the "permute" local rows.
2356  for (size_t k = 0; k < numPermute; ++k) {
2357  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2358  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2359 
2360  const LO* lclSrcCols;
2361  Scalar* vals;
2362  LO numEntries;
2363  LO err = 0;
2364  try {
2365  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2366  } catch (std::exception& e) {
2367  if (debug) {
2368  std::ostringstream os;
2369  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2370  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2371  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2372  << ", src->getLocalRowView() threw an exception: " << e.what ();
2373  std::cerr << os.str ();
2374  }
2375  throw e;
2376  }
2377 
2378  if (err != 0) {
2379  lclErr = true;
2380 #ifdef HAVE_TPETRA_DEBUG
2381  invalidPermuteFromRows.insert (srcLclRow);
2382 #endif // HAVE_TPETRA_DEBUG
2383  }
2384  else {
2385  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2386  lclErr = true;
2387  }
2388  else {
2389  // Convert the source matrix's local column indices to the
2390  // destination matrix's local column indices.
2391  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2392  for (LO j = 0; j < numEntries; ++j) {
2393  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2394  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2395  lclErr = true;
2396 #ifdef HAVE_TPETRA_DEBUG
2397  invalidDstPermuteCols.insert (lclDstColsView[j]);
2398 #endif // HAVE_TPETRA_DEBUG
2399  }
2400  }
2401  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2402  if (err != numEntries) {
2403  lclErr = true;
2404  }
2405  }
2406  }
2407  }
2408  }
2409 
2410  if (lclErr) {
2411  std::ostream& err = this->markLocalErrorAndGetStream ();
2412 #ifdef HAVE_TPETRA_DEBUG
2413  err << "copyAndPermute: The graph structure of the source of the "
2414  "Import or Export must be a subset of the graph structure of the "
2415  "target. ";
2416  err << "invalidSrcCopyRows = [";
2417  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2418  it != invalidSrcCopyRows.end (); ++it) {
2419  err << *it;
2420  typename std::set<LO>::const_iterator itp1 = it;
2421  itp1++;
2422  if (itp1 != invalidSrcCopyRows.end ()) {
2423  err << ",";
2424  }
2425  }
2426  err << "], invalidDstCopyRows = [";
2427  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2428  it != invalidDstCopyRows.end (); ++it) {
2429  err << *it;
2430  typename std::set<LO>::const_iterator itp1 = it;
2431  itp1++;
2432  if (itp1 != invalidDstCopyRows.end ()) {
2433  err << ",";
2434  }
2435  }
2436  err << "], invalidDstCopyCols = [";
2437  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2438  it != invalidDstCopyCols.end (); ++it) {
2439  err << *it;
2440  typename std::set<LO>::const_iterator itp1 = it;
2441  itp1++;
2442  if (itp1 != invalidDstCopyCols.end ()) {
2443  err << ",";
2444  }
2445  }
2446  err << "], invalidDstPermuteCols = [";
2447  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2448  it != invalidDstPermuteCols.end (); ++it) {
2449  err << *it;
2450  typename std::set<LO>::const_iterator itp1 = it;
2451  itp1++;
2452  if (itp1 != invalidDstPermuteCols.end ()) {
2453  err << ",";
2454  }
2455  }
2456  err << "], invalidPermuteFromRows = [";
2457  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2458  it != invalidPermuteFromRows.end (); ++it) {
2459  err << *it;
2460  typename std::set<LO>::const_iterator itp1 = it;
2461  itp1++;
2462  if (itp1 != invalidPermuteFromRows.end ()) {
2463  err << ",";
2464  }
2465  }
2466  err << "]" << endl;
2467 #else
2468  err << "copyAndPermute: The graph structure of the source of the "
2469  "Import or Export must be a subset of the graph structure of the "
2470  "target." << endl;
2471 #endif // HAVE_TPETRA_DEBUG
2472  }
2473 
2474  if (debug) {
2475  std::ostringstream os;
2476  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2477  const bool lclSuccess = ! (* (this->localError_));
2478  os << "*** Proc " << myRank << ": copyAndPermute "
2479  << (lclSuccess ? "succeeded" : "FAILED");
2480  if (lclSuccess) {
2481  os << endl;
2482  } else {
2483  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2484  }
2485  std::cerr << os.str ();
2486  }
2487  }
2488 
2489  namespace { // (anonymous)
2490 
2509  template<class LO, class GO>
2510  size_t
2511  packRowCount (const size_t numEnt,
2512  const size_t numBytesPerValue,
2513  const size_t blkSize)
2514  {
2515  using ::Tpetra::Details::PackTraits;
2516 
2517  if (numEnt == 0) {
2518  // Empty rows always take zero bytes, to ensure sparsity.
2519  return 0;
2520  }
2521  else {
2522  // We store the number of entries as a local index (LO).
2523  LO numEntLO = 0; // packValueCount wants this.
2524  GO gid {};
2525  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2526  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2527  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2528  return numEntLen + gidsLen + valsLen;
2529  }
2530  }
2531 
2542  template<class ST, class LO, class GO>
2543  size_t
2544  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
2545  const size_t offset,
2546  const size_t numBytes,
2547  const size_t /* numBytesPerValue */)
2548  {
2549  using Kokkos::subview;
2550  using ::Tpetra::Details::PackTraits;
2551 
2552  if (numBytes == 0) {
2553  // Empty rows always take zero bytes, to ensure sparsity.
2554  return static_cast<size_t> (0);
2555  }
2556  else {
2557  LO numEntLO = 0;
2558  const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
2559  TEUCHOS_TEST_FOR_EXCEPTION
2560  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2561  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2562  << ".");
2563  const char* const inBuf = imports.data () + offset;
2564  const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
2565  TEUCHOS_TEST_FOR_EXCEPTION
2566  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2567  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2568  << ".");
2569  return static_cast<size_t> (numEntLO);
2570  }
2571  }
2572 
2576  template<class ST, class LO, class GO>
2577  size_t
2578  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
2579  const size_t offset,
2580  const size_t numEnt,
2581  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
2582  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
2583  const size_t numBytesPerValue,
2584  const size_t blockSize)
2585  {
2586  using Kokkos::subview;
2587  using ::Tpetra::Details::PackTraits;
2588 
2589  if (numEnt == 0) {
2590  // Empty rows always take zero bytes, to ensure sparsity.
2591  return 0;
2592  }
2593  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2594 
2595  const GO gid = 0; // packValueCount wants this
2596  const LO numEntLO = static_cast<size_t> (numEnt);
2597 
2598  const size_t numEntBeg = offset;
2599  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
2600  const size_t gidsBeg = numEntBeg + numEntLen;
2601  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2602  const size_t valsBeg = gidsBeg + gidsLen;
2603  const size_t valsLen = numScalarEnt * numBytesPerValue;
2604 
2605  char* const numEntOut = exports.data () + numEntBeg;
2606  char* const gidsOut = exports.data () + gidsBeg;
2607  char* const valsOut = exports.data () + valsBeg;
2608 
2609  size_t numBytesOut = 0;
2610  int errorCode = 0;
2611  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
2612 
2613  {
2614  Kokkos::pair<int, size_t> p;
2615  p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
2616  errorCode += p.first;
2617  numBytesOut += p.second;
2618 
2619  p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
2620  errorCode += p.first;
2621  numBytesOut += p.second;
2622  }
2623 
2624  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2625  TEUCHOS_TEST_FOR_EXCEPTION
2626  (numBytesOut != expectedNumBytes, std::logic_error,
2627  "packRowForBlockCrs: numBytesOut = " << numBytesOut
2628  << " != expectedNumBytes = " << expectedNumBytes << ".");
2629 
2630  TEUCHOS_TEST_FOR_EXCEPTION
2631  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2632  "PackTraits::packArray returned a nonzero error code");
2633 
2634  return numBytesOut;
2635  }
2636 
2637  // Return the number of bytes actually read / used.
2638  template<class ST, class LO, class GO>
2639  size_t
2640  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
2641  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
2642  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
2643  const size_t offset,
2644  const size_t numBytes,
2645  const size_t numEnt,
2646  const size_t numBytesPerValue,
2647  const size_t blockSize)
2648  {
2649  using ::Tpetra::Details::PackTraits;
2650 
2651  if (numBytes == 0) {
2652  // Rows with zero bytes always have zero entries.
2653  return 0;
2654  }
2655  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2656  TEUCHOS_TEST_FOR_EXCEPTION
2657  (static_cast<size_t> (imports.extent (0)) <= offset,
2658  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2659  << imports.extent (0) << " <= offset = " << offset << ".");
2660  TEUCHOS_TEST_FOR_EXCEPTION
2661  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2662  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2663  << imports.extent (0) << " < offset + numBytes = "
2664  << (offset + numBytes) << ".");
2665 
2666  const GO gid = 0; // packValueCount wants this
2667  const LO lid = 0; // packValueCount wants this
2668 
2669  const size_t numEntBeg = offset;
2670  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
2671  const size_t gidsBeg = numEntBeg + numEntLen;
2672  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
2673  const size_t valsBeg = gidsBeg + gidsLen;
2674  const size_t valsLen = numScalarEnt * numBytesPerValue;
2675 
2676  const char* const numEntIn = imports.data () + numEntBeg;
2677  const char* const gidsIn = imports.data () + gidsBeg;
2678  const char* const valsIn = imports.data () + valsBeg;
2679 
2680  size_t numBytesOut = 0;
2681  int errorCode = 0;
2682  LO numEntOut;
2683  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
2684  TEUCHOS_TEST_FOR_EXCEPTION
2685  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2686  "unpackRowForBlockCrs: Expected number of entries " << numEnt
2687  << " != actual number of entries " << numEntOut << ".");
2688 
2689  {
2690  Kokkos::pair<int, size_t> p;
2691  p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2692  errorCode += p.first;
2693  numBytesOut += p.second;
2694 
2695  p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2696  errorCode += p.first;
2697  numBytesOut += p.second;
2698  }
2699 
2700  TEUCHOS_TEST_FOR_EXCEPTION
2701  (numBytesOut != numBytes, std::logic_error,
2702  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2703  << " != numBytes = " << numBytes << ".");
2704 
2705  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2706  TEUCHOS_TEST_FOR_EXCEPTION
2707  (numBytesOut != expectedNumBytes, std::logic_error,
2708  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2709  << " != expectedNumBytes = " << expectedNumBytes << ".");
2710 
2711  TEUCHOS_TEST_FOR_EXCEPTION
2712  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2713  "PackTraits::unpackArray returned a nonzero error code");
2714 
2715  return numBytesOut;
2716  }
2717  } // namespace (anonymous)
2718 
2719  template<class Scalar, class LO, class GO, class Node>
2720  void
2723  (const ::Tpetra::SrcDistObject& source,
2724  const Kokkos::DualView<const local_ordinal_type*,
2725  buffer_device_type>& exportLIDs,
2726  Kokkos::DualView<packet_type*,
2727  buffer_device_type>& exports, // output
2728  Kokkos::DualView<size_t*,
2729  buffer_device_type> numPacketsPerLID, // output
2730  size_t& constantNumPackets,
2731  Distributor& /* distor */)
2732  {
2733  using ::Tpetra::Details::Behavior;
2735  using ::Tpetra::Details::ProfilingRegion;
2736  using ::Tpetra::Details::PackTraits;
2737 
2738  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2739 
2740  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2741 
2742  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2743 
2744  const bool debug = Behavior::debug();
2745  const bool verbose = Behavior::verbose();
2746 
2747  // Define this function prefix
2748  std::string prefix;
2749  {
2750  std::ostringstream os;
2751  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2752  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2753  prefix = os.str();
2754  }
2755 
2756  // check if this already includes a local error
2757  if (* (this->localError_)) {
2758  std::ostream& err = this->markLocalErrorAndGetStream ();
2759  err << prefix
2760  << "The target object of the Import or Export is already in an error state."
2761  << std::endl;
2762  return;
2763  }
2764 
2765  //
2766  // Verbose input dual view status
2767  //
2768  if (verbose) {
2769  std::ostringstream os;
2770  os << prefix << std::endl
2771  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2772  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2773  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2774  std::cerr << os.str ();
2775  }
2776 
2780  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2781  std::ostream& err = this->markLocalErrorAndGetStream ();
2782  err << prefix
2783  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2784  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2785  << "." << std::endl;
2786  return;
2787  }
2788  if (exportLIDs.need_sync_host ()) {
2789  std::ostream& err = this->markLocalErrorAndGetStream ();
2790  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2791  return;
2792  }
2793 
2794  const this_type* src = dynamic_cast<const this_type* > (&source);
2795  if (src == NULL) {
2796  std::ostream& err = this->markLocalErrorAndGetStream ();
2797  err << prefix
2798  << "The source (input) object of the Import or "
2799  "Export is either not a BlockCrsMatrix, or does not have the right "
2800  "template parameters. checkSizes() should have caught this. "
2801  "Please report this bug to the Tpetra developers." << std::endl;
2802  return;
2803  }
2804 
2805  // Graphs and matrices are allowed to have a variable number of
2806  // entries per row. We could test whether all rows have the same
2807  // number of entries, but DistObject can only use this
2808  // optimization if all rows on _all_ processes have the same
2809  // number of entries. Rather than do the all-reduce necessary to
2810  // test for this unlikely case, we tell DistObject (by setting
2811  // constantNumPackets to zero) to assume that different rows may
2812  // have different numbers of entries.
2813  constantNumPackets = 0;
2814 
2815  // const values
2816  const crs_graph_type& srcGraph = src->graph_;
2817  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2818  const size_t numExportLIDs = exportLIDs.extent (0);
2819  const size_t numBytesPerValue =
2820  PackTraits<impl_scalar_type>
2821  ::packValueCount(this->val_.extent(0) ? this->val_.view_host()(0) : impl_scalar_type());
2822 
2823  // Compute the number of bytes ("packets") per row to pack. While
2824  // we're at it, compute the total # of block entries to send, and
2825  // the max # of block entries in any of the rows we're sending.
2826 
2827  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2828 
2829  // Graph information is on host; let's do this on host parallel reduce
2830  auto exportLIDsHost = exportLIDs.view_host();
2831  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2832  numPacketsPerLID.modify_host ();
2833  {
2834  using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2835  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2836  Kokkos::parallel_reduce
2837  (policy,
2838  [=](const size_t &i, typename reducer_type::value_type &update) {
2839  const LO lclRow = exportLIDsHost(i);
2840  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2841  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2842 
2843  const size_t numBytes =
2844  packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2845  numPacketsPerLIDHost(i) = numBytes;
2846  update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2847  }, rowReducerStruct);
2848  }
2849 
2850  // Compute the number of bytes ("packets") per row to pack. While
2851  // we're at it, compute the total # of block entries to send, and
2852  // the max # of block entries in any of the rows we're sending.
2853  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2854  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2855  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2856 
2857  if (verbose) {
2858  std::ostringstream os;
2859  os << prefix
2860  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2861  << std::endl;
2862  std::cerr << os.str ();
2863  }
2864 
2865  // We use a "struct of arrays" approach to packing each row's
2866  // entries. All the column indices (as global indices) go first,
2867  // then all their owning process ranks, and then the values.
2868  if(exports.extent(0) != totalNumBytes)
2869  {
2870  const std::string oldLabel = exports.d_view.label ();
2871  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2872  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2873  }
2874  if (totalNumEntries > 0) {
2875  // Current position (in bytes) in the 'exports' output array.
2876  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2877  {
2878  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2879  Kokkos::parallel_scan
2880  (policy,
2881  [=](const size_t &i, size_t &update, const bool &final) {
2882  if (final) offset(i) = update;
2883  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2884  });
2885  }
2886  if (offset(numExportLIDs) != totalNumBytes) {
2887  std::ostream& err = this->markLocalErrorAndGetStream ();
2888  err << prefix
2889  << "At end of method, the final offset (in bytes) "
2890  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2891  << totalNumBytes << ". "
2892  << "Please report this bug to the Tpetra developers." << std::endl;
2893  return;
2894  }
2895 
2896  // For each block row of the matrix owned by the calling
2897  // process, pack that block row's column indices and values into
2898  // the exports array.
2899 
2900  // Source matrix's column Map. We verified in checkSizes() that
2901  // the column Map exists (is not null).
2902  const map_type& srcColMap = * (srcGraph.getColMap ());
2903 
2904  // Pack the data for each row to send, into the 'exports' buffer.
2905  // exports will be modified on host.
2906  exports.modify_host();
2907  {
2908  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2909  const auto policy =
2910  policy_type(numExportLIDs, 1, 1)
2911  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2912  Kokkos::parallel_for
2913  (policy,
2914  [=](const typename policy_type::member_type &member) {
2915  const size_t i = member.league_rank();
2916  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2917  gblColInds(member.team_scratch(0), maxRowLength);
2918 
2919  const LO lclRowInd = exportLIDsHost(i);
2920  const LO* lclColIndsRaw;
2921  Scalar* valsRaw;
2922  LO numEntLO;
2923  // It's OK to ignore the return value, since if the calling
2924  // process doesn't own that local row, then the number of
2925  // entries in that row on the calling process is zero.
2926  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
2927 
2928  const size_t numEnt = static_cast<size_t> (numEntLO);
2929  Kokkos::View<const LO*,host_exec> lclColInds (lclColIndsRaw, numEnt);
2930 
2931  // Convert column indices from local to global.
2932  for (size_t j = 0; j < numEnt; ++j)
2933  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2934 
2935  // Kyungjoo: additional wrapping scratch view is necessary
2936  // the following function interface need the same execution space
2937  // host scratch space somehow is not considered same as the host_exec
2938  // Copy the row's data into the current spot in the exports array.
2939  const size_t numBytes =
2940  packRowForBlockCrs<impl_scalar_type, LO, GO>
2941  (exports.view_host(),
2942  offset(i),
2943  numEnt,
2944  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2945  Kokkos::View<const impl_scalar_type*, host_exec>(reinterpret_cast<const impl_scalar_type*>(valsRaw), numEnt*blockSize*blockSize),
2946  numBytesPerValue,
2947  blockSize);
2948 
2949  // numBytes should be same as the difference between offsets
2950  if (debug) {
2951  const size_t offsetDiff = offset(i+1) - offset(i);
2952  if (numBytes != offsetDiff) {
2953  std::ostringstream os;
2954  os << prefix
2955  << "numBytes computed from packRowForBlockCrs is different from "
2956  << "precomputed offset values, LID = " << i << std::endl;
2957  std::cerr << os.str ();
2958  }
2959  }
2960  }); // for each LID (of a row) to send
2961  }
2962  } // if totalNumEntries > 0
2963 
2964  if (debug) {
2965  std::ostringstream os;
2966  const bool lclSuccess = ! (* (this->localError_));
2967  os << prefix
2968  << (lclSuccess ? "succeeded" : "FAILED")
2969  << std::endl;
2970  std::cerr << os.str ();
2971  }
2972  }
2973 
2974  template<class Scalar, class LO, class GO, class Node>
2975  void
2978  (const Kokkos::DualView<const local_ordinal_type*,
2979  buffer_device_type>& importLIDs,
2980  Kokkos::DualView<packet_type*,
2981  buffer_device_type> imports,
2982  Kokkos::DualView<size_t*,
2983  buffer_device_type> numPacketsPerLID,
2984  const size_t /* constantNumPackets */,
2985  Distributor& /* distor */,
2986  const CombineMode combineMode)
2987  {
2988  using ::Tpetra::Details::Behavior;
2990  using ::Tpetra::Details::ProfilingRegion;
2991  using ::Tpetra::Details::PackTraits;
2992  using std::endl;
2993  using host_exec =
2994  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2995 
2996  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2997  const bool verbose = Behavior::verbose ();
2998 
2999  // Define this function prefix
3000  std::string prefix;
3001  {
3002  std::ostringstream os;
3003  auto map = this->graph_.getRowMap ();
3004  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
3005  const int myRank = comm.is_null () ? -1 : comm->getRank ();
3006  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
3007  prefix = os.str ();
3008  if (verbose) {
3009  os << "Start" << endl;
3010  std::cerr << os.str ();
3011  }
3012  }
3013 
3014  // check if this already includes a local error
3015  if (* (this->localError_)) {
3016  std::ostream& err = this->markLocalErrorAndGetStream ();
3017  std::ostringstream os;
3018  os << prefix << "{Im/Ex}port target is already in error." << endl;
3019  if (verbose) {
3020  std::cerr << os.str ();
3021  }
3022  err << os.str ();
3023  return;
3024  }
3025 
3029  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
3030  std::ostream& err = this->markLocalErrorAndGetStream ();
3031  std::ostringstream os;
3032  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
3033  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
3034  << "." << endl;
3035  if (verbose) {
3036  std::cerr << os.str ();
3037  }
3038  err << os.str ();
3039  return;
3040  }
3041 
3042  if (combineMode != ADD && combineMode != INSERT &&
3043  combineMode != REPLACE && combineMode != ABSMAX &&
3044  combineMode != ZERO) {
3045  std::ostream& err = this->markLocalErrorAndGetStream ();
3046  std::ostringstream os;
3047  os << prefix
3048  << "Invalid CombineMode value " << combineMode << ". Valid "
3049  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3050  << std::endl;
3051  if (verbose) {
3052  std::cerr << os.str ();
3053  }
3054  err << os.str ();
3055  return;
3056  }
3057 
3058  if (this->graph_.getColMap ().is_null ()) {
3059  std::ostream& err = this->markLocalErrorAndGetStream ();
3060  std::ostringstream os;
3061  os << prefix << "Target matrix's column Map is null." << endl;
3062  if (verbose) {
3063  std::cerr << os.str ();
3064  }
3065  err << os.str ();
3066  return;
3067  }
3068 
3069  // Target matrix's column Map. Use to convert the global column
3070  // indices in the receive buffer to local indices. We verified in
3071  // checkSizes() that the column Map exists (is not null).
3072  const map_type& tgtColMap = * (this->graph_.getColMap ());
3073 
3074  // Const values
3075  const size_t blockSize = this->getBlockSize ();
3076  const size_t numImportLIDs = importLIDs.extent(0);
3077  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
3078  // default-constructed instance could have a different size than
3079  // other instances. (We assume all nominally constructed
3080  // instances have the same size; that's not the issue here.) This
3081  // could be bad if the calling process has no entries, but other
3082  // processes have entries that they want to send to us.
3083  const size_t numBytesPerValue =
3084  PackTraits<impl_scalar_type>::packValueCount
3085  (this->val_.extent (0) ? this->val_.view_host () (0) : impl_scalar_type ());
3086  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3087  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3088 
3089  if (verbose) {
3090  std::ostringstream os;
3091  os << prefix << "combineMode: "
3092  << ::Tpetra::combineModeToString (combineMode)
3093  << ", blockSize: " << blockSize
3094  << ", numImportLIDs: " << numImportLIDs
3095  << ", numBytesPerValue: " << numBytesPerValue
3096  << ", maxRowNumEnt: " << maxRowNumEnt
3097  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
3098  std::cerr << os.str ();
3099  }
3100 
3101  if (combineMode == ZERO || numImportLIDs == 0) {
3102  if (verbose) {
3103  std::ostringstream os;
3104  os << prefix << "Nothing to unpack. Done!" << std::endl;
3105  std::cerr << os.str ();
3106  }
3107  return;
3108  }
3109 
3110  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
3111  // we can remove this sync.
3112  if (imports.need_sync_host ()) {
3113  imports.sync_host ();
3114  }
3115 
3116  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
3117  // sync'd numPacketsPerLID to host, since it needs to do that in
3118  // order to post MPI_Irecv messages with the correct lengths. A
3119  // hypothetical device-specific boundary exchange implementation
3120  // could possibly receive data without sync'ing lengths to host,
3121  // but we don't need to design for that nonexistent thing yet.
3122 
3123  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
3124  importLIDs.need_sync_host ()) {
3125  std::ostream& err = this->markLocalErrorAndGetStream ();
3126  std::ostringstream os;
3127  os << prefix << "All input DualViews must be sync'd to host by now. "
3128  << "imports_nc.need_sync_host()="
3129  << (imports.need_sync_host () ? "true" : "false")
3130  << ", numPacketsPerLID.need_sync_host()="
3131  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
3132  << ", importLIDs.need_sync_host()="
3133  << (importLIDs.need_sync_host () ? "true" : "false")
3134  << "." << endl;
3135  if (verbose) {
3136  std::cerr << os.str ();
3137  }
3138  err << os.str ();
3139  return;
3140  }
3141 
3142  const auto importLIDsHost = importLIDs.view_host ();
3143  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
3144 
3145  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
3146  // loop, by only unpacking on final==true (when we know the
3147  // current offset's value).
3148 
3149  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
3150  {
3151  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
3152  Kokkos::parallel_scan
3153  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
3154  [=] (const size_t &i, size_t &update, const bool &final) {
3155  if (final) offset(i) = update;
3156  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
3157  });
3158  }
3159 
3160  // this variable does not matter with a race condition (just error flag)
3161  //
3162  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
3163  // or atomics on bool, so we use int instead. (I know we're not
3164  // launching a CUDA loop here, but we could in the future, and I'd
3165  // like to avoid potential trouble.)
3166  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
3167  errorDuringUnpack ("errorDuringUnpack");
3168  errorDuringUnpack () = 0;
3169  {
3170  using policy_type = Kokkos::TeamPolicy<host_exec>;
3171  const auto policy = policy_type (numImportLIDs, 1, 1)
3172  .set_scratch_size (0, Kokkos::PerTeam (sizeof (GO) * maxRowNumEnt +
3173  sizeof (LO) * maxRowNumEnt +
3174  numBytesPerValue * maxRowNumScalarEnt));
3175  using host_scratch_space = typename host_exec::scratch_memory_space;
3176  using pair_type = Kokkos::pair<size_t, size_t>;
3177  Kokkos::parallel_for
3178  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
3179  [=] (const typename policy_type::member_type& member) {
3180  const size_t i = member.league_rank();
3181 
3182  Kokkos::View<GO*, host_scratch_space> gblColInds
3183  (member.team_scratch (0), maxRowNumEnt);
3184  Kokkos::View<LO*, host_scratch_space> lclColInds
3185  (member.team_scratch (0), maxRowNumEnt);
3186  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
3187  (member.team_scratch (0), maxRowNumScalarEnt);
3188 
3189  const size_t offval = offset(i);
3190  const LO lclRow = importLIDsHost(i);
3191  const size_t numBytes = numPacketsPerLIDHost(i);
3192  const size_t numEnt =
3193  unpackRowCount<impl_scalar_type, LO, GO>
3194  (imports.view_host (), offval, numBytes, numBytesPerValue);
3195 
3196  if (numBytes > 0) {
3197  if (numEnt > maxRowNumEnt) {
3198  errorDuringUnpack() = 1;
3199  if (verbose) {
3200  std::ostringstream os;
3201  os << prefix
3202  << "At i = " << i << ", numEnt = " << numEnt
3203  << " > maxRowNumEnt = " << maxRowNumEnt
3204  << std::endl;
3205  std::cerr << os.str ();
3206  }
3207  return;
3208  }
3209  }
3210  const size_t numScalarEnt = numEnt*blockSize*blockSize;
3211  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
3212  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
3213  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
3214 
3215  // Kyungjoo: additional wrapping scratch view is necessary
3216  // the following function interface need the same execution space
3217  // host scratch space somehow is not considered same as the host_exec
3218  size_t numBytesOut = 0;
3219  try {
3220  numBytesOut =
3221  unpackRowForBlockCrs<impl_scalar_type, LO, GO>
3222  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
3223  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
3224  imports.view_host(),
3225  offval, numBytes, numEnt,
3226  numBytesPerValue, blockSize);
3227  }
3228  catch (std::exception& e) {
3229  errorDuringUnpack() = 1;
3230  if (verbose) {
3231  std::ostringstream os;
3232  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
3233  << e.what () << endl;
3234  std::cerr << os.str ();
3235  }
3236  return;
3237  }
3238 
3239  if (numBytes != numBytesOut) {
3240  errorDuringUnpack() = 1;
3241  if (verbose) {
3242  std::ostringstream os;
3243  os << prefix
3244  << "At i = " << i << ", numBytes = " << numBytes
3245  << " != numBytesOut = " << numBytesOut << "."
3246  << std::endl;
3247  std::cerr << os.str();
3248  }
3249  return;
3250  }
3251 
3252  // Convert incoming global indices to local indices.
3253  for (size_t k = 0; k < numEnt; ++k) {
3254  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3255  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3256  if (verbose) {
3257  std::ostringstream os;
3258  os << prefix
3259  << "At i = " << i << ", GID " << gidsOut(k)
3260  << " is not owned by the calling process."
3261  << std::endl;
3262  std::cerr << os.str();
3263  }
3264  return;
3265  }
3266  }
3267 
3268  // Combine the incoming data with the matrix's current data.
3269  LO numCombd = 0;
3270  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
3271  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
3272  (const_cast<const impl_scalar_type*> (valsOut.data ()));
3273  if (combineMode == ADD) {
3274  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3275  } else if (combineMode == INSERT || combineMode == REPLACE) {
3276  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3277  } else if (combineMode == ABSMAX) {
3278  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3279  }
3280 
3281  if (static_cast<LO> (numEnt) != numCombd) {
3282  errorDuringUnpack() = 1;
3283  if (verbose) {
3284  std::ostringstream os;
3285  os << prefix << "At i = " << i << ", numEnt = " << numEnt
3286  << " != numCombd = " << numCombd << "."
3287  << endl;
3288  std::cerr << os.str ();
3289  }
3290  return;
3291  }
3292  }); // for each import LID i
3293  }
3294 
3295  if (errorDuringUnpack () != 0) {
3296  std::ostream& err = this->markLocalErrorAndGetStream ();
3297  err << prefix << "Unpacking failed.";
3298  if (! verbose) {
3299  err << " Please run again with the environment variable setting "
3300  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3301  }
3302  err << endl;
3303  }
3304 
3305  if (verbose) {
3306  std::ostringstream os;
3307  const bool lclSuccess = ! (* (this->localError_));
3308  os << prefix
3309  << (lclSuccess ? "succeeded" : "FAILED")
3310  << std::endl;
3311  std::cerr << os.str ();
3312  }
3313  }
3314 
3315  template<class Scalar, class LO, class GO, class Node>
3316  std::string
3318  {
3319  using Teuchos::TypeNameTraits;
3320  std::ostringstream os;
3321  os << "\"Tpetra::BlockCrsMatrix\": { "
3322  << "Template parameters: { "
3323  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3324  << "LO: " << TypeNameTraits<LO>::name ()
3325  << "GO: " << TypeNameTraits<GO>::name ()
3326  << "Node: " << TypeNameTraits<Node>::name ()
3327  << " }"
3328  << ", Label: \"" << this->getObjectLabel () << "\""
3329  << ", Global dimensions: ["
3330  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3331  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3332  << ", Block size: " << getBlockSize ()
3333  << " }";
3334  return os.str ();
3335  }
3336 
3337 
3338  template<class Scalar, class LO, class GO, class Node>
3339  void
3341  describe (Teuchos::FancyOStream& out,
3342  const Teuchos::EVerbosityLevel verbLevel) const
3343  {
3344  using Teuchos::ArrayRCP;
3345  using Teuchos::CommRequest;
3346  using Teuchos::FancyOStream;
3347  using Teuchos::getFancyOStream;
3348  using Teuchos::ireceive;
3349  using Teuchos::isend;
3350  using Teuchos::outArg;
3351  using Teuchos::TypeNameTraits;
3352  using Teuchos::VERB_DEFAULT;
3353  using Teuchos::VERB_NONE;
3354  using Teuchos::VERB_LOW;
3355  using Teuchos::VERB_MEDIUM;
3356  // using Teuchos::VERB_HIGH;
3357  using Teuchos::VERB_EXTREME;
3358  using Teuchos::RCP;
3359  using Teuchos::wait;
3360  using std::endl;
3361 #ifdef HAVE_TPETRA_DEBUG
3362  const char prefix[] = "Tpetra::BlockCrsMatrix::describe: ";
3363 #endif // HAVE_TPETRA_DEBUG
3364 
3365  // Set default verbosity if applicable.
3366  const Teuchos::EVerbosityLevel vl =
3367  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3368 
3369  if (vl == VERB_NONE) {
3370  return; // print nothing
3371  }
3372 
3373  // describe() always starts with a tab before it prints anything.
3374  Teuchos::OSTab tab0 (out);
3375 
3376  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3377  Teuchos::OSTab tab1 (out);
3378 
3379  out << "Template parameters:" << endl;
3380  {
3381  Teuchos::OSTab tab2 (out);
3382  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3383  << "LO: " << TypeNameTraits<LO>::name () << endl
3384  << "GO: " << TypeNameTraits<GO>::name () << endl
3385  << "Node: " << TypeNameTraits<Node>::name () << endl;
3386  }
3387  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3388  << "Global dimensions: ["
3389  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3390  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3391 
3392  const LO blockSize = getBlockSize ();
3393  out << "Block size: " << blockSize << endl;
3394 
3395  // constituent objects
3396  if (vl >= VERB_MEDIUM) {
3397  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3398  const int myRank = comm.getRank ();
3399  if (myRank == 0) {
3400  out << "Row Map:" << endl;
3401  }
3402  getRowMap()->describe (out, vl);
3403 
3404  if (! getColMap ().is_null ()) {
3405  if (getColMap() == getRowMap()) {
3406  if (myRank == 0) {
3407  out << "Column Map: same as row Map" << endl;
3408  }
3409  }
3410  else {
3411  if (myRank == 0) {
3412  out << "Column Map:" << endl;
3413  }
3414  getColMap ()->describe (out, vl);
3415  }
3416  }
3417  if (! getDomainMap ().is_null ()) {
3418  if (getDomainMap () == getRowMap ()) {
3419  if (myRank == 0) {
3420  out << "Domain Map: same as row Map" << endl;
3421  }
3422  }
3423  else if (getDomainMap () == getColMap ()) {
3424  if (myRank == 0) {
3425  out << "Domain Map: same as column Map" << endl;
3426  }
3427  }
3428  else {
3429  if (myRank == 0) {
3430  out << "Domain Map:" << endl;
3431  }
3432  getDomainMap ()->describe (out, vl);
3433  }
3434  }
3435  if (! getRangeMap ().is_null ()) {
3436  if (getRangeMap () == getDomainMap ()) {
3437  if (myRank == 0) {
3438  out << "Range Map: same as domain Map" << endl;
3439  }
3440  }
3441  else if (getRangeMap () == getRowMap ()) {
3442  if (myRank == 0) {
3443  out << "Range Map: same as row Map" << endl;
3444  }
3445  }
3446  else {
3447  if (myRank == 0) {
3448  out << "Range Map: " << endl;
3449  }
3450  getRangeMap ()->describe (out, vl);
3451  }
3452  }
3453  }
3454 
3455  if (vl >= VERB_EXTREME) {
3456  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3457  // to host, since it's supposed to be const. However, that's
3458  // the easiest and least memory-intensive way to implement this
3459  // method.
3460  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
3461  const_cast<this_type&> (*this).sync_host ();
3462 
3463 #ifdef HAVE_TPETRA_DEBUG
3464  TEUCHOS_TEST_FOR_EXCEPTION
3465  (this->need_sync_host (), std::logic_error,
3466  prefix << "Right after sync to host, the matrix claims that it needs "
3467  "sync to host. Please report this bug to the Tpetra developers.");
3468 #endif // HAVE_TPETRA_DEBUG
3469 
3470  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3471  const int myRank = comm.getRank ();
3472  const int numProcs = comm.getSize ();
3473 
3474  // Print the calling process' data to the given output stream.
3475  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3476  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3477  FancyOStream& os = *osPtr;
3478  os << "Process " << myRank << ":" << endl;
3479  Teuchos::OSTab tab2 (os);
3480 
3481  const map_type& meshRowMap = * (graph_.getRowMap ());
3482  const map_type& meshColMap = * (graph_.getColMap ());
3483  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3484  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3485  ++meshLclRow) {
3486  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3487  os << "Row " << meshGblRow << ": {";
3488 
3489  const LO* lclColInds = NULL;
3490  Scalar* vals = NULL;
3491  LO numInds = 0;
3492  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3493 
3494  for (LO k = 0; k < numInds; ++k) {
3495  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3496 
3497  os << "Col " << gblCol << ": [";
3498  for (LO i = 0; i < blockSize; ++i) {
3499  for (LO j = 0; j < blockSize; ++j) {
3500  os << vals[blockSize*blockSize*k + i*blockSize + j];
3501  if (j + 1 < blockSize) {
3502  os << ", ";
3503  }
3504  }
3505  if (i + 1 < blockSize) {
3506  os << "; ";
3507  }
3508  }
3509  os << "]";
3510  if (k + 1 < numInds) {
3511  os << ", ";
3512  }
3513  }
3514  os << "}" << endl;
3515  }
3516 
3517  // Print data on Process 0. This will automatically respect the
3518  // current indentation level.
3519  if (myRank == 0) {
3520  out << lclOutStrPtr->str ();
3521  lclOutStrPtr = Teuchos::null; // clear it to save space
3522  }
3523 
3524  const int sizeTag = 1337;
3525  const int dataTag = 1338;
3526 
3527  ArrayRCP<char> recvDataBuf; // only used on Process 0
3528 
3529  // Send string sizes and data from each process in turn to
3530  // Process 0, and print on that process.
3531  for (int p = 1; p < numProcs; ++p) {
3532  if (myRank == 0) {
3533  // Receive the incoming string's length.
3534  ArrayRCP<size_t> recvSize (1);
3535  recvSize[0] = 0;
3536  RCP<CommRequest<int> > recvSizeReq =
3537  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3538  wait<int> (comm, outArg (recvSizeReq));
3539  const size_t numCharsToRecv = recvSize[0];
3540 
3541  // Allocate space for the string to receive. Reuse receive
3542  // buffer space if possible. We can do this because in the
3543  // current implementation, we only have one receive in
3544  // flight at a time. Leave space for the '\0' at the end,
3545  // in case the sender doesn't send it.
3546  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3547  recvDataBuf.resize (numCharsToRecv + 1);
3548  }
3549  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3550  // Post the receive of the actual string data.
3551  RCP<CommRequest<int> > recvDataReq =
3552  ireceive<int, char> (recvData, p, dataTag, comm);
3553  wait<int> (comm, outArg (recvDataReq));
3554 
3555  // Print the received data. This will respect the current
3556  // indentation level. Make sure that the string is
3557  // null-terminated.
3558  recvDataBuf[numCharsToRecv] = '\0';
3559  out << recvDataBuf.getRawPtr ();
3560  }
3561  else if (myRank == p) { // if I am not Process 0, and my rank is p
3562  // This deep-copies the string at most twice, depending on
3563  // whether std::string reference counts internally (it
3564  // generally does, so this won't deep-copy at all).
3565  const std::string stringToSend = lclOutStrPtr->str ();
3566  lclOutStrPtr = Teuchos::null; // clear original to save space
3567 
3568  // Send the string's length to Process 0.
3569  const size_t numCharsToSend = stringToSend.size ();
3570  ArrayRCP<size_t> sendSize (1);
3571  sendSize[0] = numCharsToSend;
3572  RCP<CommRequest<int> > sendSizeReq =
3573  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3574  wait<int> (comm, outArg (sendSizeReq));
3575 
3576  // Send the actual string to Process 0. We know that the
3577  // string has length > 0, so it's save to take the address
3578  // of the first entry. Make a nonowning ArrayRCP to hold
3579  // the string. Process 0 will add a null termination
3580  // character at the end of the string, after it receives the
3581  // message.
3582  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3583  RCP<CommRequest<int> > sendDataReq =
3584  isend<int, char> (sendData, 0, dataTag, comm);
3585  wait<int> (comm, outArg (sendDataReq));
3586  }
3587  } // for each process rank p other than 0
3588  } // extreme verbosity level (print the whole matrix)
3589  }
3590 
3591  template<class Scalar, class LO, class GO, class Node>
3592  Teuchos::RCP<const Teuchos::Comm<int> >
3594  getComm() const
3595  {
3596  return graph_.getComm();
3597  }
3598 
3599 
3600  template<class Scalar, class LO, class GO, class Node>
3604  {
3605  return graph_.getGlobalNumCols();
3606  }
3607 
3608  template<class Scalar, class LO, class GO, class Node>
3609  size_t
3612  {
3613  return graph_.getNodeNumCols();
3614  }
3615 
3616  template<class Scalar, class LO, class GO, class Node>
3617  GO
3620  {
3621  return graph_.getIndexBase();
3622  }
3623 
3624  template<class Scalar, class LO, class GO, class Node>
3628  {
3629  return graph_.getGlobalNumEntries();
3630  }
3631 
3632  template<class Scalar, class LO, class GO, class Node>
3633  size_t
3636  {
3637  return graph_.getNodeNumEntries();
3638  }
3639 
3640  template<class Scalar, class LO, class GO, class Node>
3641  size_t
3643  getNumEntriesInGlobalRow (GO globalRow) const
3644  {
3645  return graph_.getNumEntriesInGlobalRow(globalRow);
3646  }
3647 
3648 
3649  template<class Scalar, class LO, class GO, class Node>
3650  size_t
3653  {
3654  return graph_.getGlobalMaxNumRowEntries();
3655  }
3656 
3657  template<class Scalar, class LO, class GO, class Node>
3658  bool
3660  hasColMap() const
3661  {
3662  return graph_.hasColMap();
3663  }
3664 
3665 
3666  template<class Scalar, class LO, class GO, class Node>
3667  bool
3670  {
3671  return graph_.isLocallyIndexed();
3672  }
3673 
3674  template<class Scalar, class LO, class GO, class Node>
3675  bool
3678  {
3679  return graph_.isGloballyIndexed();
3680  }
3681 
3682  template<class Scalar, class LO, class GO, class Node>
3683  bool
3686  {
3687  return graph_.isFillComplete ();
3688  }
3689 
3690  template<class Scalar, class LO, class GO, class Node>
3691  bool
3694  {
3695  return false;
3696  }
3697 
3698 
3699  template<class Scalar, class LO, class GO, class Node>
3700  void
3702  getGlobalRowCopy (GO /* GlobalRow */,
3703  const Teuchos::ArrayView<GO> &/* Indices */,
3704  const Teuchos::ArrayView<Scalar> &/* Values */,
3705  size_t &/* NumEntries */) const
3706  {
3707  TEUCHOS_TEST_FOR_EXCEPTION(
3708  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
3709  "This class doesn't support global matrix indexing.");
3710 
3711  }
3712 
3713  template<class Scalar, class LO, class GO, class Node>
3714  void
3716  getGlobalRowView (GO /* GlobalRow */,
3717  Teuchos::ArrayView<const GO> &/* indices */,
3718  Teuchos::ArrayView<const Scalar> &/* values */) const
3719  {
3720  TEUCHOS_TEST_FOR_EXCEPTION(
3721  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
3722  "This class doesn't support global matrix indexing.");
3723 
3724  }
3725 
3726  template<class Scalar, class LO, class GO, class Node>
3727  void
3729  getLocalRowView (LO /* LocalRow */,
3730  Teuchos::ArrayView<const LO>& /* indices */,
3731  Teuchos::ArrayView<const Scalar>& /* values */) const
3732  {
3733  TEUCHOS_TEST_FOR_EXCEPTION(
3734  true, std::logic_error, "Tpetra::BlockCrsMatrix::getLocalRowView: "
3735  "This class doesn't support local matrix indexing.");
3736  }
3737 
3738  template<class Scalar, class LO, class GO, class Node>
3739  void
3742  {
3743 #ifdef HAVE_TPETRA_DEBUG
3744  const char prefix[] =
3745  "Tpetra::BlockCrsMatrix::getLocalDiagCopy: ";
3746 #endif // HAVE_TPETRA_DEBUG
3747 
3748  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3749 
3750  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3751  graph_.getLocalDiagOffsets (diagOffsets);
3752 
3753  // The code below works on host, so use a host View.
3754  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3755  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3756  // We're filling diag on host for now.
3757  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3758 
3759 #ifdef HAVE_TPETRA_DEBUG
3760  TEUCHOS_TEST_FOR_EXCEPTION
3761  (this->need_sync_host (), std::runtime_error,
3762  prefix << "The matrix's data were last modified on device, but have "
3763  "not been sync'd to host. Please sync to host (by calling "
3764  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3765  "method.");
3766 #endif // HAVE_TPETRA_DEBUG
3767 
3768  auto vals_host_out = getValuesHost ();
3769  Scalar* vals_host_out_raw =
3770  reinterpret_cast<Scalar*> (vals_host_out.data ());
3771 
3772  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3773  size_t rowOffset = 0;
3774  size_t offset = 0;
3775  LO bs = getBlockSize();
3776  for(size_t r=0; r<getNodeNumRows(); r++)
3777  {
3778  // move pointer to start of diagonal block
3779  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3780  for(int b=0; b<bs; b++)
3781  {
3782  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3783  }
3784  // move pointer to start of next block row
3785  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3786  }
3787 
3788  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3789  }
3790 
3791  template<class Scalar, class LO, class GO, class Node>
3792  void
3794  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3795  {
3796  TEUCHOS_TEST_FOR_EXCEPTION(
3797  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3798  "not implemented.");
3799 
3800  }
3801 
3802  template<class Scalar, class LO, class GO, class Node>
3803  void
3805  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3806  {
3807  TEUCHOS_TEST_FOR_EXCEPTION(
3808  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3809  "not implemented.");
3810 
3811  }
3812 
3813  template<class Scalar, class LO, class GO, class Node>
3814  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3816  getGraph() const
3817  {
3818  return graphRCP_;
3819  }
3820 
3821  template<class Scalar, class LO, class GO, class Node>
3822  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3825  {
3826  TEUCHOS_TEST_FOR_EXCEPTION(
3827  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3828  "not implemented.");
3829  }
3830 
3831 } // namespace Tpetra
3832 
3833 //
3834 // Explicit instantiation macro
3835 //
3836 // Must be expanded from within the Tpetra namespace!
3837 //
3838 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3839  template class BlockCrsMatrix< S, LO, GO, NODE >;
3840 
3841 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type > local_graph_type
The type of the part of the sparse graph on each MPI process.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
size_t getNodeNumRows() const
get the local number of block rows
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
LO local_ordinal_type
The type of local indices.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets, Distributor &)
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, Distributor &, const CombineMode combineMode)
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
virtual bool hasColMap() const
Whether this matrix has a well-defined column Map.
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
int local_ordinal_type
Default value of Scalar template parameter.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
MultiVector for multiple degrees of freedom per mesh point.
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
size_t global_size_t
Global size_t object.
size_t getNodeNumEntries() const override
The local number of entries in the graph.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
virtual bool isFillComplete() const
Whether fillComplete() has been called.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in any row of the matrix, on this process.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
Sets up and executes a communication plan for a Tpetra DistObject.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
global_size_t getGlobalNumRows() const
get the global number of block rows
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
Replace old value with maximum of magnitudes of old and new values.
void sync_host()
Sync the matrix&#39;s values to host space.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs)
Replace existing values with new values.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
Replace old values with zero.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
char packet_type
Implementation detail; tells.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
A distributed dense vector.
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
local_graph_type getLocalGraph() const
Get the local graph.
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
void reorderedGaussSeidelCopy(::Tpetra::MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
std::string description() const
One-line description of this object.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual GO getIndexBase() const
The index base for global indices in this matrix.