Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
Tpetra_Experimental_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
43 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
44 
47 
51 #include "Tpetra_Experimental_BlockMultiVector.hpp"
53 
54 #include "Teuchos_TimeMonitor.hpp"
55 #ifdef HAVE_TPETRA_DEBUG
56 # include <set>
57 #endif // HAVE_TPETRA_DEBUG
58 
59 //
60 // mfh 25 May 2016: Temporary fix for #393.
61 //
62 // Don't use lambdas in the BCRS mat-vec for GCC < 4.8, due to a GCC
63 // 4.7.2 compiler bug ("internal compiler error") when compiling them.
64 // Also, lambdas for Kokkos::parallel_* don't work with CUDA, so don't
65 // use them in that case, either.
66 //
67 // mfh 31 May 2016: GCC 4.9.[23] appears to be broken ("internal
68 // compiler error") too. Ditto for GCC 5.1. I'll just disable the
69 // thing for any GCC version.
70 //
71 #if defined(__CUDACC__)
72  // Lambdas for Kokkos::parallel_* don't work with CUDA 7.5 either.
73 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
74 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
75 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
76 
77 #elif defined(__GNUC__)
78 
79 # if defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
80 # undef TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA
81 # endif // defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
82 
83 #else // some other compiler
84 
85  // Optimistically assume that other compilers aren't broken.
86 # if ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
87 # define TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA 1
88 # endif // ! defined(TPETRA_BLOCKCRSMATRIX_APPLY_USE_LAMBDA)
89 #endif // defined(__CUDACC__), defined(__GNUC__)
90 
91 
92 namespace Tpetra {
93 namespace Experimental {
94 
95 namespace Impl {
96 
97  template<typename T>
98  struct BlockCrsRowStruct {
99  T totalNumEntries, totalNumBytes, maxRowLength;
100  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct() = default;
101  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
102  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
103  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
104  };
105 
106  template<typename T>
107  static
108  KOKKOS_INLINE_FUNCTION
109  void operator+=(volatile BlockCrsRowStruct<T> &a,
110  volatile const BlockCrsRowStruct<T> &b) {
111  a.totalNumEntries += b.totalNumEntries;
112  a.totalNumBytes += b.totalNumBytes;
113  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
114  }
115 
116  template<typename T>
117  static
118  KOKKOS_INLINE_FUNCTION
119  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
120  a.totalNumEntries += b.totalNumEntries;
121  a.totalNumBytes += b.totalNumBytes;
122  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
123  }
124 
125  template<typename T, typename ExecSpace>
126  struct BlockCrsReducer {
127  typedef BlockCrsReducer reducer;
128  typedef T value_type;
129  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
130  value_type *value;
131 
132  KOKKOS_INLINE_FUNCTION
133  BlockCrsReducer(value_type &val) : value(&val) {}
134 
135  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
136  KOKKOS_INLINE_FUNCTION void join(volatile value_type &dst, const volatile value_type &src) const { dst += src; }
137  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
138  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
139  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
140  };
141 
142 #if 0
143 template<class AlphaCoeffType,
144  class GraphType,
145  class MatrixValuesType,
146  class InVecType,
147  class BetaCoeffType,
148  class OutVecType>
149 class BcrsApplyNoTrans1VecTeamFunctor {
150 private:
151  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
152  "MatrixValuesType must be a Kokkos::View.");
153  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
154  "OutVecType must be a Kokkos::View.");
155  static_assert (Kokkos::Impl::is_view<InVecType>::value,
156  "InVecType must be a Kokkos::View.");
157  static_assert (std::is_same<MatrixValuesType,
158  typename MatrixValuesType::const_type>::value,
159  "MatrixValuesType must be a const Kokkos::View.");
160  static_assert (std::is_same<OutVecType,
161  typename OutVecType::non_const_type>::value,
162  "OutVecType must be a nonconst Kokkos::View.");
163  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
164  "InVecType must be a const Kokkos::View.");
165  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
166  "MatrixValuesType must be a rank-1 Kokkos::View.");
167  static_assert (static_cast<int> (InVecType::rank) == 1,
168  "InVecType must be a rank-1 Kokkos::View.");
169  static_assert (static_cast<int> (OutVecType::rank) == 1,
170  "OutVecType must be a rank-1 Kokkos::View.");
171  typedef typename MatrixValuesType::non_const_value_type scalar_type;
172  typedef typename GraphType::device_type device_type;
173  typedef typename device_type::execution_space execution_space;
174  typedef typename execution_space::scratch_memory_space shmem_space;
175 
176 public:
178  typedef typename std::remove_const<typename GraphType::data_type>::type
181  typedef Kokkos::TeamPolicy<Kokkos::Schedule<Kokkos::Dynamic>,
182  execution_space,
183  local_ordinal_type> policy_type;
190  void setX (const InVecType& X) { X_ = X; }
191 
198  void setY (const OutVecType& Y) { Y_ = Y; }
199 
203  KOKKOS_INLINE_FUNCTION local_ordinal_type
204  getScratchSizePerTeam () const
205  {
206  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
207  // that have run-time sizes, like some of those in Stokhos.
208  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
209  return blockSize_ * sizeof (y_value_type);
210  }
211 
215  KOKKOS_INLINE_FUNCTION local_ordinal_type
216  getScratchSizePerThread () const
217  {
218  // WARNING (mfh 01 Jun 2016) This may not work for Scalar types
219  // that have run-time sizes, like some of those in Stokhos.
220  typedef typename std::decay<decltype (Y_(0))>::type y_value_type;
221  return blockSize_ * sizeof (y_value_type);
222  }
223 
224 private:
225  KOKKOS_INLINE_FUNCTION local_ordinal_type
226  getNumLclMeshRows () const
227  {
228  return ptr_.extent (0) == 0 ?
229  static_cast<local_ordinal_type> (0) :
230  static_cast<local_ordinal_type> (ptr_.extent (0) - 1);
231  }
232 
233  static constexpr local_ordinal_type defaultRowsPerTeam = 20;
234 
235 public:
237  local_ordinal_type getNumTeams () const {
238  return 1;
239  // const local_ordinal_type numLclMeshRows = getNumLclMeshRows ();
240  // return (numLclMeshRows + rowsPerTeam_ - 1) / rowsPerTeam_;
241  }
242 
244  BcrsApplyNoTrans1VecTeamFunctor (const typename std::decay<AlphaCoeffType>::type& alpha,
245  const GraphType& graph,
246  const MatrixValuesType& val,
247  const local_ordinal_type blockSize,
248  const InVecType& X,
249  const typename std::decay<BetaCoeffType>::type& beta,
250  const OutVecType& Y,
251  const local_ordinal_type rowsPerTeam = defaultRowsPerTeam) :
252  alpha_ (alpha),
253  ptr_ (graph.row_map),
254  ind_ (graph.entries),
255  val_ (val),
256  blockSize_ (blockSize),
257  X_ (X),
258  beta_ (beta),
259  Y_ (Y),
260  rowsPerTeam_ (rowsPerTeam)
261  {
262  // std::ostringstream os;
263  // os << "Functor ctor: numLclMeshRows = " << getNumLclMeshRows () << std::endl;
264  // std::cerr << os.str ();
265  }
266 
267  KOKKOS_INLINE_FUNCTION void
268  operator () (const typename policy_type::member_type& member) const
269  {
274  using Kokkos::Details::ArithTraits;
275  // I'm not writing 'using Kokkos::make_pair;' here, because that
276  // may break builds for users who make the mistake of putting
277  // 'using namespace std;' in the global namespace. Please don't
278  // ever do that! But just in case you do, I'll take this
279  // precaution.
280  using Kokkos::parallel_for;
281  using Kokkos::subview;
282  typedef local_ordinal_type LO;
283  typedef typename decltype (ptr_)::non_const_value_type offset_type;
284  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
285  shmem_space, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
286  shared_array_type;
287  typedef Kokkos::View<typename OutVecType::non_const_value_type*,
288  Kokkos::LayoutRight,
289  device_type,
290  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
291  out_little_vec_type;
292  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
293  Kokkos::LayoutRight,
294  device_type,
295  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
296  little_block_type;
297 
298  const LO leagueRank = member.league_rank();
299 
300  // This looks wrong! All the threads are writing into the same local storage.
301  // shared_array_type threadLocalMem =
302  // shared_array_type (member.thread_scratch (1), blockSize_);
303  shared_array_type threadLocalMem =
304  shared_array_type (member.thread_scratch (1), blockSize_ * rowsPerTeam_);
305 
306  // This looks wrong! All the threads are writing into the same local storage.
307  //out_little_vec_type Y_tlm (threadLocalMem.data (), blockSize_, 1);
308 
309  const LO numLclMeshRows = getNumLclMeshRows ();
310  const LO rowBeg = leagueRank * rowsPerTeam_;
311  const LO rowTmp = rowBeg + rowsPerTeam_;
312  const LO rowEnd = rowTmp < numLclMeshRows ? rowTmp : numLclMeshRows;
313 
314  // {
315  // std::ostringstream os;
316  // os << leagueRank << "," << member.team_rank () << ": "
317  // << rowBeg << "," << rowEnd << std::endl;
318  // std::cerr << os.str ();
319  // }
320 
321  // Each team takes rowsPerTeam_ (block) rows.
322  // Each thread in the team takes a single (block) row.
323  parallel_for (Kokkos::TeamThreadRange (member, rowBeg, rowEnd),
324  [&] (const LO& lclRow) {
325  // Each thread in the team gets its own temp storage.
326  out_little_vec_type Y_tlm (threadLocalMem.data () + blockSize_ * member.team_rank (), blockSize_);
327 
328  const offset_type Y_ptBeg = lclRow * blockSize_;
329  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
330  auto Y_cur =
331  subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
332  if (beta_ == ArithTraits<BetaCoeffType>::zero ()) {
333  FILL (Y_tlm, ArithTraits<BetaCoeffType>::zero ());
334  }
335  else if (beta_ == ArithTraits<BetaCoeffType>::one ()) {
336  COPY (Y_cur, Y_tlm);
337  }
338  else { // beta != 0 && beta != 1
339  COPY (Y_cur, Y_tlm);
340  SCAL (beta_, Y_tlm);
341  }
342 
343  if (alpha_ != ArithTraits<AlphaCoeffType>::zero ()) {
344  const offset_type blkBeg = ptr_[lclRow];
345  const offset_type blkEnd = ptr_[lclRow+1];
346  // Precompute to save integer math in the inner loop.
347  const offset_type bs2 = blockSize_ * blockSize_;
348  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
349  ++absBlkOff) {
350  little_block_type A_cur (val_.data () + absBlkOff * bs2,
351  blockSize_, blockSize_);
352  const offset_type X_blkCol = ind_[absBlkOff];
353  const offset_type X_ptBeg = X_blkCol * blockSize_;
354  const offset_type X_ptEnd = X_ptBeg + blockSize_;
355  auto X_cur =
356  subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
357  // Y_tlm += alpha*A_cur*X_cur
358  GEMV (alpha_, A_cur, X_cur, Y_tlm);
359  } // for each entry in current local block row of matrix
360  COPY (Y_tlm, Y_cur);
361  }
362  });
363  }
364 
365 private:
366  typename std::decay<AlphaCoeffType>::type alpha_;
367  typename GraphType::row_map_type::const_type ptr_;
368  typename GraphType::entries_type::const_type ind_;
369  MatrixValuesType val_;
370  local_ordinal_type blockSize_;
371  InVecType X_;
372  typename std::decay<BetaCoeffType>::type beta_;
373  OutVecType Y_;
374  local_ordinal_type rowsPerTeam_;
375 };
376 #endif // 0
377 
378 template<class AlphaCoeffType,
379  class GraphType,
380  class MatrixValuesType,
381  class InVecType,
382  class BetaCoeffType,
383  class OutVecType>
384 class BcrsApplyNoTrans1VecFunctor {
385 private:
386  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
387  "MatrixValuesType must be a Kokkos::View.");
388  static_assert (Kokkos::Impl::is_view<OutVecType>::value,
389  "OutVecType must be a Kokkos::View.");
390  static_assert (Kokkos::Impl::is_view<InVecType>::value,
391  "InVecType must be a Kokkos::View.");
392  static_assert (std::is_same<MatrixValuesType,
393  typename MatrixValuesType::const_type>::value,
394  "MatrixValuesType must be a const Kokkos::View.");
395  static_assert (std::is_same<OutVecType,
396  typename OutVecType::non_const_type>::value,
397  "OutVecType must be a nonconst Kokkos::View.");
398  static_assert (std::is_same<InVecType, typename InVecType::const_type>::value,
399  "InVecType must be a const Kokkos::View.");
400  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
401  "MatrixValuesType must be a rank-1 Kokkos::View.");
402  static_assert (static_cast<int> (InVecType::rank) == 1,
403  "InVecType must be a rank-1 Kokkos::View.");
404  static_assert (static_cast<int> (OutVecType::rank) == 1,
405  "OutVecType must be a rank-1 Kokkos::View.");
406  typedef typename MatrixValuesType::non_const_value_type scalar_type;
407 
408 public:
409  typedef typename GraphType::device_type device_type;
410 
412  typedef typename std::remove_const<typename GraphType::data_type>::type
415  typedef Kokkos::RangePolicy<Kokkos::Schedule<Kokkos::Dynamic>,
416  typename device_type::execution_space,
417  local_ordinal_type> policy_type;
424  void setX (const InVecType& X) { X_ = X; }
425 
432  void setY (const OutVecType& Y) { Y_ = Y; }
433 
434  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_coeff_type;
435  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_coeff_type;
436 
438  BcrsApplyNoTrans1VecFunctor (const alpha_coeff_type& alpha,
439  const GraphType& graph,
440  const MatrixValuesType& val,
441  const local_ordinal_type blockSize,
442  const InVecType& X,
443  const beta_coeff_type& beta,
444  const OutVecType& Y) :
445  alpha_ (alpha),
446  ptr_ (graph.row_map),
447  ind_ (graph.entries),
448  val_ (val),
449  blockSize_ (blockSize),
450  X_ (X),
451  beta_ (beta),
452  Y_ (Y)
453  {}
454 
455  KOKKOS_INLINE_FUNCTION void
456  operator () (const local_ordinal_type& lclRow) const
457  {
462  using Kokkos::Details::ArithTraits;
463  // I'm not writing 'using Kokkos::make_pair;' here, because that
464  // may break builds for users who make the mistake of putting
465  // 'using namespace std;' in the global namespace. Please don't
466  // ever do that! But just in case you do, I'll take this
467  // precaution.
468  using Kokkos::parallel_for;
469  using Kokkos::subview;
470  typedef typename decltype (ptr_)::non_const_value_type offset_type;
471  typedef Kokkos::View<typename MatrixValuesType::const_value_type**,
472  Kokkos::LayoutRight,
473  device_type,
474  Kokkos::MemoryTraits<Kokkos::Unmanaged> >
475  little_block_type;
476 
477  const offset_type Y_ptBeg = lclRow * blockSize_;
478  const offset_type Y_ptEnd = Y_ptBeg + blockSize_;
479  auto Y_cur = subview (Y_, ::Kokkos::make_pair (Y_ptBeg, Y_ptEnd));
480 
481  // This version of the code does not use temporary storage.
482  // Each thread writes to its own block of the target vector.
483  if (beta_ == ArithTraits<beta_coeff_type>::zero ()) {
484  FILL (Y_cur, ArithTraits<beta_coeff_type>::zero ());
485  }
486  else if (beta_ != ArithTraits<beta_coeff_type>::one ()) { // beta != 0 && beta != 1
487  SCAL (beta_, Y_cur);
488  }
489 
490  if (alpha_ != ArithTraits<alpha_coeff_type>::zero ()) {
491  const offset_type blkBeg = ptr_[lclRow];
492  const offset_type blkEnd = ptr_[lclRow+1];
493  // Precompute to save integer math in the inner loop.
494  const offset_type bs2 = blockSize_ * blockSize_;
495  for (offset_type absBlkOff = blkBeg; absBlkOff < blkEnd;
496  ++absBlkOff) {
497  little_block_type A_cur (val_.data () + absBlkOff * bs2,
498  blockSize_, blockSize_);
499  const offset_type X_blkCol = ind_[absBlkOff];
500  const offset_type X_ptBeg = X_blkCol * blockSize_;
501  const offset_type X_ptEnd = X_ptBeg + blockSize_;
502  auto X_cur = subview (X_, ::Kokkos::make_pair (X_ptBeg, X_ptEnd));
503 
504  GEMV (alpha_, A_cur, X_cur, Y_cur); // Y_cur += alpha*A_cur*X_cur
505  } // for each entry in current local block row of matrix
506  }
507  }
508 
509 private:
510  alpha_coeff_type alpha_;
511  typename GraphType::row_map_type::const_type ptr_;
512  typename GraphType::entries_type::const_type ind_;
513  MatrixValuesType val_;
514  local_ordinal_type blockSize_;
515  InVecType X_;
516  beta_coeff_type beta_;
517  OutVecType Y_;
518 };
519 
520 template<class AlphaCoeffType,
521  class GraphType,
522  class MatrixValuesType,
523  class InMultiVecType,
524  class BetaCoeffType,
525  class OutMultiVecType>
526 void
527 bcrsLocalApplyNoTrans (const AlphaCoeffType& alpha,
528  const GraphType& graph,
529  const MatrixValuesType& val,
530  const typename std::remove_const<typename GraphType::data_type>::type blockSize,
531  const InMultiVecType& X,
532  const BetaCoeffType& beta,
533  const OutMultiVecType& Y
534 #if 0
535  , const typename std::remove_const<typename GraphType::data_type>::type rowsPerTeam = 20
536 #endif // 0
537  )
538 {
539  static_assert (Kokkos::Impl::is_view<MatrixValuesType>::value,
540  "MatrixValuesType must be a Kokkos::View.");
541  static_assert (Kokkos::Impl::is_view<OutMultiVecType>::value,
542  "OutMultiVecType must be a Kokkos::View.");
543  static_assert (Kokkos::Impl::is_view<InMultiVecType>::value,
544  "InMultiVecType must be a Kokkos::View.");
545  static_assert (static_cast<int> (MatrixValuesType::rank) == 1,
546  "MatrixValuesType must be a rank-1 Kokkos::View.");
547  static_assert (static_cast<int> (OutMultiVecType::rank) == 2,
548  "OutMultiVecType must be a rank-2 Kokkos::View.");
549  static_assert (static_cast<int> (InMultiVecType::rank) == 2,
550  "InMultiVecType must be a rank-2 Kokkos::View.");
551 
552  typedef typename MatrixValuesType::const_type matrix_values_type;
553  typedef typename OutMultiVecType::non_const_type out_multivec_type;
554  typedef typename InMultiVecType::const_type in_multivec_type;
555  typedef typename Kokkos::ArithTraits<typename std::decay<AlphaCoeffType>::type>::val_type alpha_type;
556  typedef typename Kokkos::ArithTraits<typename std::decay<BetaCoeffType>::type>::val_type beta_type;
557  typedef typename std::remove_const<typename GraphType::data_type>::type LO;
558 
559  const LO numLocalMeshRows = graph.row_map.extent (0) == 0 ?
560  static_cast<LO> (0) :
561  static_cast<LO> (graph.row_map.extent (0) - 1);
562  const LO numVecs = Y.extent (1);
563  if (numLocalMeshRows == 0 || numVecs == 0) {
564  return; // code below doesn't handle numVecs==0 correctly
565  }
566 
567  // These assignments avoid instantiating the functor extra times
568  // unnecessarily, e.g., for X const vs. nonconst. We only need the
569  // X const case, so only instantiate for that case.
570  in_multivec_type X_in = X;
571  out_multivec_type Y_out = Y;
572 
573  // The functor only knows how to handle one vector at a time, and it
574  // expects 1-D Views. Thus, we need to know the type of each column
575  // of X and Y.
576  typedef decltype (Kokkos::subview (X_in, Kokkos::ALL (), 0)) in_vec_type;
577  typedef decltype (Kokkos::subview (Y_out, Kokkos::ALL (), 0)) out_vec_type;
578 #if 0
579  typedef BcrsApplyNoTrans1VecTeamFunctor<alpha_type, GraphType,
580  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
581 #else
582  typedef BcrsApplyNoTrans1VecFunctor<alpha_type, GraphType,
583  matrix_values_type, in_vec_type, beta_type, out_vec_type> functor_type;
584 #endif // 0
585  typedef typename functor_type::policy_type policy_type;
586 
587  auto X_0 = Kokkos::subview (X_in, Kokkos::ALL (), 0);
588  auto Y_0 = Kokkos::subview (Y_out, Kokkos::ALL (), 0);
589 #if 0
590  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0, rowsPerTeam);
591  const LO numTeams = functor.getNumTeams ();
592  policy_type policy (numTeams, Kokkos::AUTO ());
593  {
594  // KJ : hierarchy level of memory allocated e.g., cache (1),
595  // HBM (2), DDR (3), not used for now
596  const LO level = 1;
597  // KJ : for now provide two options for parallelizing (for vs. reduce)
598  const LO scratchSizePerTeam = functor.getScratchSizePerTeam (); // used for team parallel_red
599  const LO scratchSizePerThread = functor.getScratchSizePerThread (); // used for team parallel_for
600  policy =
601  policy.set_scratch_size (level,
602  Kokkos::PerTeam (scratchSizePerTeam),
603  Kokkos::PerThread (scratchSizePerThread));
604  }
605 #else
606  functor_type functor (alpha, graph, val, blockSize, X_0, beta, Y_0);
607  policy_type policy (0, numLocalMeshRows);
608 #endif // 0
609 
610  // Compute the first column of Y.
611  Kokkos::parallel_for (policy, functor);
612 
613  // Compute the remaining columns of Y.
614  for (LO j = 1; j < numVecs; ++j) {
615  auto X_j = Kokkos::subview (X_in, Kokkos::ALL (), j);
616  auto Y_j = Kokkos::subview (Y_out, Kokkos::ALL (), j);
617  functor.setX (X_j);
618  functor.setY (Y_j);
619  Kokkos::parallel_for (policy, functor);
620  }
621 }
622 
623 } // namespace Impl
624 
625 namespace { // (anonymous)
626 
627 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
628 // version that takes two Kokkos::View arguments).
629 template<class Scalar, class LO, class GO, class Node>
630 class GetLocalDiagCopy {
631 public:
632  typedef typename Node::device_type device_type;
633  typedef size_t diag_offset_type;
634  typedef Kokkos::View<const size_t*, device_type,
635  Kokkos::MemoryUnmanaged> diag_offsets_type;
636  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
637  typedef typename global_graph_type::local_graph_type local_graph_type;
638  typedef typename local_graph_type::row_map_type row_offsets_type;
639  typedef typename ::Tpetra::Experimental::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
640  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
641  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
642 
643  // Constructor
644  GetLocalDiagCopy (const diag_type& diag,
645  const values_type& val,
646  const diag_offsets_type& diagOffsets,
647  const row_offsets_type& ptr,
648  const LO blockSize) :
649  diag_ (diag),
650  diagOffsets_ (diagOffsets),
651  ptr_ (ptr),
652  blockSize_ (blockSize),
653  offsetPerBlock_ (blockSize_*blockSize_),
654  val_(val)
655  {}
656 
657  KOKKOS_INLINE_FUNCTION void
658  operator() (const LO& lclRowInd) const
659  {
660  using Kokkos::ALL;
661 
662  // Get row offset
663  const size_t absOffset = ptr_[lclRowInd];
664 
665  // Get offset relative to start of row
666  const size_t relOffset = diagOffsets_[lclRowInd];
667 
668  // Get the total offset
669  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
670 
671  // Get a view of the block. BCRS currently uses LayoutRight
672  // regardless of the device.
673  typedef Kokkos::View<const IST**, Kokkos::LayoutRight,
674  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
675  const_little_block_type;
676  const_little_block_type D_in (val_.data () + pointOffset,
677  blockSize_, blockSize_);
678  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
679  COPY (D_in, D_out);
680  }
681 
682  private:
683  diag_type diag_;
684  diag_offsets_type diagOffsets_;
685  row_offsets_type ptr_;
686  LO blockSize_;
687  LO offsetPerBlock_;
688  values_type val_;
689  };
690 } // namespace (anonymous)
691 
692  template<class Scalar, class LO, class GO, class Node>
693  std::ostream&
694  BlockCrsMatrix<Scalar, LO, GO, Node>::
695  markLocalErrorAndGetStream ()
696  {
697  * (this->localError_) = true;
698  if ((*errs_).is_null ()) {
699  *errs_ = Teuchos::rcp (new std::ostringstream ());
700  }
701  return **errs_;
702  }
703 
704  template<class Scalar, class LO, class GO, class Node>
707  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
708  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
709  blockSize_ (static_cast<LO> (0)),
710  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
711  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
712  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
713  offsetPerBlock_ (0),
714  localError_ (new bool (false)),
715  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
716  {
717  }
718 
719  template<class Scalar, class LO, class GO, class Node>
722  const LO blockSize) :
723  dist_object_type (graph.getMap ()),
724  graph_ (graph),
725  rowMeshMap_ (* (graph.getRowMap ())),
726  blockSize_ (blockSize),
727  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
728  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
729  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
730  offsetPerBlock_ (blockSize * blockSize),
731  localError_ (new bool (false)),
732  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
733  {
734  TEUCHOS_TEST_FOR_EXCEPTION(
735  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
736  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
737  "rows (isSorted() is false). This class assumes sorted rows.");
738 
739  graphRCP_ = Teuchos::rcpFromRef(graph_);
740 
741  // Trick to test whether LO is nonpositive, without a compiler
742  // warning in case LO is unsigned (which is generally a bad idea
743  // anyway). I don't promise that the trick works, but it
744  // generally does with gcc at least, in my experience.
745  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
746  TEUCHOS_TEST_FOR_EXCEPTION(
747  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
748  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
749  " <= 0. The block size must be positive.");
750 
751  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
752  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
753 
754  {
755  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
756  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
757 
758  row_map_type ptr_d = graph.getLocalGraph ().row_map;
759  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
760  Kokkos::deep_copy (ptr_h_nc, ptr_d);
761  ptrHost_ = ptr_h_nc;
762  }
763  {
764  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
765  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
766 
767  entries_type ind_d = graph.getLocalGraph ().entries;
768  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
769  Kokkos::deep_copy (ind_h_nc, ind_d);
770  indHost_ = ind_h_nc;
771  }
772 
773  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
774  val_ = decltype (val_) ("val", numValEnt);
775  }
776 
777  template<class Scalar, class LO, class GO, class Node>
780  const map_type& domainPointMap,
781  const map_type& rangePointMap,
782  const LO blockSize) :
783  dist_object_type (graph.getMap ()),
784  graph_ (graph),
785  rowMeshMap_ (* (graph.getRowMap ())),
786  domainPointMap_ (domainPointMap),
787  rangePointMap_ (rangePointMap),
788  blockSize_ (blockSize),
789  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
790  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
791  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
792  offsetPerBlock_ (blockSize * blockSize),
793  localError_ (new bool (false)),
794  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
795  {
796  TEUCHOS_TEST_FOR_EXCEPTION(
797  ! graph_.isSorted (), std::invalid_argument, "Tpetra::Experimental::"
798  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
799  "rows (isSorted() is false). This class assumes sorted rows.");
800 
801  graphRCP_ = Teuchos::rcpFromRef(graph_);
802 
803  // Trick to test whether LO is nonpositive, without a compiler
804  // warning in case LO is unsigned (which is generally a bad idea
805  // anyway). I don't promise that the trick works, but it
806  // generally does with gcc at least, in my experience.
807  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
808  TEUCHOS_TEST_FOR_EXCEPTION(
809  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::Experimental::"
810  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
811  " <= 0. The block size must be positive.");
812 
813  {
814  typedef typename crs_graph_type::local_graph_type::row_map_type row_map_type;
815  typedef typename row_map_type::HostMirror::non_const_type nc_host_row_map_type;
816 
817  row_map_type ptr_d = graph.getLocalGraph ().row_map;
818  nc_host_row_map_type ptr_h_nc = Kokkos::create_mirror_view (ptr_d);
819  Kokkos::deep_copy (ptr_h_nc, ptr_d);
820  ptrHost_ = ptr_h_nc;
821  }
822  {
823  typedef typename crs_graph_type::local_graph_type::entries_type entries_type;
824  typedef typename entries_type::HostMirror::non_const_type nc_host_entries_type;
825 
826  entries_type ind_d = graph.getLocalGraph ().entries;
827  nc_host_entries_type ind_h_nc = Kokkos::create_mirror_view (ind_d);
828  Kokkos::deep_copy (ind_h_nc, ind_d);
829  indHost_ = ind_h_nc;
830  }
831 
832  const auto numValEnt = graph.getNodeNumEntries () * offsetPerBlock ();
833  val_ = decltype (val_) ("val", numValEnt);
834  }
835 
836  template<class Scalar, class LO, class GO, class Node>
837  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
840  { // Copy constructor of map_type does a shallow copy.
841  // We're only returning by RCP for backwards compatibility.
842  return Teuchos::rcp (new map_type (domainPointMap_));
843  }
844 
845  template<class Scalar, class LO, class GO, class Node>
846  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
848  getRangeMap () const
849  { // Copy constructor of map_type does a shallow copy.
850  // We're only returning by RCP for backwards compatibility.
851  return Teuchos::rcp (new map_type (rangePointMap_));
852  }
853 
854  template<class Scalar, class LO, class GO, class Node>
855  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
857  getRowMap () const
858  {
859  return graph_.getRowMap();
860  }
861 
862  template<class Scalar, class LO, class GO, class Node>
863  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
865  getColMap () const
866  {
867  return graph_.getColMap();
868  }
869 
870  template<class Scalar, class LO, class GO, class Node>
874  {
875  return graph_.getGlobalNumRows();
876  }
877 
878  template<class Scalar, class LO, class GO, class Node>
879  size_t
882  {
883  return graph_.getNodeNumRows();
884  }
885 
886  template<class Scalar, class LO, class GO, class Node>
887  size_t
890  {
891  return graph_.getNodeMaxNumRowEntries();
892  }
893 
894  template<class Scalar, class LO, class GO, class Node>
895  void
897  apply (const mv_type& X,
898  mv_type& Y,
899  Teuchos::ETransp mode,
900  Scalar alpha,
901  Scalar beta) const
902  {
904  TEUCHOS_TEST_FOR_EXCEPTION(
905  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
906  std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::apply: "
907  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
908  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
909 
910  BMV X_view;
911  BMV Y_view;
912  const LO blockSize = getBlockSize ();
913  try {
914  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
915  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
916  }
917  catch (std::invalid_argument& e) {
918  TEUCHOS_TEST_FOR_EXCEPTION(
919  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
920  "apply: Either the input MultiVector X or the output MultiVector Y "
921  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
922  "graph. BlockMultiVector's constructor threw the following exception: "
923  << e.what ());
924  }
925 
926  try {
927  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
928  // either that or mark fields of this class as 'mutable'. The
929  // problem is that applyBlock wants to do lazy initialization of
930  // temporary block multivectors.
931  const_cast<this_type*> (this)->applyBlock (X_view, Y_view, mode, alpha, beta);
932  } catch (std::invalid_argument& e) {
933  TEUCHOS_TEST_FOR_EXCEPTION(
934  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
935  "apply: The implementation method applyBlock complained about having "
936  "an invalid argument. It reported the following: " << e.what ());
937  } catch (std::logic_error& e) {
938  TEUCHOS_TEST_FOR_EXCEPTION(
939  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
940  "apply: The implementation method applyBlock complained about a "
941  "possible bug in its implementation. It reported the following: "
942  << e.what ());
943  } catch (std::exception& e) {
944  TEUCHOS_TEST_FOR_EXCEPTION(
945  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
946  "apply: The implementation method applyBlock threw an exception which "
947  "is neither std::invalid_argument nor std::logic_error, but is a "
948  "subclass of std::exception. It reported the following: "
949  << e.what ());
950  } catch (...) {
951  TEUCHOS_TEST_FOR_EXCEPTION(
952  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
953  "apply: The implementation method applyBlock threw an exception which "
954  "is not an instance of a subclass of std::exception. This probably "
955  "indicates a bug. Please report this to the Tpetra developers.");
956  }
957  }
958 
959  template<class Scalar, class LO, class GO, class Node>
960  void
964  Teuchos::ETransp mode,
965  const Scalar alpha,
966  const Scalar beta)
967  {
968  TEUCHOS_TEST_FOR_EXCEPTION(
969  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
970  "Tpetra::Experimental::BlockCrsMatrix::applyBlock: "
971  "X and Y have different block sizes. X.getBlockSize() = "
972  << X.getBlockSize () << " != Y.getBlockSize() = "
973  << Y.getBlockSize () << ".");
974 
975  if (mode == Teuchos::NO_TRANS) {
976  applyBlockNoTrans (X, Y, alpha, beta);
977  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
978  applyBlockTrans (X, Y, mode, alpha, beta);
979  } else {
980  TEUCHOS_TEST_FOR_EXCEPTION(
981  true, std::invalid_argument, "Tpetra::Experimental::BlockCrsMatrix::"
982  "applyBlock: Invalid 'mode' argument. Valid values are "
983  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
984  }
985  }
986 
987  template<class Scalar, class LO, class GO, class Node>
988  void
990  setAllToScalar (const Scalar& alpha)
991  {
992 #ifdef HAVE_TPETRA_DEBUG
993  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::setAllToScalar: ";
994 #endif // HAVE_TPETRA_DEBUG
995 
996  if (this->need_sync_device ()) {
997  // If we need to sync to device, then the data were last
998  // modified on host. In that case, we should again modify them
999  // on host.
1000 #ifdef HAVE_TPETRA_DEBUG
1001  TEUCHOS_TEST_FOR_EXCEPTION
1002  (this->need_sync_host (), std::runtime_error,
1003  prefix << "The matrix's values need sync on both device and host.");
1004 #endif // HAVE_TPETRA_DEBUG
1005  this->modify_host ();
1006  Kokkos::deep_copy (getValuesHost (), alpha);
1007  }
1008  else {
1009  // If we need to sync to host, then the data were last modified
1010  // on device. In that case, we should again modify them on
1011  // device. Also, prefer modifying on device if neither side is
1012  // marked as modified.
1013  this->modify_device ();
1014  Kokkos::deep_copy (this->getValuesDevice (), alpha);
1015  }
1016  }
1017 
1018  template<class Scalar, class LO, class GO, class Node>
1019  LO
1021  replaceLocalValues (const LO localRowInd,
1022  const LO colInds[],
1023  const Scalar vals[],
1024  const LO numColInds) const
1025  {
1026 #ifdef HAVE_TPETRA_DEBUG
1027  const char prefix[] =
1028  "Tpetra::Experimental::BlockCrsMatrix::replaceLocalValues: ";
1029 #endif // HAVE_TPETRA_DEBUG
1030 
1031  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1032  // We modified no values, because the input local row index is
1033  // invalid on the calling process. That may not be an error, if
1034  // numColInds is zero anyway; it doesn't matter. This is the
1035  // advantage of returning the number of valid indices.
1036  return static_cast<LO> (0);
1037  }
1038  const impl_scalar_type* const vIn =
1039  reinterpret_cast<const impl_scalar_type*> (vals);
1040  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1041  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1042  const LO perBlockSize = this->offsetPerBlock ();
1043  LO hint = 0; // Guess for the relative offset into the current row
1044  LO pointOffset = 0; // Current offset into input values
1045  LO validCount = 0; // number of valid column indices in colInds
1046 
1047 #ifdef HAVE_TPETRA_DEBUG
1048  TEUCHOS_TEST_FOR_EXCEPTION
1049  (this->need_sync_host (), std::runtime_error,
1050  prefix << "The matrix's data were last modified on device, but have "
1051  "not been sync'd to host. Please sync to host (by calling "
1052  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1053  "method.");
1054 #endif // HAVE_TPETRA_DEBUG
1055 
1056  auto vals_host_out = getValuesHost ();
1057  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1058 
1059  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1060  const LO relBlockOffset =
1061  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1062  if (relBlockOffset != LINV) {
1063  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1064  // are stored contiguously, with no padding. "Contiguously"
1065  // means that all memory between the first and last entries
1066  // belongs to the block (no striding). "No padding" means
1067  // that getBlockSize() * getBlockSize() is exactly the number
1068  // of entries that the block uses. For another place where
1069  // this assumption is encoded, see sumIntoLocalValues.
1070 
1071  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1072  // little_block_type A_old =
1073  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1074  impl_scalar_type* const A_old =
1075  vals_host_out_raw + absBlockOffset * perBlockSize;
1076  // const_little_block_type A_new =
1077  // getConstLocalBlockFromInput (vIn, pointOffset);
1078  const impl_scalar_type* const A_new = vIn + pointOffset;
1079  // COPY (A_new, A_old);
1080  for (LO i = 0; i < perBlockSize; ++i) {
1081  A_old[i] = A_new[i];
1082  }
1083  hint = relBlockOffset + 1;
1084  ++validCount;
1085  }
1086  }
1087  return validCount;
1088  }
1089 
1090  template <class Scalar, class LO, class GO, class Node>
1091  void
1093  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
1094  Kokkos::MemoryUnmanaged>& offsets) const
1095  {
1096  graph_.getLocalDiagOffsets (offsets);
1097  }
1098 
1099 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1100  template <class Scalar, class LO, class GO, class Node>
1101  void TPETRA_DEPRECATED
1103  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
1104  {
1105  // mfh 19 Mar 2016: We plan to deprecate the ArrayRCP version of
1106  // this method in CrsGraph too, so don't call it (otherwise build
1107  // warnings will show up and annoy users). Instead, copy results
1108  // in and out, if the memory space requires it.
1109 
1110  const size_t lclNumRows = graph_.getNodeNumRows ();
1111  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
1112  offsets.resize (lclNumRows);
1113  }
1114 
1115  // The input ArrayRCP must always be a host pointer. Thus, if
1116  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
1117  // to write to that allocation directly as a Kokkos::View.
1118  typedef typename device_type::memory_space memory_space;
1119  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
1120  // It is always syntactically correct to assign a raw host
1121  // pointer to a device View, so this code will compile correctly
1122  // even if this branch never runs.
1123  typedef Kokkos::View<size_t*, device_type,
1124  Kokkos::MemoryUnmanaged> output_type;
1125  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1126  graph_.getLocalDiagOffsets (offsetsOut);
1127  }
1128  else {
1129  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
1130  graph_.getLocalDiagOffsets (offsetsTmp);
1131  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
1132  Kokkos::MemoryUnmanaged> output_type;
1133  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
1134  Kokkos::deep_copy (offsetsOut, offsetsTmp);
1135  }
1136  }
1137 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1138 
1139  template <class Scalar, class LO, class GO, class Node>
1140  void
1144  const Kokkos::View<impl_scalar_type***, device_type,
1145  Kokkos::MemoryUnmanaged>& D_inv,
1146  const Scalar& omega,
1147  const ESweepDirection direction) const
1148  {
1149  using Kokkos::ALL;
1150  const impl_scalar_type zero =
1151  Kokkos::Details::ArithTraits<impl_scalar_type>::zero ();
1152  const impl_scalar_type one =
1153  Kokkos::Details::ArithTraits<impl_scalar_type>::one ();
1154  const LO numLocalMeshRows =
1155  static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1156  const LO numVecs = static_cast<LO> (X.getNumVectors ());
1157 
1158  // If using (new) Kokkos, replace localMem with thread-local
1159  // memory. Note that for larger block sizes, this will affect the
1160  // two-level parallelization. Look to Stokhos for best practice
1161  // on making this fast for GPUs.
1162  const LO blockSize = getBlockSize ();
1163  Teuchos::Array<impl_scalar_type> localMem (blockSize);
1164  Teuchos::Array<impl_scalar_type> localMat (blockSize*blockSize);
1165  little_vec_type X_lcl (localMem.getRawPtr (), blockSize);
1166 
1167  // FIXME (mfh 12 Aug 2014) This probably won't work if LO is unsigned.
1168  LO rowBegin = 0, rowEnd = 0, rowStride = 0;
1169  if (direction == Forward) {
1170  rowBegin = 1;
1171  rowEnd = numLocalMeshRows+1;
1172  rowStride = 1;
1173  }
1174  else if (direction == Backward) {
1175  rowBegin = numLocalMeshRows;
1176  rowEnd = 0;
1177  rowStride = -1;
1178  }
1179  else if (direction == Symmetric) {
1180  this->localGaussSeidel (B, X, D_inv, omega, Forward);
1181  this->localGaussSeidel (B, X, D_inv, omega, Backward);
1182  return;
1183  }
1184 
1185  const Scalar one_minus_omega = Teuchos::ScalarTraits<Scalar>::one()-omega;
1186  const Scalar minus_omega = -omega;
1187 
1188  if (numVecs == 1) {
1189  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1190  const LO actlRow = lclRow - 1;
1191 
1192  little_vec_type B_cur = B.getLocalBlock (actlRow, 0);
1193  COPY (B_cur, X_lcl);
1194  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1195 
1196  const size_t meshBeg = ptrHost_[actlRow];
1197  const size_t meshEnd = ptrHost_[actlRow+1];
1198  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1199  const LO meshCol = indHost_[absBlkOff];
1200  const_little_block_type A_cur =
1201  getConstLocalBlockFromAbsOffset (absBlkOff);
1202  little_vec_type X_cur = X.getLocalBlock (meshCol, 0);
1203 
1204  // X_lcl += alpha*A_cur*X_cur
1205  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1206  //X_lcl.matvecUpdate (alpha, A_cur, X_cur);
1207  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1208  } // for each entry in the current local row of the matrix
1209 
1210  // NOTE (mfh 20 Jan 2016) The two input Views here are
1211  // unmanaged already, so we don't have to take unmanaged
1212  // subviews first.
1213  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1214  little_vec_type X_update = X.getLocalBlock (actlRow, 0);
1215  FILL (X_update, zero);
1216  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1217  } // for each local row of the matrix
1218  }
1219  else {
1220  for (LO lclRow = rowBegin; lclRow != rowEnd; lclRow += rowStride) {
1221  for (LO j = 0; j < numVecs; ++j) {
1222  LO actlRow = lclRow-1;
1223 
1224  little_vec_type B_cur = B.getLocalBlock (actlRow, j);
1225  COPY (B_cur, X_lcl);
1226  SCAL (static_cast<impl_scalar_type> (omega), X_lcl);
1227 
1228  const size_t meshBeg = ptrHost_[actlRow];
1229  const size_t meshEnd = ptrHost_[actlRow+1];
1230  for (size_t absBlkOff = meshBeg; absBlkOff < meshEnd; ++absBlkOff) {
1231  const LO meshCol = indHost_[absBlkOff];
1232  const_little_block_type A_cur =
1233  getConstLocalBlockFromAbsOffset (absBlkOff);
1234  little_vec_type X_cur = X.getLocalBlock (meshCol, j);
1235 
1236  // X_lcl += alpha*A_cur*X_cur
1237  const Scalar alpha = meshCol == actlRow ? one_minus_omega : minus_omega;
1238  GEMV (static_cast<impl_scalar_type> (alpha), A_cur, X_cur, X_lcl);
1239  } // for each entry in the current local row of the matrx
1240 
1241  auto D_lcl = Kokkos::subview (D_inv, actlRow, ALL (), ALL ());
1242  auto X_update = X.getLocalBlock (actlRow, j);
1243  FILL (X_update, zero);
1244  GEMV (one, D_lcl, X_lcl, X_update); // overwrite X_update
1245  } // for each entry in the current local row of the matrix
1246  } // for each local row of the matrix
1247  }
1248  }
1249 
1250  template <class Scalar, class LO, class GO, class Node>
1251  void
1254  const MultiVector<Scalar,LO,GO,Node> &/* B */,
1255  const MultiVector<Scalar,LO,GO,Node> &/* D */,
1256  const Scalar& /* dampingFactor */,
1257  const ESweepDirection /* direction */,
1258  const int /* numSweeps */,
1259  const bool /* zeroInitialGuess */) const
1260  {
1261  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1262  // interface for block Gauss-Seidel.
1263  TEUCHOS_TEST_FOR_EXCEPTION(
1264  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1265  "gaussSeidelCopy: Not implemented.");
1266  }
1267 
1268  template <class Scalar, class LO, class GO, class Node>
1269  void
1272  const MultiVector<Scalar,LO,GO,Node>& /* B */,
1273  const MultiVector<Scalar,LO,GO,Node>& /* D */,
1274  const Teuchos::ArrayView<LO>& /* rowIndices */,
1275  const Scalar& /* dampingFactor */,
1276  const ESweepDirection /* direction */,
1277  const int /* numSweeps */,
1278  const bool /* zeroInitialGuess */) const
1279  {
1280  // FIXME (mfh 12 Aug 2014) This method has entirely the wrong
1281  // interface for block Gauss-Seidel.
1282  TEUCHOS_TEST_FOR_EXCEPTION(
1283  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::"
1284  "reorderedGaussSeidelCopy: Not implemented.");
1285  }
1286 
1287 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
1288  template <class Scalar, class LO, class GO, class Node>
1289  void TPETRA_DEPRECATED
1292  const Teuchos::ArrayView<const size_t>& offsets) const
1293  {
1294  using Teuchos::ArrayRCP;
1295  using Teuchos::ArrayView;
1296  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
1297 
1298  const size_t myNumRows = rowMeshMap_.getNodeNumElements();
1299  const LO* columnIndices;
1300  Scalar* vals;
1301  LO numColumns;
1302  Teuchos::Array<LO> cols(1);
1303 
1304  // FIXME (mfh 12 Aug 2014) Should use a "little block" for this instead.
1305  Teuchos::Array<Scalar> zeroMat (blockSize_*blockSize_, ZERO);
1306  for (size_t i = 0; i < myNumRows; ++i) {
1307  cols[0] = i;
1308  if (offsets[i] == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1309  diag.replaceLocalValues (i, cols.getRawPtr (), zeroMat.getRawPtr (), 1);
1310  }
1311  else {
1312  getLocalRowView (i, columnIndices, vals, numColumns);
1313  diag.replaceLocalValues (i, cols.getRawPtr(), &vals[offsets[i]*blockSize_*blockSize_], 1);
1314  }
1315  }
1316  }
1317 #endif // TPETRA_ENABLE_DEPRECATED_CODE
1318 
1319  template <class Scalar, class LO, class GO, class Node>
1320  void
1322  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1323  Kokkos::MemoryUnmanaged>& diag,
1324  const Kokkos::View<const size_t*, device_type,
1325  Kokkos::MemoryUnmanaged>& offsets) const
1326  {
1327  using Kokkos::parallel_for;
1328  typedef typename device_type::execution_space execution_space;
1329  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
1330 
1331  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1332  const LO blockSize = this->getBlockSize ();
1333  TEUCHOS_TEST_FOR_EXCEPTION
1334  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1335  static_cast<LO> (diag.extent (1)) < blockSize ||
1336  static_cast<LO> (diag.extent (2)) < blockSize,
1337  std::invalid_argument, prefix <<
1338  "The input Kokkos::View is not big enough to hold all the data.");
1339  TEUCHOS_TEST_FOR_EXCEPTION
1340  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
1341  prefix << "offsets.size() = " << offsets.size () << " < local number of "
1342  "diagonal blocks " << lclNumMeshRows << ".");
1343 
1344 #ifdef HAVE_TPETRA_DEBUG
1345  TEUCHOS_TEST_FOR_EXCEPTION
1346  (this->template need_sync<device_type> (), std::runtime_error,
1347  prefix << "The matrix's data were last modified on host, but have "
1348  "not been sync'd to device. Please sync to device (by calling "
1349  "sync<device_type>() on this matrix) before calling this method.");
1350 #endif // HAVE_TPETRA_DEBUG
1351 
1352  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
1353  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
1354 
1355  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
1356  // we reserve the right to do lazy allocation of device data. (We
1357  // don't plan to do lazy allocation for host data; the host
1358  // version of the data always exists.)
1360  auto vals_dev =
1361  const_cast<this_type*> (this)->template getValues<device_type> ();
1362 
1363  parallel_for (policy_type (0, lclNumMeshRows),
1364  functor_type (diag, vals_dev, offsets,
1365  graph_.getLocalGraph ().row_map, blockSize_));
1366  }
1367 
1368 
1369  template <class Scalar, class LO, class GO, class Node>
1370  void
1372  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
1373  Kokkos::MemoryUnmanaged>& diag,
1374  const Teuchos::ArrayView<const size_t>& offsets) const
1375  {
1376  using Kokkos::ALL;
1377  using Kokkos::parallel_for;
1378  typedef typename Kokkos::View<impl_scalar_type***, device_type,
1379  Kokkos::MemoryUnmanaged>::HostMirror::execution_space host_exec_space;
1380 
1381  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getNodeNumElements ());
1382  const LO blockSize = this->getBlockSize ();
1383  TEUCHOS_TEST_FOR_EXCEPTION
1384  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
1385  static_cast<LO> (diag.extent (1)) < blockSize ||
1386  static_cast<LO> (diag.extent (2)) < blockSize,
1387  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1388  "The input Kokkos::View is not big enough to hold all the data.");
1389  TEUCHOS_TEST_FOR_EXCEPTION
1390  (static_cast<LO> (offsets.size ()) < lclNumMeshRows,
1391  std::invalid_argument, "Tpetra::BlockCrsMatrix::getLocalDiagCopy: "
1392  "offsets.size() = " << offsets.size () << " < local number of diagonal "
1393  "blocks " << lclNumMeshRows << ".");
1394 
1395  // mfh 12 Dec 2015: Use the host execution space, since we haven't
1396  // quite made everything work with CUDA yet.
1397  typedef Kokkos::RangePolicy<host_exec_space, LO> policy_type;
1398  parallel_for (policy_type (0, lclNumMeshRows), [=] (const LO& lclMeshRow) {
1399  auto D_in = this->getConstLocalBlockFromRelOffset (lclMeshRow, offsets[lclMeshRow]);
1400  auto D_out = Kokkos::subview (diag, lclMeshRow, ALL (), ALL ());
1401  COPY (D_in, D_out);
1402  });
1403  }
1404 
1405 
1406  template<class Scalar, class LO, class GO, class Node>
1407  LO
1409  absMaxLocalValues (const LO localRowInd,
1410  const LO colInds[],
1411  const Scalar vals[],
1412  const LO numColInds) const
1413  {
1414  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1415  // We modified no values, because the input local row index is
1416  // invalid on the calling process. That may not be an error, if
1417  // numColInds is zero anyway; it doesn't matter. This is the
1418  // advantage of returning the number of valid indices.
1419  return static_cast<LO> (0);
1420  }
1421  const impl_scalar_type* const vIn =
1422  reinterpret_cast<const impl_scalar_type*> (vals);
1423  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1424  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1425  const LO perBlockSize = this->offsetPerBlock ();
1426  LO hint = 0; // Guess for the relative offset into the current row
1427  LO pointOffset = 0; // Current offset into input values
1428  LO validCount = 0; // number of valid column indices in colInds
1429 
1430  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1431  const LO relBlockOffset =
1432  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1433  if (relBlockOffset != LINV) {
1434  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1435  little_block_type A_old =
1436  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1437  const_little_block_type A_new =
1438  getConstLocalBlockFromInput (vIn, pointOffset);
1439 
1440  ::Tpetra::Experimental::Impl::absMax (A_old, A_new);
1441  hint = relBlockOffset + 1;
1442  ++validCount;
1443  }
1444  }
1445  return validCount;
1446  }
1447 
1448 
1449  template<class Scalar, class LO, class GO, class Node>
1450  LO
1452  sumIntoLocalValues (const LO localRowInd,
1453  const LO colInds[],
1454  const Scalar vals[],
1455  const LO numColInds) const
1456  {
1457 #ifdef HAVE_TPETRA_DEBUG
1458  const char prefix[] =
1459  "Tpetra::Experimental::BlockCrsMatrix::sumIntoLocalValues: ";
1460 #endif // HAVE_TPETRA_DEBUG
1461 
1462  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1463  // We modified no values, because the input local row index is
1464  // invalid on the calling process. That may not be an error, if
1465  // numColInds is zero anyway; it doesn't matter. This is the
1466  // advantage of returning the number of valid indices.
1467  return static_cast<LO> (0);
1468  }
1469  //const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1470  const impl_scalar_type* const vIn =
1471  reinterpret_cast<const impl_scalar_type*> (vals);
1472  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1473  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1474  const LO perBlockSize = this->offsetPerBlock ();
1475  LO hint = 0; // Guess for the relative offset into the current row
1476  LO pointOffset = 0; // Current offset into input values
1477  LO validCount = 0; // number of valid column indices in colInds
1478 
1479 #ifdef HAVE_TPETRA_DEBUG
1480  TEUCHOS_TEST_FOR_EXCEPTION
1481  (this->need_sync_host (), std::runtime_error,
1482  prefix << "The matrix's data were last modified on device, but have not "
1483  "been sync'd to host. Please sync to host (by calling "
1484  "sync<Kokkos::HostSpace>() on this matrix) before calling this method.");
1485 #endif // HAVE_TPETRA_DEBUG
1486 
1487  auto vals_host_out =
1488  getValuesHost ();
1489  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1490 
1491  for (LO k = 0; k < numColInds; ++k, pointOffset += perBlockSize) {
1492  const LO relBlockOffset =
1493  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1494  if (relBlockOffset != LINV) {
1495  // mfh 21 Dec 2015: Here we encode the assumption that blocks
1496  // are stored contiguously, with no padding. "Contiguously"
1497  // means that all memory between the first and last entries
1498  // belongs to the block (no striding). "No padding" means
1499  // that getBlockSize() * getBlockSize() is exactly the number
1500  // of entries that the block uses. For another place where
1501  // this assumption is encoded, see replaceLocalValues.
1502 
1503  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1504  // little_block_type A_old =
1505  // getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1506  impl_scalar_type* const A_old =
1507  vals_host_out_raw + absBlockOffset * perBlockSize;
1508  // const_little_block_type A_new =
1509  // getConstLocalBlockFromInput (vIn, pointOffset);
1510  const impl_scalar_type* const A_new = vIn + pointOffset;
1511  // AXPY (ONE, A_new, A_old);
1512  for (LO i = 0; i < perBlockSize; ++i) {
1513  A_old[i] += A_new[i];
1514  }
1515  hint = relBlockOffset + 1;
1516  ++validCount;
1517  }
1518  }
1519  return validCount;
1520  }
1521 
1522  template<class Scalar, class LO, class GO, class Node>
1523  LO
1525  getLocalRowView (const LO localRowInd,
1526  const LO*& colInds,
1527  Scalar*& vals,
1528  LO& numInds) const
1529  {
1530 #ifdef HAVE_TPETRA_DEBUG
1531  const char prefix[] =
1532  "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: ";
1533 #endif // HAVE_TPETRA_DEBUG
1534 
1535  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1536  colInds = NULL;
1537  vals = NULL;
1538  numInds = 0;
1539  return Teuchos::OrdinalTraits<LO>::invalid ();
1540  }
1541  else {
1542  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
1543  colInds = indHost_.data () + absBlockOffsetStart;
1544 
1545 #ifdef HAVE_TPETRA_DEBUG
1546  TEUCHOS_TEST_FOR_EXCEPTION
1547  (this->need_sync_host (), std::runtime_error,
1548  prefix << "The matrix's data were last modified on device, but have "
1549  "not been sync'd to host. Please sync to host (by calling "
1550  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
1551  "method.");
1552 #endif // HAVE_TPETRA_DEBUG
1553 
1554  auto vals_host_out = getValuesHost ();
1555  impl_scalar_type* vals_host_out_raw = vals_host_out.data ();
1556  impl_scalar_type* const vOut = vals_host_out_raw +
1557  absBlockOffsetStart * offsetPerBlock ();
1558  vals = reinterpret_cast<Scalar*> (vOut);
1559 
1560  numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
1561  return 0; // indicates no error
1562  }
1563  }
1564 
1565  template<class Scalar, class LO, class GO, class Node>
1566  void
1568  getLocalRowCopy (LO LocalRow,
1569  const Teuchos::ArrayView<LO>& Indices,
1570  const Teuchos::ArrayView<Scalar>& Values,
1571  size_t &NumEntries) const
1572  {
1573  const LO *colInds;
1574  Scalar *vals;
1575  LO numInds;
1576  getLocalRowView(LocalRow,colInds,vals,numInds);
1577  if (numInds > Indices.size() || numInds*blockSize_*blockSize_ > Values.size()) {
1578  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
1579  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
1580  << numInds << " row entries");
1581  }
1582  for (LO i=0; i<numInds; ++i) {
1583  Indices[i] = colInds[i];
1584  }
1585  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
1586  Values[i] = vals[i];
1587  }
1588  NumEntries = numInds;
1589  }
1590 
1591  template<class Scalar, class LO, class GO, class Node>
1592  LO
1594  getLocalRowOffsets (const LO localRowInd,
1595  ptrdiff_t offsets[],
1596  const LO colInds[],
1597  const LO numColInds) const
1598  {
1599  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1600  // We got no offsets, because the input local row index is
1601  // invalid on the calling process. That may not be an error, if
1602  // numColInds is zero anyway; it doesn't matter. This is the
1603  // advantage of returning the number of valid indices.
1604  return static_cast<LO> (0);
1605  }
1606 
1607  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
1608  LO hint = 0; // Guess for the relative offset into the current row
1609  LO validCount = 0; // number of valid column indices in colInds
1610 
1611  for (LO k = 0; k < numColInds; ++k) {
1612  const LO relBlockOffset =
1613  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
1614  if (relBlockOffset != LINV) {
1615  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
1616  hint = relBlockOffset + 1;
1617  ++validCount;
1618  }
1619  }
1620  return validCount;
1621  }
1622 
1623 
1624  template<class Scalar, class LO, class GO, class Node>
1625  LO
1627  replaceLocalValuesByOffsets (const LO localRowInd,
1628  const ptrdiff_t offsets[],
1629  const Scalar vals[],
1630  const LO numOffsets) const
1631  {
1632  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1633  // We modified no values, because the input local row index is
1634  // invalid on the calling process. That may not be an error, if
1635  // numColInds is zero anyway; it doesn't matter. This is the
1636  // advantage of returning the number of valid indices.
1637  return static_cast<LO> (0);
1638  }
1639  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1640 
1641  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1642  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1643  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1644  size_t pointOffset = 0; // Current offset into input values
1645  LO validCount = 0; // number of valid offsets
1646 
1647  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1648  const size_t relBlockOffset = offsets[k];
1649  if (relBlockOffset != STINV) {
1650  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1651  little_block_type A_old =
1652  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1653  const_little_block_type A_new =
1654  getConstLocalBlockFromInput (vIn, pointOffset);
1655  COPY (A_new, A_old);
1656  ++validCount;
1657  }
1658  }
1659  return validCount;
1660  }
1661 
1662 
1663  template<class Scalar, class LO, class GO, class Node>
1664  LO
1666  absMaxLocalValuesByOffsets (const LO localRowInd,
1667  const ptrdiff_t offsets[],
1668  const Scalar vals[],
1669  const LO numOffsets) const
1670  {
1671  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1672  // We modified no values, because the input local row index is
1673  // invalid on the calling process. That may not be an error, if
1674  // numColInds is zero anyway; it doesn't matter. This is the
1675  // advantage of returning the number of valid indices.
1676  return static_cast<LO> (0);
1677  }
1678  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1679 
1680  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1681  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1682  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1683  size_t pointOffset = 0; // Current offset into input values
1684  LO validCount = 0; // number of valid offsets
1685 
1686  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1687  const size_t relBlockOffset = offsets[k];
1688  if (relBlockOffset != STINV) {
1689  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1690  little_block_type A_old =
1691  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1692  const_little_block_type A_new =
1693  getConstLocalBlockFromInput (vIn, pointOffset);
1694  ::Tpetra::Experimental::Impl::absMax (A_old, A_new);
1695  ++validCount;
1696  }
1697  }
1698  return validCount;
1699  }
1700 
1701 
1702  template<class Scalar, class LO, class GO, class Node>
1703  LO
1705  sumIntoLocalValuesByOffsets (const LO localRowInd,
1706  const ptrdiff_t offsets[],
1707  const Scalar vals[],
1708  const LO numOffsets) const
1709  {
1710  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
1711  // We modified no values, because the input local row index is
1712  // invalid on the calling process. That may not be an error, if
1713  // numColInds is zero anyway; it doesn't matter. This is the
1714  // advantage of returning the number of valid indices.
1715  return static_cast<LO> (0);
1716  }
1717  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
1718  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
1719 
1720  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1721  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
1722  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
1723  size_t pointOffset = 0; // Current offset into input values
1724  LO validCount = 0; // number of valid offsets
1725 
1726  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
1727  const size_t relBlockOffset = offsets[k];
1728  if (relBlockOffset != STINV) {
1729  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1730  little_block_type A_old =
1731  getNonConstLocalBlockFromAbsOffset (absBlockOffset);
1732  const_little_block_type A_new =
1733  getConstLocalBlockFromInput (vIn, pointOffset);
1734  //A_old.update (ONE, A_new);
1735  AXPY (ONE, A_new, A_old);
1736  ++validCount;
1737  }
1738  }
1739  return validCount;
1740  }
1741 
1742 
1743  template<class Scalar, class LO, class GO, class Node>
1744  size_t
1746  getNumEntriesInLocalRow (const LO localRowInd) const
1747  {
1748  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
1749  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
1750  return static_cast<LO> (0); // the calling process doesn't have that row
1751  } else {
1752  return static_cast<LO> (numEntInGraph);
1753  }
1754  }
1755 
1756  template<class Scalar, class LO, class GO, class Node>
1757  void
1761  const Teuchos::ETransp mode,
1762  const Scalar alpha,
1763  const Scalar beta)
1764  {
1765  (void) X;
1766  (void) Y;
1767  (void) mode;
1768  (void) alpha;
1769  (void) beta;
1770 
1771  TEUCHOS_TEST_FOR_EXCEPTION(
1772  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::apply: "
1773  "transpose and conjugate transpose modes are not yet implemented.");
1774  }
1775 
1776  template<class Scalar, class LO, class GO, class Node>
1777  void
1778  BlockCrsMatrix<Scalar, LO, GO, Node>::
1779  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1780  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1781  const Scalar alpha,
1782  const Scalar beta)
1783  {
1784  using Teuchos::RCP;
1785  using Teuchos::rcp;
1786  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1787  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1788  const Scalar zero = STS::zero ();
1789  const Scalar one = STS::one ();
1790  RCP<const import_type> import = graph_.getImporter ();
1791  // "export" is a reserved C++ keyword, so we can't use it.
1792  RCP<const export_type> theExport = graph_.getExporter ();
1793  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1794 
1795  if (alpha == zero) {
1796  if (beta == zero) {
1797  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1798  }
1799  else if (beta != one) {
1800  Y.scale (beta);
1801  }
1802  }
1803  else { // alpha != 0
1804  const BMV* X_colMap = NULL;
1805  if (import.is_null ()) {
1806  try {
1807  X_colMap = &X;
1808  }
1809  catch (std::exception& e) {
1810  TEUCHOS_TEST_FOR_EXCEPTION
1811  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1812  "operator= threw an exception: " << std::endl << e.what ());
1813  }
1814  }
1815  else {
1816  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1817  // Y_rowMap_ below. This lets us do lazy initialization
1818  // correctly with view semantics of BlockCrsMatrix. All views
1819  // of this BlockCrsMatrix have the same outer pointer. That
1820  // way, we can set the inner pointer in one view, and all
1821  // other views will see it.
1822  if ((*X_colMap_).is_null () ||
1823  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1824  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1825  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1826  static_cast<LO> (X.getNumVectors ())));
1827  }
1828 #ifdef HAVE_TPETRA_BCRS_DO_POINT_IMPORT
1829  if (pointImporter_->is_null ()) {
1830  // The Import ctor needs RCPs. Map's copy ctor does a shallow copy, so
1831  // these are small operations.
1832  const auto domainPointMap = rcp (new typename BMV::map_type (domainPointMap_));
1833  const auto colPointMap = rcp (new typename BMV::map_type (
1834  BMV::makePointMap (*graph_.getColMap(),
1835  blockSize_)));
1836  *pointImporter_ = rcp (new typename crs_graph_type::import_type (
1837  domainPointMap, colPointMap));
1838  }
1839  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1840  **pointImporter_,
1842 #else
1843  (**X_colMap_).doImport (X, *import, ::Tpetra::REPLACE);
1844 #endif
1845  try {
1846  X_colMap = &(**X_colMap_);
1847  }
1848  catch (std::exception& e) {
1849  TEUCHOS_TEST_FOR_EXCEPTION
1850  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1851  "operator= threw an exception: " << std::endl << e.what ());
1852  }
1853  }
1854 
1855  BMV* Y_rowMap = NULL;
1856  if (theExport.is_null ()) {
1857  Y_rowMap = &Y;
1858  }
1859  else if ((*Y_rowMap_).is_null () ||
1860  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1861  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1862  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1863  static_cast<LO> (X.getNumVectors ())));
1864  try {
1865  Y_rowMap = &(**Y_rowMap_);
1866  }
1867  catch (std::exception& e) {
1868  TEUCHOS_TEST_FOR_EXCEPTION(
1869  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1870  "operator= threw an exception: " << std::endl << e.what ());
1871  }
1872  }
1873 
1874  try {
1875  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1876  }
1877  catch (std::exception& e) {
1878  TEUCHOS_TEST_FOR_EXCEPTION
1879  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1880  "an exception: " << e.what ());
1881  }
1882  catch (...) {
1883  TEUCHOS_TEST_FOR_EXCEPTION
1884  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1885  "an exception not a subclass of std::exception.");
1886  }
1887 
1888  if (! theExport.is_null ()) {
1889  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1890  }
1891  }
1892  }
1893 
1894  template<class Scalar, class LO, class GO, class Node>
1895  void
1896  BlockCrsMatrix<Scalar, LO, GO, Node>::
1897  localApplyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
1898  BlockMultiVector<Scalar, LO, GO, Node>& Y,
1899  const Scalar alpha,
1900  const Scalar beta)
1901  {
1902  using ::Tpetra::Experimental::Impl::bcrsLocalApplyNoTrans;
1903 
1904  const impl_scalar_type alpha_impl = alpha;
1905  const auto graph = this->graph_.getLocalGraph ();
1906  const impl_scalar_type beta_impl = beta;
1907  const LO blockSize = this->getBlockSize ();
1908 
1909  auto X_mv = X.getMultiVectorView ();
1910  auto Y_mv = Y.getMultiVectorView ();
1911  Y_mv.template modify<device_type> ();
1912 
1913  auto X_lcl = X_mv.template getLocalView<device_type> ();
1914  auto Y_lcl = Y_mv.template getLocalView<device_type> ();
1915  auto val = this->val_.template view<device_type> ();
1916 
1917  bcrsLocalApplyNoTrans (alpha_impl, graph, val, blockSize, X_lcl,
1918  beta_impl, Y_lcl);
1919  }
1920 
1921  template<class Scalar, class LO, class GO, class Node>
1922  LO
1923  BlockCrsMatrix<Scalar, LO, GO, Node>::
1924  findRelOffsetOfColumnIndex (const LO localRowIndex,
1925  const LO colIndexToFind,
1926  const LO hint) const
1927  {
1928  const size_t absStartOffset = ptrHost_[localRowIndex];
1929  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1930  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1931  // Amortize pointer arithmetic over the search loop.
1932  const LO* const curInd = indHost_.data () + absStartOffset;
1933 
1934  // If the hint was correct, then the hint is the offset to return.
1935  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1936  // Always return the offset relative to the current row.
1937  return hint;
1938  }
1939 
1940  // The hint was wrong, so we must search for the given column
1941  // index in the column indices for the given row.
1942  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1943 
1944  // We require that the graph have sorted rows. However, binary
1945  // search only pays if the current row is longer than a certain
1946  // amount. We set this to 32, but you might want to tune this.
1947  const LO maxNumEntriesForLinearSearch = 32;
1948  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1949  // Use binary search. It would probably be better for us to
1950  // roll this loop by hand. If we wrote it right, a smart
1951  // compiler could perhaps use conditional loads and avoid
1952  // branches (according to Jed Brown on May 2014).
1953  const LO* beg = curInd;
1954  const LO* end = curInd + numEntriesInRow;
1955  std::pair<const LO*, const LO*> p =
1956  std::equal_range (beg, end, colIndexToFind);
1957  if (p.first != p.second) {
1958  // offset is relative to the current row
1959  relOffset = static_cast<LO> (p.first - beg);
1960  }
1961  }
1962  else { // use linear search
1963  for (LO k = 0; k < numEntriesInRow; ++k) {
1964  if (colIndexToFind == curInd[k]) {
1965  relOffset = k; // offset is relative to the current row
1966  break;
1967  }
1968  }
1969  }
1970 
1971  return relOffset;
1972  }
1973 
1974  template<class Scalar, class LO, class GO, class Node>
1975  LO
1976  BlockCrsMatrix<Scalar, LO, GO, Node>::
1977  offsetPerBlock () const
1978  {
1979  return offsetPerBlock_;
1980  }
1981 
1982  template<class Scalar, class LO, class GO, class Node>
1983  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1984  BlockCrsMatrix<Scalar, LO, GO, Node>::
1985  getConstLocalBlockFromInput (const impl_scalar_type* val,
1986  const size_t pointOffset) const
1987  {
1988  // Row major blocks
1989  const LO rowStride = blockSize_;
1990  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1991  }
1992 
1993  template<class Scalar, class LO, class GO, class Node>
1994  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1995  BlockCrsMatrix<Scalar, LO, GO, Node>::
1996  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1997  const size_t pointOffset) const
1998  {
1999  // Row major blocks
2000  const LO rowStride = blockSize_;
2001  return little_block_type (val + pointOffset, blockSize_, rowStride);
2002  }
2003 
2004  template<class Scalar, class LO, class GO, class Node>
2005  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
2006  BlockCrsMatrix<Scalar, LO, GO, Node>::
2007  getConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2008  {
2009 #ifdef HAVE_TPETRA_DEBUG
2010  const char prefix[] =
2011  "Tpetra::Experimental::BlockCrsMatrix::getConstLocalBlockFromAbsOffset: ";
2012 #endif // HAVE_TPETRA_DEBUG
2013 
2014  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2015  // An empty block signifies an error. We don't expect to see
2016  // this error in correct code, but it's helpful for avoiding
2017  // memory corruption in case there is a bug.
2018  return const_little_block_type ();
2019  }
2020  else {
2021 #ifdef HAVE_TPETRA_DEBUG
2022  TEUCHOS_TEST_FOR_EXCEPTION
2023  (this->need_sync_host (), std::runtime_error,
2024  prefix << "The matrix's data were last modified on device, but have "
2025  "not been sync'd to host. Please sync to host (by calling "
2026  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2027  "method.");
2028 #endif // HAVE_TPETRA_DEBUG
2029  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2030 
2031  auto vals_host = getValuesHost ();
2032  const impl_scalar_type* vals_host_raw = vals_host.data ();
2033 
2034  return getConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2035  }
2036  }
2037 
2038  template<class Scalar, class LO, class GO, class Node>
2039  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
2040  BlockCrsMatrix<Scalar, LO, GO, Node>::
2041  getConstLocalBlockFromRelOffset (const LO lclMeshRow,
2042  const size_t relMeshOffset) const
2043  {
2044  typedef impl_scalar_type IST;
2045 
2046  const LO* lclColInds = NULL;
2047  Scalar* lclVals = NULL;
2048  LO numEnt = 0;
2049 
2050  LO err = this->getLocalRowView (lclMeshRow, lclColInds, lclVals, numEnt);
2051  if (err != 0) {
2052  // An empty block signifies an error. We don't expect to see
2053  // this error in correct code, but it's helpful for avoiding
2054  // memory corruption in case there is a bug.
2055  return const_little_block_type ();
2056  }
2057  else {
2058  const size_t relPointOffset = relMeshOffset * this->offsetPerBlock ();
2059  IST* lclValsImpl = reinterpret_cast<IST*> (lclVals);
2060  return this->getConstLocalBlockFromInput (const_cast<const IST*> (lclValsImpl),
2061  relPointOffset);
2062  }
2063  }
2064 
2065  template<class Scalar, class LO, class GO, class Node>
2066  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
2067  BlockCrsMatrix<Scalar, LO, GO, Node>::
2068  getNonConstLocalBlockFromAbsOffset (const size_t absBlockOffset) const
2069  {
2070 #ifdef HAVE_TPETRA_DEBUG
2071  const char prefix[] =
2072  "Tpetra::Experimental::BlockCrsMatrix::getNonConstLocalBlockFromAbsOffset: ";
2073 #endif // HAVE_TPETRA_DEBUG
2074 
2075  if (absBlockOffset >= ptrHost_[rowMeshMap_.getNodeNumElements ()]) {
2076  // An empty block signifies an error. We don't expect to see
2077  // this error in correct code, but it's helpful for avoiding
2078  // memory corruption in case there is a bug.
2079  return little_block_type ();
2080  }
2081  else {
2082  const size_t absPointOffset = absBlockOffset * offsetPerBlock ();
2083 #ifdef HAVE_TPETRA_DEBUG
2084  TEUCHOS_TEST_FOR_EXCEPTION
2085  (this->need_sync_host (), std::runtime_error,
2086  prefix << "The matrix's data were last modified on device, but have "
2087  "not been sync'd to host. Please sync to host (by calling "
2088  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
2089  "method.");
2090 #endif // HAVE_TPETRA_DEBUG
2091  auto vals_host = getValuesHost();
2092  impl_scalar_type* vals_host_raw = vals_host.data ();
2093  return getNonConstLocalBlockFromInput (vals_host_raw, absPointOffset);
2094  }
2095  }
2096 
2097  template<class Scalar, class LO, class GO, class Node>
2098  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
2099  BlockCrsMatrix<Scalar, LO, GO, Node>::
2100  getLocalBlock (const LO localRowInd, const LO localColInd) const
2101  {
2102  const size_t absRowBlockOffset = ptrHost_[localRowInd];
2103  const LO relBlockOffset =
2104  this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
2105 
2106  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
2107  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
2108  return getNonConstLocalBlockFromAbsOffset (absBlockOffset);
2109  }
2110  else {
2111  return little_block_type ();
2112  }
2113  }
2114 
2115  // template<class Scalar, class LO, class GO, class Node>
2116  // void
2117  // BlockCrsMatrix<Scalar, LO, GO, Node>::
2118  // clearLocalErrorStateAndStream ()
2119  // {
2120  // typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2121  // * (const_cast<this_type*> (this)->localError_) = false;
2122  // *errs_ = Teuchos::null;
2123  // }
2124 
2125  template<class Scalar, class LO, class GO, class Node>
2126  bool
2127  BlockCrsMatrix<Scalar, LO, GO, Node>::
2128  checkSizes (const ::Tpetra::SrcDistObject& source)
2129  {
2130  using std::endl;
2131  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2132  const this_type* src = dynamic_cast<const this_type* > (&source);
2133 
2134  if (src == NULL) {
2135  std::ostream& err = markLocalErrorAndGetStream ();
2136  err << "checkSizes: The source object of the Import or Export "
2137  "must be a BlockCrsMatrix with the same template parameters as the "
2138  "target object." << endl;
2139  }
2140  else {
2141  // Use a string of ifs, not if-elseifs, because we want to know
2142  // all the errors.
2143  if (src->getBlockSize () != this->getBlockSize ()) {
2144  std::ostream& err = markLocalErrorAndGetStream ();
2145  err << "checkSizes: The source and target objects of the Import or "
2146  << "Export must have the same block sizes. The source's block "
2147  << "size = " << src->getBlockSize () << " != the target's block "
2148  << "size = " << this->getBlockSize () << "." << endl;
2149  }
2150  if (! src->graph_.isFillComplete ()) {
2151  std::ostream& err = markLocalErrorAndGetStream ();
2152  err << "checkSizes: The source object of the Import or Export is "
2153  "not fill complete. Both source and target objects must be fill "
2154  "complete." << endl;
2155  }
2156  if (! this->graph_.isFillComplete ()) {
2157  std::ostream& err = markLocalErrorAndGetStream ();
2158  err << "checkSizes: The target object of the Import or Export is "
2159  "not fill complete. Both source and target objects must be fill "
2160  "complete." << endl;
2161  }
2162  if (src->graph_.getColMap ().is_null ()) {
2163  std::ostream& err = markLocalErrorAndGetStream ();
2164  err << "checkSizes: The source object of the Import or Export does "
2165  "not have a column Map. Both source and target objects must have "
2166  "column Maps." << endl;
2167  }
2168  if (this->graph_.getColMap ().is_null ()) {
2169  std::ostream& err = markLocalErrorAndGetStream ();
2170  err << "checkSizes: The target object of the Import or Export does "
2171  "not have a column Map. Both source and target objects must have "
2172  "column Maps." << endl;
2173  }
2174  }
2175 
2176  return ! (* (this->localError_));
2177  }
2178 
2179  template<class Scalar, class LO, class GO, class Node>
2180  void
2181  BlockCrsMatrix<Scalar, LO, GO, Node>::
2182 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2183  copyAndPermuteNew
2184 #else // TPETRA_ENABLE_DEPRECATED_CODE
2185  copyAndPermute
2186 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2187  (const ::Tpetra::SrcDistObject& source,
2188  const size_t numSameIDs,
2189  const Kokkos::DualView<const local_ordinal_type*,
2190  buffer_device_type>& permuteToLIDs,
2191  const Kokkos::DualView<const local_ordinal_type*,
2192  buffer_device_type>& permuteFromLIDs)
2193  {
2194  using ::Tpetra::Details::Behavior;
2196  using ::Tpetra::Details::ProfilingRegion;
2197  using std::endl;
2198  using this_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2199 
2200  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
2201  const bool debug = Behavior::debug();
2202  const bool verbose = Behavior::verbose();
2203 
2204  // Define this function prefix
2205  std::string prefix;
2206  {
2207  std::ostringstream os;
2208  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2209  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
2210  prefix = os.str();
2211  }
2212 
2213  // check if this already includes a local error
2214  if (* (this->localError_)) {
2215  std::ostream& err = this->markLocalErrorAndGetStream ();
2216  err << prefix
2217  << "The target object of the Import or Export is already in an error state."
2218  << endl;
2219  return;
2220  }
2221 
2222  //
2223  // Verbose input dual view status
2224  //
2225  if (verbose) {
2226  std::ostringstream os;
2227  os << prefix << endl
2228  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
2229  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
2230  std::cerr << os.str ();
2231  }
2232 
2236  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
2237  std::ostream& err = this->markLocalErrorAndGetStream ();
2238  err << prefix
2239  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
2240  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
2241  << "." << endl;
2242  return;
2243  }
2244  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
2245  std::ostream& err = this->markLocalErrorAndGetStream ();
2246  err << prefix
2247  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
2248  << endl;
2249  return;
2250  }
2251 
2252  const this_type* src = dynamic_cast<const this_type* > (&source);
2253  if (src == NULL) {
2254  std::ostream& err = this->markLocalErrorAndGetStream ();
2255  err << prefix
2256  << "The source (input) object of the Import or "
2257  "Export is either not a BlockCrsMatrix, or does not have the right "
2258  "template parameters. checkSizes() should have caught this. "
2259  "Please report this bug to the Tpetra developers." << endl;
2260  return;
2261  }
2262  else {
2263  // Kyungjoo: where is val_ modified ?
2264  // When we have dual view as a member variable,
2265  // which function should make sure the val_ is upto date ?
2266  // IMO, wherever it is used, the function should check its
2267  // availability.
2268  const_cast<this_type*>(src)->sync_host();
2269  }
2270  this->sync_host();
2271 
2272  bool lclErr = false;
2273 #ifdef HAVE_TPETRA_DEBUG
2274  std::set<LO> invalidSrcCopyRows;
2275  std::set<LO> invalidDstCopyRows;
2276  std::set<LO> invalidDstCopyCols;
2277  std::set<LO> invalidDstPermuteCols;
2278  std::set<LO> invalidPermuteFromRows;
2279 #endif // HAVE_TPETRA_DEBUG
2280 
2281  // Copy the initial sequence of rows that are the same.
2282  //
2283  // The two graphs might have different column Maps, so we need to
2284  // do this using global column indices. This is purely local, so
2285  // we only need to check for local sameness of the two column
2286  // Maps.
2287 
2288 #ifdef HAVE_TPETRA_DEBUG
2289  const map_type& srcRowMap = * (src->graph_.getRowMap ());
2290 #endif // HAVE_TPETRA_DEBUG
2291  const map_type& dstRowMap = * (this->graph_.getRowMap ());
2292  const map_type& srcColMap = * (src->graph_.getColMap ());
2293  const map_type& dstColMap = * (this->graph_.getColMap ());
2294  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
2295 
2296  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
2297  if (verbose) {
2298  std::ostringstream os;
2299  os << prefix
2300  << "canUseLocalColumnIndices: "
2301  << (canUseLocalColumnIndices ? "true" : "false")
2302  << ", numPermute: " << numPermute
2303  << endl;
2304  std::cerr << os.str ();
2305  }
2306 
2307  const auto permuteToLIDsHost = permuteToLIDs.view_host();
2308  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
2309 
2310  if (canUseLocalColumnIndices) {
2311  // Copy local rows that are the "same" in both source and target.
2312  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2313 #ifdef HAVE_TPETRA_DEBUG
2314  if (! srcRowMap.isNodeLocalElement (localRow)) {
2315  lclErr = true;
2316  invalidSrcCopyRows.insert (localRow);
2317  continue; // skip invalid rows
2318  }
2319 #endif // HAVE_TPETRA_DEBUG
2320 
2321  const LO* lclSrcCols;
2322  Scalar* vals;
2323  LO numEntries;
2324  // If this call fails, that means the mesh row local index is
2325  // invalid. That means the Import or Export is invalid somehow.
2326  LO err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2327  if (err != 0) {
2328  lclErr = true;
2329 #ifdef HAVE_TPETRA_DEBUG
2330  (void) invalidSrcCopyRows.insert (localRow);
2331 #endif // HAVE_TPETRA_DEBUG
2332  }
2333  else {
2334  err = this->replaceLocalValues (localRow, lclSrcCols, vals, numEntries);
2335  if (err != numEntries) {
2336  lclErr = true;
2337  if (! dstRowMap.isNodeLocalElement (localRow)) {
2338 #ifdef HAVE_TPETRA_DEBUG
2339  invalidDstCopyRows.insert (localRow);
2340 #endif // HAVE_TPETRA_DEBUG
2341  }
2342  else {
2343  // Once there's an error, there's no sense in saving
2344  // time, so we check whether the column indices were
2345  // invalid. However, only remember which ones were
2346  // invalid in a debug build, because that might take a
2347  // lot of space.
2348  for (LO k = 0; k < numEntries; ++k) {
2349  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
2350  lclErr = true;
2351 #ifdef HAVE_TPETRA_DEBUG
2352  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
2353 #endif // HAVE_TPETRA_DEBUG
2354  }
2355  }
2356  }
2357  }
2358  }
2359  } // for each "same" local row
2360 
2361  // Copy the "permute" local rows.
2362  for (size_t k = 0; k < numPermute; ++k) {
2363  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2364  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2365 
2366  const LO* lclSrcCols;
2367  Scalar* vals;
2368  LO numEntries;
2369  LO err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2370  if (err != 0) {
2371  lclErr = true;
2372 #ifdef HAVE_TPETRA_DEBUG
2373  invalidPermuteFromRows.insert (srcLclRow);
2374 #endif // HAVE_TPETRA_DEBUG
2375  }
2376  else {
2377  err = this->replaceLocalValues (dstLclRow, lclSrcCols, vals, numEntries);
2378  if (err != numEntries) {
2379  lclErr = true;
2380 #ifdef HAVE_TPETRA_DEBUG
2381  for (LO c = 0; c < numEntries; ++c) {
2382  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
2383  invalidDstPermuteCols.insert (lclSrcCols[c]);
2384  }
2385  }
2386 #endif // HAVE_TPETRA_DEBUG
2387  }
2388  }
2389  }
2390  }
2391  else { // must convert column indices to global
2392  // Reserve space to store the destination matrix's local column indices.
2393  const size_t maxNumEnt = src->graph_.getNodeMaxNumRowEntries ();
2394  Teuchos::Array<LO> lclDstCols (maxNumEnt);
2395 
2396  // Copy local rows that are the "same" in both source and target.
2397  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
2398  const LO* lclSrcCols;
2399  Scalar* vals;
2400  LO numEntries;
2401  // If this call fails, that means the mesh row local index is
2402  // invalid. That means the Import or Export is invalid somehow.
2403  LO err = 0;
2404  try {
2405  err = src->getLocalRowView (localRow, lclSrcCols, vals, numEntries);
2406  } catch (std::exception& e) {
2407  if (debug) {
2408  std::ostringstream os;
2409  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2410  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2411  << localRow << ", src->getLocalRowView() threw an exception: "
2412  << e.what ();
2413  std::cerr << os.str ();
2414  }
2415  throw e;
2416  }
2417 
2418  if (err != 0) {
2419  lclErr = true;
2420 #ifdef HAVE_TPETRA_DEBUG
2421  invalidSrcCopyRows.insert (localRow);
2422 #endif // HAVE_TPETRA_DEBUG
2423  }
2424  else {
2425  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2426  lclErr = true;
2427  if (debug) {
2428  std::ostringstream os;
2429  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2430  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2431  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
2432  << maxNumEnt << endl;
2433  std::cerr << os.str ();
2434  }
2435  }
2436  else {
2437  // Convert the source matrix's local column indices to the
2438  // destination matrix's local column indices.
2439  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2440  for (LO j = 0; j < numEntries; ++j) {
2441  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2442  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2443  lclErr = true;
2444 #ifdef HAVE_TPETRA_DEBUG
2445  invalidDstCopyCols.insert (lclDstColsView[j]);
2446 #endif // HAVE_TPETRA_DEBUG
2447  }
2448  }
2449  try {
2450  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), vals, numEntries);
2451  } catch (std::exception& e) {
2452  if (debug) {
2453  std::ostringstream os;
2454  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2455  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
2456  << localRow << ", this->replaceLocalValues() threw an exception: "
2457  << e.what ();
2458  std::cerr << os.str ();
2459  }
2460  throw e;
2461  }
2462  if (err != numEntries) {
2463  lclErr = true;
2464  if (debug) {
2465  std::ostringstream os;
2466  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2467  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
2468  "localRow " << localRow << ", this->replaceLocalValues "
2469  "returned " << err << " instead of numEntries = "
2470  << numEntries << endl;
2471  std::cerr << os.str ();
2472  }
2473  }
2474  }
2475  }
2476  }
2477 
2478  // Copy the "permute" local rows.
2479  for (size_t k = 0; k < numPermute; ++k) {
2480  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
2481  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
2482 
2483  const LO* lclSrcCols;
2484  Scalar* vals;
2485  LO numEntries;
2486  LO err = 0;
2487  try {
2488  err = src->getLocalRowView (srcLclRow, lclSrcCols, vals, numEntries);
2489  } catch (std::exception& e) {
2490  if (debug) {
2491  std::ostringstream os;
2492  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2493  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
2494  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
2495  << ", src->getLocalRowView() threw an exception: " << e.what ();
2496  std::cerr << os.str ();
2497  }
2498  throw e;
2499  }
2500 
2501  if (err != 0) {
2502  lclErr = true;
2503 #ifdef HAVE_TPETRA_DEBUG
2504  invalidPermuteFromRows.insert (srcLclRow);
2505 #endif // HAVE_TPETRA_DEBUG
2506  }
2507  else {
2508  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
2509  lclErr = true;
2510  }
2511  else {
2512  // Convert the source matrix's local column indices to the
2513  // destination matrix's local column indices.
2514  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
2515  for (LO j = 0; j < numEntries; ++j) {
2516  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
2517  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
2518  lclErr = true;
2519 #ifdef HAVE_TPETRA_DEBUG
2520  invalidDstPermuteCols.insert (lclDstColsView[j]);
2521 #endif // HAVE_TPETRA_DEBUG
2522  }
2523  }
2524  err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), vals, numEntries);
2525  if (err != numEntries) {
2526  lclErr = true;
2527  }
2528  }
2529  }
2530  }
2531  }
2532 
2533  if (lclErr) {
2534  std::ostream& err = this->markLocalErrorAndGetStream ();
2535 #ifdef HAVE_TPETRA_DEBUG
2536  err << "copyAndPermute: The graph structure of the source of the "
2537  "Import or Export must be a subset of the graph structure of the "
2538  "target. ";
2539  err << "invalidSrcCopyRows = [";
2540  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
2541  it != invalidSrcCopyRows.end (); ++it) {
2542  err << *it;
2543  typename std::set<LO>::const_iterator itp1 = it;
2544  itp1++;
2545  if (itp1 != invalidSrcCopyRows.end ()) {
2546  err << ",";
2547  }
2548  }
2549  err << "], invalidDstCopyRows = [";
2550  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
2551  it != invalidDstCopyRows.end (); ++it) {
2552  err << *it;
2553  typename std::set<LO>::const_iterator itp1 = it;
2554  itp1++;
2555  if (itp1 != invalidDstCopyRows.end ()) {
2556  err << ",";
2557  }
2558  }
2559  err << "], invalidDstCopyCols = [";
2560  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
2561  it != invalidDstCopyCols.end (); ++it) {
2562  err << *it;
2563  typename std::set<LO>::const_iterator itp1 = it;
2564  itp1++;
2565  if (itp1 != invalidDstCopyCols.end ()) {
2566  err << ",";
2567  }
2568  }
2569  err << "], invalidDstPermuteCols = [";
2570  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
2571  it != invalidDstPermuteCols.end (); ++it) {
2572  err << *it;
2573  typename std::set<LO>::const_iterator itp1 = it;
2574  itp1++;
2575  if (itp1 != invalidDstPermuteCols.end ()) {
2576  err << ",";
2577  }
2578  }
2579  err << "], invalidPermuteFromRows = [";
2580  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
2581  it != invalidPermuteFromRows.end (); ++it) {
2582  err << *it;
2583  typename std::set<LO>::const_iterator itp1 = it;
2584  itp1++;
2585  if (itp1 != invalidPermuteFromRows.end ()) {
2586  err << ",";
2587  }
2588  }
2589  err << "]" << endl;
2590 #else
2591  err << "copyAndPermute: The graph structure of the source of the "
2592  "Import or Export must be a subset of the graph structure of the "
2593  "target." << endl;
2594 #endif // HAVE_TPETRA_DEBUG
2595  }
2596 
2597  if (debug) {
2598  std::ostringstream os;
2599  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2600  const bool lclSuccess = ! (* (this->localError_));
2601  os << "*** Proc " << myRank << ": copyAndPermute "
2602  << (lclSuccess ? "succeeded" : "FAILED");
2603  if (lclSuccess) {
2604  os << endl;
2605  } else {
2606  os << ": error messages: " << this->errorMessages (); // comes w/ endl
2607  }
2608  std::cerr << os.str ();
2609  }
2610  }
2611 
2612  namespace { // (anonymous)
2613 
2632  template<class LO, class GO, class D>
2633  size_t
2634  packRowCount (const size_t numEnt,
2635  const size_t numBytesPerValue,
2636  const size_t blkSize)
2637  {
2638  using ::Tpetra::Details::PackTraits;
2639 
2640  if (numEnt == 0) {
2641  // Empty rows always take zero bytes, to ensure sparsity.
2642  return 0;
2643  }
2644  else {
2645  // We store the number of entries as a local index (LO).
2646  LO numEntLO = 0; // packValueCount wants this.
2647  GO gid;
2648  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2649  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2650  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
2651  return numEntLen + gidsLen + valsLen;
2652  }
2653  }
2654 
2665  template<class ST, class LO, class GO, class D>
2666  size_t
2667  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO, D>::input_buffer_type& imports,
2668  const size_t offset,
2669  const size_t numBytes,
2670  const size_t /* numBytesPerValue */)
2671  {
2672  using Kokkos::subview;
2673  using ::Tpetra::Details::PackTraits;
2674 
2675  if (numBytes == 0) {
2676  // Empty rows always take zero bytes, to ensure sparsity.
2677  return static_cast<size_t> (0);
2678  }
2679  else {
2680  LO numEntLO = 0;
2681  const size_t theNumBytes = PackTraits<LO, D>::packValueCount (numEntLO);
2682  TEUCHOS_TEST_FOR_EXCEPTION
2683  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2684  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
2685  << ".");
2686  const char* const inBuf = imports.data () + offset;
2687  const size_t actualNumBytes = PackTraits<LO, D>::unpackValue (numEntLO, inBuf);
2688  TEUCHOS_TEST_FOR_EXCEPTION
2689  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
2690  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
2691  << ".");
2692  return static_cast<size_t> (numEntLO);
2693  }
2694  }
2695 
2699  template<class ST, class LO, class GO, class D>
2700  size_t
2701  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO, D>::output_buffer_type exports,
2702  const size_t offset,
2703  const size_t numEnt,
2704  const typename ::Tpetra::Details::PackTraits<GO, D>::input_array_type& gidsIn,
2705  const typename ::Tpetra::Details::PackTraits<ST, D>::input_array_type& valsIn,
2706  const size_t numBytesPerValue,
2707  const size_t blockSize)
2708  {
2709  using Kokkos::subview;
2710  using ::Tpetra::Details::PackTraits;
2711 
2712  if (numEnt == 0) {
2713  // Empty rows always take zero bytes, to ensure sparsity.
2714  return 0;
2715  }
2716  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2717 
2718  const GO gid = 0; // packValueCount wants this
2719  const LO numEntLO = static_cast<size_t> (numEnt);
2720 
2721  const size_t numEntBeg = offset;
2722  const size_t numEntLen = PackTraits<LO, D>::packValueCount (numEntLO);
2723  const size_t gidsBeg = numEntBeg + numEntLen;
2724  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2725  const size_t valsBeg = gidsBeg + gidsLen;
2726  const size_t valsLen = numScalarEnt * numBytesPerValue;
2727 
2728  char* const numEntOut = exports.data () + numEntBeg;
2729  char* const gidsOut = exports.data () + gidsBeg;
2730  char* const valsOut = exports.data () + valsBeg;
2731 
2732  size_t numBytesOut = 0;
2733  int errorCode = 0;
2734  numBytesOut += PackTraits<LO, D>::packValue (numEntOut, numEntLO);
2735 
2736  {
2737  Kokkos::pair<int, size_t> p;
2738  p = PackTraits<GO, D>::packArray (gidsOut, gidsIn.data (), numEnt);
2739  errorCode += p.first;
2740  numBytesOut += p.second;
2741 
2742  p = PackTraits<ST, D>::packArray (valsOut, valsIn.data (), numScalarEnt);
2743  errorCode += p.first;
2744  numBytesOut += p.second;
2745  }
2746 
2747  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2748  TEUCHOS_TEST_FOR_EXCEPTION
2749  (numBytesOut != expectedNumBytes, std::logic_error,
2750  "packRowForBlockCrs: numBytesOut = " << numBytesOut
2751  << " != expectedNumBytes = " << expectedNumBytes << ".");
2752 
2753  TEUCHOS_TEST_FOR_EXCEPTION
2754  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
2755  "PackTraits::packArray returned a nonzero error code");
2756 
2757  return numBytesOut;
2758  }
2759 
2760  // Return the number of bytes actually read / used.
2761  template<class ST, class LO, class GO, class D>
2762  size_t
2763  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO, D>::output_array_type& gidsOut,
2764  const typename ::Tpetra::Details::PackTraits<ST, D>::output_array_type& valsOut,
2765  const typename ::Tpetra::Details::PackTraits<int, D>::input_buffer_type& imports,
2766  const size_t offset,
2767  const size_t numBytes,
2768  const size_t numEnt,
2769  const size_t numBytesPerValue,
2770  const size_t blockSize)
2771  {
2772  using ::Tpetra::Details::PackTraits;
2773 
2774  if (numBytes == 0) {
2775  // Rows with zero bytes always have zero entries.
2776  return 0;
2777  }
2778  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2779  TEUCHOS_TEST_FOR_EXCEPTION
2780  (static_cast<size_t> (imports.extent (0)) <= offset,
2781  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2782  << imports.extent (0) << " <= offset = " << offset << ".");
2783  TEUCHOS_TEST_FOR_EXCEPTION
2784  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
2785  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
2786  << imports.extent (0) << " < offset + numBytes = "
2787  << (offset + numBytes) << ".");
2788 
2789  const GO gid = 0; // packValueCount wants this
2790  const LO lid = 0; // packValueCount wants this
2791 
2792  const size_t numEntBeg = offset;
2793  const size_t numEntLen = PackTraits<LO, D>::packValueCount (lid);
2794  const size_t gidsBeg = numEntBeg + numEntLen;
2795  const size_t gidsLen = numEnt * PackTraits<GO, D>::packValueCount (gid);
2796  const size_t valsBeg = gidsBeg + gidsLen;
2797  const size_t valsLen = numScalarEnt * numBytesPerValue;
2798 
2799  const char* const numEntIn = imports.data () + numEntBeg;
2800  const char* const gidsIn = imports.data () + gidsBeg;
2801  const char* const valsIn = imports.data () + valsBeg;
2802 
2803  size_t numBytesOut = 0;
2804  int errorCode = 0;
2805  LO numEntOut;
2806  numBytesOut += PackTraits<LO, D>::unpackValue (numEntOut, numEntIn);
2807  TEUCHOS_TEST_FOR_EXCEPTION
2808  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
2809  "unpackRowForBlockCrs: Expected number of entries " << numEnt
2810  << " != actual number of entries " << numEntOut << ".");
2811 
2812  {
2813  Kokkos::pair<int, size_t> p;
2814  p = PackTraits<GO, D>::unpackArray (gidsOut.data (), gidsIn, numEnt);
2815  errorCode += p.first;
2816  numBytesOut += p.second;
2817 
2818  p = PackTraits<ST, D>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
2819  errorCode += p.first;
2820  numBytesOut += p.second;
2821  }
2822 
2823  TEUCHOS_TEST_FOR_EXCEPTION
2824  (numBytesOut != numBytes, std::logic_error,
2825  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2826  << " != numBytes = " << numBytes << ".");
2827 
2828  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
2829  TEUCHOS_TEST_FOR_EXCEPTION
2830  (numBytesOut != expectedNumBytes, std::logic_error,
2831  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
2832  << " != expectedNumBytes = " << expectedNumBytes << ".");
2833 
2834  TEUCHOS_TEST_FOR_EXCEPTION
2835  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
2836  "PackTraits::unpackArray returned a nonzero error code");
2837 
2838  return numBytesOut;
2839  }
2840  } // namespace (anonymous)
2841 
2842  template<class Scalar, class LO, class GO, class Node>
2843  void
2844  BlockCrsMatrix<Scalar, LO, GO, Node>::
2845 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
2846  packAndPrepareNew
2847 #else // TPETRA_ENABLE_DEPRECATED_CODE
2848  packAndPrepare
2849 #endif // TPETRA_ENABLE_DEPRECATED_CODE
2850  (const ::Tpetra::SrcDistObject& source,
2851  const Kokkos::DualView<const local_ordinal_type*,
2852  buffer_device_type>& exportLIDs,
2853  Kokkos::DualView<packet_type*,
2854  buffer_device_type>& exports, // output
2855  Kokkos::DualView<size_t*,
2856  buffer_device_type> numPacketsPerLID, // output
2857  size_t& constantNumPackets,
2858  Distributor& /* distor */)
2859  {
2860  using ::Tpetra::Details::Behavior;
2862  using ::Tpetra::Details::ProfilingRegion;
2863  using ::Tpetra::Details::PackTraits;
2864 
2865  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
2866 
2867  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
2868 
2869  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
2870 
2871  const bool debug = Behavior::debug();
2872  const bool verbose = Behavior::verbose();
2873 
2874  // Define this function prefix
2875  std::string prefix;
2876  {
2877  std::ostringstream os;
2878  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
2879  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
2880  prefix = os.str();
2881  }
2882 
2883  // check if this already includes a local error
2884  if (* (this->localError_)) {
2885  std::ostream& err = this->markLocalErrorAndGetStream ();
2886  err << prefix
2887  << "The target object of the Import or Export is already in an error state."
2888  << std::endl;
2889  return;
2890  }
2891 
2892  //
2893  // Verbose input dual view status
2894  //
2895  if (verbose) {
2896  std::ostringstream os;
2897  os << prefix << std::endl
2898  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2899  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2900  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2901  std::cerr << os.str ();
2902  }
2903 
2907  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2908  std::ostream& err = this->markLocalErrorAndGetStream ();
2909  err << prefix
2910  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2911  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2912  << "." << std::endl;
2913  return;
2914  }
2915  if (exportLIDs.need_sync_host ()) {
2916  std::ostream& err = this->markLocalErrorAndGetStream ();
2917  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2918  return;
2919  }
2920 
2921  const this_type* src = dynamic_cast<const this_type* > (&source);
2922  if (src == NULL) {
2923  std::ostream& err = this->markLocalErrorAndGetStream ();
2924  err << prefix
2925  << "The source (input) object of the Import or "
2926  "Export is either not a BlockCrsMatrix, or does not have the right "
2927  "template parameters. checkSizes() should have caught this. "
2928  "Please report this bug to the Tpetra developers." << std::endl;
2929  return;
2930  }
2931 
2932  // Graphs and matrices are allowed to have a variable number of
2933  // entries per row. We could test whether all rows have the same
2934  // number of entries, but DistObject can only use this
2935  // optimization if all rows on _all_ processes have the same
2936  // number of entries. Rather than do the all-reduce necessary to
2937  // test for this unlikely case, we tell DistObject (by setting
2938  // constantNumPackets to zero) to assume that different rows may
2939  // have different numbers of entries.
2940  constantNumPackets = 0;
2941 
2942  // const values
2943  const crs_graph_type& srcGraph = src->graph_;
2944  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2945  const size_t numExportLIDs = exportLIDs.extent (0);
2946  const size_t numBytesPerValue =
2947  PackTraits<impl_scalar_type, host_exec>
2948  ::packValueCount(this->val_.extent(0) ? this->val_.view_host()(0) : impl_scalar_type());
2949 
2950  // Compute the number of bytes ("packets") per row to pack. While
2951  // we're at it, compute the total # of block entries to send, and
2952  // the max # of block entries in any of the rows we're sending.
2953 
2954  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2955 
2956  // Graph information is on host; let's do this on host parallel reduce
2957  auto exportLIDsHost = exportLIDs.view_host();
2958  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2959  numPacketsPerLID.modify_host ();
2960  {
2961  using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2962  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2963  Kokkos::parallel_reduce
2964  (policy,
2965  [=](const size_t &i, typename reducer_type::value_type &update) {
2966  const LO lclRow = exportLIDsHost(i);
2967  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2968  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2969 
2970  const size_t numBytes =
2971  packRowCount<LO, GO, host_exec> (numEnt, numBytesPerValue, blockSize);
2972  numPacketsPerLIDHost(i) = numBytes;
2973  update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2974  }, rowReducerStruct);
2975  }
2976 
2977  // Compute the number of bytes ("packets") per row to pack. While
2978  // we're at it, compute the total # of block entries to send, and
2979  // the max # of block entries in any of the rows we're sending.
2980  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2981  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2982  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2983 
2984  if (verbose) {
2985  std::ostringstream os;
2986  os << prefix
2987  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2988  << std::endl;
2989  std::cerr << os.str ();
2990  }
2991 
2992  // We use a "struct of arrays" approach to packing each row's
2993  // entries. All the column indices (as global indices) go first,
2994  // then all their owning process ranks, and then the values.
2995  exports.resize (totalNumBytes);
2996  if (totalNumEntries > 0) {
2997  // Current position (in bytes) in the 'exports' output array.
2998  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2999  {
3000  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
3001  Kokkos::parallel_scan
3002  (policy,
3003  [=](const size_t &i, size_t &update, const bool &final) {
3004  if (final) offset(i) = update;
3005  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
3006  });
3007  }
3008  if (offset(numExportLIDs) != totalNumBytes) {
3009  std::ostream& err = this->markLocalErrorAndGetStream ();
3010  err << prefix
3011  << "At end of method, the final offset (in bytes) "
3012  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
3013  << totalNumBytes << ". "
3014  << "Please report this bug to the Tpetra developers." << std::endl;
3015  return;
3016  }
3017 
3018  // For each block row of the matrix owned by the calling
3019  // process, pack that block row's column indices and values into
3020  // the exports array.
3021 
3022  // Source matrix's column Map. We verified in checkSizes() that
3023  // the column Map exists (is not null).
3024  const map_type& srcColMap = * (srcGraph.getColMap ());
3025 
3026  // Pack the data for each row to send, into the 'exports' buffer.
3027  // exports will be modified on host.
3028  exports.modify_host();
3029  {
3030  typedef Kokkos::TeamPolicy<host_exec> policy_type;
3031  const auto policy =
3032  policy_type(numExportLIDs, 1, 1)
3033  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
3034  Kokkos::parallel_for
3035  (policy,
3036  [=](const typename policy_type::member_type &member) {
3037  const size_t i = member.league_rank();
3038  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
3039  gblColInds(member.team_scratch(0), maxRowLength);
3040 
3041  const LO lclRowInd = exportLIDsHost(i);
3042  const LO* lclColIndsRaw;
3043  Scalar* valsRaw;
3044  LO numEntLO;
3045  // It's OK to ignore the return value, since if the calling
3046  // process doesn't own that local row, then the number of
3047  // entries in that row on the calling process is zero.
3048  (void) src->getLocalRowView (lclRowInd, lclColIndsRaw, valsRaw, numEntLO);
3049 
3050  const size_t numEnt = static_cast<size_t> (numEntLO);
3051  Kokkos::View<const LO*,host_exec> lclColInds (lclColIndsRaw, numEnt);
3052 
3053  // Convert column indices from local to global.
3054  for (size_t j = 0; j < numEnt; ++j)
3055  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
3056 
3057  // Kyungjoo: additional wrapping scratch view is necessary
3058  // the following function interface need the same execution space
3059  // host scratch space somehow is not considered same as the host_exec
3060  // Copy the row's data into the current spot in the exports array.
3061  const size_t numBytes = packRowForBlockCrs<impl_scalar_type,LO,GO,host_exec>
3062  (exports.view_host(),
3063  offset(i),
3064  numEnt,
3065  Kokkos::View<const GO*,host_exec>(gblColInds.data(), maxRowLength),
3066  Kokkos::View<const impl_scalar_type*,host_exec>(reinterpret_cast<const impl_scalar_type*>(valsRaw), numEnt*blockSize*blockSize),
3067  numBytesPerValue,
3068  blockSize);
3069 
3070  // numBytes should be same as the difference between offsets
3071  if (debug) {
3072  const size_t offsetDiff = offset(i+1) - offset(i);
3073  if (numBytes != offsetDiff) {
3074  std::ostringstream os;
3075  os << prefix
3076  << "numBytes computed from packRowForBlockCrs is different from "
3077  << "precomputed offset values, LID = " << i << std::endl;
3078  std::cerr << os.str ();
3079  }
3080  }
3081  }); // for each LID (of a row) to send
3082  }
3083  } // if totalNumEntries > 0
3084 
3085  if (debug) {
3086  std::ostringstream os;
3087  const bool lclSuccess = ! (* (this->localError_));
3088  os << prefix
3089  << (lclSuccess ? "succeeded" : "FAILED")
3090  << std::endl;
3091  std::cerr << os.str ();
3092  }
3093  }
3094 
3095  template<class Scalar, class LO, class GO, class Node>
3096  void
3097  BlockCrsMatrix<Scalar, LO, GO, Node>::
3098 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3099  unpackAndCombineNew
3100 #else // TPETRA_ENABLE_DEPRECATED_CODE
3101  unpackAndCombine
3102 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3103  (const Kokkos::DualView<const local_ordinal_type*,
3104  buffer_device_type>& importLIDs,
3105  Kokkos::DualView<packet_type*,
3106  buffer_device_type> imports,
3107  Kokkos::DualView<size_t*,
3108  buffer_device_type> numPacketsPerLID,
3109  const size_t /* constantNumPackets */,
3110  Distributor& /* distor */,
3111  const CombineMode combineMode)
3112  {
3113  using ::Tpetra::Details::Behavior;
3115  using ::Tpetra::Details::ProfilingRegion;
3116  using ::Tpetra::Details::PackTraits;
3117  using std::endl;
3118  using host_exec =
3119  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
3120 
3121  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
3122  const bool verbose = Behavior::verbose ();
3123 
3124  // Define this function prefix
3125  std::string prefix;
3126  {
3127  std::ostringstream os;
3128  auto map = this->graph_.getRowMap ();
3129  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
3130  const int myRank = comm.is_null () ? -1 : comm->getRank ();
3131  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
3132  prefix = os.str ();
3133  if (verbose) {
3134  os << "Start" << endl;
3135  std::cerr << os.str ();
3136  }
3137  }
3138 
3139  // check if this already includes a local error
3140  if (* (this->localError_)) {
3141  std::ostream& err = this->markLocalErrorAndGetStream ();
3142  std::ostringstream os;
3143  os << prefix << "{Im/Ex}port target is already in error." << endl;
3144  if (verbose) {
3145  std::cerr << os.str ();
3146  }
3147  err << os.str ();
3148  return;
3149  }
3150 
3154  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
3155  std::ostream& err = this->markLocalErrorAndGetStream ();
3156  std::ostringstream os;
3157  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
3158  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
3159  << "." << endl;
3160  if (verbose) {
3161  std::cerr << os.str ();
3162  }
3163  err << os.str ();
3164  return;
3165  }
3166 
3167  if (combineMode != ADD && combineMode != INSERT &&
3168  combineMode != REPLACE && combineMode != ABSMAX &&
3169  combineMode != ZERO) {
3170  std::ostream& err = this->markLocalErrorAndGetStream ();
3171  std::ostringstream os;
3172  os << prefix
3173  << "Invalid CombineMode value " << combineMode << ". Valid "
3174  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
3175  << std::endl;
3176  if (verbose) {
3177  std::cerr << os.str ();
3178  }
3179  err << os.str ();
3180  return;
3181  }
3182 
3183  if (this->graph_.getColMap ().is_null ()) {
3184  std::ostream& err = this->markLocalErrorAndGetStream ();
3185  std::ostringstream os;
3186  os << prefix << "Target matrix's column Map is null." << endl;
3187  if (verbose) {
3188  std::cerr << os.str ();
3189  }
3190  err << os.str ();
3191  return;
3192  }
3193 
3194  // Target matrix's column Map. Use to convert the global column
3195  // indices in the receive buffer to local indices. We verified in
3196  // checkSizes() that the column Map exists (is not null).
3197  const map_type& tgtColMap = * (this->graph_.getColMap ());
3198 
3199  // Const values
3200  const size_t blockSize = this->getBlockSize ();
3201  const size_t numImportLIDs = importLIDs.extent(0);
3202  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
3203  // default-constructed instance could have a different size than
3204  // other instances. (We assume all nominally constructed
3205  // instances have the same size; that's not the issue here.) This
3206  // could be bad if the calling process has no entries, but other
3207  // processes have entries that they want to send to us.
3208  const size_t numBytesPerValue =
3209  PackTraits<impl_scalar_type, host_exec>::packValueCount
3210  (this->val_.extent (0) ? this->val_.view_host () (0) : impl_scalar_type ());
3211  const size_t maxRowNumEnt = graph_.getNodeMaxNumRowEntries ();
3212  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
3213 
3214  if (verbose) {
3215  std::ostringstream os;
3216  os << prefix << "combineMode: "
3217  << ::Tpetra::combineModeToString (combineMode)
3218  << ", blockSize: " << blockSize
3219  << ", numImportLIDs: " << numImportLIDs
3220  << ", numBytesPerValue: " << numBytesPerValue
3221  << ", maxRowNumEnt: " << maxRowNumEnt
3222  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
3223  std::cerr << os.str ();
3224  }
3225 
3226  if (combineMode == ZERO || numImportLIDs == 0) {
3227  if (verbose) {
3228  std::ostringstream os;
3229  os << prefix << "Nothing to unpack. Done!" << std::endl;
3230  std::cerr << os.str ();
3231  }
3232  return;
3233  }
3234 
3235  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
3236  // we can remove this sync.
3237  if (imports.need_sync_host ()) {
3238  imports.sync_host ();
3239  }
3240 
3241  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
3242  // sync'd numPacketsPerLID to host, since it needs to do that in
3243  // order to post MPI_Irecv messages with the correct lengths. A
3244  // hypothetical device-specific boundary exchange implementation
3245  // could possibly receive data without sync'ing lengths to host,
3246  // but we don't need to design for that nonexistent thing yet.
3247 
3248  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
3249  importLIDs.need_sync_host ()) {
3250  std::ostream& err = this->markLocalErrorAndGetStream ();
3251  std::ostringstream os;
3252  os << prefix << "All input DualViews must be sync'd to host by now. "
3253  << "imports_nc.need_sync_host()="
3254  << (imports.need_sync_host () ? "true" : "false")
3255  << ", numPacketsPerLID.need_sync_host()="
3256  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
3257  << ", importLIDs.need_sync_host()="
3258  << (importLIDs.need_sync_host () ? "true" : "false")
3259  << "." << endl;
3260  if (verbose) {
3261  std::cerr << os.str ();
3262  }
3263  err << os.str ();
3264  return;
3265  }
3266 
3267  const auto importLIDsHost = importLIDs.view_host ();
3268  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
3269 
3270  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
3271  // loop, by only unpacking on final==true (when we know the
3272  // current offset's value).
3273 
3274  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
3275  {
3276  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
3277  Kokkos::parallel_scan
3278  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
3279  [=] (const size_t &i, size_t &update, const bool &final) {
3280  if (final) offset(i) = update;
3281  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
3282  });
3283  }
3284 
3285  // this variable does not matter with a race condition (just error flag)
3286  //
3287  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
3288  // or atomics on bool, so we use int instead. (I know we're not
3289  // launching a CUDA loop here, but we could in the future, and I'd
3290  // like to avoid potential trouble.)
3291  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
3292  errorDuringUnpack ("errorDuringUnpack");
3293  errorDuringUnpack () = 0;
3294  {
3295  using policy_type = Kokkos::TeamPolicy<host_exec>;
3296  const auto policy = policy_type (numImportLIDs, 1, 1)
3297  .set_scratch_size (0, Kokkos::PerTeam (sizeof (GO) * maxRowNumEnt +
3298  sizeof (LO) * maxRowNumEnt +
3299  numBytesPerValue * maxRowNumScalarEnt));
3300  using host_scratch_space = typename host_exec::scratch_memory_space;
3301  using pair_type = Kokkos::pair<size_t, size_t>;
3302  Kokkos::parallel_for
3303  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
3304  [=] (const typename policy_type::member_type& member) {
3305  const size_t i = member.league_rank();
3306 
3307  Kokkos::View<GO*, host_scratch_space> gblColInds
3308  (member.team_scratch (0), maxRowNumEnt);
3309  Kokkos::View<LO*, host_scratch_space> lclColInds
3310  (member.team_scratch (0), maxRowNumEnt);
3311  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
3312  (member.team_scratch (0), maxRowNumScalarEnt);
3313 
3314  const size_t offval = offset(i);
3315  const LO lclRow = importLIDsHost(i);
3316  const size_t numBytes = numPacketsPerLIDHost(i);
3317  const size_t numEnt =
3318  unpackRowCount<impl_scalar_type, LO, GO, host_exec>
3319  (imports.view_host (), offval, numBytes, numBytesPerValue);
3320 
3321  if (numBytes > 0) {
3322  if (numEnt > maxRowNumEnt) {
3323  errorDuringUnpack() = 1;
3324  if (verbose) {
3325  std::ostringstream os;
3326  os << prefix
3327  << "At i = " << i << ", numEnt = " << numEnt
3328  << " > maxRowNumEnt = " << maxRowNumEnt
3329  << std::endl;
3330  std::cerr << os.str ();
3331  }
3332  return;
3333  }
3334  }
3335  const size_t numScalarEnt = numEnt*blockSize*blockSize;
3336  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
3337  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
3338  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
3339 
3340  // Kyungjoo: additional wrapping scratch view is necessary
3341  // the following function interface need the same execution space
3342  // host scratch space somehow is not considered same as the host_exec
3343  size_t numBytesOut = 0;
3344  try {
3345  numBytesOut =
3346  unpackRowForBlockCrs<impl_scalar_type, LO, GO, host_exec>
3347  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
3348  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
3349  imports.view_host(),
3350  offval, numBytes, numEnt,
3351  numBytesPerValue, blockSize);
3352  }
3353  catch (std::exception& e) {
3354  errorDuringUnpack() = 1;
3355  if (verbose) {
3356  std::ostringstream os;
3357  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
3358  << e.what () << endl;
3359  std::cerr << os.str ();
3360  }
3361  return;
3362  }
3363 
3364  if (numBytes != numBytesOut) {
3365  errorDuringUnpack() = 1;
3366  if (verbose) {
3367  std::ostringstream os;
3368  os << prefix
3369  << "At i = " << i << ", numBytes = " << numBytes
3370  << " != numBytesOut = " << numBytesOut << "."
3371  << std::endl;
3372  std::cerr << os.str();
3373  }
3374  return;
3375  }
3376 
3377  // Convert incoming global indices to local indices.
3378  for (size_t k = 0; k < numEnt; ++k) {
3379  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
3380  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
3381  if (verbose) {
3382  std::ostringstream os;
3383  os << prefix
3384  << "At i = " << i << ", GID " << gidsOut(k)
3385  << " is not owned by the calling process."
3386  << std::endl;
3387  std::cerr << os.str();
3388  }
3389  return;
3390  }
3391  }
3392 
3393  // Combine the incoming data with the matrix's current data.
3394  LO numCombd = 0;
3395  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
3396  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
3397  (const_cast<const impl_scalar_type*> (valsOut.data ()));
3398  if (combineMode == ADD) {
3399  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3400  } else if (combineMode == INSERT || combineMode == REPLACE) {
3401  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3402  } else if (combineMode == ABSMAX) {
3403  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
3404  }
3405 
3406  if (static_cast<LO> (numEnt) != numCombd) {
3407  errorDuringUnpack() = 1;
3408  if (verbose) {
3409  std::ostringstream os;
3410  os << prefix << "At i = " << i << ", numEnt = " << numEnt
3411  << " != numCombd = " << numCombd << "."
3412  << endl;
3413  std::cerr << os.str ();
3414  }
3415  return;
3416  }
3417  }); // for each import LID i
3418  }
3419 
3420  if (errorDuringUnpack () != 0) {
3421  std::ostream& err = this->markLocalErrorAndGetStream ();
3422  err << prefix << "Unpacking failed.";
3423  if (! verbose) {
3424  err << " Please run again with the environment variable setting "
3425  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
3426  }
3427  err << endl;
3428  }
3429 
3430  if (verbose) {
3431  std::ostringstream os;
3432  const bool lclSuccess = ! (* (this->localError_));
3433  os << prefix
3434  << (lclSuccess ? "succeeded" : "FAILED")
3435  << std::endl;
3436  std::cerr << os.str ();
3437  }
3438  }
3439 
3440  template<class Scalar, class LO, class GO, class Node>
3441  std::string
3443  {
3444  using Teuchos::TypeNameTraits;
3445  std::ostringstream os;
3446  os << "\"Tpetra::BlockCrsMatrix\": { "
3447  << "Template parameters: { "
3448  << "Scalar: " << TypeNameTraits<Scalar>::name ()
3449  << "LO: " << TypeNameTraits<LO>::name ()
3450  << "GO: " << TypeNameTraits<GO>::name ()
3451  << "Node: " << TypeNameTraits<Node>::name ()
3452  << " }"
3453  << ", Label: \"" << this->getObjectLabel () << "\""
3454  << ", Global dimensions: ["
3455  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3456  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
3457  << ", Block size: " << getBlockSize ()
3458  << " }";
3459  return os.str ();
3460  }
3461 
3462 
3463  template<class Scalar, class LO, class GO, class Node>
3464  void
3466  describe (Teuchos::FancyOStream& out,
3467  const Teuchos::EVerbosityLevel verbLevel) const
3468  {
3469  using Teuchos::ArrayRCP;
3470  using Teuchos::CommRequest;
3471  using Teuchos::FancyOStream;
3472  using Teuchos::getFancyOStream;
3473  using Teuchos::ireceive;
3474  using Teuchos::isend;
3475  using Teuchos::outArg;
3476  using Teuchos::TypeNameTraits;
3477  using Teuchos::VERB_DEFAULT;
3478  using Teuchos::VERB_NONE;
3479  using Teuchos::VERB_LOW;
3480  using Teuchos::VERB_MEDIUM;
3481  // using Teuchos::VERB_HIGH;
3482  using Teuchos::VERB_EXTREME;
3483  using Teuchos::RCP;
3484  using Teuchos::wait;
3485  using std::endl;
3486 #ifdef HAVE_TPETRA_DEBUG
3487  const char prefix[] = "Tpetra::Experimental::BlockCrsMatrix::describe: ";
3488 #endif // HAVE_TPETRA_DEBUG
3489 
3490  // Set default verbosity if applicable.
3491  const Teuchos::EVerbosityLevel vl =
3492  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
3493 
3494  if (vl == VERB_NONE) {
3495  return; // print nothing
3496  }
3497 
3498  // describe() always starts with a tab before it prints anything.
3499  Teuchos::OSTab tab0 (out);
3500 
3501  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
3502  Teuchos::OSTab tab1 (out);
3503 
3504  out << "Template parameters:" << endl;
3505  {
3506  Teuchos::OSTab tab2 (out);
3507  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
3508  << "LO: " << TypeNameTraits<LO>::name () << endl
3509  << "GO: " << TypeNameTraits<GO>::name () << endl
3510  << "Node: " << TypeNameTraits<Node>::name () << endl;
3511  }
3512  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
3513  << "Global dimensions: ["
3514  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
3515  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
3516 
3517  const LO blockSize = getBlockSize ();
3518  out << "Block size: " << blockSize << endl;
3519 
3520  // constituent objects
3521  if (vl >= VERB_MEDIUM) {
3522  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3523  const int myRank = comm.getRank ();
3524  if (myRank == 0) {
3525  out << "Row Map:" << endl;
3526  }
3527  getRowMap()->describe (out, vl);
3528 
3529  if (! getColMap ().is_null ()) {
3530  if (getColMap() == getRowMap()) {
3531  if (myRank == 0) {
3532  out << "Column Map: same as row Map" << endl;
3533  }
3534  }
3535  else {
3536  if (myRank == 0) {
3537  out << "Column Map:" << endl;
3538  }
3539  getColMap ()->describe (out, vl);
3540  }
3541  }
3542  if (! getDomainMap ().is_null ()) {
3543  if (getDomainMap () == getRowMap ()) {
3544  if (myRank == 0) {
3545  out << "Domain Map: same as row Map" << endl;
3546  }
3547  }
3548  else if (getDomainMap () == getColMap ()) {
3549  if (myRank == 0) {
3550  out << "Domain Map: same as column Map" << endl;
3551  }
3552  }
3553  else {
3554  if (myRank == 0) {
3555  out << "Domain Map:" << endl;
3556  }
3557  getDomainMap ()->describe (out, vl);
3558  }
3559  }
3560  if (! getRangeMap ().is_null ()) {
3561  if (getRangeMap () == getDomainMap ()) {
3562  if (myRank == 0) {
3563  out << "Range Map: same as domain Map" << endl;
3564  }
3565  }
3566  else if (getRangeMap () == getRowMap ()) {
3567  if (myRank == 0) {
3568  out << "Range Map: same as row Map" << endl;
3569  }
3570  }
3571  else {
3572  if (myRank == 0) {
3573  out << "Range Map: " << endl;
3574  }
3575  getRangeMap ()->describe (out, vl);
3576  }
3577  }
3578  }
3579 
3580  if (vl >= VERB_EXTREME) {
3581  // FIXME (mfh 26 May 2016) It's not nice for this method to sync
3582  // to host, since it's supposed to be const. However, that's
3583  // the easiest and least memory-intensive way to implement this
3584  // method.
3585  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_type;
3586  const_cast<this_type*> (this)->sync_host ();
3587 
3588 #ifdef HAVE_TPETRA_DEBUG
3589  TEUCHOS_TEST_FOR_EXCEPTION
3590  (this->need_sync_host (), std::logic_error,
3591  prefix << "Right after sync to host, the matrix claims that it needs "
3592  "sync to host. Please report this bug to the Tpetra developers.");
3593 #endif // HAVE_TPETRA_DEBUG
3594 
3595  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
3596  const int myRank = comm.getRank ();
3597  const int numProcs = comm.getSize ();
3598 
3599  // Print the calling process' data to the given output stream.
3600  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
3601  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
3602  FancyOStream& os = *osPtr;
3603  os << "Process " << myRank << ":" << endl;
3604  Teuchos::OSTab tab2 (os);
3605 
3606  const map_type& meshRowMap = * (graph_.getRowMap ());
3607  const map_type& meshColMap = * (graph_.getColMap ());
3608  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
3609  meshLclRow <= meshRowMap.getMaxLocalIndex ();
3610  ++meshLclRow) {
3611  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
3612  os << "Row " << meshGblRow << ": {";
3613 
3614  const LO* lclColInds = NULL;
3615  Scalar* vals = NULL;
3616  LO numInds = 0;
3617  this->getLocalRowView (meshLclRow, lclColInds, vals, numInds);
3618 
3619  for (LO k = 0; k < numInds; ++k) {
3620  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
3621 
3622  os << "Col " << gblCol << ": [";
3623  for (LO i = 0; i < blockSize; ++i) {
3624  for (LO j = 0; j < blockSize; ++j) {
3625  os << vals[blockSize*blockSize*k + i*blockSize + j];
3626  if (j + 1 < blockSize) {
3627  os << ", ";
3628  }
3629  }
3630  if (i + 1 < blockSize) {
3631  os << "; ";
3632  }
3633  }
3634  os << "]";
3635  if (k + 1 < numInds) {
3636  os << ", ";
3637  }
3638  }
3639  os << "}" << endl;
3640  }
3641 
3642  // Print data on Process 0. This will automatically respect the
3643  // current indentation level.
3644  if (myRank == 0) {
3645  out << lclOutStrPtr->str ();
3646  lclOutStrPtr = Teuchos::null; // clear it to save space
3647  }
3648 
3649  const int sizeTag = 1337;
3650  const int dataTag = 1338;
3651 
3652  ArrayRCP<char> recvDataBuf; // only used on Process 0
3653 
3654  // Send string sizes and data from each process in turn to
3655  // Process 0, and print on that process.
3656  for (int p = 1; p < numProcs; ++p) {
3657  if (myRank == 0) {
3658  // Receive the incoming string's length.
3659  ArrayRCP<size_t> recvSize (1);
3660  recvSize[0] = 0;
3661  RCP<CommRequest<int> > recvSizeReq =
3662  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
3663  wait<int> (comm, outArg (recvSizeReq));
3664  const size_t numCharsToRecv = recvSize[0];
3665 
3666  // Allocate space for the string to receive. Reuse receive
3667  // buffer space if possible. We can do this because in the
3668  // current implementation, we only have one receive in
3669  // flight at a time. Leave space for the '\0' at the end,
3670  // in case the sender doesn't send it.
3671  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
3672  recvDataBuf.resize (numCharsToRecv + 1);
3673  }
3674  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
3675  // Post the receive of the actual string data.
3676  RCP<CommRequest<int> > recvDataReq =
3677  ireceive<int, char> (recvData, p, dataTag, comm);
3678  wait<int> (comm, outArg (recvDataReq));
3679 
3680  // Print the received data. This will respect the current
3681  // indentation level. Make sure that the string is
3682  // null-terminated.
3683  recvDataBuf[numCharsToRecv] = '\0';
3684  out << recvDataBuf.getRawPtr ();
3685  }
3686  else if (myRank == p) { // if I am not Process 0, and my rank is p
3687  // This deep-copies the string at most twice, depending on
3688  // whether std::string reference counts internally (it
3689  // generally does, so this won't deep-copy at all).
3690  const std::string stringToSend = lclOutStrPtr->str ();
3691  lclOutStrPtr = Teuchos::null; // clear original to save space
3692 
3693  // Send the string's length to Process 0.
3694  const size_t numCharsToSend = stringToSend.size ();
3695  ArrayRCP<size_t> sendSize (1);
3696  sendSize[0] = numCharsToSend;
3697  RCP<CommRequest<int> > sendSizeReq =
3698  isend<int, size_t> (sendSize, 0, sizeTag, comm);
3699  wait<int> (comm, outArg (sendSizeReq));
3700 
3701  // Send the actual string to Process 0. We know that the
3702  // string has length > 0, so it's save to take the address
3703  // of the first entry. Make a nonowning ArrayRCP to hold
3704  // the string. Process 0 will add a null termination
3705  // character at the end of the string, after it receives the
3706  // message.
3707  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
3708  RCP<CommRequest<int> > sendDataReq =
3709  isend<int, char> (sendData, 0, dataTag, comm);
3710  wait<int> (comm, outArg (sendDataReq));
3711  }
3712  } // for each process rank p other than 0
3713  } // extreme verbosity level (print the whole matrix)
3714  }
3715 
3716  template<class Scalar, class LO, class GO, class Node>
3717  Teuchos::RCP<const Teuchos::Comm<int> >
3719  getComm() const
3720  {
3721  return graph_.getComm();
3722  }
3723 
3724 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3725  template<class Scalar, class LO, class GO, class Node>
3726  TPETRA_DEPRECATED
3727  Teuchos::RCP<Node>
3729  getNode() const
3730  {
3731  return graph_.getNode();
3732 
3733  }
3734 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3735 
3736  template<class Scalar, class LO, class GO, class Node>
3740  {
3741  return graph_.getGlobalNumCols();
3742  }
3743 
3744  template<class Scalar, class LO, class GO, class Node>
3745  size_t
3748  {
3749  return graph_.getNodeNumCols();
3750  }
3751 
3752  template<class Scalar, class LO, class GO, class Node>
3753  GO
3756  {
3757  return graph_.getIndexBase();
3758  }
3759 
3760  template<class Scalar, class LO, class GO, class Node>
3764  {
3765  return graph_.getGlobalNumEntries();
3766  }
3767 
3768  template<class Scalar, class LO, class GO, class Node>
3769  size_t
3772  {
3773  return graph_.getNodeNumEntries();
3774  }
3775 
3776  template<class Scalar, class LO, class GO, class Node>
3777  size_t
3779  getNumEntriesInGlobalRow (GO globalRow) const
3780  {
3781  return graph_.getNumEntriesInGlobalRow(globalRow);
3782  }
3783 
3784 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3785  template<class Scalar, class LO, class GO, class Node>
3786  global_size_t TPETRA_DEPRECATED
3788  getGlobalNumDiags () const
3789  {
3790  using HDM = Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3791  return dynamic_cast<const HDM&> (this->graph_).getGlobalNumDiagsImpl ();
3792  }
3793 
3794  template<class Scalar, class LO, class GO, class Node>
3795  size_t TPETRA_DEPRECATED
3796  BlockCrsMatrix<Scalar, LO, GO, Node>::
3797  getNodeNumDiags () const
3798  {
3799  using HDM = Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3800  return dynamic_cast<const HDM&> (this->graph_).getNodeNumDiagsImpl ();
3801  }
3802 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3803 
3804  template<class Scalar, class LO, class GO, class Node>
3805  size_t
3808  {
3809  return graph_.getGlobalMaxNumRowEntries();
3810  }
3811 
3812  template<class Scalar, class LO, class GO, class Node>
3813  bool
3815  hasColMap() const
3816  {
3817  return graph_.hasColMap();
3818  }
3819 
3820 #ifdef TPETRA_ENABLE_DEPRECATED_CODE
3821  template<class Scalar, class LO, class GO, class Node>
3822  bool TPETRA_DEPRECATED
3824  isLowerTriangular () const
3825  {
3826  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3827  return dynamic_cast<const HDM&> (this->graph_).isLowerTriangularImpl ();
3828  }
3829 
3830  template<class Scalar, class LO, class GO, class Node>
3831  bool TPETRA_DEPRECATED
3832  BlockCrsMatrix<Scalar, LO, GO, Node>::
3833  isUpperTriangular () const
3834  {
3835  using HDM = ::Tpetra::Details::HasDeprecatedMethods2630_WarningThisClassIsNotForUsers;
3836  return dynamic_cast<const HDM&> (this->graph_).isUpperTriangularImpl ();
3837  }
3838 #endif // TPETRA_ENABLE_DEPRECATED_CODE
3839 
3840  template<class Scalar, class LO, class GO, class Node>
3841  bool
3844  {
3845  return graph_.isLocallyIndexed();
3846  }
3847 
3848  template<class Scalar, class LO, class GO, class Node>
3849  bool
3852  {
3853  return graph_.isGloballyIndexed();
3854  }
3855 
3856  template<class Scalar, class LO, class GO, class Node>
3857  bool
3860  {
3861  return graph_.isFillComplete ();
3862  }
3863 
3864  template<class Scalar, class LO, class GO, class Node>
3865  bool
3868  {
3869  return false;
3870  }
3871 
3872 
3873  template<class Scalar, class LO, class GO, class Node>
3874  void
3876  getGlobalRowCopy (GO /* GlobalRow */,
3877  const Teuchos::ArrayView<GO> &/* Indices */,
3878  const Teuchos::ArrayView<Scalar> &/* Values */,
3879  size_t &/* NumEntries */) const
3880  {
3881  TEUCHOS_TEST_FOR_EXCEPTION(
3882  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowCopy: "
3883  "This class doesn't support global matrix indexing.");
3884 
3885  }
3886 
3887  template<class Scalar, class LO, class GO, class Node>
3888  void
3890  getGlobalRowView (GO /* GlobalRow */,
3891  Teuchos::ArrayView<const GO> &/* indices */,
3892  Teuchos::ArrayView<const Scalar> &/* values */) const
3893  {
3894  TEUCHOS_TEST_FOR_EXCEPTION(
3895  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getGlobalRowView: "
3896  "This class doesn't support global matrix indexing.");
3897 
3898  }
3899 
3900  template<class Scalar, class LO, class GO, class Node>
3901  void
3903  getLocalRowView (LO /* LocalRow */,
3904  Teuchos::ArrayView<const LO>& /* indices */,
3905  Teuchos::ArrayView<const Scalar>& /* values */) const
3906  {
3907  TEUCHOS_TEST_FOR_EXCEPTION(
3908  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getLocalRowView: "
3909  "This class doesn't support local matrix indexing.");
3910  }
3911 
3912  template<class Scalar, class LO, class GO, class Node>
3913  void
3916  {
3917 #ifdef HAVE_TPETRA_DEBUG
3918  const char prefix[] =
3919  "Tpetra::Experimental::BlockCrsMatrix::getLocalDiagCopy: ";
3920 #endif // HAVE_TPETRA_DEBUG
3921 
3922  const size_t lclNumMeshRows = graph_.getNodeNumRows ();
3923 
3924  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3925  graph_.getLocalDiagOffsets (diagOffsets);
3926 
3927  // The code below works on host, so use a host View.
3928  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3929  Kokkos::deep_copy (diagOffsetsHost, diagOffsets);
3930  // We're filling diag on host for now.
3931  diag.template modify<typename decltype (diagOffsetsHost)::memory_space> ();
3932 
3933 #ifdef HAVE_TPETRA_DEBUG
3934  TEUCHOS_TEST_FOR_EXCEPTION
3935  (this->need_sync_host (), std::runtime_error,
3936  prefix << "The matrix's data were last modified on device, but have "
3937  "not been sync'd to host. Please sync to host (by calling "
3938  "sync<Kokkos::HostSpace>() on this matrix) before calling this "
3939  "method.");
3940 #endif // HAVE_TPETRA_DEBUG
3941 
3942  auto vals_host_out = getValuesHost ();
3943  Scalar* vals_host_out_raw =
3944  reinterpret_cast<Scalar*> (vals_host_out.data ());
3945 
3946  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3947  size_t rowOffset = 0;
3948  size_t offset = 0;
3949  LO bs = getBlockSize();
3950  for(size_t r=0; r<getNodeNumRows(); r++)
3951  {
3952  // move pointer to start of diagonal block
3953  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3954  for(int b=0; b<bs; b++)
3955  {
3956  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3957  }
3958  // move pointer to start of next block row
3959  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3960  }
3961 
3962  diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3963  }
3964 
3965  template<class Scalar, class LO, class GO, class Node>
3966  void
3968  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3969  {
3970  TEUCHOS_TEST_FOR_EXCEPTION(
3971  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::leftScale: "
3972  "not implemented.");
3973 
3974  }
3975 
3976  template<class Scalar, class LO, class GO, class Node>
3977  void
3979  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3980  {
3981  TEUCHOS_TEST_FOR_EXCEPTION(
3982  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::rightScale: "
3983  "not implemented.");
3984 
3985  }
3986 
3987  template<class Scalar, class LO, class GO, class Node>
3988  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3990  getGraph() const
3991  {
3992  return graphRCP_;
3993  }
3994 
3995  template<class Scalar, class LO, class GO, class Node>
3996  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3999  {
4000  TEUCHOS_TEST_FOR_EXCEPTION(
4001  true, std::logic_error, "Tpetra::Experimental::BlockCrsMatrix::getFrobeniusNorm: "
4002  "not implemented.");
4003  }
4004 
4005 } // namespace Experimental
4006 } // namespace Tpetra
4007 
4008 //
4009 // Explicit instantiation macro
4010 //
4011 // Must be expanded from within the Tpetra namespace!
4012 //
4013 #define TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
4014  namespace Experimental { \
4015  template class BlockCrsMatrix< S, LO, GO, NODE >; \
4016  }
4017 
4018 #endif // TPETRA_EXPERIMENTAL_BLOCKCRSMATRIX_DEF_HPP
LO absMaxLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValuesByOffsets, but for the ABSMAX combine mode.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the left with the given Vector x.
virtual bool supportsRowViews() const
Whether this object implements getLocalRowView() and getGlobalRowView().
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
Teuchos::RCP< const map_type > getColMap() const
get the (mesh) map for the columns of this block matrix.
KOKKOS_INLINE_FUNCTION void GEMV(const CoeffType &alpha, const BlkType &A, const VecType1 &x, const VecType2 &y)
y := y + alpha * A * x (dense matrix-vector multiply)
KOKKOS_INLINE_FUNCTION void SCAL(const CoefficientType &alpha, const ViewType &x)
x := alpha*x, where x is either rank 1 (a vector) or rank 2 (a matrix).
Kokkos::View< impl_scalar_type *, Kokkos::LayoutRight, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_vec_type
The type used to access nonconst vector blocks.
Teuchos::RCP< const map_type > getRowMap() const
get the (mesh) map for the rows of this block matrix.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNodeNumRows() const
get the local number of block rows
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
One or more distributed dense vectors.
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x)
Scale the RowMatrix on the right with the given Vector x.
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const
Print a description of this object to the given output stream.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
MultiVector for multiple degrees of freedom per mesh point.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
global_size_t getGlobalNumRows() const
get the global number of block rows
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
KOKKOS_INLINE_FUNCTION void FILL(const ViewType &x, const InputType &val)
Set every entry of x to val.
LocalOrdinal getMinLocalIndex() const
The minimum local index.
little_vec_type::HostMirror getLocalBlock(const LO localRowIndex, const LO colIndex) const
Get a host view of the degrees of freedom at the given mesh point.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Like sumIntoLocalValues, but for the ABSMAX combine mode.
Teuchos::RCP< const map_type > getRangeMap() const
Get the (point) range Map of this matrix.
int local_ordinal_type
Default value of Scalar template parameter.
virtual bool hasColMap() const
Whether this matrix has a well-defined column Map.
Teuchos::RCP< const map_type > getDomainMap() const
Get the (point) domain Map of this matrix.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const
Get the (mesh) graph.
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
virtual GO getIndexBase() const
The index base for global indices in this matrix.
virtual bool isGloballyIndexed() const
Whether matrix indices are globally indexed.
size_t global_size_t
Global size_t object.
size_t getNodeNumEntries() const override
The local number of entries in the graph.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
Insert new values that don&#39;t currently exist.
std::string description() const
One-line description of this object.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Linear algebra kernels for small dense matrices and vectors.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
virtual global_size_t getGlobalNumCols() const
The global number of columns of this matrix.
ESweepDirection
Sweep direction for Gauss-Seidel or Successive Over-Relaxation (SOR).
virtual bool isFillComplete() const
Whether fillComplete() has been called.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, execution_space > local_graph_type
The type of the part of the sparse graph on each MPI process.
LO getNumVectors() const
Get the number of columns (vectors) in the BlockMultiVector.
Sets up and executes a communication plan for a Tpetra DistObject.
GlobalOrdinal getGlobalElement(LocalOrdinal localIndex) const
The global index corresponding to the given local index.
CombineMode
Rule for combining data in an Import or Export.
Sum new values into existing values.
virtual size_t getNodeNumCols() const
The number of columns needed to apply the forward operator on this node.
virtual size_t getNodeNumEntries() const
The local number of stored (structurally nonzero) entries.
CrsGraphType::global_ordinal_type getGlobalNumDiags(const CrsGraphType &G)
Number of populated diagonal entries in the given sparse graph, over all processes in the graph&#39;s (MP...
virtual global_size_t getGlobalNumEntries() const
The global number of stored (structurally nonzero) entries.
virtual void getGlobalRowCopy(GO GlobalRow, const Teuchos::ArrayView< GO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Get a copy of the given global row&#39;s entries.
void getLocalRowCopy(LO LocalRow, const Teuchos::ArrayView< LO > &Indices, const Teuchos::ArrayView< Scalar > &Values, size_t &NumEntries) const
Not implemented.
Replace old value with maximum of magnitudes of old and new values.
LO getLocalRowView(const LO localRowInd, const LO *&colInds, Scalar *&vals, LO &numInds) const
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
Replace existing values with new values.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
Replace old values with zero.
void localGaussSeidel(const BlockMultiVector< Scalar, LO, GO, Node > &Residual, BlockMultiVector< Scalar, LO, GO, Node > &Solution, const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &D_inv, const Scalar &omega, const ESweepDirection direction) const
Local Gauss-Seidel solve, given a factorized diagonal.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getGlobalMaxNumRowEntries() const
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
void reorderedGaussSeidelCopy(::Tpetra::MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Teuchos::ArrayView< LO > &rowIndices, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of reorderedGaussSeidel(), with fewer requirements on X.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
A distributed dense vector.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const
The Frobenius norm of the matrix.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const
The communicator over which this matrix is distributed.
void gaussSeidelCopy(MultiVector< Scalar, LO, GO, Node > &X, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &B, const ::Tpetra::MultiVector< Scalar, LO, GO, Node > &D, const Scalar &dampingFactor, const ESweepDirection direction, const int numSweeps, const bool zeroInitialGuess) const
Version of gaussSeidel(), with fewer requirements on X.
local_graph_type getLocalGraph() const
Get the local graph.
size_t getNodeMaxNumRowEntries() const
Maximum number of entries in any row of the matrix, on this process.
LocalOrdinal getMaxLocalIndex() const
The maximum local index on the calling process.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const
The current number of entries on the calling process in the specified global row. ...
size_t getNumEntriesInLocalRow(const LO localRowInd) const
Return the number of entries in the given row on the calling process.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value) const
Replace current value at the specified location with specified values.
virtual void getGlobalRowView(GO GlobalRow, Teuchos::ArrayView< const GO > &indices, Teuchos::ArrayView< const Scalar > &values) const
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
virtual bool isLocallyIndexed() const
Whether matrix indices are locally indexed.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.