Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 // clang-format off
40 
41 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
42 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
43 
46 
50 #include "Tpetra_BlockMultiVector.hpp"
51 #include "Tpetra_BlockView.hpp"
52 
53 #include "Teuchos_TimeMonitor.hpp"
54 #ifdef HAVE_TPETRA_DEBUG
55 # include <set>
56 #endif // HAVE_TPETRA_DEBUG
57 
58 #include "KokkosSparse_spmv.hpp"
59 
60 namespace Tpetra {
61 
62 namespace Impl {
63 
64  template<typename T>
65  struct BlockCrsRowStruct {
66  T totalNumEntries, totalNumBytes, maxRowLength;
67  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
68  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
69  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
70  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
71  };
72 
73  template<typename T>
74  static
75  KOKKOS_INLINE_FUNCTION
76  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
77  a.totalNumEntries += b.totalNumEntries;
78  a.totalNumBytes += b.totalNumBytes;
79  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
80  }
81 
82  template<typename T, typename ExecSpace>
83  struct BlockCrsReducer {
84  typedef BlockCrsReducer reducer;
85  typedef T value_type;
86  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
87  value_type *value;
88 
89  KOKKOS_INLINE_FUNCTION
90  BlockCrsReducer(value_type &val) : value(&val) {}
91 
92  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
93  KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
94  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
95  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
96  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
97  };
98 
99 } // namespace Impl
100 
101 namespace { // (anonymous)
102 
103 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
104 // version that takes two Kokkos::View arguments).
105 template<class Scalar, class LO, class GO, class Node>
106 class GetLocalDiagCopy {
107 public:
108  typedef typename Node::device_type device_type;
109  typedef size_t diag_offset_type;
110  typedef Kokkos::View<const size_t*, device_type,
111  Kokkos::MemoryUnmanaged> diag_offsets_type;
112  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
113  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
114  typedef typename local_graph_device_type::row_map_type row_offsets_type;
115  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
116  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
117  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
118 
119  // Constructor
120  GetLocalDiagCopy (const diag_type& diag,
121  const values_type& val,
122  const diag_offsets_type& diagOffsets,
123  const row_offsets_type& ptr,
124  const LO blockSize) :
125  diag_ (diag),
126  diagOffsets_ (diagOffsets),
127  ptr_ (ptr),
128  blockSize_ (blockSize),
129  offsetPerBlock_ (blockSize_*blockSize_),
130  val_(val)
131  {}
132 
133  KOKKOS_INLINE_FUNCTION void
134  operator() (const LO& lclRowInd) const
135  {
136  using Kokkos::ALL;
137 
138  // Get row offset
139  const size_t absOffset = ptr_[lclRowInd];
140 
141  // Get offset relative to start of row
142  const size_t relOffset = diagOffsets_[lclRowInd];
143 
144  // Get the total offset
145  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
146 
147  // Get a view of the block. BCRS currently uses LayoutRight
148  // regardless of the device.
149  typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
150  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
151  const_little_block_type;
152  const_little_block_type D_in (val_.data () + pointOffset,
153  blockSize_, blockSize_);
154  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
155  COPY (D_in, D_out);
156  }
157 
158  private:
159  diag_type diag_;
160  diag_offsets_type diagOffsets_;
161  row_offsets_type ptr_;
162  LO blockSize_;
163  LO offsetPerBlock_;
164  values_type val_;
165  };
166 } // namespace (anonymous)
167 
168  template<class Scalar, class LO, class GO, class Node>
169  std::ostream&
170  BlockCrsMatrix<Scalar, LO, GO, Node>::
171  markLocalErrorAndGetStream ()
172  {
173  * (this->localError_) = true;
174  if ((*errs_).is_null ()) {
175  *errs_ = Teuchos::rcp (new std::ostringstream ());
176  }
177  return **errs_;
178  }
179 
180  template<class Scalar, class LO, class GO, class Node>
183  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
184  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
185  blockSize_ (static_cast<LO> (0)),
186  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
187  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
188  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
189  offsetPerBlock_ (0),
190  localError_ (new bool (false)),
191  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
192  {
193  }
194 
195  template<class Scalar, class LO, class GO, class Node>
198  const LO blockSize) :
199  dist_object_type (graph.getMap ()),
200  graph_ (graph),
201  rowMeshMap_ (* (graph.getRowMap ())),
202  blockSize_ (blockSize),
203  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
204  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
205  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
206  offsetPerBlock_ (blockSize * blockSize),
207  localError_ (new bool (false)),
208  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
209  {
210 
212  TEUCHOS_TEST_FOR_EXCEPTION(
213  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
214  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
215  "rows (isSorted() is false). This class assumes sorted rows.");
216 
217  graphRCP_ = Teuchos::rcpFromRef(graph_);
218 
219  // Trick to test whether LO is nonpositive, without a compiler
220  // warning in case LO is unsigned (which is generally a bad idea
221  // anyway). I don't promise that the trick works, but it
222  // generally does with gcc at least, in my experience.
223  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
224  TEUCHOS_TEST_FOR_EXCEPTION(
225  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
226  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
227  " <= 0. The block size must be positive.");
228 
229  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
230  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
231 
232  {
233  // These are rcp
234  const auto domainPointMap = getDomainMap();
235  const auto colPointMap = Teuchos::rcp
236  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
237  *pointImporter_ = Teuchos::rcp
238  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
239  }
240  {
241  auto local_graph_h = graph.getLocalGraphHost ();
242  auto ptr_h = local_graph_h.row_map;
243  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
244  Kokkos::deep_copy(ptrHost_, ptr_h);
245 
246  auto ind_h = local_graph_h.entries;
247  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
248  // DEEP_COPY REVIEW - HOST-TO-HOST
249  Kokkos::deep_copy (indHost_, ind_h);
250 
251  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
252  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
253  }
254  }
255 
256  template<class Scalar, class LO, class GO, class Node>
259  const typename local_matrix_device_type::values_type& values,
260  const LO blockSize) :
261  dist_object_type (graph.getMap ()),
262  graph_ (graph),
263  rowMeshMap_ (* (graph.getRowMap ())),
264  blockSize_ (blockSize),
265  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
266  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
267  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
268  offsetPerBlock_ (blockSize * blockSize),
269  localError_ (new bool (false)),
270  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
271  {
273  TEUCHOS_TEST_FOR_EXCEPTION(
274  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
275  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
276  "rows (isSorted() is false). This class assumes sorted rows.");
277 
278  graphRCP_ = Teuchos::rcpFromRef(graph_);
279 
280  // Trick to test whether LO is nonpositive, without a compiler
281  // warning in case LO is unsigned (which is generally a bad idea
282  // anyway). I don't promise that the trick works, but it
283  // generally does with gcc at least, in my experience.
284  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
285  TEUCHOS_TEST_FOR_EXCEPTION(
286  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
287  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
288  " <= 0. The block size must be positive.");
289 
290  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
291  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
292 
293  {
294  // These are rcp
295  const auto domainPointMap = getDomainMap();
296  const auto colPointMap = Teuchos::rcp
297  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
298  *pointImporter_ = Teuchos::rcp
299  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
300  }
301  {
302  auto local_graph_h = graph.getLocalGraphHost ();
303  auto ptr_h = local_graph_h.row_map;
304  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
305  Kokkos::deep_copy(ptrHost_, ptr_h);
306 
307  auto ind_h = local_graph_h.entries;
308  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
309  Kokkos::deep_copy (indHost_, ind_h);
310 
311  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
312  TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
313  val_ = decltype (val_) (values);
314  }
315  }
316 
317  template<class Scalar, class LO, class GO, class Node>
320  const map_type& domainPointMap,
321  const map_type& rangePointMap,
322  const LO blockSize) :
323  dist_object_type (graph.getMap ()),
324  graph_ (graph),
325  rowMeshMap_ (* (graph.getRowMap ())),
326  domainPointMap_ (domainPointMap),
327  rangePointMap_ (rangePointMap),
328  blockSize_ (blockSize),
329  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
330  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
331  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
332  offsetPerBlock_ (blockSize * blockSize),
333  localError_ (new bool (false)),
334  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
335  {
336  TEUCHOS_TEST_FOR_EXCEPTION(
337  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
338  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
339  "rows (isSorted() is false). This class assumes sorted rows.");
340 
341  graphRCP_ = Teuchos::rcpFromRef(graph_);
342 
343  // Trick to test whether LO is nonpositive, without a compiler
344  // warning in case LO is unsigned (which is generally a bad idea
345  // anyway). I don't promise that the trick works, but it
346  // generally does with gcc at least, in my experience.
347  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
348  TEUCHOS_TEST_FOR_EXCEPTION(
349  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
350  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
351  " <= 0. The block size must be positive.");
352  {
353  // These are rcp
354  const auto rcpDomainPointMap = getDomainMap();
355  const auto colPointMap = Teuchos::rcp
356  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
357  *pointImporter_ = Teuchos::rcp
358  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
359  }
360  {
361  auto local_graph_h = graph.getLocalGraphHost ();
362  auto ptr_h = local_graph_h.row_map;
363  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
364  // DEEP_COPY REVIEW - HOST-TO-HOST
365  Kokkos::deep_copy(ptrHost_, ptr_h);
366 
367  auto ind_h = local_graph_h.entries;
368  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
369  // DEEP_COPY REVIEW - HOST-TO-HOST
370  Kokkos::deep_copy (indHost_, ind_h);
371 
372  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
373  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
374 
375  }
376  }
377 
378  template<class Scalar, class LO, class GO, class Node>
379  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
382  { // Copy constructor of map_type does a shallow copy.
383  // We're only returning by RCP for backwards compatibility.
384  return Teuchos::rcp (new map_type (domainPointMap_));
385  }
386 
387  template<class Scalar, class LO, class GO, class Node>
388  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
390  getRangeMap () const
391  { // Copy constructor of map_type does a shallow copy.
392  // We're only returning by RCP for backwards compatibility.
393  return Teuchos::rcp (new map_type (rangePointMap_));
394  }
395 
396  template<class Scalar, class LO, class GO, class Node>
397  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
399  getRowMap () const
400  {
401  return graph_.getRowMap();
402  }
403 
404  template<class Scalar, class LO, class GO, class Node>
405  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
407  getColMap () const
408  {
409  return graph_.getColMap();
410  }
411 
412  template<class Scalar, class LO, class GO, class Node>
416  {
417  return graph_.getGlobalNumRows();
418  }
419 
420  template<class Scalar, class LO, class GO, class Node>
421  size_t
424  {
425  return graph_.getLocalNumRows();
426  }
427 
428  template<class Scalar, class LO, class GO, class Node>
429  size_t
432  {
433  return graph_.getLocalMaxNumRowEntries();
434  }
435 
436  template<class Scalar, class LO, class GO, class Node>
437  void
439  apply (const mv_type& X,
440  mv_type& Y,
441  Teuchos::ETransp mode,
442  Scalar alpha,
443  Scalar beta) const
444  {
445  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
446  TEUCHOS_TEST_FOR_EXCEPTION(
447  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
448  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
449  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
450  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
451 
452  TEUCHOS_TEST_FOR_EXCEPTION(
453  !X.isConstantStride() || !Y.isConstantStride(),
454  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
455  "X and Y must both be constant stride");
456 
457  BMV X_view;
458  BMV Y_view;
459  const LO blockSize = getBlockSize ();
460  try {
461  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
462  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
463  }
464  catch (std::invalid_argument& e) {
465  TEUCHOS_TEST_FOR_EXCEPTION(
466  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
467  "apply: Either the input MultiVector X or the output MultiVector Y "
468  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
469  "graph. BlockMultiVector's constructor threw the following exception: "
470  << e.what ());
471  }
472 
473  try {
474  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
475  // either that or mark fields of this class as 'mutable'. The
476  // problem is that applyBlock wants to do lazy initialization of
477  // temporary block multivectors.
478  const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
479  } catch (std::invalid_argument& e) {
480  TEUCHOS_TEST_FOR_EXCEPTION(
481  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
482  "apply: The implementation method applyBlock complained about having "
483  "an invalid argument. It reported the following: " << e.what ());
484  } catch (std::logic_error& e) {
485  TEUCHOS_TEST_FOR_EXCEPTION(
486  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
487  "apply: The implementation method applyBlock complained about a "
488  "possible bug in its implementation. It reported the following: "
489  << e.what ());
490  } catch (std::exception& e) {
491  TEUCHOS_TEST_FOR_EXCEPTION(
492  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
493  "apply: The implementation method applyBlock threw an exception which "
494  "is neither std::invalid_argument nor std::logic_error, but is a "
495  "subclass of std::exception. It reported the following: "
496  << e.what ());
497  } catch (...) {
498  TEUCHOS_TEST_FOR_EXCEPTION(
499  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
500  "apply: The implementation method applyBlock threw an exception which "
501  "is not an instance of a subclass of std::exception. This probably "
502  "indicates a bug. Please report this to the Tpetra developers.");
503  }
504  }
505 
506  template<class Scalar, class LO, class GO, class Node>
507  void
511  Teuchos::ETransp mode,
512  const Scalar alpha,
513  const Scalar beta)
514  {
515  TEUCHOS_TEST_FOR_EXCEPTION(
516  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
517  "Tpetra::BlockCrsMatrix::applyBlock: "
518  "X and Y have different block sizes. X.getBlockSize() = "
519  << X.getBlockSize () << " != Y.getBlockSize() = "
520  << Y.getBlockSize () << ".");
521 
522  if (mode == Teuchos::NO_TRANS) {
523  applyBlockNoTrans (X, Y, alpha, beta);
524  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
525  applyBlockTrans (X, Y, mode, alpha, beta);
526  } else {
527  TEUCHOS_TEST_FOR_EXCEPTION(
528  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
529  "applyBlock: Invalid 'mode' argument. Valid values are "
530  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
531  }
532  }
533 
534  template<class Scalar, class LO, class GO, class Node>
535  void
538  const Import<LO, GO, Node>& importer) const
539  {
540  using Teuchos::RCP;
541  using Teuchos::rcp;
542  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
543 
544  // Right now, we make many assumptions...
545  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
546  "destMatrix is required to be null.");
547 
548  // BlockCrsMatrix requires a complete graph at construction.
549  // So first step is to import and fill complete the destGraph.
550  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
551  RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
552  srcGraph->getDomainMap(),
553  srcGraph->getRangeMap());
554 
555 
556  // Final step, create and import the destMatrix.
557  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
558  destMatrix->doImport(*this, importer, Tpetra::INSERT);
559  }
560 
561 
562  template<class Scalar, class LO, class GO, class Node>
563  void
566  const Export<LO, GO, Node>& exporter) const
567  {
568  using Teuchos::RCP;
569  using Teuchos::rcp;
570  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
571 
572  // Right now, we make many assumptions...
573  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
574  "destMatrix is required to be null.");
575 
576  // BlockCrsMatrix requires a complete graph at construction.
577  // So first step is to import and fill complete the destGraph.
578  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
579  RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
580  srcGraph->getDomainMap(),
581  srcGraph->getRangeMap());
582 
583 
584  // Final step, create and import the destMatrix.
585  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
586  destMatrix->doExport(*this, exporter, Tpetra::INSERT);
587  }
588 
589 
590  template<class Scalar, class LO, class GO, class Node>
591  void
593  setAllToScalar (const Scalar& alpha)
594  {
595  auto val_d = val_.getDeviceView(Access::OverwriteAll);
596  Kokkos::deep_copy(val_d, alpha);
597  }
598 
599  template<class Scalar, class LO, class GO, class Node>
600  LO
602  replaceLocalValues (const LO localRowInd,
603  const LO colInds[],
604  const Scalar vals[],
605  const LO numColInds) const
606  {
607  std::vector<ptrdiff_t> offsets(numColInds);
608  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
609  const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
610  return validCount;
611  }
612 
613  template <class Scalar, class LO, class GO, class Node>
614  void
616  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
617  Kokkos::MemoryUnmanaged>& offsets) const
618  {
619  graph_.getLocalDiagOffsets (offsets);
620  }
621 
622  template <class Scalar, class LO, class GO, class Node>
623  void
625  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
626  Kokkos::MemoryUnmanaged>& diag,
627  const Kokkos::View<const size_t*, device_type,
628  Kokkos::MemoryUnmanaged>& offsets) const
629  {
630  using Kokkos::parallel_for;
631  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
632 
633  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
634  const LO blockSize = this->getBlockSize ();
635  TEUCHOS_TEST_FOR_EXCEPTION
636  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
637  static_cast<LO> (diag.extent (1)) < blockSize ||
638  static_cast<LO> (diag.extent (2)) < blockSize,
639  std::invalid_argument, prefix <<
640  "The input Kokkos::View is not big enough to hold all the data.");
641  TEUCHOS_TEST_FOR_EXCEPTION
642  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
643  prefix << "offsets.size() = " << offsets.size () << " < local number of "
644  "diagonal blocks " << lclNumMeshRows << ".");
645 
646  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
647  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
648 
649  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
650  // we reserve the right to do lazy allocation of device data. (We
651  // don't plan to do lazy allocation for host data; the host
652  // version of the data always exists.)
653  auto val_d = val_.getDeviceView(Access::ReadOnly);
654  parallel_for (policy_type (0, lclNumMeshRows),
655  functor_type (diag, val_d, offsets,
656  graph_.getLocalGraphDevice ().row_map, blockSize_));
657  }
658 
659  template<class Scalar, class LO, class GO, class Node>
660  LO
662  absMaxLocalValues (const LO localRowInd,
663  const LO colInds[],
664  const Scalar vals[],
665  const LO numColInds) const
666  {
667  std::vector<ptrdiff_t> offsets(numColInds);
668  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
669  const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
670  return validCount;
671  }
672 
673 
674  template<class Scalar, class LO, class GO, class Node>
675  LO
677  sumIntoLocalValues (const LO localRowInd,
678  const LO colInds[],
679  const Scalar vals[],
680  const LO numColInds) const
681  {
682  std::vector<ptrdiff_t> offsets(numColInds);
683  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
684  const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
685  return validCount;
686  }
687  template<class Scalar, class LO, class GO, class Node>
688  void
690  getLocalRowCopy (LO LocalRow,
691  nonconst_local_inds_host_view_type &Indices,
692  nonconst_values_host_view_type &Values,
693  size_t& NumEntries) const
694  {
695  auto vals = getValuesHost(LocalRow);
696  const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
697  if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
698  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
699  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
700  << numInds << " row entries");
701  }
702  const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
703  for (LO i=0; i<numInds; ++i) {
704  Indices[i] = colInds[i];
705  }
706  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
707  Values[i] = vals[i];
708  }
709  NumEntries = numInds;
710  }
711 
712  template<class Scalar, class LO, class GO, class Node>
713  LO
715  getLocalRowOffsets (const LO localRowInd,
716  ptrdiff_t offsets[],
717  const LO colInds[],
718  const LO numColInds) const
719  {
720  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
721  // We got no offsets, because the input local row index is
722  // invalid on the calling process. That may not be an error, if
723  // numColInds is zero anyway; it doesn't matter. This is the
724  // advantage of returning the number of valid indices.
725  return static_cast<LO> (0);
726  }
727 
728  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
729  LO hint = 0; // Guess for the relative offset into the current row
730  LO validCount = 0; // number of valid column indices in colInds
731 
732  for (LO k = 0; k < numColInds; ++k) {
733  const LO relBlockOffset =
734  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
735  if (relBlockOffset != LINV) {
736  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
737  hint = relBlockOffset + 1;
738  ++validCount;
739  }
740  }
741  return validCount;
742  }
743 
744 
745  template<class Scalar, class LO, class GO, class Node>
746  LO
748  replaceLocalValuesByOffsets (const LO localRowInd,
749  const ptrdiff_t offsets[],
750  const Scalar vals[],
751  const LO numOffsets) const
752  {
753  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
754  // We modified no values, because the input local row index is
755  // invalid on the calling process. That may not be an error, if
756  // numColInds is zero anyway; it doesn't matter. This is the
757  // advantage of returning the number of valid indices.
758  return static_cast<LO> (0);
759  }
760  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
761  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
762  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
763  impl_scalar_type* vOut = val_out.data();
764 
765  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
766  const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
767  size_t pointOffset = 0; // Current offset into input values
768  LO validCount = 0; // number of valid offsets
769 
770  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
771  const size_t blockOffset = offsets[k]*perBlockSize;
772  if (offsets[k] != STINV) {
773  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
774  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
775  COPY (A_new, A_old);
776  ++validCount;
777  }
778  }
779  return validCount;
780  }
781 
782 
783  template<class Scalar, class LO, class GO, class Node>
784  LO
786  absMaxLocalValuesByOffsets (const LO localRowInd,
787  const ptrdiff_t offsets[],
788  const Scalar vals[],
789  const LO numOffsets) const
790  {
791  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
792  // We modified no values, because the input local row index is
793  // invalid on the calling process. That may not be an error, if
794  // numColInds is zero anyway; it doesn't matter. This is the
795  // advantage of returning the number of valid indices.
796  return static_cast<LO> (0);
797  }
798  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
799  auto val_out = getValuesHost(localRowInd);
800  impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
801 
802  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
803  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
804  size_t pointOffset = 0; // Current offset into input values
805  LO validCount = 0; // number of valid offsets
806 
807  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
808  const size_t blockOffset = offsets[k]*perBlockSize;
809  if (blockOffset != STINV) {
810  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
811  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
812  ::Tpetra::Impl::absMax (A_old, A_new);
813  ++validCount;
814  }
815  }
816  return validCount;
817  }
818 
819 
820  template<class Scalar, class LO, class GO, class Node>
821  LO
823  sumIntoLocalValuesByOffsets (const LO localRowInd,
824  const ptrdiff_t offsets[],
825  const Scalar vals[],
826  const LO numOffsets) const
827  {
828  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
829  // We modified no values, because the input local row index is
830  // invalid on the calling process. That may not be an error, if
831  // numColInds is zero anyway; it doesn't matter. This is the
832  // advantage of returning the number of valid indices.
833  return static_cast<LO> (0);
834  }
835  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
836  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
837  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
838  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
839  impl_scalar_type* vOut = val_out.data();
840 
841  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
842  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
843  size_t pointOffset = 0; // Current offset into input values
844  LO validCount = 0; // number of valid offsets
845 
846  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
847  const size_t blockOffset = offsets[k]*perBlockSize;
848  if (blockOffset != STINV) {
849  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
850  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
851  AXPY (ONE, A_new, A_old);
852  ++validCount;
853  }
854  }
855  return validCount;
856  }
857 
858  template<class Scalar, class LO, class GO, class Node>
860  impl_scalar_type_dualview::t_host::const_type
862  getValuesHost () const
863  {
864  return val_.getHostView(Access::ReadOnly);
865  }
866 
867  template<class Scalar, class LO, class GO, class Node>
868  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
869  impl_scalar_type_dualview::t_dev::const_type
870  BlockCrsMatrix<Scalar, LO, GO, Node>::
871  getValuesDevice () const
872  {
873  return val_.getDeviceView(Access::ReadOnly);
874  }
875 
876  template<class Scalar, class LO, class GO, class Node>
877  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
878  impl_scalar_type_dualview::t_host
881  {
882  return val_.getHostView(Access::ReadWrite);
883  }
884 
885  template<class Scalar, class LO, class GO, class Node>
887  impl_scalar_type_dualview::t_dev
889  getValuesDeviceNonConst () const
890  {
891  return val_.getDeviceView(Access::ReadWrite);
892  }
893 
894  template<class Scalar, class LO, class GO, class Node>
895  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
896  impl_scalar_type_dualview::t_host::const_type
897  BlockCrsMatrix<Scalar, LO, GO, Node>::
898  getValuesHost (const LO& lclRow) const
899  {
900  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
901  auto val = val_.getHostView(Access::ReadOnly);
902  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
903  return r_val;
904  }
905 
906  template<class Scalar, class LO, class GO, class Node>
908  impl_scalar_type_dualview::t_dev::const_type
910  getValuesDevice (const LO& lclRow) const
911  {
912  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
913  auto val = val_.getDeviceView(Access::ReadOnly);
914  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
915  return r_val;
916  }
917 
918  template<class Scalar, class LO, class GO, class Node>
921  getValuesHostNonConst (const LO& lclRow)
922  {
923  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
924  auto val = val_.getHostView(Access::ReadWrite);
925  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
926  return r_val;
927  }
928 
929  template<class Scalar, class LO, class GO, class Node>
932  getValuesDeviceNonConst (const LO& lclRow)
933  {
934  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
935  auto val = val_.getDeviceView(Access::ReadWrite);
936  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
937  return r_val;
938  }
939 
940  template<class Scalar, class LO, class GO, class Node>
941  size_t
943  getNumEntriesInLocalRow (const LO localRowInd) const
944  {
945  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
946  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
947  return static_cast<LO> (0); // the calling process doesn't have that row
948  } else {
949  return static_cast<LO> (numEntInGraph);
950  }
951  }
952 
953  template<class Scalar, class LO, class GO, class Node>
954  typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
957  {
958  auto numCols = this->graph_.getColMap()->getLocalNumElements();
959  auto val = val_.getDeviceView(Access::ReadWrite);
960  const LO blockSize = this->getBlockSize ();
961  const auto graph = this->graph_.getLocalGraphDevice ();
962 
963  return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
964  numCols,
965  val,
966  graph,
967  blockSize);
968  }
969 
970  template<class Scalar, class LO, class GO, class Node>
971  void
975  const Teuchos::ETransp mode,
976  const Scalar alpha,
977  const Scalar beta)
978  {
979  (void) X;
980  (void) Y;
981  (void) mode;
982  (void) alpha;
983  (void) beta;
984 
985  TEUCHOS_TEST_FOR_EXCEPTION(
986  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
987  "transpose and conjugate transpose modes are not yet implemented.");
988  }
989 
990  template<class Scalar, class LO, class GO, class Node>
991  void
992  BlockCrsMatrix<Scalar, LO, GO, Node>::
993  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
994  BlockMultiVector<Scalar, LO, GO, Node>& Y,
995  const Scalar alpha,
996  const Scalar beta)
997  {
998  using Teuchos::RCP;
999  using Teuchos::rcp;
1000  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1001  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1002  const Scalar zero = STS::zero ();
1003  const Scalar one = STS::one ();
1004  RCP<const import_type> import = graph_.getImporter ();
1005  // "export" is a reserved C++ keyword, so we can't use it.
1006  RCP<const export_type> theExport = graph_.getExporter ();
1007  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1008 
1009  if (alpha == zero) {
1010  if (beta == zero) {
1011  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1012  }
1013  else if (beta != one) {
1014  Y.scale (beta);
1015  }
1016  }
1017  else { // alpha != 0
1018  const BMV* X_colMap = NULL;
1019  if (import.is_null ()) {
1020  try {
1021  X_colMap = &X;
1022  }
1023  catch (std::exception& e) {
1024  TEUCHOS_TEST_FOR_EXCEPTION
1025  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1026  "operator= threw an exception: " << std::endl << e.what ());
1027  }
1028  }
1029  else {
1030  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1031  // Y_rowMap_ below. This lets us do lazy initialization
1032  // correctly with view semantics of BlockCrsMatrix. All views
1033  // of this BlockCrsMatrix have the same outer pointer. That
1034  // way, we can set the inner pointer in one view, and all
1035  // other views will see it.
1036  if ((*X_colMap_).is_null () ||
1037  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1038  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1039  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1040  static_cast<LO> (X.getNumVectors ())));
1041  }
1042  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1043  **pointImporter_,
1045  try {
1046  X_colMap = &(**X_colMap_);
1047  }
1048  catch (std::exception& e) {
1049  TEUCHOS_TEST_FOR_EXCEPTION
1050  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1051  "operator= threw an exception: " << std::endl << e.what ());
1052  }
1053  }
1054 
1055  BMV* Y_rowMap = NULL;
1056  if (theExport.is_null ()) {
1057  Y_rowMap = &Y;
1058  }
1059  else if ((*Y_rowMap_).is_null () ||
1060  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1061  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1062  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1063  static_cast<LO> (X.getNumVectors ())));
1064  try {
1065  Y_rowMap = &(**Y_rowMap_);
1066  }
1067  catch (std::exception& e) {
1068  TEUCHOS_TEST_FOR_EXCEPTION(
1069  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1070  "operator= threw an exception: " << std::endl << e.what ());
1071  }
1072  }
1073 
1074  try {
1075  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1076  }
1077  catch (std::exception& e) {
1078  TEUCHOS_TEST_FOR_EXCEPTION
1079  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1080  "an exception: " << e.what ());
1081  }
1082  catch (...) {
1083  TEUCHOS_TEST_FOR_EXCEPTION
1084  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1085  "an exception not a subclass of std::exception.");
1086  }
1087 
1088  if (! theExport.is_null ()) {
1089  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1090  }
1091  }
1092  }
1093 
1094 // clang-format on
1095 template <class Scalar, class LO, class GO, class Node>
1096 void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1097  const BlockMultiVector<Scalar, LO, GO, Node> &X,
1098  BlockMultiVector<Scalar, LO, GO, Node> &Y, const Scalar alpha,
1099  const Scalar beta) {
1100  ::Tpetra::Details::ProfilingRegion profile_region(
1101  "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1102  const impl_scalar_type alpha_impl = alpha;
1103  const auto graph = this->graph_.getLocalGraphDevice();
1104 
1105  mv_type X_mv = X.getMultiVectorView();
1106  mv_type Y_mv = Y.getMultiVectorView();
1107  auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1108  auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1109 
1110 #if KOKKOSKERNELS_VERSION >= 40299
1111  auto A_lcl = getLocalMatrixDevice();
1112  if(!applyHelper.get()) {
1113  // The apply helper does not exist, so create it
1114  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
1115  }
1116  if(applyHelper->shouldUseIntRowptrs())
1117  {
1118  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
1119  KokkosSparse::spmv(
1120  &applyHelper->handle_int, KokkosSparse::NoTranspose,
1121  alpha_impl, A_lcl_int_rowptrs, X_lcl, beta, Y_lcl);
1122  }
1123  else
1124  {
1125  KokkosSparse::spmv(
1126  &applyHelper->handle, KokkosSparse::NoTranspose,
1127  alpha_impl, A_lcl, X_lcl, beta, Y_lcl);
1128  }
1129 #else
1130  auto A_lcl = getLocalMatrixDevice();
1131  KokkosSparse::spmv(KokkosSparse::NoTranspose, alpha_impl, A_lcl, X_lcl, beta,
1132  Y_lcl);
1133 #endif
1134 }
1135 // clang-format off
1136 
1137  template<class Scalar, class LO, class GO, class Node>
1138  LO
1139  BlockCrsMatrix<Scalar, LO, GO, Node>::
1140  findRelOffsetOfColumnIndex (const LO localRowIndex,
1141  const LO colIndexToFind,
1142  const LO hint) const
1143  {
1144  const size_t absStartOffset = ptrHost_[localRowIndex];
1145  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1146  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1147  // Amortize pointer arithmetic over the search loop.
1148  const LO* const curInd = indHost_.data () + absStartOffset;
1149 
1150  // If the hint was correct, then the hint is the offset to return.
1151  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1152  // Always return the offset relative to the current row.
1153  return hint;
1154  }
1155 
1156  // The hint was wrong, so we must search for the given column
1157  // index in the column indices for the given row.
1158  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1159 
1160  // We require that the graph have sorted rows. However, binary
1161  // search only pays if the current row is longer than a certain
1162  // amount. We set this to 32, but you might want to tune this.
1163  const LO maxNumEntriesForLinearSearch = 32;
1164  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1165  // Use binary search. It would probably be better for us to
1166  // roll this loop by hand. If we wrote it right, a smart
1167  // compiler could perhaps use conditional loads and avoid
1168  // branches (according to Jed Brown on May 2014).
1169  const LO* beg = curInd;
1170  const LO* end = curInd + numEntriesInRow;
1171  std::pair<const LO*, const LO*> p =
1172  std::equal_range (beg, end, colIndexToFind);
1173  if (p.first != p.second) {
1174  // offset is relative to the current row
1175  relOffset = static_cast<LO> (p.first - beg);
1176  }
1177  }
1178  else { // use linear search
1179  for (LO k = 0; k < numEntriesInRow; ++k) {
1180  if (colIndexToFind == curInd[k]) {
1181  relOffset = k; // offset is relative to the current row
1182  break;
1183  }
1184  }
1185  }
1186 
1187  return relOffset;
1188  }
1189 
1190  template<class Scalar, class LO, class GO, class Node>
1191  LO
1192  BlockCrsMatrix<Scalar, LO, GO, Node>::
1193  offsetPerBlock () const
1194  {
1195  return offsetPerBlock_;
1196  }
1197 
1198  template<class Scalar, class LO, class GO, class Node>
1199  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1200  BlockCrsMatrix<Scalar, LO, GO, Node>::
1201  getConstLocalBlockFromInput (const impl_scalar_type* val,
1202  const size_t pointOffset) const
1203  {
1204  // Row major blocks
1205  const LO rowStride = blockSize_;
1206  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1207  }
1208 
1209  template<class Scalar, class LO, class GO, class Node>
1210  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1211  BlockCrsMatrix<Scalar, LO, GO, Node>::
1212  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1213  const size_t pointOffset) const
1214  {
1215  // Row major blocks
1216  const LO rowStride = blockSize_;
1217  return little_block_type (val + pointOffset, blockSize_, rowStride);
1218  }
1219 
1220  template<class Scalar, class LO, class GO, class Node>
1221  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1222  BlockCrsMatrix<Scalar, LO, GO, Node>::
1223  getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1224  const size_t pointOffset) const
1225  {
1226  // Row major blocks
1227  const LO rowStride = blockSize_;
1228  const size_t bs2 = blockSize_ * blockSize_;
1229  return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1230  }
1231 
1232  template<class Scalar, class LO, class GO, class Node>
1233  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1234  BlockCrsMatrix<Scalar, LO, GO, Node>::
1235  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1236  {
1237  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1238 
1239  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1240  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1241  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1242  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1243  auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1244  auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1245  return r_val;
1246  }
1247  else {
1248  return little_block_type ();
1249  }
1250  }
1251 
1252  template<class Scalar, class LO, class GO, class Node>
1253  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1254  BlockCrsMatrix<Scalar, LO, GO, Node>::
1255  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1256  {
1257  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1258 
1259  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1260  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1261  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1262  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1263  auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1264  auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1265  return r_val;
1266  }
1267  else {
1268  return little_block_host_type ();
1269  }
1270  }
1271 
1272 
1273  template<class Scalar, class LO, class GO, class Node>
1274  bool
1275  BlockCrsMatrix<Scalar, LO, GO, Node>::
1276  checkSizes (const ::Tpetra::SrcDistObject& source)
1277  {
1278  using std::endl;
1279  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1280  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1281 
1282  if (src == NULL) {
1283  std::ostream& err = markLocalErrorAndGetStream ();
1284  err << "checkSizes: The source object of the Import or Export "
1285  "must be a BlockCrsMatrix with the same template parameters as the "
1286  "target object." << endl;
1287  }
1288  else {
1289  // Use a string of ifs, not if-elseifs, because we want to know
1290  // all the errors.
1291  if (src->getBlockSize () != this->getBlockSize ()) {
1292  std::ostream& err = markLocalErrorAndGetStream ();
1293  err << "checkSizes: The source and target objects of the Import or "
1294  << "Export must have the same block sizes. The source's block "
1295  << "size = " << src->getBlockSize () << " != the target's block "
1296  << "size = " << this->getBlockSize () << "." << endl;
1297  }
1298  if (! src->graph_.isFillComplete ()) {
1299  std::ostream& err = markLocalErrorAndGetStream ();
1300  err << "checkSizes: The source object of the Import or Export is "
1301  "not fill complete. Both source and target objects must be fill "
1302  "complete." << endl;
1303  }
1304  if (! this->graph_.isFillComplete ()) {
1305  std::ostream& err = markLocalErrorAndGetStream ();
1306  err << "checkSizes: The target object of the Import or Export is "
1307  "not fill complete. Both source and target objects must be fill "
1308  "complete." << endl;
1309  }
1310  if (src->graph_.getColMap ().is_null ()) {
1311  std::ostream& err = markLocalErrorAndGetStream ();
1312  err << "checkSizes: The source object of the Import or Export does "
1313  "not have a column Map. Both source and target objects must have "
1314  "column Maps." << endl;
1315  }
1316  if (this->graph_.getColMap ().is_null ()) {
1317  std::ostream& err = markLocalErrorAndGetStream ();
1318  err << "checkSizes: The target object of the Import or Export does "
1319  "not have a column Map. Both source and target objects must have "
1320  "column Maps." << endl;
1321  }
1322  }
1323 
1324  return ! (* (this->localError_));
1325  }
1326 
1327  template<class Scalar, class LO, class GO, class Node>
1328  void
1331  (const ::Tpetra::SrcDistObject& source,
1332  const size_t numSameIDs,
1333  const Kokkos::DualView<const local_ordinal_type*,
1334  buffer_device_type>& permuteToLIDs,
1335  const Kokkos::DualView<const local_ordinal_type*,
1336  buffer_device_type>& permuteFromLIDs,
1337  const CombineMode /*CM*/)
1338  {
1339  using ::Tpetra::Details::Behavior;
1341  using ::Tpetra::Details::ProfilingRegion;
1342  using std::endl;
1343  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1344 
1345  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1346  const bool debug = Behavior::debug();
1347  const bool verbose = Behavior::verbose();
1348 
1349  // Define this function prefix
1350  std::string prefix;
1351  {
1352  std::ostringstream os;
1353  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1354  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1355  prefix = os.str();
1356  }
1357 
1358  // check if this already includes a local error
1359  if (* (this->localError_)) {
1360  std::ostream& err = this->markLocalErrorAndGetStream ();
1361  err << prefix
1362  << "The target object of the Import or Export is already in an error state."
1363  << endl;
1364  return;
1365  }
1366 
1367  //
1368  // Verbose input dual view status
1369  //
1370  if (verbose) {
1371  std::ostringstream os;
1372  os << prefix << endl
1373  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1374  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1375  std::cerr << os.str ();
1376  }
1377 
1381  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1382  std::ostream& err = this->markLocalErrorAndGetStream ();
1383  err << prefix
1384  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1385  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1386  << "." << endl;
1387  return;
1388  }
1389  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1390  std::ostream& err = this->markLocalErrorAndGetStream ();
1391  err << prefix
1392  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1393  << endl;
1394  return;
1395  }
1396 
1397  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1398  if (src == NULL) {
1399  std::ostream& err = this->markLocalErrorAndGetStream ();
1400  err << prefix
1401  << "The source (input) object of the Import or "
1402  "Export is either not a BlockCrsMatrix, or does not have the right "
1403  "template parameters. checkSizes() should have caught this. "
1404  "Please report this bug to the Tpetra developers." << endl;
1405  return;
1406  }
1407 
1408  bool lclErr = false;
1409 #ifdef HAVE_TPETRA_DEBUG
1410  std::set<LO> invalidSrcCopyRows;
1411  std::set<LO> invalidDstCopyRows;
1412  std::set<LO> invalidDstCopyCols;
1413  std::set<LO> invalidDstPermuteCols;
1414  std::set<LO> invalidPermuteFromRows;
1415 #endif // HAVE_TPETRA_DEBUG
1416 
1417  // Copy the initial sequence of rows that are the same.
1418  //
1419  // The two graphs might have different column Maps, so we need to
1420  // do this using global column indices. This is purely local, so
1421  // we only need to check for local sameness of the two column
1422  // Maps.
1423 
1424 #ifdef HAVE_TPETRA_DEBUG
1425  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1426 #endif // HAVE_TPETRA_DEBUG
1427  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1428  const map_type& srcColMap = * (src->graph_.getColMap ());
1429  const map_type& dstColMap = * (this->graph_.getColMap ());
1430  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1431 
1432  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1433  if (verbose) {
1434  std::ostringstream os;
1435  os << prefix
1436  << "canUseLocalColumnIndices: "
1437  << (canUseLocalColumnIndices ? "true" : "false")
1438  << ", numPermute: " << numPermute
1439  << endl;
1440  std::cerr << os.str ();
1441  }
1442 
1443  const auto permuteToLIDsHost = permuteToLIDs.view_host();
1444  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1445 
1446  if (canUseLocalColumnIndices) {
1447  // Copy local rows that are the "same" in both source and target.
1448  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1449 #ifdef HAVE_TPETRA_DEBUG
1450  if (! srcRowMap.isNodeLocalElement (localRow)) {
1451  lclErr = true;
1452  invalidSrcCopyRows.insert (localRow);
1453  continue; // skip invalid rows
1454  }
1455 #endif // HAVE_TPETRA_DEBUG
1456 
1457  local_inds_host_view_type lclSrcCols;
1458  values_host_view_type vals;
1459  LO numEntries;
1460  // If this call fails, that means the mesh row local index is
1461  // invalid. That means the Import or Export is invalid somehow.
1462  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1463  if (numEntries > 0) {
1464  LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1465  if (err != numEntries) {
1466  lclErr = true;
1467  if (! dstRowMap.isNodeLocalElement (localRow)) {
1468 #ifdef HAVE_TPETRA_DEBUG
1469  invalidDstCopyRows.insert (localRow);
1470 #endif // HAVE_TPETRA_DEBUG
1471  }
1472  else {
1473  // Once there's an error, there's no sense in saving
1474  // time, so we check whether the column indices were
1475  // invalid. However, only remember which ones were
1476  // invalid in a debug build, because that might take a
1477  // lot of space.
1478  for (LO k = 0; k < numEntries; ++k) {
1479  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1480  lclErr = true;
1481 #ifdef HAVE_TPETRA_DEBUG
1482  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1483 #endif // HAVE_TPETRA_DEBUG
1484  }
1485  }
1486  }
1487  }
1488  }
1489  } // for each "same" local row
1490 
1491  // Copy the "permute" local rows.
1492  for (size_t k = 0; k < numPermute; ++k) {
1493  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1494  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1495 
1496  local_inds_host_view_type lclSrcCols;
1497  values_host_view_type vals;
1498  LO numEntries;
1499  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1500  if (numEntries > 0) {
1501  LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1502  if (err != numEntries) {
1503  lclErr = true;
1504 #ifdef HAVE_TPETRA_DEBUG
1505  for (LO c = 0; c < numEntries; ++c) {
1506  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1507  invalidDstPermuteCols.insert (lclSrcCols[c]);
1508  }
1509  }
1510 #endif // HAVE_TPETRA_DEBUG
1511  }
1512  }
1513  }
1514  }
1515  else { // must convert column indices to global
1516  // Reserve space to store the destination matrix's local column indices.
1517  const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1518  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1519 
1520  // Copy local rows that are the "same" in both source and target.
1521  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1522  local_inds_host_view_type lclSrcCols;
1523  values_host_view_type vals;
1524  LO numEntries;
1525 
1526  // If this call fails, that means the mesh row local index is
1527  // invalid. That means the Import or Export is invalid somehow.
1528  try {
1529  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1530  } catch (std::exception& e) {
1531  if (debug) {
1532  std::ostringstream os;
1533  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1534  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1535  << localRow << ", src->getLocalRowView() threw an exception: "
1536  << e.what ();
1537  std::cerr << os.str ();
1538  }
1539  throw e;
1540  }
1541 
1542  if (numEntries > 0) {
1543  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1544  lclErr = true;
1545  if (debug) {
1546  std::ostringstream os;
1547  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1548  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1549  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1550  << maxNumEnt << endl;
1551  std::cerr << os.str ();
1552  }
1553  }
1554  else {
1555  // Convert the source matrix's local column indices to the
1556  // destination matrix's local column indices.
1557  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1558  for (LO j = 0; j < numEntries; ++j) {
1559  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1560  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1561  lclErr = true;
1562 #ifdef HAVE_TPETRA_DEBUG
1563  invalidDstCopyCols.insert (lclDstColsView[j]);
1564 #endif // HAVE_TPETRA_DEBUG
1565  }
1566  }
1567  LO err(0);
1568  try {
1569  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1570  } catch (std::exception& e) {
1571  if (debug) {
1572  std::ostringstream os;
1573  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1574  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1575  << localRow << ", this->replaceLocalValues() threw an exception: "
1576  << e.what ();
1577  std::cerr << os.str ();
1578  }
1579  throw e;
1580  }
1581  if (err != numEntries) {
1582  lclErr = true;
1583  if (debug) {
1584  std::ostringstream os;
1585  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1586  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1587  "localRow " << localRow << ", this->replaceLocalValues "
1588  "returned " << err << " instead of numEntries = "
1589  << numEntries << endl;
1590  std::cerr << os.str ();
1591  }
1592  }
1593  }
1594  }
1595  }
1596 
1597  // Copy the "permute" local rows.
1598  for (size_t k = 0; k < numPermute; ++k) {
1599  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1600  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1601 
1602  local_inds_host_view_type lclSrcCols;
1603  values_host_view_type vals;
1604  LO numEntries;
1605 
1606  try {
1607  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1608  } catch (std::exception& e) {
1609  if (debug) {
1610  std::ostringstream os;
1611  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1612  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1613  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1614  << ", src->getLocalRowView() threw an exception: " << e.what ();
1615  std::cerr << os.str ();
1616  }
1617  throw e;
1618  }
1619 
1620  if (numEntries > 0) {
1621  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1622  lclErr = true;
1623  }
1624  else {
1625  // Convert the source matrix's local column indices to the
1626  // destination matrix's local column indices.
1627  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1628  for (LO j = 0; j < numEntries; ++j) {
1629  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1630  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1631  lclErr = true;
1632 #ifdef HAVE_TPETRA_DEBUG
1633  invalidDstPermuteCols.insert (lclDstColsView[j]);
1634 #endif // HAVE_TPETRA_DEBUG
1635  }
1636  }
1637  LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1638  if (err != numEntries) {
1639  lclErr = true;
1640  }
1641  }
1642  }
1643  }
1644  }
1645 
1646  if (lclErr) {
1647  std::ostream& err = this->markLocalErrorAndGetStream ();
1648 #ifdef HAVE_TPETRA_DEBUG
1649  err << "copyAndPermute: The graph structure of the source of the "
1650  "Import or Export must be a subset of the graph structure of the "
1651  "target. ";
1652  err << "invalidSrcCopyRows = [";
1653  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1654  it != invalidSrcCopyRows.end (); ++it) {
1655  err << *it;
1656  typename std::set<LO>::const_iterator itp1 = it;
1657  itp1++;
1658  if (itp1 != invalidSrcCopyRows.end ()) {
1659  err << ",";
1660  }
1661  }
1662  err << "], invalidDstCopyRows = [";
1663  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1664  it != invalidDstCopyRows.end (); ++it) {
1665  err << *it;
1666  typename std::set<LO>::const_iterator itp1 = it;
1667  itp1++;
1668  if (itp1 != invalidDstCopyRows.end ()) {
1669  err << ",";
1670  }
1671  }
1672  err << "], invalidDstCopyCols = [";
1673  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1674  it != invalidDstCopyCols.end (); ++it) {
1675  err << *it;
1676  typename std::set<LO>::const_iterator itp1 = it;
1677  itp1++;
1678  if (itp1 != invalidDstCopyCols.end ()) {
1679  err << ",";
1680  }
1681  }
1682  err << "], invalidDstPermuteCols = [";
1683  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1684  it != invalidDstPermuteCols.end (); ++it) {
1685  err << *it;
1686  typename std::set<LO>::const_iterator itp1 = it;
1687  itp1++;
1688  if (itp1 != invalidDstPermuteCols.end ()) {
1689  err << ",";
1690  }
1691  }
1692  err << "], invalidPermuteFromRows = [";
1693  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1694  it != invalidPermuteFromRows.end (); ++it) {
1695  err << *it;
1696  typename std::set<LO>::const_iterator itp1 = it;
1697  itp1++;
1698  if (itp1 != invalidPermuteFromRows.end ()) {
1699  err << ",";
1700  }
1701  }
1702  err << "]" << endl;
1703 #else
1704  err << "copyAndPermute: The graph structure of the source of the "
1705  "Import or Export must be a subset of the graph structure of the "
1706  "target." << endl;
1707 #endif // HAVE_TPETRA_DEBUG
1708  }
1709 
1710  if (debug) {
1711  std::ostringstream os;
1712  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1713  const bool lclSuccess = ! (* (this->localError_));
1714  os << "*** Proc " << myRank << ": copyAndPermute "
1715  << (lclSuccess ? "succeeded" : "FAILED");
1716  if (lclSuccess) {
1717  os << endl;
1718  } else {
1719  os << ": error messages: " << this->errorMessages (); // comes w/ endl
1720  }
1721  std::cerr << os.str ();
1722  }
1723  }
1724 
1725  namespace { // (anonymous)
1726 
1745  template<class LO, class GO>
1746  size_t
1747  packRowCount (const size_t numEnt,
1748  const size_t numBytesPerValue,
1749  const size_t blkSize)
1750  {
1751  using ::Tpetra::Details::PackTraits;
1752 
1753  if (numEnt == 0) {
1754  // Empty rows always take zero bytes, to ensure sparsity.
1755  return 0;
1756  }
1757  else {
1758  // We store the number of entries as a local index (LO).
1759  LO numEntLO = 0; // packValueCount wants this.
1760  GO gid {};
1761  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1762  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1763  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1764  return numEntLen + gidsLen + valsLen;
1765  }
1766  }
1767 
1778  template<class ST, class LO, class GO>
1779  size_t
1780  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1781  const size_t offset,
1782  const size_t numBytes,
1783  const size_t /* numBytesPerValue */)
1784  {
1785  using Kokkos::subview;
1786  using ::Tpetra::Details::PackTraits;
1787 
1788  if (numBytes == 0) {
1789  // Empty rows always take zero bytes, to ensure sparsity.
1790  return static_cast<size_t> (0);
1791  }
1792  else {
1793  LO numEntLO = 0;
1794  const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
1795  TEUCHOS_TEST_FOR_EXCEPTION
1796  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1797  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
1798  << ".");
1799  const char* const inBuf = imports.data () + offset;
1800  const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
1801  TEUCHOS_TEST_FOR_EXCEPTION
1802  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1803  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
1804  << ".");
1805  return static_cast<size_t> (numEntLO);
1806  }
1807  }
1808 
1812  template<class ST, class LO, class GO>
1813  size_t
1814  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1815  const size_t offset,
1816  const size_t numEnt,
1817  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1818  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1819  const size_t numBytesPerValue,
1820  const size_t blockSize)
1821  {
1822  using Kokkos::subview;
1823  using ::Tpetra::Details::PackTraits;
1824 
1825  if (numEnt == 0) {
1826  // Empty rows always take zero bytes, to ensure sparsity.
1827  return 0;
1828  }
1829  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1830 
1831  const GO gid = 0; // packValueCount wants this
1832  const LO numEntLO = static_cast<size_t> (numEnt);
1833 
1834  const size_t numEntBeg = offset;
1835  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1836  const size_t gidsBeg = numEntBeg + numEntLen;
1837  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1838  const size_t valsBeg = gidsBeg + gidsLen;
1839  const size_t valsLen = numScalarEnt * numBytesPerValue;
1840 
1841  char* const numEntOut = exports.data () + numEntBeg;
1842  char* const gidsOut = exports.data () + gidsBeg;
1843  char* const valsOut = exports.data () + valsBeg;
1844 
1845  size_t numBytesOut = 0;
1846  int errorCode = 0;
1847  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
1848 
1849  {
1850  Kokkos::pair<int, size_t> p;
1851  p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
1852  errorCode += p.first;
1853  numBytesOut += p.second;
1854 
1855  p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
1856  errorCode += p.first;
1857  numBytesOut += p.second;
1858  }
1859 
1860  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1861  TEUCHOS_TEST_FOR_EXCEPTION
1862  (numBytesOut != expectedNumBytes, std::logic_error,
1863  "packRowForBlockCrs: numBytesOut = " << numBytesOut
1864  << " != expectedNumBytes = " << expectedNumBytes << ".");
1865 
1866  TEUCHOS_TEST_FOR_EXCEPTION
1867  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
1868  "PackTraits::packArray returned a nonzero error code");
1869 
1870  return numBytesOut;
1871  }
1872 
1873  // Return the number of bytes actually read / used.
1874  template<class ST, class LO, class GO>
1875  size_t
1876  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1877  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1878  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1879  const size_t offset,
1880  const size_t numBytes,
1881  const size_t numEnt,
1882  const size_t numBytesPerValue,
1883  const size_t blockSize)
1884  {
1885  using ::Tpetra::Details::PackTraits;
1886 
1887  if (numBytes == 0) {
1888  // Rows with zero bytes always have zero entries.
1889  return 0;
1890  }
1891  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1892  TEUCHOS_TEST_FOR_EXCEPTION
1893  (static_cast<size_t> (imports.extent (0)) <= offset,
1894  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1895  << imports.extent (0) << " <= offset = " << offset << ".");
1896  TEUCHOS_TEST_FOR_EXCEPTION
1897  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
1898  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1899  << imports.extent (0) << " < offset + numBytes = "
1900  << (offset + numBytes) << ".");
1901 
1902  const GO gid = 0; // packValueCount wants this
1903  const LO lid = 0; // packValueCount wants this
1904 
1905  const size_t numEntBeg = offset;
1906  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
1907  const size_t gidsBeg = numEntBeg + numEntLen;
1908  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1909  const size_t valsBeg = gidsBeg + gidsLen;
1910  const size_t valsLen = numScalarEnt * numBytesPerValue;
1911 
1912  const char* const numEntIn = imports.data () + numEntBeg;
1913  const char* const gidsIn = imports.data () + gidsBeg;
1914  const char* const valsIn = imports.data () + valsBeg;
1915 
1916  size_t numBytesOut = 0;
1917  int errorCode = 0;
1918  LO numEntOut;
1919  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
1920  TEUCHOS_TEST_FOR_EXCEPTION
1921  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
1922  "unpackRowForBlockCrs: Expected number of entries " << numEnt
1923  << " != actual number of entries " << numEntOut << ".");
1924 
1925  {
1926  Kokkos::pair<int, size_t> p;
1927  p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
1928  errorCode += p.first;
1929  numBytesOut += p.second;
1930 
1931  p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
1932  errorCode += p.first;
1933  numBytesOut += p.second;
1934  }
1935 
1936  TEUCHOS_TEST_FOR_EXCEPTION
1937  (numBytesOut != numBytes, std::logic_error,
1938  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1939  << " != numBytes = " << numBytes << ".");
1940 
1941  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1942  TEUCHOS_TEST_FOR_EXCEPTION
1943  (numBytesOut != expectedNumBytes, std::logic_error,
1944  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1945  << " != expectedNumBytes = " << expectedNumBytes << ".");
1946 
1947  TEUCHOS_TEST_FOR_EXCEPTION
1948  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
1949  "PackTraits::unpackArray returned a nonzero error code");
1950 
1951  return numBytesOut;
1952  }
1953  } // namespace (anonymous)
1954 
1955  template<class Scalar, class LO, class GO, class Node>
1956  void
1959  (const ::Tpetra::SrcDistObject& source,
1960  const Kokkos::DualView<const local_ordinal_type*,
1961  buffer_device_type>& exportLIDs,
1962  Kokkos::DualView<packet_type*,
1963  buffer_device_type>& exports, // output
1964  Kokkos::DualView<size_t*,
1965  buffer_device_type> numPacketsPerLID, // output
1966  size_t& constantNumPackets)
1967  {
1968  using ::Tpetra::Details::Behavior;
1970  using ::Tpetra::Details::ProfilingRegion;
1971  using ::Tpetra::Details::PackTraits;
1972 
1973  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
1974 
1975  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1976 
1977  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1978 
1979  const bool debug = Behavior::debug();
1980  const bool verbose = Behavior::verbose();
1981 
1982  // Define this function prefix
1983  std::string prefix;
1984  {
1985  std::ostringstream os;
1986  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1987  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1988  prefix = os.str();
1989  }
1990 
1991  // check if this already includes a local error
1992  if (* (this->localError_)) {
1993  std::ostream& err = this->markLocalErrorAndGetStream ();
1994  err << prefix
1995  << "The target object of the Import or Export is already in an error state."
1996  << std::endl;
1997  return;
1998  }
1999 
2000  //
2001  // Verbose input dual view status
2002  //
2003  if (verbose) {
2004  std::ostringstream os;
2005  os << prefix << std::endl
2006  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
2007  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
2008  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
2009  std::cerr << os.str ();
2010  }
2011 
2015  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2016  std::ostream& err = this->markLocalErrorAndGetStream ();
2017  err << prefix
2018  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
2019  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2020  << "." << std::endl;
2021  return;
2022  }
2023  if (exportLIDs.need_sync_host ()) {
2024  std::ostream& err = this->markLocalErrorAndGetStream ();
2025  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2026  return;
2027  }
2028 
2029  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
2030  if (src == NULL) {
2031  std::ostream& err = this->markLocalErrorAndGetStream ();
2032  err << prefix
2033  << "The source (input) object of the Import or "
2034  "Export is either not a BlockCrsMatrix, or does not have the right "
2035  "template parameters. checkSizes() should have caught this. "
2036  "Please report this bug to the Tpetra developers." << std::endl;
2037  return;
2038  }
2039 
2040  // Graphs and matrices are allowed to have a variable number of
2041  // entries per row. We could test whether all rows have the same
2042  // number of entries, but DistObject can only use this
2043  // optimization if all rows on _all_ processes have the same
2044  // number of entries. Rather than do the all-reduce necessary to
2045  // test for this unlikely case, we tell DistObject (by setting
2046  // constantNumPackets to zero) to assume that different rows may
2047  // have different numbers of entries.
2048  constantNumPackets = 0;
2049 
2050  // const values
2051  const crs_graph_type& srcGraph = src->graph_;
2052  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2053  const size_t numExportLIDs = exportLIDs.extent (0);
2054  size_t numBytesPerValue(0);
2055  {
2056  auto val_host = val_.getHostView(Access::ReadOnly);
2057  numBytesPerValue =
2058  PackTraits<impl_scalar_type>
2059  ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2060  }
2061 
2062  // Compute the number of bytes ("packets") per row to pack. While
2063  // we're at it, compute the total # of block entries to send, and
2064  // the max # of block entries in any of the rows we're sending.
2065 
2066  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2067 
2068  // Graph information is on host; let's do this on host parallel reduce
2069  auto exportLIDsHost = exportLIDs.view_host();
2070  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2071  numPacketsPerLID.modify_host ();
2072  {
2073  rowReducerStruct = Impl::BlockCrsRowStruct<size_t>();
2074  for (size_t i = 0; i < numExportLIDs; ++i) {
2075  const LO lclRow = exportLIDsHost(i);
2076  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2077  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2078 
2079  const size_t numBytes =
2080  packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2081  numPacketsPerLIDHost(i) = numBytes;
2082  rowReducerStruct += Impl::BlockCrsRowStruct<size_t>(numEnt, numBytes, numEnt);
2083  }
2084  }
2085 
2086  // Compute the number of bytes ("packets") per row to pack. While
2087  // we're at it, compute the total # of block entries to send, and
2088  // the max # of block entries in any of the rows we're sending.
2089  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2090  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2091  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2092 
2093  if (verbose) {
2094  std::ostringstream os;
2095  os << prefix
2096  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2097  << std::endl;
2098  std::cerr << os.str ();
2099  }
2100 
2101  // We use a "struct of arrays" approach to packing each row's
2102  // entries. All the column indices (as global indices) go first,
2103  // then all their owning process ranks, and then the values.
2104  if(exports.extent(0) != totalNumBytes)
2105  {
2106  const std::string oldLabel = exports.d_view.label ();
2107  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2108  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2109  }
2110  if (totalNumEntries > 0) {
2111  // Current position (in bytes) in the 'exports' output array.
2112  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2113  {
2114  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2115  Kokkos::parallel_scan
2116  (policy,
2117  [=](const size_t &i, size_t &update, const bool &final) {
2118  if (final) offset(i) = update;
2119  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2120  });
2121  }
2122  if (offset(numExportLIDs) != totalNumBytes) {
2123  std::ostream& err = this->markLocalErrorAndGetStream ();
2124  err << prefix
2125  << "At end of method, the final offset (in bytes) "
2126  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2127  << totalNumBytes << ". "
2128  << "Please report this bug to the Tpetra developers." << std::endl;
2129  return;
2130  }
2131 
2132  // For each block row of the matrix owned by the calling
2133  // process, pack that block row's column indices and values into
2134  // the exports array.
2135 
2136  // Source matrix's column Map. We verified in checkSizes() that
2137  // the column Map exists (is not null).
2138  const map_type& srcColMap = * (srcGraph.getColMap ());
2139 
2140  // Pack the data for each row to send, into the 'exports' buffer.
2141  // exports will be modified on host.
2142  exports.modify_host();
2143  {
2144  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2145  const auto policy =
2146  policy_type(numExportLIDs, 1, 1)
2147  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2148  // The following parallel_for needs const access to the local values of src.
2149  // (the local graph is also accessed on host, but this does not use WDVs).
2150  getValuesHost();
2152  Kokkos::parallel_for
2153  (policy,
2154  [=](const typename policy_type::member_type &member) {
2155  const size_t i = member.league_rank();
2156  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2157  gblColInds(member.team_scratch(0), maxRowLength);
2158 
2159  const LO lclRowInd = exportLIDsHost(i);
2160  local_inds_host_view_type lclColInds;
2161  values_host_view_type vals;
2162 
2163  // It's OK to ignore the return value, since if the calling
2164  // process doesn't own that local row, then the number of
2165  // entries in that row on the calling process is zero.
2166  src->getLocalRowView (lclRowInd, lclColInds, vals);
2167  const size_t numEnt = lclColInds.extent(0);
2168 
2169  // Convert column indices from local to global.
2170  for (size_t j = 0; j < numEnt; ++j)
2171  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2172 
2173  // Kyungjoo: additional wrapping scratch view is necessary
2174  // the following function interface need the same execution space
2175  // host scratch space somehow is not considered same as the host_exec
2176  // Copy the row's data into the current spot in the exports array.
2177  const size_t numBytes =
2178  packRowForBlockCrs<impl_scalar_type, LO, GO>
2179  (exports.view_host(),
2180  offset(i),
2181  numEnt,
2182  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2183  vals,
2184  numBytesPerValue,
2185  blockSize);
2186 
2187  // numBytes should be same as the difference between offsets
2188  if (debug) {
2189  const size_t offsetDiff = offset(i+1) - offset(i);
2190  if (numBytes != offsetDiff) {
2191  std::ostringstream os;
2192  os << prefix
2193  << "numBytes computed from packRowForBlockCrs is different from "
2194  << "precomputed offset values, LID = " << i << std::endl;
2195  std::cerr << os.str ();
2196  }
2197  }
2198  }); // for each LID (of a row) to send
2200  }
2201  } // if totalNumEntries > 0
2202 
2203  if (debug) {
2204  std::ostringstream os;
2205  const bool lclSuccess = ! (* (this->localError_));
2206  os << prefix
2207  << (lclSuccess ? "succeeded" : "FAILED")
2208  << std::endl;
2209  std::cerr << os.str ();
2210  }
2211  }
2212 
2213  template<class Scalar, class LO, class GO, class Node>
2214  void
2217  (const Kokkos::DualView<const local_ordinal_type*,
2218  buffer_device_type>& importLIDs,
2219  Kokkos::DualView<packet_type*,
2220  buffer_device_type> imports,
2221  Kokkos::DualView<size_t*,
2222  buffer_device_type> numPacketsPerLID,
2223  const size_t /* constantNumPackets */,
2224  const CombineMode combineMode)
2225  {
2226  using ::Tpetra::Details::Behavior;
2228  using ::Tpetra::Details::ProfilingRegion;
2229  using ::Tpetra::Details::PackTraits;
2230  using std::endl;
2231  using host_exec =
2232  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2233 
2234  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2235  const bool verbose = Behavior::verbose ();
2236 
2237  // Define this function prefix
2238  std::string prefix;
2239  {
2240  std::ostringstream os;
2241  auto map = this->graph_.getRowMap ();
2242  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2243  const int myRank = comm.is_null () ? -1 : comm->getRank ();
2244  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2245  prefix = os.str ();
2246  if (verbose) {
2247  os << "Start" << endl;
2248  std::cerr << os.str ();
2249  }
2250  }
2251 
2252  // check if this already includes a local error
2253  if (* (this->localError_)) {
2254  std::ostream& err = this->markLocalErrorAndGetStream ();
2255  std::ostringstream os;
2256  os << prefix << "{Im/Ex}port target is already in error." << endl;
2257  if (verbose) {
2258  std::cerr << os.str ();
2259  }
2260  err << os.str ();
2261  return;
2262  }
2263 
2267  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2268  std::ostream& err = this->markLocalErrorAndGetStream ();
2269  std::ostringstream os;
2270  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2271  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2272  << "." << endl;
2273  if (verbose) {
2274  std::cerr << os.str ();
2275  }
2276  err << os.str ();
2277  return;
2278  }
2279 
2280  if (combineMode != ADD && combineMode != INSERT &&
2281  combineMode != REPLACE && combineMode != ABSMAX &&
2282  combineMode != ZERO) {
2283  std::ostream& err = this->markLocalErrorAndGetStream ();
2284  std::ostringstream os;
2285  os << prefix
2286  << "Invalid CombineMode value " << combineMode << ". Valid "
2287  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2288  << std::endl;
2289  if (verbose) {
2290  std::cerr << os.str ();
2291  }
2292  err << os.str ();
2293  return;
2294  }
2295 
2296  if (this->graph_.getColMap ().is_null ()) {
2297  std::ostream& err = this->markLocalErrorAndGetStream ();
2298  std::ostringstream os;
2299  os << prefix << "Target matrix's column Map is null." << endl;
2300  if (verbose) {
2301  std::cerr << os.str ();
2302  }
2303  err << os.str ();
2304  return;
2305  }
2306 
2307  // Target matrix's column Map. Use to convert the global column
2308  // indices in the receive buffer to local indices. We verified in
2309  // checkSizes() that the column Map exists (is not null).
2310  const map_type& tgtColMap = * (this->graph_.getColMap ());
2311 
2312  // Const values
2313  const size_t blockSize = this->getBlockSize ();
2314  const size_t numImportLIDs = importLIDs.extent(0);
2315  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2316  // default-constructed instance could have a different size than
2317  // other instances. (We assume all nominally constructed
2318  // instances have the same size; that's not the issue here.) This
2319  // could be bad if the calling process has no entries, but other
2320  // processes have entries that they want to send to us.
2321  size_t numBytesPerValue(0);
2322  {
2323  auto val_host = val_.getHostView(Access::ReadOnly);
2324  numBytesPerValue =
2325  PackTraits<impl_scalar_type>::packValueCount
2326  (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2327  }
2328  const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2329  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2330 
2331  if (verbose) {
2332  std::ostringstream os;
2333  os << prefix << "combineMode: "
2334  << ::Tpetra::combineModeToString (combineMode)
2335  << ", blockSize: " << blockSize
2336  << ", numImportLIDs: " << numImportLIDs
2337  << ", numBytesPerValue: " << numBytesPerValue
2338  << ", maxRowNumEnt: " << maxRowNumEnt
2339  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2340  std::cerr << os.str ();
2341  }
2342 
2343  if (combineMode == ZERO || numImportLIDs == 0) {
2344  if (verbose) {
2345  std::ostringstream os;
2346  os << prefix << "Nothing to unpack. Done!" << std::endl;
2347  std::cerr << os.str ();
2348  }
2349  return;
2350  }
2351 
2352  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2353  // we can remove this sync.
2354  if (imports.need_sync_host ()) {
2355  imports.sync_host ();
2356  }
2357 
2358  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2359  // sync'd numPacketsPerLID to host, since it needs to do that in
2360  // order to post MPI_Irecv messages with the correct lengths. A
2361  // hypothetical device-specific boundary exchange implementation
2362  // could possibly receive data without sync'ing lengths to host,
2363  // but we don't need to design for that nonexistent thing yet.
2364 
2365  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2366  importLIDs.need_sync_host ()) {
2367  std::ostream& err = this->markLocalErrorAndGetStream ();
2368  std::ostringstream os;
2369  os << prefix << "All input DualViews must be sync'd to host by now. "
2370  << "imports_nc.need_sync_host()="
2371  << (imports.need_sync_host () ? "true" : "false")
2372  << ", numPacketsPerLID.need_sync_host()="
2373  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2374  << ", importLIDs.need_sync_host()="
2375  << (importLIDs.need_sync_host () ? "true" : "false")
2376  << "." << endl;
2377  if (verbose) {
2378  std::cerr << os.str ();
2379  }
2380  err << os.str ();
2381  return;
2382  }
2383 
2384  const auto importLIDsHost = importLIDs.view_host ();
2385  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2386 
2387  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2388  // loop, by only unpacking on final==true (when we know the
2389  // current offset's value).
2390 
2391  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2392  {
2393  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2394  Kokkos::parallel_scan
2395  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2396  [=] (const size_t &i, size_t &update, const bool &final) {
2397  if (final) offset(i) = update;
2398  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2399  });
2400  }
2401 
2402  // this variable does not matter with a race condition (just error flag)
2403  //
2404  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2405  // or atomics on bool, so we use int instead. (I know we're not
2406  // launching a CUDA loop here, but we could in the future, and I'd
2407  // like to avoid potential trouble.)
2408  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2409  errorDuringUnpack ("errorDuringUnpack");
2410  errorDuringUnpack () = 0;
2411  {
2412  using policy_type = Kokkos::TeamPolicy<host_exec>;
2413  size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2414  + 2 * sizeof(GO); // Yeah, this is a fudge factor
2415 
2416  const auto policy = policy_type (numImportLIDs, 1, 1)
2417  .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2418  using host_scratch_space = typename host_exec::scratch_memory_space;
2419 
2420  using pair_type = Kokkos::pair<size_t, size_t>;
2421 
2422  //The following parallel_for modifies values on host while unpacking.
2423  getValuesHostNonConst();
2425  Kokkos::parallel_for
2426  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2427  [=] (const typename policy_type::member_type& member) {
2428  const size_t i = member.league_rank();
2429  Kokkos::View<GO*, host_scratch_space> gblColInds
2430  (member.team_scratch (0), maxRowNumEnt);
2431  Kokkos::View<LO*, host_scratch_space> lclColInds
2432  (member.team_scratch (0), maxRowNumEnt);
2433  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2434  (member.team_scratch (0), maxRowNumScalarEnt);
2435 
2436 
2437  const size_t offval = offset(i);
2438  const LO lclRow = importLIDsHost(i);
2439  const size_t numBytes = numPacketsPerLIDHost(i);
2440  const size_t numEnt =
2441  unpackRowCount<impl_scalar_type, LO, GO>
2442  (imports.view_host (), offval, numBytes, numBytesPerValue);
2443 
2444  if (numBytes > 0) {
2445  if (numEnt > maxRowNumEnt) {
2446  errorDuringUnpack() = 1;
2447  if (verbose) {
2448  std::ostringstream os;
2449  os << prefix
2450  << "At i = " << i << ", numEnt = " << numEnt
2451  << " > maxRowNumEnt = " << maxRowNumEnt
2452  << std::endl;
2453  std::cerr << os.str ();
2454  }
2455  return;
2456  }
2457  }
2458  const size_t numScalarEnt = numEnt*blockSize*blockSize;
2459  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2460  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2461  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2462 
2463  // Kyungjoo: additional wrapping scratch view is necessary
2464  // the following function interface need the same execution space
2465  // host scratch space somehow is not considered same as the host_exec
2466  size_t numBytesOut = 0;
2467  try {
2468  numBytesOut =
2469  unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2470  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2471  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2472  imports.view_host(),
2473  offval, numBytes, numEnt,
2474  numBytesPerValue, blockSize);
2475  }
2476  catch (std::exception& e) {
2477  errorDuringUnpack() = 1;
2478  if (verbose) {
2479  std::ostringstream os;
2480  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2481  << e.what () << endl;
2482  std::cerr << os.str ();
2483  }
2484  return;
2485  }
2486 
2487  if (numBytes != numBytesOut) {
2488  errorDuringUnpack() = 1;
2489  if (verbose) {
2490  std::ostringstream os;
2491  os << prefix
2492  << "At i = " << i << ", numBytes = " << numBytes
2493  << " != numBytesOut = " << numBytesOut << "."
2494  << std::endl;
2495  std::cerr << os.str();
2496  }
2497  return;
2498  }
2499 
2500  // Convert incoming global indices to local indices.
2501  for (size_t k = 0; k < numEnt; ++k) {
2502  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2503  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2504  if (verbose) {
2505  std::ostringstream os;
2506  os << prefix
2507  << "At i = " << i << ", GID " << gidsOut(k)
2508  << " is not owned by the calling process."
2509  << std::endl;
2510  std::cerr << os.str();
2511  }
2512  return;
2513  }
2514  }
2515 
2516  // Combine the incoming data with the matrix's current data.
2517  LO numCombd = 0;
2518  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2519  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2520  (const_cast<const impl_scalar_type*> (valsOut.data ()));
2521  if (combineMode == ADD) {
2522  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2523  } else if (combineMode == INSERT || combineMode == REPLACE) {
2524  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2525  } else if (combineMode == ABSMAX) {
2526  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2527  }
2528 
2529  if (static_cast<LO> (numEnt) != numCombd) {
2530  errorDuringUnpack() = 1;
2531  if (verbose) {
2532  std::ostringstream os;
2533  os << prefix << "At i = " << i << ", numEnt = " << numEnt
2534  << " != numCombd = " << numCombd << "."
2535  << endl;
2536  std::cerr << os.str ();
2537  }
2538  return;
2539  }
2540  }); // for each import LID i
2542  }
2543 
2544  if (errorDuringUnpack () != 0) {
2545  std::ostream& err = this->markLocalErrorAndGetStream ();
2546  err << prefix << "Unpacking failed.";
2547  if (! verbose) {
2548  err << " Please run again with the environment variable setting "
2549  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2550  }
2551  err << endl;
2552  }
2553 
2554  if (verbose) {
2555  std::ostringstream os;
2556  const bool lclSuccess = ! (* (this->localError_));
2557  os << prefix
2558  << (lclSuccess ? "succeeded" : "FAILED")
2559  << std::endl;
2560  std::cerr << os.str ();
2561  }
2562  }
2563 
2564  template<class Scalar, class LO, class GO, class Node>
2565  std::string
2567  {
2568  using Teuchos::TypeNameTraits;
2569  std::ostringstream os;
2570  os << "\"Tpetra::BlockCrsMatrix\": { "
2571  << "Template parameters: { "
2572  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2573  << "LO: " << TypeNameTraits<LO>::name ()
2574  << "GO: " << TypeNameTraits<GO>::name ()
2575  << "Node: " << TypeNameTraits<Node>::name ()
2576  << " }"
2577  << ", Label: \"" << this->getObjectLabel () << "\""
2578  << ", Global dimensions: ["
2579  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2580  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2581  << ", Block size: " << getBlockSize ()
2582  << " }";
2583  return os.str ();
2584  }
2585 
2586 
2587  template<class Scalar, class LO, class GO, class Node>
2588  void
2590  describe (Teuchos::FancyOStream& out,
2591  const Teuchos::EVerbosityLevel verbLevel) const
2592  {
2593  using Teuchos::ArrayRCP;
2594  using Teuchos::CommRequest;
2595  using Teuchos::FancyOStream;
2596  using Teuchos::getFancyOStream;
2597  using Teuchos::ireceive;
2598  using Teuchos::isend;
2599  using Teuchos::outArg;
2600  using Teuchos::TypeNameTraits;
2601  using Teuchos::VERB_DEFAULT;
2602  using Teuchos::VERB_NONE;
2603  using Teuchos::VERB_LOW;
2604  using Teuchos::VERB_MEDIUM;
2605  // using Teuchos::VERB_HIGH;
2606  using Teuchos::VERB_EXTREME;
2607  using Teuchos::RCP;
2608  using Teuchos::wait;
2609  using std::endl;
2610 
2611  // Set default verbosity if applicable.
2612  const Teuchos::EVerbosityLevel vl =
2613  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2614 
2615  if (vl == VERB_NONE) {
2616  return; // print nothing
2617  }
2618 
2619  // describe() always starts with a tab before it prints anything.
2620  Teuchos::OSTab tab0 (out);
2621 
2622  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2623  Teuchos::OSTab tab1 (out);
2624 
2625  out << "Template parameters:" << endl;
2626  {
2627  Teuchos::OSTab tab2 (out);
2628  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2629  << "LO: " << TypeNameTraits<LO>::name () << endl
2630  << "GO: " << TypeNameTraits<GO>::name () << endl
2631  << "Node: " << TypeNameTraits<Node>::name () << endl;
2632  }
2633  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2634  << "Global dimensions: ["
2635  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2636  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2637 
2638  const LO blockSize = getBlockSize ();
2639  out << "Block size: " << blockSize << endl;
2640 
2641  // constituent objects
2642  if (vl >= VERB_MEDIUM) {
2643  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2644  const int myRank = comm.getRank ();
2645  if (myRank == 0) {
2646  out << "Row Map:" << endl;
2647  }
2648  getRowMap()->describe (out, vl);
2649 
2650  if (! getColMap ().is_null ()) {
2651  if (getColMap() == getRowMap()) {
2652  if (myRank == 0) {
2653  out << "Column Map: same as row Map" << endl;
2654  }
2655  }
2656  else {
2657  if (myRank == 0) {
2658  out << "Column Map:" << endl;
2659  }
2660  getColMap ()->describe (out, vl);
2661  }
2662  }
2663  if (! getDomainMap ().is_null ()) {
2664  if (getDomainMap () == getRowMap ()) {
2665  if (myRank == 0) {
2666  out << "Domain Map: same as row Map" << endl;
2667  }
2668  }
2669  else if (getDomainMap () == getColMap ()) {
2670  if (myRank == 0) {
2671  out << "Domain Map: same as column Map" << endl;
2672  }
2673  }
2674  else {
2675  if (myRank == 0) {
2676  out << "Domain Map:" << endl;
2677  }
2678  getDomainMap ()->describe (out, vl);
2679  }
2680  }
2681  if (! getRangeMap ().is_null ()) {
2682  if (getRangeMap () == getDomainMap ()) {
2683  if (myRank == 0) {
2684  out << "Range Map: same as domain Map" << endl;
2685  }
2686  }
2687  else if (getRangeMap () == getRowMap ()) {
2688  if (myRank == 0) {
2689  out << "Range Map: same as row Map" << endl;
2690  }
2691  }
2692  else {
2693  if (myRank == 0) {
2694  out << "Range Map: " << endl;
2695  }
2696  getRangeMap ()->describe (out, vl);
2697  }
2698  }
2699  }
2700 
2701  if (vl >= VERB_EXTREME) {
2702  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2703  const int myRank = comm.getRank ();
2704  const int numProcs = comm.getSize ();
2705 
2706  // Print the calling process' data to the given output stream.
2707  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2708  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2709  FancyOStream& os = *osPtr;
2710  os << "Process " << myRank << ":" << endl;
2711  Teuchos::OSTab tab2 (os);
2712 
2713  const map_type& meshRowMap = * (graph_.getRowMap ());
2714  const map_type& meshColMap = * (graph_.getColMap ());
2715  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2716  meshLclRow <= meshRowMap.getMaxLocalIndex ();
2717  ++meshLclRow) {
2718  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2719  os << "Row " << meshGblRow << ": {";
2720 
2721  local_inds_host_view_type lclColInds;
2722  values_host_view_type vals;
2723  LO numInds = 0;
2724  this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
2725 
2726  for (LO k = 0; k < numInds; ++k) {
2727  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2728 
2729  os << "Col " << gblCol << ": [";
2730  for (LO i = 0; i < blockSize; ++i) {
2731  for (LO j = 0; j < blockSize; ++j) {
2732  os << vals[blockSize*blockSize*k + i*blockSize + j];
2733  if (j + 1 < blockSize) {
2734  os << ", ";
2735  }
2736  }
2737  if (i + 1 < blockSize) {
2738  os << "; ";
2739  }
2740  }
2741  os << "]";
2742  if (k + 1 < numInds) {
2743  os << ", ";
2744  }
2745  }
2746  os << "}" << endl;
2747  }
2748 
2749  // Print data on Process 0. This will automatically respect the
2750  // current indentation level.
2751  if (myRank == 0) {
2752  out << lclOutStrPtr->str ();
2753  lclOutStrPtr = Teuchos::null; // clear it to save space
2754  }
2755 
2756  const int sizeTag = 1337;
2757  const int dataTag = 1338;
2758 
2759  ArrayRCP<char> recvDataBuf; // only used on Process 0
2760 
2761  // Send string sizes and data from each process in turn to
2762  // Process 0, and print on that process.
2763  for (int p = 1; p < numProcs; ++p) {
2764  if (myRank == 0) {
2765  // Receive the incoming string's length.
2766  ArrayRCP<size_t> recvSize (1);
2767  recvSize[0] = 0;
2768  RCP<CommRequest<int> > recvSizeReq =
2769  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2770  wait<int> (comm, outArg (recvSizeReq));
2771  const size_t numCharsToRecv = recvSize[0];
2772 
2773  // Allocate space for the string to receive. Reuse receive
2774  // buffer space if possible. We can do this because in the
2775  // current implementation, we only have one receive in
2776  // flight at a time. Leave space for the '\0' at the end,
2777  // in case the sender doesn't send it.
2778  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2779  recvDataBuf.resize (numCharsToRecv + 1);
2780  }
2781  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2782  // Post the receive of the actual string data.
2783  RCP<CommRequest<int> > recvDataReq =
2784  ireceive<int, char> (recvData, p, dataTag, comm);
2785  wait<int> (comm, outArg (recvDataReq));
2786 
2787  // Print the received data. This will respect the current
2788  // indentation level. Make sure that the string is
2789  // null-terminated.
2790  recvDataBuf[numCharsToRecv] = '\0';
2791  out << recvDataBuf.getRawPtr ();
2792  }
2793  else if (myRank == p) { // if I am not Process 0, and my rank is p
2794  // This deep-copies the string at most twice, depending on
2795  // whether std::string reference counts internally (it
2796  // generally does, so this won't deep-copy at all).
2797  const std::string stringToSend = lclOutStrPtr->str ();
2798  lclOutStrPtr = Teuchos::null; // clear original to save space
2799 
2800  // Send the string's length to Process 0.
2801  const size_t numCharsToSend = stringToSend.size ();
2802  ArrayRCP<size_t> sendSize (1);
2803  sendSize[0] = numCharsToSend;
2804  RCP<CommRequest<int> > sendSizeReq =
2805  isend<int, size_t> (sendSize, 0, sizeTag, comm);
2806  wait<int> (comm, outArg (sendSizeReq));
2807 
2808  // Send the actual string to Process 0. We know that the
2809  // string has length > 0, so it's save to take the address
2810  // of the first entry. Make a nonowning ArrayRCP to hold
2811  // the string. Process 0 will add a null termination
2812  // character at the end of the string, after it receives the
2813  // message.
2814  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2815  RCP<CommRequest<int> > sendDataReq =
2816  isend<int, char> (sendData, 0, dataTag, comm);
2817  wait<int> (comm, outArg (sendDataReq));
2818  }
2819  } // for each process rank p other than 0
2820  } // extreme verbosity level (print the whole matrix)
2821  }
2822 
2823  template<class Scalar, class LO, class GO, class Node>
2824  Teuchos::RCP<const Teuchos::Comm<int> >
2826  getComm() const
2827  {
2828  return graph_.getComm();
2829  }
2830 
2831 
2832  template<class Scalar, class LO, class GO, class Node>
2836  {
2837  return graph_.getGlobalNumCols();
2838  }
2839 
2840 
2841  template<class Scalar, class LO, class GO, class Node>
2842  size_t
2845  {
2846  return graph_.getLocalNumCols();
2847  }
2848 
2849  template<class Scalar, class LO, class GO, class Node>
2850  GO
2853  {
2854  return graph_.getIndexBase();
2855  }
2856 
2857  template<class Scalar, class LO, class GO, class Node>
2861  {
2862  return graph_.getGlobalNumEntries();
2863  }
2864 
2865  template<class Scalar, class LO, class GO, class Node>
2866  size_t
2869  {
2870  return graph_.getLocalNumEntries();
2871  }
2872 
2873  template<class Scalar, class LO, class GO, class Node>
2874  size_t
2876  getNumEntriesInGlobalRow (GO globalRow) const
2877  {
2878  return graph_.getNumEntriesInGlobalRow(globalRow);
2879  }
2880 
2881 
2882  template<class Scalar, class LO, class GO, class Node>
2883  size_t
2886  {
2887  return graph_.getGlobalMaxNumRowEntries();
2888  }
2889 
2890  template<class Scalar, class LO, class GO, class Node>
2891  bool
2893  hasColMap() const
2894  {
2895  return graph_.hasColMap();
2896  }
2897 
2898 
2899  template<class Scalar, class LO, class GO, class Node>
2900  bool
2903  {
2904  return graph_.isLocallyIndexed();
2905  }
2906 
2907  template<class Scalar, class LO, class GO, class Node>
2908  bool
2911  {
2912  return graph_.isGloballyIndexed();
2913  }
2914 
2915  template<class Scalar, class LO, class GO, class Node>
2916  bool
2919  {
2920  return graph_.isFillComplete ();
2921  }
2922 
2923  template<class Scalar, class LO, class GO, class Node>
2924  bool
2927  {
2928  return false;
2929  }
2930 
2931  template<class Scalar, class LO, class GO, class Node>
2932  void
2934  getGlobalRowCopy (GO /*GlobalRow*/,
2935  nonconst_global_inds_host_view_type &/*Indices*/,
2936  nonconst_values_host_view_type &/*Values*/,
2937  size_t& /*NumEntries*/) const
2938  {
2939  TEUCHOS_TEST_FOR_EXCEPTION(
2940  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2941  "This class doesn't support global matrix indexing.");
2942 
2943  }
2944 
2945  template<class Scalar, class LO, class GO, class Node>
2946  void
2948  getGlobalRowView (GO /* GlobalRow */,
2949  global_inds_host_view_type &/* indices */,
2950  values_host_view_type &/* values */) const
2951  {
2952  TEUCHOS_TEST_FOR_EXCEPTION(
2953  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2954  "This class doesn't support global matrix indexing.");
2955 
2956  }
2957 
2958  template<class Scalar, class LO, class GO, class Node>
2959  void
2961  getLocalRowView (LO localRowInd,
2962  local_inds_host_view_type &colInds,
2963  values_host_view_type &vals) const
2964  {
2965  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2966  colInds = local_inds_host_view_type();
2967  vals = values_host_view_type();
2968  }
2969  else {
2970  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2971  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2972  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2973 
2974  vals = getValuesHost (localRowInd);
2975  }
2976  }
2977 
2978  template<class Scalar, class LO, class GO, class Node>
2979  void
2982  local_inds_host_view_type &colInds,
2983  nonconst_values_host_view_type &vals) const
2984  {
2985  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2986  colInds = nonconst_local_inds_host_view_type();
2987  vals = nonconst_values_host_view_type();
2988  }
2989  else {
2990  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2991  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2992  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2993 
2994  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2995  vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
2996  }
2997  }
2998 
2999  template<class Scalar, class LO, class GO, class Node>
3000  void
3003  {
3004  const size_t lclNumMeshRows = graph_.getLocalNumRows ();
3005 
3006  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
3007  graph_.getLocalDiagOffsets (diagOffsets);
3008 
3009  // The code below works on host, so use a host View.
3010  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
3011  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
3012  Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
3013 
3014  auto vals_host_out = val_.getHostView(Access::ReadOnly);
3015  const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
3016 
3017  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3018  size_t rowOffset = 0;
3019  size_t offset = 0;
3020  LO bs = getBlockSize();
3021  for(size_t r=0; r<getLocalNumRows(); r++)
3022  {
3023  // move pointer to start of diagonal block
3024  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3025  for(int b=0; b<bs; b++)
3026  {
3027  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3028  }
3029  // move pointer to start of next block row
3030  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3031  }
3032 
3033  //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3034  }
3035 
3036  template<class Scalar, class LO, class GO, class Node>
3037  void
3039  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3040  {
3041  TEUCHOS_TEST_FOR_EXCEPTION(
3042  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3043  "not implemented.");
3044 
3045  }
3046 
3047  template<class Scalar, class LO, class GO, class Node>
3048  void
3050  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3051  {
3052  TEUCHOS_TEST_FOR_EXCEPTION(
3053  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3054  "not implemented.");
3055 
3056  }
3057 
3058  template<class Scalar, class LO, class GO, class Node>
3059  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3061  getGraph() const
3062  {
3063  return graphRCP_;
3064  }
3065 
3066  template<class Scalar, class LO, class GO, class Node>
3067  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3070  {
3071  TEUCHOS_TEST_FOR_EXCEPTION(
3072  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3073  "not implemented.");
3074  }
3075 
3076 } // namespace Tpetra
3077 
3078 //
3079 // Explicit instantiation macro
3080 //
3081 // Must be expanded from within the Tpetra namespace!
3082 //
3083 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3084  template class BlockCrsMatrix< S, LO, GO, NODE >;
3085 
3086 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
std::string description() const override
One-line description of this object.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
LO local_ordinal_type
The type of local indices.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
size_t getLocalNumRows() const override
get the local number of block rows
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
MultiVector for multiple degrees of freedom per mesh point.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
size_t global_size_t
Global size_t object.
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
global_size_t getGlobalNumRows() const override
get the global number of block rows
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Replace old value with maximum of magnitudes of old and new values.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Replace existing values with new values.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Replace old values with zero.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
char packet_type
Implementation detail; tells.
local_matrix_device_type getLocalMatrixDevice() const
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
A distributed dense vector.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row&#39;s entries.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix&#39;s values (val_).
Base class for distributed Tpetra objects that support data redistribution.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row. ...