Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 // clang-format off
11 
12 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
13 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
14 
17 
21 #include "Tpetra_BlockMultiVector.hpp"
22 #include "Tpetra_BlockView.hpp"
23 
24 #include "Teuchos_TimeMonitor.hpp"
25 #ifdef HAVE_TPETRA_DEBUG
26 # include <set>
27 #endif // HAVE_TPETRA_DEBUG
28 
29 #include "KokkosSparse_spmv.hpp"
30 
31 namespace Tpetra {
32 
33 namespace Impl {
34 
35  template<typename T>
36  struct BlockCrsRowStruct {
37  T totalNumEntries, totalNumBytes, maxRowLength;
38  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
39  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
40  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
41  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
42  };
43 
44  template<typename T>
45  static
46  KOKKOS_INLINE_FUNCTION
47  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
48  a.totalNumEntries += b.totalNumEntries;
49  a.totalNumBytes += b.totalNumBytes;
50  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
51  }
52 
53  template<typename T, typename ExecSpace>
54  struct BlockCrsReducer {
55  typedef BlockCrsReducer reducer;
56  typedef T value_type;
57  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
58  value_type *value;
59 
60  KOKKOS_INLINE_FUNCTION
61  BlockCrsReducer(value_type &val) : value(&val) {}
62 
63  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
64  KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
65  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
66  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
67  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
68  };
69 
70 } // namespace Impl
71 
72 namespace { // (anonymous)
73 
74 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
75 // version that takes two Kokkos::View arguments).
76 template<class Scalar, class LO, class GO, class Node>
77 class GetLocalDiagCopy {
78 public:
79  typedef typename Node::device_type device_type;
80  typedef size_t diag_offset_type;
81  typedef Kokkos::View<const size_t*, device_type,
82  Kokkos::MemoryUnmanaged> diag_offsets_type;
83  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
84  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
85  typedef typename local_graph_device_type::row_map_type row_offsets_type;
86  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
87  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
88  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
89 
90  // Constructor
91  GetLocalDiagCopy (const diag_type& diag,
92  const values_type& val,
93  const diag_offsets_type& diagOffsets,
94  const row_offsets_type& ptr,
95  const LO blockSize) :
96  diag_ (diag),
97  diagOffsets_ (diagOffsets),
98  ptr_ (ptr),
99  blockSize_ (blockSize),
100  offsetPerBlock_ (blockSize_*blockSize_),
101  val_(val)
102  {}
103 
104  KOKKOS_INLINE_FUNCTION void
105  operator() (const LO& lclRowInd) const
106  {
107  using Kokkos::ALL;
108 
109  // Get row offset
110  const size_t absOffset = ptr_[lclRowInd];
111 
112  // Get offset relative to start of row
113  const size_t relOffset = diagOffsets_[lclRowInd];
114 
115  // Get the total offset
116  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
117 
118  // Get a view of the block. BCRS currently uses LayoutRight
119  // regardless of the device.
120  typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
121  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
122  const_little_block_type;
123  const_little_block_type D_in (val_.data () + pointOffset,
124  blockSize_, blockSize_);
125  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
126  COPY (D_in, D_out);
127  }
128 
129  private:
130  diag_type diag_;
131  diag_offsets_type diagOffsets_;
132  row_offsets_type ptr_;
133  LO blockSize_;
134  LO offsetPerBlock_;
135  values_type val_;
136  };
137 } // namespace (anonymous)
138 
139  template<class Scalar, class LO, class GO, class Node>
140  std::ostream&
141  BlockCrsMatrix<Scalar, LO, GO, Node>::
142  markLocalErrorAndGetStream ()
143  {
144  * (this->localError_) = true;
145  if ((*errs_).is_null ()) {
146  *errs_ = Teuchos::rcp (new std::ostringstream ());
147  }
148  return **errs_;
149  }
150 
151  template<class Scalar, class LO, class GO, class Node>
154  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
155  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
156  blockSize_ (static_cast<LO> (0)),
157  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
158  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
159  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
160  offsetPerBlock_ (0),
161  localError_ (new bool (false)),
162  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
163  {
164  }
165 
166  template<class Scalar, class LO, class GO, class Node>
169  const LO blockSize) :
170  dist_object_type (graph.getMap ()),
171  graph_ (graph),
172  rowMeshMap_ (* (graph.getRowMap ())),
173  blockSize_ (blockSize),
174  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
175  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
176  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
177  offsetPerBlock_ (blockSize * blockSize),
178  localError_ (new bool (false)),
179  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
180  {
181 
183  TEUCHOS_TEST_FOR_EXCEPTION(
184  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
185  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
186  "rows (isSorted() is false). This class assumes sorted rows.");
187 
188  graphRCP_ = Teuchos::rcpFromRef(graph_);
189 
190  // Trick to test whether LO is nonpositive, without a compiler
191  // warning in case LO is unsigned (which is generally a bad idea
192  // anyway). I don't promise that the trick works, but it
193  // generally does with gcc at least, in my experience.
194  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
195  TEUCHOS_TEST_FOR_EXCEPTION(
196  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
197  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
198  " <= 0. The block size must be positive.");
199 
200  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
201  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
202 
203  {
204  // These are rcp
205  const auto domainPointMap = getDomainMap();
206  const auto colPointMap = Teuchos::rcp
207  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
208  *pointImporter_ = Teuchos::rcp
209  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
210  }
211  {
212  auto local_graph_h = graph.getLocalGraphHost ();
213  auto ptr_h = local_graph_h.row_map;
214  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
215  Kokkos::deep_copy(ptrHost_, ptr_h);
216 
217  auto ind_h = local_graph_h.entries;
218  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
219  // DEEP_COPY REVIEW - HOST-TO-HOST
220  Kokkos::deep_copy (indHost_, ind_h);
221 
222  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
223  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
224  }
225  }
226 
227  template<class Scalar, class LO, class GO, class Node>
230  const typename local_matrix_device_type::values_type& values,
231  const LO blockSize) :
232  dist_object_type (graph.getMap ()),
233  graph_ (graph),
234  rowMeshMap_ (* (graph.getRowMap ())),
235  blockSize_ (blockSize),
236  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
237  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
238  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
239  offsetPerBlock_ (blockSize * blockSize),
240  localError_ (new bool (false)),
241  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
242  {
244  TEUCHOS_TEST_FOR_EXCEPTION(
245  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
246  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
247  "rows (isSorted() is false). This class assumes sorted rows.");
248 
249  graphRCP_ = Teuchos::rcpFromRef(graph_);
250 
251  // Trick to test whether LO is nonpositive, without a compiler
252  // warning in case LO is unsigned (which is generally a bad idea
253  // anyway). I don't promise that the trick works, but it
254  // generally does with gcc at least, in my experience.
255  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
256  TEUCHOS_TEST_FOR_EXCEPTION(
257  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
258  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
259  " <= 0. The block size must be positive.");
260 
261  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
262  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
263 
264  {
265  // These are rcp
266  const auto domainPointMap = getDomainMap();
267  const auto colPointMap = Teuchos::rcp
268  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
269  *pointImporter_ = Teuchos::rcp
270  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
271  }
272  {
273  auto local_graph_h = graph.getLocalGraphHost ();
274  auto ptr_h = local_graph_h.row_map;
275  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
276  Kokkos::deep_copy(ptrHost_, ptr_h);
277 
278  auto ind_h = local_graph_h.entries;
279  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
280  Kokkos::deep_copy (indHost_, ind_h);
281 
282  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
283  TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
284  val_ = decltype (val_) (values);
285  }
286  }
287 
288  template<class Scalar, class LO, class GO, class Node>
291  const map_type& domainPointMap,
292  const map_type& rangePointMap,
293  const LO blockSize) :
294  dist_object_type (graph.getMap ()),
295  graph_ (graph),
296  rowMeshMap_ (* (graph.getRowMap ())),
297  domainPointMap_ (domainPointMap),
298  rangePointMap_ (rangePointMap),
299  blockSize_ (blockSize),
300  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
301  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
302  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
303  offsetPerBlock_ (blockSize * blockSize),
304  localError_ (new bool (false)),
305  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
306  {
307  TEUCHOS_TEST_FOR_EXCEPTION(
308  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
309  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
310  "rows (isSorted() is false). This class assumes sorted rows.");
311 
312  graphRCP_ = Teuchos::rcpFromRef(graph_);
313 
314  // Trick to test whether LO is nonpositive, without a compiler
315  // warning in case LO is unsigned (which is generally a bad idea
316  // anyway). I don't promise that the trick works, but it
317  // generally does with gcc at least, in my experience.
318  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
319  TEUCHOS_TEST_FOR_EXCEPTION(
320  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
321  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
322  " <= 0. The block size must be positive.");
323  {
324  // These are rcp
325  const auto rcpDomainPointMap = getDomainMap();
326  const auto colPointMap = Teuchos::rcp
327  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
328  *pointImporter_ = Teuchos::rcp
329  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
330  }
331  {
332  auto local_graph_h = graph.getLocalGraphHost ();
333  auto ptr_h = local_graph_h.row_map;
334  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
335  // DEEP_COPY REVIEW - HOST-TO-HOST
336  Kokkos::deep_copy(ptrHost_, ptr_h);
337 
338  auto ind_h = local_graph_h.entries;
339  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
340  // DEEP_COPY REVIEW - HOST-TO-HOST
341  Kokkos::deep_copy (indHost_, ind_h);
342 
343  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
344  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
345 
346  }
347  }
348 
349  template<class Scalar, class LO, class GO, class Node>
350  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
353  { // Copy constructor of map_type does a shallow copy.
354  // We're only returning by RCP for backwards compatibility.
355  return Teuchos::rcp (new map_type (domainPointMap_));
356  }
357 
358  template<class Scalar, class LO, class GO, class Node>
359  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
361  getRangeMap () const
362  { // Copy constructor of map_type does a shallow copy.
363  // We're only returning by RCP for backwards compatibility.
364  return Teuchos::rcp (new map_type (rangePointMap_));
365  }
366 
367  template<class Scalar, class LO, class GO, class Node>
368  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
370  getRowMap () const
371  {
372  return graph_.getRowMap();
373  }
374 
375  template<class Scalar, class LO, class GO, class Node>
376  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
378  getColMap () const
379  {
380  return graph_.getColMap();
381  }
382 
383  template<class Scalar, class LO, class GO, class Node>
387  {
388  return graph_.getGlobalNumRows();
389  }
390 
391  template<class Scalar, class LO, class GO, class Node>
392  size_t
395  {
396  return graph_.getLocalNumRows();
397  }
398 
399  template<class Scalar, class LO, class GO, class Node>
400  size_t
403  {
404  return graph_.getLocalMaxNumRowEntries();
405  }
406 
407  template<class Scalar, class LO, class GO, class Node>
408  void
410  apply (const mv_type& X,
411  mv_type& Y,
412  Teuchos::ETransp mode,
413  Scalar alpha,
414  Scalar beta) const
415  {
416  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
417  TEUCHOS_TEST_FOR_EXCEPTION(
418  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
419  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
420  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
421  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
422 
423  TEUCHOS_TEST_FOR_EXCEPTION(
424  !X.isConstantStride() || !Y.isConstantStride(),
425  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
426  "X and Y must both be constant stride");
427 
428  BMV X_view;
429  BMV Y_view;
430  const LO blockSize = getBlockSize ();
431  try {
432  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
433  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
434  }
435  catch (std::invalid_argument& e) {
436  TEUCHOS_TEST_FOR_EXCEPTION(
437  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
438  "apply: Either the input MultiVector X or the output MultiVector Y "
439  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
440  "graph. BlockMultiVector's constructor threw the following exception: "
441  << e.what ());
442  }
443 
444  try {
445  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
446  // either that or mark fields of this class as 'mutable'. The
447  // problem is that applyBlock wants to do lazy initialization of
448  // temporary block multivectors.
449  const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
450  } catch (std::invalid_argument& e) {
451  TEUCHOS_TEST_FOR_EXCEPTION(
452  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
453  "apply: The implementation method applyBlock complained about having "
454  "an invalid argument. It reported the following: " << e.what ());
455  } catch (std::logic_error& e) {
456  TEUCHOS_TEST_FOR_EXCEPTION(
457  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
458  "apply: The implementation method applyBlock complained about a "
459  "possible bug in its implementation. It reported the following: "
460  << e.what ());
461  } catch (std::exception& e) {
462  TEUCHOS_TEST_FOR_EXCEPTION(
463  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
464  "apply: The implementation method applyBlock threw an exception which "
465  "is neither std::invalid_argument nor std::logic_error, but is a "
466  "subclass of std::exception. It reported the following: "
467  << e.what ());
468  } catch (...) {
469  TEUCHOS_TEST_FOR_EXCEPTION(
470  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
471  "apply: The implementation method applyBlock threw an exception which "
472  "is not an instance of a subclass of std::exception. This probably "
473  "indicates a bug. Please report this to the Tpetra developers.");
474  }
475  }
476 
477  template<class Scalar, class LO, class GO, class Node>
478  void
482  Teuchos::ETransp mode,
483  const Scalar alpha,
484  const Scalar beta)
485  {
486  TEUCHOS_TEST_FOR_EXCEPTION(
487  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
488  "Tpetra::BlockCrsMatrix::applyBlock: "
489  "X and Y have different block sizes. X.getBlockSize() = "
490  << X.getBlockSize () << " != Y.getBlockSize() = "
491  << Y.getBlockSize () << ".");
492 
493  if (mode == Teuchos::NO_TRANS) {
494  applyBlockNoTrans (X, Y, alpha, beta);
495  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
496  applyBlockTrans (X, Y, mode, alpha, beta);
497  } else {
498  TEUCHOS_TEST_FOR_EXCEPTION(
499  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
500  "applyBlock: Invalid 'mode' argument. Valid values are "
501  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
502  }
503  }
504 
505  template<class Scalar, class LO, class GO, class Node>
506  void
509  const Import<LO, GO, Node>& importer) const
510  {
511  using Teuchos::RCP;
512  using Teuchos::rcp;
513  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
514 
515  // Right now, we make many assumptions...
516  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
517  "destMatrix is required to be null.");
518 
519  // BlockCrsMatrix requires a complete graph at construction.
520  // So first step is to import and fill complete the destGraph.
521  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
522  RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
523  srcGraph->getDomainMap(),
524  srcGraph->getRangeMap());
525 
526 
527  // Final step, create and import the destMatrix.
528  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
529  destMatrix->doImport(*this, importer, Tpetra::INSERT);
530  }
531 
532 
533  template<class Scalar, class LO, class GO, class Node>
534  void
537  const Export<LO, GO, Node>& exporter) const
538  {
539  using Teuchos::RCP;
540  using Teuchos::rcp;
541  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
542 
543  // Right now, we make many assumptions...
544  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
545  "destMatrix is required to be null.");
546 
547  // BlockCrsMatrix requires a complete graph at construction.
548  // So first step is to import and fill complete the destGraph.
549  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
550  RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
551  srcGraph->getDomainMap(),
552  srcGraph->getRangeMap());
553 
554 
555  // Final step, create and import the destMatrix.
556  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
557  destMatrix->doExport(*this, exporter, Tpetra::INSERT);
558  }
559 
560 
561  template<class Scalar, class LO, class GO, class Node>
562  void
564  setAllToScalar (const Scalar& alpha)
565  {
566  auto val_d = val_.getDeviceView(Access::OverwriteAll);
567  Kokkos::deep_copy(val_d, alpha);
568  }
569 
570  template<class Scalar, class LO, class GO, class Node>
571  LO
573  replaceLocalValues (const LO localRowInd,
574  const LO colInds[],
575  const Scalar vals[],
576  const LO numColInds) const
577  {
578  std::vector<ptrdiff_t> offsets(numColInds);
579  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
580  const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
581  return validCount;
582  }
583 
584  template <class Scalar, class LO, class GO, class Node>
585  void
587  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
588  Kokkos::MemoryUnmanaged>& offsets) const
589  {
590  graph_.getLocalDiagOffsets (offsets);
591  }
592 
593  template <class Scalar, class LO, class GO, class Node>
594  void
596  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
597  Kokkos::MemoryUnmanaged>& diag,
598  const Kokkos::View<const size_t*, device_type,
599  Kokkos::MemoryUnmanaged>& offsets) const
600  {
601  using Kokkos::parallel_for;
602  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
603 
604  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
605  const LO blockSize = this->getBlockSize ();
606  TEUCHOS_TEST_FOR_EXCEPTION
607  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
608  static_cast<LO> (diag.extent (1)) < blockSize ||
609  static_cast<LO> (diag.extent (2)) < blockSize,
610  std::invalid_argument, prefix <<
611  "The input Kokkos::View is not big enough to hold all the data.");
612  TEUCHOS_TEST_FOR_EXCEPTION
613  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
614  prefix << "offsets.size() = " << offsets.size () << " < local number of "
615  "diagonal blocks " << lclNumMeshRows << ".");
616 
617  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
618  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
619 
620  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
621  // we reserve the right to do lazy allocation of device data. (We
622  // don't plan to do lazy allocation for host data; the host
623  // version of the data always exists.)
624  auto val_d = val_.getDeviceView(Access::ReadOnly);
625  parallel_for (policy_type (0, lclNumMeshRows),
626  functor_type (diag, val_d, offsets,
627  graph_.getLocalGraphDevice ().row_map, blockSize_));
628  }
629 
630  template<class Scalar, class LO, class GO, class Node>
631  LO
633  absMaxLocalValues (const LO localRowInd,
634  const LO colInds[],
635  const Scalar vals[],
636  const LO numColInds) const
637  {
638  std::vector<ptrdiff_t> offsets(numColInds);
639  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
640  const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
641  return validCount;
642  }
643 
644 
645  template<class Scalar, class LO, class GO, class Node>
646  LO
648  sumIntoLocalValues (const LO localRowInd,
649  const LO colInds[],
650  const Scalar vals[],
651  const LO numColInds) const
652  {
653  std::vector<ptrdiff_t> offsets(numColInds);
654  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
655  const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
656  return validCount;
657  }
658  template<class Scalar, class LO, class GO, class Node>
659  void
661  getLocalRowCopy (LO LocalRow,
662  nonconst_local_inds_host_view_type &Indices,
663  nonconst_values_host_view_type &Values,
664  size_t& NumEntries) const
665  {
666  auto vals = getValuesHost(LocalRow);
667  const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
668  if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
669  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
670  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
671  << numInds << " row entries");
672  }
673  const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
674  for (LO i=0; i<numInds; ++i) {
675  Indices[i] = colInds[i];
676  }
677  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
678  Values[i] = vals[i];
679  }
680  NumEntries = numInds;
681  }
682 
683  template<class Scalar, class LO, class GO, class Node>
684  LO
686  getLocalRowOffsets (const LO localRowInd,
687  ptrdiff_t offsets[],
688  const LO colInds[],
689  const LO numColInds) const
690  {
691  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
692  // We got no offsets, because the input local row index is
693  // invalid on the calling process. That may not be an error, if
694  // numColInds is zero anyway; it doesn't matter. This is the
695  // advantage of returning the number of valid indices.
696  return static_cast<LO> (0);
697  }
698 
699  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
700  LO hint = 0; // Guess for the relative offset into the current row
701  LO validCount = 0; // number of valid column indices in colInds
702 
703  for (LO k = 0; k < numColInds; ++k) {
704  const LO relBlockOffset =
705  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
706  if (relBlockOffset != LINV) {
707  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
708  hint = relBlockOffset + 1;
709  ++validCount;
710  }
711  }
712  return validCount;
713  }
714 
715 
716  template<class Scalar, class LO, class GO, class Node>
717  LO
719  replaceLocalValuesByOffsets (const LO localRowInd,
720  const ptrdiff_t offsets[],
721  const Scalar vals[],
722  const LO numOffsets) const
723  {
724  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
725  // We modified no values, because the input local row index is
726  // invalid on the calling process. That may not be an error, if
727  // numColInds is zero anyway; it doesn't matter. This is the
728  // advantage of returning the number of valid indices.
729  return static_cast<LO> (0);
730  }
731  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
732  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
733  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
734  impl_scalar_type* vOut = val_out.data();
735 
736  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
737  const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
738  size_t pointOffset = 0; // Current offset into input values
739  LO validCount = 0; // number of valid offsets
740 
741  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
742  const size_t blockOffset = offsets[k]*perBlockSize;
743  if (offsets[k] != STINV) {
744  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
745  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
746  COPY (A_new, A_old);
747  ++validCount;
748  }
749  }
750  return validCount;
751  }
752 
753 
754  template<class Scalar, class LO, class GO, class Node>
755  LO
757  absMaxLocalValuesByOffsets (const LO localRowInd,
758  const ptrdiff_t offsets[],
759  const Scalar vals[],
760  const LO numOffsets) const
761  {
762  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
763  // We modified no values, because the input local row index is
764  // invalid on the calling process. That may not be an error, if
765  // numColInds is zero anyway; it doesn't matter. This is the
766  // advantage of returning the number of valid indices.
767  return static_cast<LO> (0);
768  }
769  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
770  auto val_out = getValuesHost(localRowInd);
771  impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
772 
773  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
774  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
775  size_t pointOffset = 0; // Current offset into input values
776  LO validCount = 0; // number of valid offsets
777 
778  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
779  const size_t blockOffset = offsets[k]*perBlockSize;
780  if (blockOffset != STINV) {
781  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
782  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
783  ::Tpetra::Impl::absMax (A_old, A_new);
784  ++validCount;
785  }
786  }
787  return validCount;
788  }
789 
790 
791  template<class Scalar, class LO, class GO, class Node>
792  LO
794  sumIntoLocalValuesByOffsets (const LO localRowInd,
795  const ptrdiff_t offsets[],
796  const Scalar vals[],
797  const LO numOffsets) const
798  {
799  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
800  // We modified no values, because the input local row index is
801  // invalid on the calling process. That may not be an error, if
802  // numColInds is zero anyway; it doesn't matter. This is the
803  // advantage of returning the number of valid indices.
804  return static_cast<LO> (0);
805  }
806  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
807  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
808  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
809  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
810  impl_scalar_type* vOut = val_out.data();
811 
812  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
813  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
814  size_t pointOffset = 0; // Current offset into input values
815  LO validCount = 0; // number of valid offsets
816 
817  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
818  const size_t blockOffset = offsets[k]*perBlockSize;
819  if (blockOffset != STINV) {
820  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
821  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
822  AXPY (ONE, A_new, A_old);
823  ++validCount;
824  }
825  }
826  return validCount;
827  }
828 
829  template<class Scalar, class LO, class GO, class Node>
831  impl_scalar_type_dualview::t_host::const_type
833  getValuesHost () const
834  {
835  return val_.getHostView(Access::ReadOnly);
836  }
837 
838  template<class Scalar, class LO, class GO, class Node>
839  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
840  impl_scalar_type_dualview::t_dev::const_type
841  BlockCrsMatrix<Scalar, LO, GO, Node>::
842  getValuesDevice () const
843  {
844  return val_.getDeviceView(Access::ReadOnly);
845  }
846 
847  template<class Scalar, class LO, class GO, class Node>
848  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
849  impl_scalar_type_dualview::t_host
852  {
853  return val_.getHostView(Access::ReadWrite);
854  }
855 
856  template<class Scalar, class LO, class GO, class Node>
858  impl_scalar_type_dualview::t_dev
861  {
862  return val_.getDeviceView(Access::ReadWrite);
863  }
864 
865  template<class Scalar, class LO, class GO, class Node>
866  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
867  impl_scalar_type_dualview::t_host::const_type
868  BlockCrsMatrix<Scalar, LO, GO, Node>::
869  getValuesHost (const LO& lclRow) const
870  {
871  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
872  auto val = val_.getHostView(Access::ReadOnly);
873  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
874  return r_val;
875  }
876 
877  template<class Scalar, class LO, class GO, class Node>
879  impl_scalar_type_dualview::t_dev::const_type
881  getValuesDevice (const LO& lclRow) const
882  {
883  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
884  auto val = val_.getDeviceView(Access::ReadOnly);
885  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
886  return r_val;
887  }
888 
889  template<class Scalar, class LO, class GO, class Node>
892  getValuesHostNonConst (const LO& lclRow)
893  {
894  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
895  auto val = val_.getHostView(Access::ReadWrite);
896  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
897  return r_val;
898  }
899 
900  template<class Scalar, class LO, class GO, class Node>
903  getValuesDeviceNonConst (const LO& lclRow)
904  {
905  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
906  auto val = val_.getDeviceView(Access::ReadWrite);
907  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
908  return r_val;
909  }
910 
911  template<class Scalar, class LO, class GO, class Node>
912  size_t
914  getNumEntriesInLocalRow (const LO localRowInd) const
915  {
916  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
917  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
918  return static_cast<LO> (0); // the calling process doesn't have that row
919  } else {
920  return static_cast<LO> (numEntInGraph);
921  }
922  }
923 
924  template<class Scalar, class LO, class GO, class Node>
925  typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
928  {
929  auto numCols = this->graph_.getColMap()->getLocalNumElements();
930  auto val = val_.getDeviceView(Access::ReadWrite);
931  const LO blockSize = this->getBlockSize ();
932  const auto graph = this->graph_.getLocalGraphDevice ();
933 
934  return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
935  numCols,
936  val,
937  graph,
938  blockSize);
939  }
940 
941  template<class Scalar, class LO, class GO, class Node>
942  void
946  const Teuchos::ETransp mode,
947  const Scalar alpha,
948  const Scalar beta)
949  {
950  (void) X;
951  (void) Y;
952  (void) mode;
953  (void) alpha;
954  (void) beta;
955 
956  TEUCHOS_TEST_FOR_EXCEPTION(
957  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
958  "transpose and conjugate transpose modes are not yet implemented.");
959  }
960 
961  template<class Scalar, class LO, class GO, class Node>
962  void
963  BlockCrsMatrix<Scalar, LO, GO, Node>::
964  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
965  BlockMultiVector<Scalar, LO, GO, Node>& Y,
966  const Scalar alpha,
967  const Scalar beta)
968  {
969  using Teuchos::RCP;
970  using Teuchos::rcp;
971  typedef ::Tpetra::Import<LO, GO, Node> import_type;
972  typedef ::Tpetra::Export<LO, GO, Node> export_type;
973  const Scalar zero = STS::zero ();
974  const Scalar one = STS::one ();
975  RCP<const import_type> import = graph_.getImporter ();
976  // "export" is a reserved C++ keyword, so we can't use it.
977  RCP<const export_type> theExport = graph_.getExporter ();
978  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
979 
980  if (alpha == zero) {
981  if (beta == zero) {
982  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
983  }
984  else if (beta != one) {
985  Y.scale (beta);
986  }
987  }
988  else { // alpha != 0
989  const BMV* X_colMap = NULL;
990  if (import.is_null ()) {
991  try {
992  X_colMap = &X;
993  }
994  catch (std::exception& e) {
995  TEUCHOS_TEST_FOR_EXCEPTION
996  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
997  "operator= threw an exception: " << std::endl << e.what ());
998  }
999  }
1000  else {
1001  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1002  // Y_rowMap_ below. This lets us do lazy initialization
1003  // correctly with view semantics of BlockCrsMatrix. All views
1004  // of this BlockCrsMatrix have the same outer pointer. That
1005  // way, we can set the inner pointer in one view, and all
1006  // other views will see it.
1007  if ((*X_colMap_).is_null () ||
1008  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1009  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1010  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1011  static_cast<LO> (X.getNumVectors ())));
1012  }
1013  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1014  **pointImporter_,
1016  try {
1017  X_colMap = &(**X_colMap_);
1018  }
1019  catch (std::exception& e) {
1020  TEUCHOS_TEST_FOR_EXCEPTION
1021  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1022  "operator= threw an exception: " << std::endl << e.what ());
1023  }
1024  }
1025 
1026  BMV* Y_rowMap = NULL;
1027  if (theExport.is_null ()) {
1028  Y_rowMap = &Y;
1029  }
1030  else if ((*Y_rowMap_).is_null () ||
1031  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1032  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1033  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1034  static_cast<LO> (X.getNumVectors ())));
1035  try {
1036  Y_rowMap = &(**Y_rowMap_);
1037  }
1038  catch (std::exception& e) {
1039  TEUCHOS_TEST_FOR_EXCEPTION(
1040  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1041  "operator= threw an exception: " << std::endl << e.what ());
1042  }
1043  }
1044 
1045  try {
1046  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1047  }
1048  catch (std::exception& e) {
1049  TEUCHOS_TEST_FOR_EXCEPTION
1050  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1051  "an exception: " << e.what ());
1052  }
1053  catch (...) {
1054  TEUCHOS_TEST_FOR_EXCEPTION
1055  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1056  "an exception not a subclass of std::exception.");
1057  }
1058 
1059  if (! theExport.is_null ()) {
1060  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1061  }
1062  }
1063  }
1064 
1065 // clang-format on
1066 template <class Scalar, class LO, class GO, class Node>
1067 void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1068  const BlockMultiVector<Scalar, LO, GO, Node> &X,
1069  BlockMultiVector<Scalar, LO, GO, Node> &Y, const Scalar alpha,
1070  const Scalar beta) {
1071  ::Tpetra::Details::ProfilingRegion profile_region(
1072  "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1073  const impl_scalar_type alpha_impl = alpha;
1074  const auto graph = this->graph_.getLocalGraphDevice();
1075 
1076  mv_type X_mv = X.getMultiVectorView();
1077  mv_type Y_mv = Y.getMultiVectorView();
1078  auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1079  auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1080 
1081  auto A_lcl = getLocalMatrixDevice();
1082  if(!applyHelper.get()) {
1083  // The apply helper does not exist, so create it
1084  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
1085  }
1086  if(applyHelper->shouldUseIntRowptrs())
1087  {
1088  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
1089  KokkosSparse::spmv(
1090  &applyHelper->handle_int, KokkosSparse::NoTranspose,
1091  alpha_impl, A_lcl_int_rowptrs, X_lcl, beta, Y_lcl);
1092  }
1093  else
1094  {
1095  KokkosSparse::spmv(
1096  &applyHelper->handle, KokkosSparse::NoTranspose,
1097  alpha_impl, A_lcl, X_lcl, beta, Y_lcl);
1098  }
1099 }
1100 // clang-format off
1101 
1102  template<class Scalar, class LO, class GO, class Node>
1103  LO
1104  BlockCrsMatrix<Scalar, LO, GO, Node>::
1105  findRelOffsetOfColumnIndex (const LO localRowIndex,
1106  const LO colIndexToFind,
1107  const LO hint) const
1108  {
1109  const size_t absStartOffset = ptrHost_[localRowIndex];
1110  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1111  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1112  // Amortize pointer arithmetic over the search loop.
1113  const LO* const curInd = indHost_.data () + absStartOffset;
1114 
1115  // If the hint was correct, then the hint is the offset to return.
1116  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1117  // Always return the offset relative to the current row.
1118  return hint;
1119  }
1120 
1121  // The hint was wrong, so we must search for the given column
1122  // index in the column indices for the given row.
1123  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1124 
1125  // We require that the graph have sorted rows. However, binary
1126  // search only pays if the current row is longer than a certain
1127  // amount. We set this to 32, but you might want to tune this.
1128  const LO maxNumEntriesForLinearSearch = 32;
1129  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1130  // Use binary search. It would probably be better for us to
1131  // roll this loop by hand. If we wrote it right, a smart
1132  // compiler could perhaps use conditional loads and avoid
1133  // branches (according to Jed Brown on May 2014).
1134  const LO* beg = curInd;
1135  const LO* end = curInd + numEntriesInRow;
1136  std::pair<const LO*, const LO*> p =
1137  std::equal_range (beg, end, colIndexToFind);
1138  if (p.first != p.second) {
1139  // offset is relative to the current row
1140  relOffset = static_cast<LO> (p.first - beg);
1141  }
1142  }
1143  else { // use linear search
1144  for (LO k = 0; k < numEntriesInRow; ++k) {
1145  if (colIndexToFind == curInd[k]) {
1146  relOffset = k; // offset is relative to the current row
1147  break;
1148  }
1149  }
1150  }
1151 
1152  return relOffset;
1153  }
1154 
1155  template<class Scalar, class LO, class GO, class Node>
1156  LO
1157  BlockCrsMatrix<Scalar, LO, GO, Node>::
1158  offsetPerBlock () const
1159  {
1160  return offsetPerBlock_;
1161  }
1162 
1163  template<class Scalar, class LO, class GO, class Node>
1164  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1165  BlockCrsMatrix<Scalar, LO, GO, Node>::
1166  getConstLocalBlockFromInput (const impl_scalar_type* val,
1167  const size_t pointOffset) const
1168  {
1169  // Row major blocks
1170  const LO rowStride = blockSize_;
1171  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1172  }
1173 
1174  template<class Scalar, class LO, class GO, class Node>
1175  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1176  BlockCrsMatrix<Scalar, LO, GO, Node>::
1177  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1178  const size_t pointOffset) const
1179  {
1180  // Row major blocks
1181  const LO rowStride = blockSize_;
1182  return little_block_type (val + pointOffset, blockSize_, rowStride);
1183  }
1184 
1185  template<class Scalar, class LO, class GO, class Node>
1186  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1187  BlockCrsMatrix<Scalar, LO, GO, Node>::
1188  getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1189  const size_t pointOffset) const
1190  {
1191  // Row major blocks
1192  const LO rowStride = blockSize_;
1193  const size_t bs2 = blockSize_ * blockSize_;
1194  return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1195  }
1196 
1197  template<class Scalar, class LO, class GO, class Node>
1198  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1199  BlockCrsMatrix<Scalar, LO, GO, Node>::
1200  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1201  {
1202  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1203 
1204  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1205  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1206  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1207  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1208  auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1209  auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1210  return r_val;
1211  }
1212  else {
1213  return little_block_type ();
1214  }
1215  }
1216 
1217  template<class Scalar, class LO, class GO, class Node>
1218  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1219  BlockCrsMatrix<Scalar, LO, GO, Node>::
1220  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1221  {
1222  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1223 
1224  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1225  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1226  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1227  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1228  auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1229  auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1230  return r_val;
1231  }
1232  else {
1233  return little_block_host_type ();
1234  }
1235  }
1236 
1237 
1238  template<class Scalar, class LO, class GO, class Node>
1239  bool
1240  BlockCrsMatrix<Scalar, LO, GO, Node>::
1241  checkSizes (const ::Tpetra::SrcDistObject& source)
1242  {
1243  using std::endl;
1244  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1245  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1246 
1247  if (src == NULL) {
1248  std::ostream& err = markLocalErrorAndGetStream ();
1249  err << "checkSizes: The source object of the Import or Export "
1250  "must be a BlockCrsMatrix with the same template parameters as the "
1251  "target object." << endl;
1252  }
1253  else {
1254  // Use a string of ifs, not if-elseifs, because we want to know
1255  // all the errors.
1256  if (src->getBlockSize () != this->getBlockSize ()) {
1257  std::ostream& err = markLocalErrorAndGetStream ();
1258  err << "checkSizes: The source and target objects of the Import or "
1259  << "Export must have the same block sizes. The source's block "
1260  << "size = " << src->getBlockSize () << " != the target's block "
1261  << "size = " << this->getBlockSize () << "." << endl;
1262  }
1263  if (! src->graph_.isFillComplete ()) {
1264  std::ostream& err = markLocalErrorAndGetStream ();
1265  err << "checkSizes: The source object of the Import or Export is "
1266  "not fill complete. Both source and target objects must be fill "
1267  "complete." << endl;
1268  }
1269  if (! this->graph_.isFillComplete ()) {
1270  std::ostream& err = markLocalErrorAndGetStream ();
1271  err << "checkSizes: The target object of the Import or Export is "
1272  "not fill complete. Both source and target objects must be fill "
1273  "complete." << endl;
1274  }
1275  if (src->graph_.getColMap ().is_null ()) {
1276  std::ostream& err = markLocalErrorAndGetStream ();
1277  err << "checkSizes: The source object of the Import or Export does "
1278  "not have a column Map. Both source and target objects must have "
1279  "column Maps." << endl;
1280  }
1281  if (this->graph_.getColMap ().is_null ()) {
1282  std::ostream& err = markLocalErrorAndGetStream ();
1283  err << "checkSizes: The target object of the Import or Export does "
1284  "not have a column Map. Both source and target objects must have "
1285  "column Maps." << endl;
1286  }
1287  }
1288 
1289  return ! (* (this->localError_));
1290  }
1291 
1292  template<class Scalar, class LO, class GO, class Node>
1293  void
1296  (const ::Tpetra::SrcDistObject& source,
1297  const size_t numSameIDs,
1298  const Kokkos::DualView<const local_ordinal_type*,
1299  buffer_device_type>& permuteToLIDs,
1300  const Kokkos::DualView<const local_ordinal_type*,
1301  buffer_device_type>& permuteFromLIDs,
1302  const CombineMode /*CM*/)
1303  {
1304  using ::Tpetra::Details::Behavior;
1306  using ::Tpetra::Details::ProfilingRegion;
1307  using std::endl;
1308  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1309 
1310  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1311  const bool debug = Behavior::debug();
1312  const bool verbose = Behavior::verbose();
1313 
1314  // Define this function prefix
1315  std::string prefix;
1316  {
1317  std::ostringstream os;
1318  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1319  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1320  prefix = os.str();
1321  }
1322 
1323  // check if this already includes a local error
1324  if (* (this->localError_)) {
1325  std::ostream& err = this->markLocalErrorAndGetStream ();
1326  err << prefix
1327  << "The target object of the Import or Export is already in an error state."
1328  << endl;
1329  return;
1330  }
1331 
1332  //
1333  // Verbose input dual view status
1334  //
1335  if (verbose) {
1336  std::ostringstream os;
1337  os << prefix << endl
1338  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1339  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1340  std::cerr << os.str ();
1341  }
1342 
1346  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1347  std::ostream& err = this->markLocalErrorAndGetStream ();
1348  err << prefix
1349  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1350  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1351  << "." << endl;
1352  return;
1353  }
1354  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1355  std::ostream& err = this->markLocalErrorAndGetStream ();
1356  err << prefix
1357  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1358  << endl;
1359  return;
1360  }
1361 
1362  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1363  if (src == NULL) {
1364  std::ostream& err = this->markLocalErrorAndGetStream ();
1365  err << prefix
1366  << "The source (input) object of the Import or "
1367  "Export is either not a BlockCrsMatrix, or does not have the right "
1368  "template parameters. checkSizes() should have caught this. "
1369  "Please report this bug to the Tpetra developers." << endl;
1370  return;
1371  }
1372 
1373  bool lclErr = false;
1374 #ifdef HAVE_TPETRA_DEBUG
1375  std::set<LO> invalidSrcCopyRows;
1376  std::set<LO> invalidDstCopyRows;
1377  std::set<LO> invalidDstCopyCols;
1378  std::set<LO> invalidDstPermuteCols;
1379  std::set<LO> invalidPermuteFromRows;
1380 #endif // HAVE_TPETRA_DEBUG
1381 
1382  // Copy the initial sequence of rows that are the same.
1383  //
1384  // The two graphs might have different column Maps, so we need to
1385  // do this using global column indices. This is purely local, so
1386  // we only need to check for local sameness of the two column
1387  // Maps.
1388 
1389 #ifdef HAVE_TPETRA_DEBUG
1390  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1391 #endif // HAVE_TPETRA_DEBUG
1392  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1393  const map_type& srcColMap = * (src->graph_.getColMap ());
1394  const map_type& dstColMap = * (this->graph_.getColMap ());
1395  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1396 
1397  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1398  if (verbose) {
1399  std::ostringstream os;
1400  os << prefix
1401  << "canUseLocalColumnIndices: "
1402  << (canUseLocalColumnIndices ? "true" : "false")
1403  << ", numPermute: " << numPermute
1404  << endl;
1405  std::cerr << os.str ();
1406  }
1407 
1408  const auto permuteToLIDsHost = permuteToLIDs.view_host();
1409  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1410 
1411  if (canUseLocalColumnIndices) {
1412  // Copy local rows that are the "same" in both source and target.
1413  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1414 #ifdef HAVE_TPETRA_DEBUG
1415  if (! srcRowMap.isNodeLocalElement (localRow)) {
1416  lclErr = true;
1417  invalidSrcCopyRows.insert (localRow);
1418  continue; // skip invalid rows
1419  }
1420 #endif // HAVE_TPETRA_DEBUG
1421 
1422  local_inds_host_view_type lclSrcCols;
1423  values_host_view_type vals;
1424  LO numEntries;
1425  // If this call fails, that means the mesh row local index is
1426  // invalid. That means the Import or Export is invalid somehow.
1427  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1428  if (numEntries > 0) {
1429  LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1430  if (err != numEntries) {
1431  lclErr = true;
1432  if (! dstRowMap.isNodeLocalElement (localRow)) {
1433 #ifdef HAVE_TPETRA_DEBUG
1434  invalidDstCopyRows.insert (localRow);
1435 #endif // HAVE_TPETRA_DEBUG
1436  }
1437  else {
1438  // Once there's an error, there's no sense in saving
1439  // time, so we check whether the column indices were
1440  // invalid. However, only remember which ones were
1441  // invalid in a debug build, because that might take a
1442  // lot of space.
1443  for (LO k = 0; k < numEntries; ++k) {
1444  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1445  lclErr = true;
1446 #ifdef HAVE_TPETRA_DEBUG
1447  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1448 #endif // HAVE_TPETRA_DEBUG
1449  }
1450  }
1451  }
1452  }
1453  }
1454  } // for each "same" local row
1455 
1456  // Copy the "permute" local rows.
1457  for (size_t k = 0; k < numPermute; ++k) {
1458  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1459  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1460 
1461  local_inds_host_view_type lclSrcCols;
1462  values_host_view_type vals;
1463  LO numEntries;
1464  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1465  if (numEntries > 0) {
1466  LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1467  if (err != numEntries) {
1468  lclErr = true;
1469 #ifdef HAVE_TPETRA_DEBUG
1470  for (LO c = 0; c < numEntries; ++c) {
1471  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1472  invalidDstPermuteCols.insert (lclSrcCols[c]);
1473  }
1474  }
1475 #endif // HAVE_TPETRA_DEBUG
1476  }
1477  }
1478  }
1479  }
1480  else { // must convert column indices to global
1481  // Reserve space to store the destination matrix's local column indices.
1482  const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1483  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1484 
1485  // Copy local rows that are the "same" in both source and target.
1486  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1487  local_inds_host_view_type lclSrcCols;
1488  values_host_view_type vals;
1489  LO numEntries;
1490 
1491  // If this call fails, that means the mesh row local index is
1492  // invalid. That means the Import or Export is invalid somehow.
1493  try {
1494  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1495  } catch (std::exception& e) {
1496  if (debug) {
1497  std::ostringstream os;
1498  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1499  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1500  << localRow << ", src->getLocalRowView() threw an exception: "
1501  << e.what ();
1502  std::cerr << os.str ();
1503  }
1504  throw e;
1505  }
1506 
1507  if (numEntries > 0) {
1508  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1509  lclErr = true;
1510  if (debug) {
1511  std::ostringstream os;
1512  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1513  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1514  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1515  << maxNumEnt << endl;
1516  std::cerr << os.str ();
1517  }
1518  }
1519  else {
1520  // Convert the source matrix's local column indices to the
1521  // destination matrix's local column indices.
1522  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1523  for (LO j = 0; j < numEntries; ++j) {
1524  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1525  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1526  lclErr = true;
1527 #ifdef HAVE_TPETRA_DEBUG
1528  invalidDstCopyCols.insert (lclDstColsView[j]);
1529 #endif // HAVE_TPETRA_DEBUG
1530  }
1531  }
1532  LO err(0);
1533  try {
1534  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1535  } catch (std::exception& e) {
1536  if (debug) {
1537  std::ostringstream os;
1538  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1539  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1540  << localRow << ", this->replaceLocalValues() threw an exception: "
1541  << e.what ();
1542  std::cerr << os.str ();
1543  }
1544  throw e;
1545  }
1546  if (err != numEntries) {
1547  lclErr = true;
1548  if (debug) {
1549  std::ostringstream os;
1550  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1551  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1552  "localRow " << localRow << ", this->replaceLocalValues "
1553  "returned " << err << " instead of numEntries = "
1554  << numEntries << endl;
1555  std::cerr << os.str ();
1556  }
1557  }
1558  }
1559  }
1560  }
1561 
1562  // Copy the "permute" local rows.
1563  for (size_t k = 0; k < numPermute; ++k) {
1564  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1565  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1566 
1567  local_inds_host_view_type lclSrcCols;
1568  values_host_view_type vals;
1569  LO numEntries;
1570 
1571  try {
1572  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1573  } catch (std::exception& e) {
1574  if (debug) {
1575  std::ostringstream os;
1576  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1577  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1578  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1579  << ", src->getLocalRowView() threw an exception: " << e.what ();
1580  std::cerr << os.str ();
1581  }
1582  throw e;
1583  }
1584 
1585  if (numEntries > 0) {
1586  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1587  lclErr = true;
1588  }
1589  else {
1590  // Convert the source matrix's local column indices to the
1591  // destination matrix's local column indices.
1592  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1593  for (LO j = 0; j < numEntries; ++j) {
1594  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1595  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1596  lclErr = true;
1597 #ifdef HAVE_TPETRA_DEBUG
1598  invalidDstPermuteCols.insert (lclDstColsView[j]);
1599 #endif // HAVE_TPETRA_DEBUG
1600  }
1601  }
1602  LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1603  if (err != numEntries) {
1604  lclErr = true;
1605  }
1606  }
1607  }
1608  }
1609  }
1610 
1611  if (lclErr) {
1612  std::ostream& err = this->markLocalErrorAndGetStream ();
1613 #ifdef HAVE_TPETRA_DEBUG
1614  err << "copyAndPermute: The graph structure of the source of the "
1615  "Import or Export must be a subset of the graph structure of the "
1616  "target. ";
1617  err << "invalidSrcCopyRows = [";
1618  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1619  it != invalidSrcCopyRows.end (); ++it) {
1620  err << *it;
1621  typename std::set<LO>::const_iterator itp1 = it;
1622  itp1++;
1623  if (itp1 != invalidSrcCopyRows.end ()) {
1624  err << ",";
1625  }
1626  }
1627  err << "], invalidDstCopyRows = [";
1628  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1629  it != invalidDstCopyRows.end (); ++it) {
1630  err << *it;
1631  typename std::set<LO>::const_iterator itp1 = it;
1632  itp1++;
1633  if (itp1 != invalidDstCopyRows.end ()) {
1634  err << ",";
1635  }
1636  }
1637  err << "], invalidDstCopyCols = [";
1638  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1639  it != invalidDstCopyCols.end (); ++it) {
1640  err << *it;
1641  typename std::set<LO>::const_iterator itp1 = it;
1642  itp1++;
1643  if (itp1 != invalidDstCopyCols.end ()) {
1644  err << ",";
1645  }
1646  }
1647  err << "], invalidDstPermuteCols = [";
1648  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1649  it != invalidDstPermuteCols.end (); ++it) {
1650  err << *it;
1651  typename std::set<LO>::const_iterator itp1 = it;
1652  itp1++;
1653  if (itp1 != invalidDstPermuteCols.end ()) {
1654  err << ",";
1655  }
1656  }
1657  err << "], invalidPermuteFromRows = [";
1658  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1659  it != invalidPermuteFromRows.end (); ++it) {
1660  err << *it;
1661  typename std::set<LO>::const_iterator itp1 = it;
1662  itp1++;
1663  if (itp1 != invalidPermuteFromRows.end ()) {
1664  err << ",";
1665  }
1666  }
1667  err << "]" << endl;
1668 #else
1669  err << "copyAndPermute: The graph structure of the source of the "
1670  "Import or Export must be a subset of the graph structure of the "
1671  "target." << endl;
1672 #endif // HAVE_TPETRA_DEBUG
1673  }
1674 
1675  if (debug) {
1676  std::ostringstream os;
1677  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1678  const bool lclSuccess = ! (* (this->localError_));
1679  os << "*** Proc " << myRank << ": copyAndPermute "
1680  << (lclSuccess ? "succeeded" : "FAILED");
1681  if (lclSuccess) {
1682  os << endl;
1683  } else {
1684  os << ": error messages: " << this->errorMessages (); // comes w/ endl
1685  }
1686  std::cerr << os.str ();
1687  }
1688  }
1689 
1690  namespace { // (anonymous)
1691 
1710  template<class LO, class GO>
1711  size_t
1712  packRowCount (const size_t numEnt,
1713  const size_t numBytesPerValue,
1714  const size_t blkSize)
1715  {
1716  using ::Tpetra::Details::PackTraits;
1717 
1718  if (numEnt == 0) {
1719  // Empty rows always take zero bytes, to ensure sparsity.
1720  return 0;
1721  }
1722  else {
1723  // We store the number of entries as a local index (LO).
1724  LO numEntLO = 0; // packValueCount wants this.
1725  GO gid {};
1726  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1727  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1728  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1729  return numEntLen + gidsLen + valsLen;
1730  }
1731  }
1732 
1743  template<class ST, class LO, class GO>
1744  size_t
1745  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1746  const size_t offset,
1747  const size_t numBytes,
1748  const size_t /* numBytesPerValue */)
1749  {
1750  using Kokkos::subview;
1751  using ::Tpetra::Details::PackTraits;
1752 
1753  if (numBytes == 0) {
1754  // Empty rows always take zero bytes, to ensure sparsity.
1755  return static_cast<size_t> (0);
1756  }
1757  else {
1758  LO numEntLO = 0;
1759  const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
1760  TEUCHOS_TEST_FOR_EXCEPTION
1761  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1762  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
1763  << ".");
1764  const char* const inBuf = imports.data () + offset;
1765  const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
1766  TEUCHOS_TEST_FOR_EXCEPTION
1767  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1768  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
1769  << ".");
1770  return static_cast<size_t> (numEntLO);
1771  }
1772  }
1773 
1777  template<class ST, class LO, class GO>
1778  size_t
1779  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1780  const size_t offset,
1781  const size_t numEnt,
1782  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1783  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1784  const size_t numBytesPerValue,
1785  const size_t blockSize)
1786  {
1787  using Kokkos::subview;
1788  using ::Tpetra::Details::PackTraits;
1789 
1790  if (numEnt == 0) {
1791  // Empty rows always take zero bytes, to ensure sparsity.
1792  return 0;
1793  }
1794  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1795 
1796  const GO gid = 0; // packValueCount wants this
1797  const LO numEntLO = static_cast<size_t> (numEnt);
1798 
1799  const size_t numEntBeg = offset;
1800  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1801  const size_t gidsBeg = numEntBeg + numEntLen;
1802  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1803  const size_t valsBeg = gidsBeg + gidsLen;
1804  const size_t valsLen = numScalarEnt * numBytesPerValue;
1805 
1806  char* const numEntOut = exports.data () + numEntBeg;
1807  char* const gidsOut = exports.data () + gidsBeg;
1808  char* const valsOut = exports.data () + valsBeg;
1809 
1810  size_t numBytesOut = 0;
1811  int errorCode = 0;
1812  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
1813 
1814  {
1815  Kokkos::pair<int, size_t> p;
1816  p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
1817  errorCode += p.first;
1818  numBytesOut += p.second;
1819 
1820  p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
1821  errorCode += p.first;
1822  numBytesOut += p.second;
1823  }
1824 
1825  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1826  TEUCHOS_TEST_FOR_EXCEPTION
1827  (numBytesOut != expectedNumBytes, std::logic_error,
1828  "packRowForBlockCrs: numBytesOut = " << numBytesOut
1829  << " != expectedNumBytes = " << expectedNumBytes << ".");
1830 
1831  TEUCHOS_TEST_FOR_EXCEPTION
1832  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
1833  "PackTraits::packArray returned a nonzero error code");
1834 
1835  return numBytesOut;
1836  }
1837 
1838  // Return the number of bytes actually read / used.
1839  template<class ST, class LO, class GO>
1840  size_t
1841  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1842  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1843  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1844  const size_t offset,
1845  const size_t numBytes,
1846  const size_t numEnt,
1847  const size_t numBytesPerValue,
1848  const size_t blockSize)
1849  {
1850  using ::Tpetra::Details::PackTraits;
1851 
1852  if (numBytes == 0) {
1853  // Rows with zero bytes always have zero entries.
1854  return 0;
1855  }
1856  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1857  TEUCHOS_TEST_FOR_EXCEPTION
1858  (static_cast<size_t> (imports.extent (0)) <= offset,
1859  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1860  << imports.extent (0) << " <= offset = " << offset << ".");
1861  TEUCHOS_TEST_FOR_EXCEPTION
1862  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
1863  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1864  << imports.extent (0) << " < offset + numBytes = "
1865  << (offset + numBytes) << ".");
1866 
1867  const GO gid = 0; // packValueCount wants this
1868  const LO lid = 0; // packValueCount wants this
1869 
1870  const size_t numEntBeg = offset;
1871  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
1872  const size_t gidsBeg = numEntBeg + numEntLen;
1873  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1874  const size_t valsBeg = gidsBeg + gidsLen;
1875  const size_t valsLen = numScalarEnt * numBytesPerValue;
1876 
1877  const char* const numEntIn = imports.data () + numEntBeg;
1878  const char* const gidsIn = imports.data () + gidsBeg;
1879  const char* const valsIn = imports.data () + valsBeg;
1880 
1881  size_t numBytesOut = 0;
1882  int errorCode = 0;
1883  LO numEntOut;
1884  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
1885  TEUCHOS_TEST_FOR_EXCEPTION
1886  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
1887  "unpackRowForBlockCrs: Expected number of entries " << numEnt
1888  << " != actual number of entries " << numEntOut << ".");
1889 
1890  {
1891  Kokkos::pair<int, size_t> p;
1892  p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
1893  errorCode += p.first;
1894  numBytesOut += p.second;
1895 
1896  p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
1897  errorCode += p.first;
1898  numBytesOut += p.second;
1899  }
1900 
1901  TEUCHOS_TEST_FOR_EXCEPTION
1902  (numBytesOut != numBytes, std::logic_error,
1903  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1904  << " != numBytes = " << numBytes << ".");
1905 
1906  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1907  TEUCHOS_TEST_FOR_EXCEPTION
1908  (numBytesOut != expectedNumBytes, std::logic_error,
1909  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1910  << " != expectedNumBytes = " << expectedNumBytes << ".");
1911 
1912  TEUCHOS_TEST_FOR_EXCEPTION
1913  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
1914  "PackTraits::unpackArray returned a nonzero error code");
1915 
1916  return numBytesOut;
1917  }
1918  } // namespace (anonymous)
1919 
1920  template<class Scalar, class LO, class GO, class Node>
1921  void
1924  (const ::Tpetra::SrcDistObject& source,
1925  const Kokkos::DualView<const local_ordinal_type*,
1926  buffer_device_type>& exportLIDs,
1927  Kokkos::DualView<packet_type*,
1928  buffer_device_type>& exports, // output
1929  Kokkos::DualView<size_t*,
1930  buffer_device_type> numPacketsPerLID, // output
1931  size_t& constantNumPackets)
1932  {
1933  using ::Tpetra::Details::Behavior;
1935  using ::Tpetra::Details::ProfilingRegion;
1936  using ::Tpetra::Details::PackTraits;
1937 
1938  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
1939 
1940  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1941 
1942  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1943 
1944  const bool debug = Behavior::debug();
1945  const bool verbose = Behavior::verbose();
1946 
1947  // Define this function prefix
1948  std::string prefix;
1949  {
1950  std::ostringstream os;
1951  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1952  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1953  prefix = os.str();
1954  }
1955 
1956  // check if this already includes a local error
1957  if (* (this->localError_)) {
1958  std::ostream& err = this->markLocalErrorAndGetStream ();
1959  err << prefix
1960  << "The target object of the Import or Export is already in an error state."
1961  << std::endl;
1962  return;
1963  }
1964 
1965  //
1966  // Verbose input dual view status
1967  //
1968  if (verbose) {
1969  std::ostringstream os;
1970  os << prefix << std::endl
1971  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
1972  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
1973  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
1974  std::cerr << os.str ();
1975  }
1976 
1980  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
1981  std::ostream& err = this->markLocalErrorAndGetStream ();
1982  err << prefix
1983  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
1984  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
1985  << "." << std::endl;
1986  return;
1987  }
1988  if (exportLIDs.need_sync_host ()) {
1989  std::ostream& err = this->markLocalErrorAndGetStream ();
1990  err << prefix << "exportLIDs be sync'd to host." << std::endl;
1991  return;
1992  }
1993 
1994  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1995  if (src == NULL) {
1996  std::ostream& err = this->markLocalErrorAndGetStream ();
1997  err << prefix
1998  << "The source (input) object of the Import or "
1999  "Export is either not a BlockCrsMatrix, or does not have the right "
2000  "template parameters. checkSizes() should have caught this. "
2001  "Please report this bug to the Tpetra developers." << std::endl;
2002  return;
2003  }
2004 
2005  // Graphs and matrices are allowed to have a variable number of
2006  // entries per row. We could test whether all rows have the same
2007  // number of entries, but DistObject can only use this
2008  // optimization if all rows on _all_ processes have the same
2009  // number of entries. Rather than do the all-reduce necessary to
2010  // test for this unlikely case, we tell DistObject (by setting
2011  // constantNumPackets to zero) to assume that different rows may
2012  // have different numbers of entries.
2013  constantNumPackets = 0;
2014 
2015  // const values
2016  const crs_graph_type& srcGraph = src->graph_;
2017  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2018  const size_t numExportLIDs = exportLIDs.extent (0);
2019  size_t numBytesPerValue(0);
2020  {
2021  auto val_host = val_.getHostView(Access::ReadOnly);
2022  numBytesPerValue =
2023  PackTraits<impl_scalar_type>
2024  ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2025  }
2026 
2027  // Compute the number of bytes ("packets") per row to pack. While
2028  // we're at it, compute the total # of block entries to send, and
2029  // the max # of block entries in any of the rows we're sending.
2030 
2031  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2032 
2033  // Graph information is on host; let's do this on host parallel reduce
2034  auto exportLIDsHost = exportLIDs.view_host();
2035  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2036  numPacketsPerLID.modify_host ();
2037  {
2038  rowReducerStruct = Impl::BlockCrsRowStruct<size_t>();
2039  for (size_t i = 0; i < numExportLIDs; ++i) {
2040  const LO lclRow = exportLIDsHost(i);
2041  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2042  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2043 
2044  const size_t numBytes =
2045  packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2046  numPacketsPerLIDHost(i) = numBytes;
2047  rowReducerStruct += Impl::BlockCrsRowStruct<size_t>(numEnt, numBytes, numEnt);
2048  }
2049  }
2050 
2051  // Compute the number of bytes ("packets") per row to pack. While
2052  // we're at it, compute the total # of block entries to send, and
2053  // the max # of block entries in any of the rows we're sending.
2054  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2055  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2056  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2057 
2058  if (verbose) {
2059  std::ostringstream os;
2060  os << prefix
2061  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2062  << std::endl;
2063  std::cerr << os.str ();
2064  }
2065 
2066  // We use a "struct of arrays" approach to packing each row's
2067  // entries. All the column indices (as global indices) go first,
2068  // then all their owning process ranks, and then the values.
2069  if(exports.extent(0) != totalNumBytes)
2070  {
2071  const std::string oldLabel = exports.view_device().label ();
2072  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2073  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2074  }
2075  if (totalNumEntries > 0) {
2076  // Current position (in bytes) in the 'exports' output array.
2077  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2078  {
2079  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2080  Kokkos::parallel_scan
2081  (policy,
2082  [=](const size_t &i, size_t &update, const bool &final) {
2083  if (final) offset(i) = update;
2084  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2085  });
2086  }
2087  if (offset(numExportLIDs) != totalNumBytes) {
2088  std::ostream& err = this->markLocalErrorAndGetStream ();
2089  err << prefix
2090  << "At end of method, the final offset (in bytes) "
2091  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2092  << totalNumBytes << ". "
2093  << "Please report this bug to the Tpetra developers." << std::endl;
2094  return;
2095  }
2096 
2097  // For each block row of the matrix owned by the calling
2098  // process, pack that block row's column indices and values into
2099  // the exports array.
2100 
2101  // Source matrix's column Map. We verified in checkSizes() that
2102  // the column Map exists (is not null).
2103  const map_type& srcColMap = * (srcGraph.getColMap ());
2104 
2105  // Pack the data for each row to send, into the 'exports' buffer.
2106  // exports will be modified on host.
2107  exports.modify_host();
2108  {
2109  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2110  const auto policy =
2111  policy_type(numExportLIDs, 1, 1)
2112  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2113  // The following parallel_for needs const access to the local values of src.
2114  // (the local graph is also accessed on host, but this does not use WDVs).
2115  getValuesHost();
2117  Kokkos::parallel_for
2118  (policy,
2119  [=](const typename policy_type::member_type &member) {
2120  const size_t i = member.league_rank();
2121  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2122  gblColInds(member.team_scratch(0), maxRowLength);
2123 
2124  const LO lclRowInd = exportLIDsHost(i);
2125  local_inds_host_view_type lclColInds;
2126  values_host_view_type vals;
2127 
2128  // It's OK to ignore the return value, since if the calling
2129  // process doesn't own that local row, then the number of
2130  // entries in that row on the calling process is zero.
2131  src->getLocalRowView (lclRowInd, lclColInds, vals);
2132  const size_t numEnt = lclColInds.extent(0);
2133 
2134  // Convert column indices from local to global.
2135  for (size_t j = 0; j < numEnt; ++j)
2136  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2137 
2138  // Kyungjoo: additional wrapping scratch view is necessary
2139  // the following function interface need the same execution space
2140  // host scratch space somehow is not considered same as the host_exec
2141  // Copy the row's data into the current spot in the exports array.
2142  const size_t numBytes =
2143  packRowForBlockCrs<impl_scalar_type, LO, GO>
2144  (exports.view_host(),
2145  offset(i),
2146  numEnt,
2147  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2148  vals,
2149  numBytesPerValue,
2150  blockSize);
2151 
2152  // numBytes should be same as the difference between offsets
2153  if (debug) {
2154  const size_t offsetDiff = offset(i+1) - offset(i);
2155  if (numBytes != offsetDiff) {
2156  std::ostringstream os;
2157  os << prefix
2158  << "numBytes computed from packRowForBlockCrs is different from "
2159  << "precomputed offset values, LID = " << i << std::endl;
2160  std::cerr << os.str ();
2161  }
2162  }
2163  }); // for each LID (of a row) to send
2165  }
2166  } // if totalNumEntries > 0
2167 
2168  if (debug) {
2169  std::ostringstream os;
2170  const bool lclSuccess = ! (* (this->localError_));
2171  os << prefix
2172  << (lclSuccess ? "succeeded" : "FAILED")
2173  << std::endl;
2174  std::cerr << os.str ();
2175  }
2176  }
2177 
2178  template<class Scalar, class LO, class GO, class Node>
2179  void
2182  (const Kokkos::DualView<const local_ordinal_type*,
2183  buffer_device_type>& importLIDs,
2184  Kokkos::DualView<packet_type*,
2185  buffer_device_type> imports,
2186  Kokkos::DualView<size_t*,
2187  buffer_device_type> numPacketsPerLID,
2188  const size_t /* constantNumPackets */,
2189  const CombineMode combineMode)
2190  {
2191  using ::Tpetra::Details::Behavior;
2193  using ::Tpetra::Details::ProfilingRegion;
2194  using ::Tpetra::Details::PackTraits;
2195  using std::endl;
2196  using host_exec =
2197  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2198 
2199  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2200  const bool verbose = Behavior::verbose ();
2201 
2202  // Define this function prefix
2203  std::string prefix;
2204  {
2205  std::ostringstream os;
2206  auto map = this->graph_.getRowMap ();
2207  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2208  const int myRank = comm.is_null () ? -1 : comm->getRank ();
2209  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2210  prefix = os.str ();
2211  if (verbose) {
2212  os << "Start" << endl;
2213  std::cerr << os.str ();
2214  }
2215  }
2216 
2217  // check if this already includes a local error
2218  if (* (this->localError_)) {
2219  std::ostream& err = this->markLocalErrorAndGetStream ();
2220  std::ostringstream os;
2221  os << prefix << "{Im/Ex}port target is already in error." << endl;
2222  if (verbose) {
2223  std::cerr << os.str ();
2224  }
2225  err << os.str ();
2226  return;
2227  }
2228 
2232  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2233  std::ostream& err = this->markLocalErrorAndGetStream ();
2234  std::ostringstream os;
2235  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2236  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2237  << "." << endl;
2238  if (verbose) {
2239  std::cerr << os.str ();
2240  }
2241  err << os.str ();
2242  return;
2243  }
2244 
2245  if (combineMode != ADD && combineMode != INSERT &&
2246  combineMode != REPLACE && combineMode != ABSMAX &&
2247  combineMode != ZERO) {
2248  std::ostream& err = this->markLocalErrorAndGetStream ();
2249  std::ostringstream os;
2250  os << prefix
2251  << "Invalid CombineMode value " << combineMode << ". Valid "
2252  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2253  << std::endl;
2254  if (verbose) {
2255  std::cerr << os.str ();
2256  }
2257  err << os.str ();
2258  return;
2259  }
2260 
2261  if (this->graph_.getColMap ().is_null ()) {
2262  std::ostream& err = this->markLocalErrorAndGetStream ();
2263  std::ostringstream os;
2264  os << prefix << "Target matrix's column Map is null." << endl;
2265  if (verbose) {
2266  std::cerr << os.str ();
2267  }
2268  err << os.str ();
2269  return;
2270  }
2271 
2272  // Target matrix's column Map. Use to convert the global column
2273  // indices in the receive buffer to local indices. We verified in
2274  // checkSizes() that the column Map exists (is not null).
2275  const map_type& tgtColMap = * (this->graph_.getColMap ());
2276 
2277  // Const values
2278  const size_t blockSize = this->getBlockSize ();
2279  const size_t numImportLIDs = importLIDs.extent(0);
2280  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2281  // default-constructed instance could have a different size than
2282  // other instances. (We assume all nominally constructed
2283  // instances have the same size; that's not the issue here.) This
2284  // could be bad if the calling process has no entries, but other
2285  // processes have entries that they want to send to us.
2286  size_t numBytesPerValue(0);
2287  {
2288  auto val_host = val_.getHostView(Access::ReadOnly);
2289  numBytesPerValue =
2290  PackTraits<impl_scalar_type>::packValueCount
2291  (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2292  }
2293  const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2294  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2295 
2296  if (verbose) {
2297  std::ostringstream os;
2298  os << prefix << "combineMode: "
2299  << ::Tpetra::combineModeToString (combineMode)
2300  << ", blockSize: " << blockSize
2301  << ", numImportLIDs: " << numImportLIDs
2302  << ", numBytesPerValue: " << numBytesPerValue
2303  << ", maxRowNumEnt: " << maxRowNumEnt
2304  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2305  std::cerr << os.str ();
2306  }
2307 
2308  if (combineMode == ZERO || numImportLIDs == 0) {
2309  if (verbose) {
2310  std::ostringstream os;
2311  os << prefix << "Nothing to unpack. Done!" << std::endl;
2312  std::cerr << os.str ();
2313  }
2314  return;
2315  }
2316 
2317  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2318  // we can remove this sync.
2319  if (imports.need_sync_host ()) {
2320  imports.sync_host ();
2321  }
2322 
2323  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2324  // sync'd numPacketsPerLID to host, since it needs to do that in
2325  // order to post MPI_Irecv messages with the correct lengths. A
2326  // hypothetical device-specific boundary exchange implementation
2327  // could possibly receive data without sync'ing lengths to host,
2328  // but we don't need to design for that nonexistent thing yet.
2329 
2330  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2331  importLIDs.need_sync_host ()) {
2332  std::ostream& err = this->markLocalErrorAndGetStream ();
2333  std::ostringstream os;
2334  os << prefix << "All input DualViews must be sync'd to host by now. "
2335  << "imports_nc.need_sync_host()="
2336  << (imports.need_sync_host () ? "true" : "false")
2337  << ", numPacketsPerLID.need_sync_host()="
2338  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2339  << ", importLIDs.need_sync_host()="
2340  << (importLIDs.need_sync_host () ? "true" : "false")
2341  << "." << endl;
2342  if (verbose) {
2343  std::cerr << os.str ();
2344  }
2345  err << os.str ();
2346  return;
2347  }
2348 
2349  const auto importLIDsHost = importLIDs.view_host ();
2350  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2351 
2352  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2353  // loop, by only unpacking on final==true (when we know the
2354  // current offset's value).
2355 
2356  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2357  {
2358  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2359  Kokkos::parallel_scan
2360  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2361  [=] (const size_t &i, size_t &update, const bool &final) {
2362  if (final) offset(i) = update;
2363  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2364  });
2365  }
2366 
2367  // this variable does not matter with a race condition (just error flag)
2368  //
2369  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2370  // or atomics on bool, so we use int instead. (I know we're not
2371  // launching a CUDA loop here, but we could in the future, and I'd
2372  // like to avoid potential trouble.)
2373  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2374  errorDuringUnpack ("errorDuringUnpack");
2375  errorDuringUnpack () = 0;
2376  {
2377  using policy_type = Kokkos::TeamPolicy<host_exec>;
2378  size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2379  + 2 * sizeof(GO); // Yeah, this is a fudge factor
2380 
2381  const auto policy = policy_type (numImportLIDs, 1, 1)
2382  .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2383  using host_scratch_space = typename host_exec::scratch_memory_space;
2384 
2385  using pair_type = Kokkos::pair<size_t, size_t>;
2386 
2387  //The following parallel_for modifies values on host while unpacking.
2388  getValuesHostNonConst();
2390  Kokkos::parallel_for
2391  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2392  [=] (const typename policy_type::member_type& member) {
2393  const size_t i = member.league_rank();
2394  Kokkos::View<GO*, host_scratch_space> gblColInds
2395  (member.team_scratch (0), maxRowNumEnt);
2396  Kokkos::View<LO*, host_scratch_space> lclColInds
2397  (member.team_scratch (0), maxRowNumEnt);
2398  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2399  (member.team_scratch (0), maxRowNumScalarEnt);
2400 
2401 
2402  const size_t offval = offset(i);
2403  const LO lclRow = importLIDsHost(i);
2404  const size_t numBytes = numPacketsPerLIDHost(i);
2405  const size_t numEnt =
2406  unpackRowCount<impl_scalar_type, LO, GO>
2407  (imports.view_host (), offval, numBytes, numBytesPerValue);
2408 
2409  if (numBytes > 0) {
2410  if (numEnt > maxRowNumEnt) {
2411  errorDuringUnpack() = 1;
2412  if (verbose) {
2413  std::ostringstream os;
2414  os << prefix
2415  << "At i = " << i << ", numEnt = " << numEnt
2416  << " > maxRowNumEnt = " << maxRowNumEnt
2417  << std::endl;
2418  std::cerr << os.str ();
2419  }
2420  return;
2421  }
2422  }
2423  const size_t numScalarEnt = numEnt*blockSize*blockSize;
2424  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2425  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2426  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2427 
2428  // Kyungjoo: additional wrapping scratch view is necessary
2429  // the following function interface need the same execution space
2430  // host scratch space somehow is not considered same as the host_exec
2431  size_t numBytesOut = 0;
2432  try {
2433  numBytesOut =
2434  unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2435  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2436  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2437  imports.view_host(),
2438  offval, numBytes, numEnt,
2439  numBytesPerValue, blockSize);
2440  }
2441  catch (std::exception& e) {
2442  errorDuringUnpack() = 1;
2443  if (verbose) {
2444  std::ostringstream os;
2445  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2446  << e.what () << endl;
2447  std::cerr << os.str ();
2448  }
2449  return;
2450  }
2451 
2452  if (numBytes != numBytesOut) {
2453  errorDuringUnpack() = 1;
2454  if (verbose) {
2455  std::ostringstream os;
2456  os << prefix
2457  << "At i = " << i << ", numBytes = " << numBytes
2458  << " != numBytesOut = " << numBytesOut << "."
2459  << std::endl;
2460  std::cerr << os.str();
2461  }
2462  return;
2463  }
2464 
2465  // Convert incoming global indices to local indices.
2466  for (size_t k = 0; k < numEnt; ++k) {
2467  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2468  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2469  if (verbose) {
2470  std::ostringstream os;
2471  os << prefix
2472  << "At i = " << i << ", GID " << gidsOut(k)
2473  << " is not owned by the calling process."
2474  << std::endl;
2475  std::cerr << os.str();
2476  }
2477  return;
2478  }
2479  }
2480 
2481  // Combine the incoming data with the matrix's current data.
2482  LO numCombd = 0;
2483  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2484  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2485  (const_cast<const impl_scalar_type*> (valsOut.data ()));
2486  if (combineMode == ADD) {
2487  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2488  } else if (combineMode == INSERT || combineMode == REPLACE) {
2489  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2490  } else if (combineMode == ABSMAX) {
2491  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2492  }
2493 
2494  if (static_cast<LO> (numEnt) != numCombd) {
2495  errorDuringUnpack() = 1;
2496  if (verbose) {
2497  std::ostringstream os;
2498  os << prefix << "At i = " << i << ", numEnt = " << numEnt
2499  << " != numCombd = " << numCombd << "."
2500  << endl;
2501  std::cerr << os.str ();
2502  }
2503  return;
2504  }
2505  }); // for each import LID i
2507  }
2508 
2509  if (errorDuringUnpack () != 0) {
2510  std::ostream& err = this->markLocalErrorAndGetStream ();
2511  err << prefix << "Unpacking failed.";
2512  if (! verbose) {
2513  err << " Please run again with the environment variable setting "
2514  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2515  }
2516  err << endl;
2517  }
2518 
2519  if (verbose) {
2520  std::ostringstream os;
2521  const bool lclSuccess = ! (* (this->localError_));
2522  os << prefix
2523  << (lclSuccess ? "succeeded" : "FAILED")
2524  << std::endl;
2525  std::cerr << os.str ();
2526  }
2527  }
2528 
2529  template<class Scalar, class LO, class GO, class Node>
2530  std::string
2532  {
2533  using Teuchos::TypeNameTraits;
2534  std::ostringstream os;
2535  os << "\"Tpetra::BlockCrsMatrix\": { "
2536  << "Template parameters: { "
2537  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2538  << "LO: " << TypeNameTraits<LO>::name ()
2539  << "GO: " << TypeNameTraits<GO>::name ()
2540  << "Node: " << TypeNameTraits<Node>::name ()
2541  << " }"
2542  << ", Label: \"" << this->getObjectLabel () << "\""
2543  << ", Global dimensions: ["
2544  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2545  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2546  << ", Block size: " << getBlockSize ()
2547  << " }";
2548  return os.str ();
2549  }
2550 
2551 
2552  template<class Scalar, class LO, class GO, class Node>
2553  void
2555  describe (Teuchos::FancyOStream& out,
2556  const Teuchos::EVerbosityLevel verbLevel) const
2557  {
2558  using Teuchos::ArrayRCP;
2559  using Teuchos::CommRequest;
2560  using Teuchos::FancyOStream;
2561  using Teuchos::getFancyOStream;
2562  using Teuchos::ireceive;
2563  using Teuchos::isend;
2564  using Teuchos::outArg;
2565  using Teuchos::TypeNameTraits;
2566  using Teuchos::VERB_DEFAULT;
2567  using Teuchos::VERB_NONE;
2568  using Teuchos::VERB_LOW;
2569  using Teuchos::VERB_MEDIUM;
2570  // using Teuchos::VERB_HIGH;
2571  using Teuchos::VERB_EXTREME;
2572  using Teuchos::RCP;
2573  using Teuchos::wait;
2574  using std::endl;
2575 
2576  // Set default verbosity if applicable.
2577  const Teuchos::EVerbosityLevel vl =
2578  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2579 
2580  if (vl == VERB_NONE) {
2581  return; // print nothing
2582  }
2583 
2584  // describe() always starts with a tab before it prints anything.
2585  Teuchos::OSTab tab0 (out);
2586 
2587  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2588  Teuchos::OSTab tab1 (out);
2589 
2590  out << "Template parameters:" << endl;
2591  {
2592  Teuchos::OSTab tab2 (out);
2593  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2594  << "LO: " << TypeNameTraits<LO>::name () << endl
2595  << "GO: " << TypeNameTraits<GO>::name () << endl
2596  << "Node: " << TypeNameTraits<Node>::name () << endl;
2597  }
2598  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2599  << "Global dimensions: ["
2600  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2601  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2602 
2603  const LO blockSize = getBlockSize ();
2604  out << "Block size: " << blockSize << endl;
2605 
2606  // constituent objects
2607  if (vl >= VERB_MEDIUM) {
2608  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2609  const int myRank = comm.getRank ();
2610  if (myRank == 0) {
2611  out << "Row Map:" << endl;
2612  }
2613  getRowMap()->describe (out, vl);
2614 
2615  if (! getColMap ().is_null ()) {
2616  if (getColMap() == getRowMap()) {
2617  if (myRank == 0) {
2618  out << "Column Map: same as row Map" << endl;
2619  }
2620  }
2621  else {
2622  if (myRank == 0) {
2623  out << "Column Map:" << endl;
2624  }
2625  getColMap ()->describe (out, vl);
2626  }
2627  }
2628  if (! getDomainMap ().is_null ()) {
2629  if (getDomainMap () == getRowMap ()) {
2630  if (myRank == 0) {
2631  out << "Domain Map: same as row Map" << endl;
2632  }
2633  }
2634  else if (getDomainMap () == getColMap ()) {
2635  if (myRank == 0) {
2636  out << "Domain Map: same as column Map" << endl;
2637  }
2638  }
2639  else {
2640  if (myRank == 0) {
2641  out << "Domain Map:" << endl;
2642  }
2643  getDomainMap ()->describe (out, vl);
2644  }
2645  }
2646  if (! getRangeMap ().is_null ()) {
2647  if (getRangeMap () == getDomainMap ()) {
2648  if (myRank == 0) {
2649  out << "Range Map: same as domain Map" << endl;
2650  }
2651  }
2652  else if (getRangeMap () == getRowMap ()) {
2653  if (myRank == 0) {
2654  out << "Range Map: same as row Map" << endl;
2655  }
2656  }
2657  else {
2658  if (myRank == 0) {
2659  out << "Range Map: " << endl;
2660  }
2661  getRangeMap ()->describe (out, vl);
2662  }
2663  }
2664  }
2665 
2666  if (vl >= VERB_EXTREME) {
2667  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2668  const int myRank = comm.getRank ();
2669  const int numProcs = comm.getSize ();
2670 
2671  // Print the calling process' data to the given output stream.
2672  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2673  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2674  FancyOStream& os = *osPtr;
2675  os << "Process " << myRank << ":" << endl;
2676  Teuchos::OSTab tab2 (os);
2677 
2678  const map_type& meshRowMap = * (graph_.getRowMap ());
2679  const map_type& meshColMap = * (graph_.getColMap ());
2680  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2681  meshLclRow <= meshRowMap.getMaxLocalIndex ();
2682  ++meshLclRow) {
2683  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2684  os << "Row " << meshGblRow << ": {";
2685 
2686  local_inds_host_view_type lclColInds;
2687  values_host_view_type vals;
2688  LO numInds = 0;
2689  this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
2690 
2691  for (LO k = 0; k < numInds; ++k) {
2692  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2693 
2694  os << "Col " << gblCol << ": [";
2695  for (LO i = 0; i < blockSize; ++i) {
2696  for (LO j = 0; j < blockSize; ++j) {
2697  os << vals[blockSize*blockSize*k + i*blockSize + j];
2698  if (j + 1 < blockSize) {
2699  os << ", ";
2700  }
2701  }
2702  if (i + 1 < blockSize) {
2703  os << "; ";
2704  }
2705  }
2706  os << "]";
2707  if (k + 1 < numInds) {
2708  os << ", ";
2709  }
2710  }
2711  os << "}" << endl;
2712  }
2713 
2714  // Print data on Process 0. This will automatically respect the
2715  // current indentation level.
2716  if (myRank == 0) {
2717  out << lclOutStrPtr->str ();
2718  lclOutStrPtr = Teuchos::null; // clear it to save space
2719  }
2720 
2721  const int sizeTag = 1337;
2722  const int dataTag = 1338;
2723 
2724  ArrayRCP<char> recvDataBuf; // only used on Process 0
2725 
2726  // Send string sizes and data from each process in turn to
2727  // Process 0, and print on that process.
2728  for (int p = 1; p < numProcs; ++p) {
2729  if (myRank == 0) {
2730  // Receive the incoming string's length.
2731  ArrayRCP<size_t> recvSize (1);
2732  recvSize[0] = 0;
2733  RCP<CommRequest<int> > recvSizeReq =
2734  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2735  wait<int> (comm, outArg (recvSizeReq));
2736  const size_t numCharsToRecv = recvSize[0];
2737 
2738  // Allocate space for the string to receive. Reuse receive
2739  // buffer space if possible. We can do this because in the
2740  // current implementation, we only have one receive in
2741  // flight at a time. Leave space for the '\0' at the end,
2742  // in case the sender doesn't send it.
2743  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2744  recvDataBuf.resize (numCharsToRecv + 1);
2745  }
2746  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2747  // Post the receive of the actual string data.
2748  RCP<CommRequest<int> > recvDataReq =
2749  ireceive<int, char> (recvData, p, dataTag, comm);
2750  wait<int> (comm, outArg (recvDataReq));
2751 
2752  // Print the received data. This will respect the current
2753  // indentation level. Make sure that the string is
2754  // null-terminated.
2755  recvDataBuf[numCharsToRecv] = '\0';
2756  out << recvDataBuf.getRawPtr ();
2757  }
2758  else if (myRank == p) { // if I am not Process 0, and my rank is p
2759  // This deep-copies the string at most twice, depending on
2760  // whether std::string reference counts internally (it
2761  // generally does, so this won't deep-copy at all).
2762  const std::string stringToSend = lclOutStrPtr->str ();
2763  lclOutStrPtr = Teuchos::null; // clear original to save space
2764 
2765  // Send the string's length to Process 0.
2766  const size_t numCharsToSend = stringToSend.size ();
2767  ArrayRCP<size_t> sendSize (1);
2768  sendSize[0] = numCharsToSend;
2769  RCP<CommRequest<int> > sendSizeReq =
2770  isend<int, size_t> (sendSize, 0, sizeTag, comm);
2771  wait<int> (comm, outArg (sendSizeReq));
2772 
2773  // Send the actual string to Process 0. We know that the
2774  // string has length > 0, so it's save to take the address
2775  // of the first entry. Make a nonowning ArrayRCP to hold
2776  // the string. Process 0 will add a null termination
2777  // character at the end of the string, after it receives the
2778  // message.
2779  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2780  RCP<CommRequest<int> > sendDataReq =
2781  isend<int, char> (sendData, 0, dataTag, comm);
2782  wait<int> (comm, outArg (sendDataReq));
2783  }
2784  } // for each process rank p other than 0
2785  } // extreme verbosity level (print the whole matrix)
2786  }
2787 
2788  template<class Scalar, class LO, class GO, class Node>
2789  Teuchos::RCP<const Teuchos::Comm<int> >
2791  getComm() const
2792  {
2793  return graph_.getComm();
2794  }
2795 
2796 
2797  template<class Scalar, class LO, class GO, class Node>
2801  {
2802  return graph_.getGlobalNumCols();
2803  }
2804 
2805 
2806  template<class Scalar, class LO, class GO, class Node>
2807  size_t
2810  {
2811  return graph_.getLocalNumCols();
2812  }
2813 
2814  template<class Scalar, class LO, class GO, class Node>
2815  GO
2818  {
2819  return graph_.getIndexBase();
2820  }
2821 
2822  template<class Scalar, class LO, class GO, class Node>
2826  {
2827  return graph_.getGlobalNumEntries();
2828  }
2829 
2830  template<class Scalar, class LO, class GO, class Node>
2831  size_t
2834  {
2835  return graph_.getLocalNumEntries();
2836  }
2837 
2838  template<class Scalar, class LO, class GO, class Node>
2839  size_t
2841  getNumEntriesInGlobalRow (GO globalRow) const
2842  {
2843  return graph_.getNumEntriesInGlobalRow(globalRow);
2844  }
2845 
2846 
2847  template<class Scalar, class LO, class GO, class Node>
2848  size_t
2851  {
2852  return graph_.getGlobalMaxNumRowEntries();
2853  }
2854 
2855  template<class Scalar, class LO, class GO, class Node>
2856  bool
2858  hasColMap() const
2859  {
2860  return graph_.hasColMap();
2861  }
2862 
2863 
2864  template<class Scalar, class LO, class GO, class Node>
2865  bool
2868  {
2869  return graph_.isLocallyIndexed();
2870  }
2871 
2872  template<class Scalar, class LO, class GO, class Node>
2873  bool
2876  {
2877  return graph_.isGloballyIndexed();
2878  }
2879 
2880  template<class Scalar, class LO, class GO, class Node>
2881  bool
2884  {
2885  return graph_.isFillComplete ();
2886  }
2887 
2888  template<class Scalar, class LO, class GO, class Node>
2889  bool
2892  {
2893  return false;
2894  }
2895 
2896  template<class Scalar, class LO, class GO, class Node>
2897  void
2899  getGlobalRowCopy (GO /*GlobalRow*/,
2900  nonconst_global_inds_host_view_type &/*Indices*/,
2901  nonconst_values_host_view_type &/*Values*/,
2902  size_t& /*NumEntries*/) const
2903  {
2904  TEUCHOS_TEST_FOR_EXCEPTION(
2905  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2906  "This class doesn't support global matrix indexing.");
2907 
2908  }
2909 
2910  template<class Scalar, class LO, class GO, class Node>
2911  void
2913  getGlobalRowView (GO /* GlobalRow */,
2914  global_inds_host_view_type &/* indices */,
2915  values_host_view_type &/* values */) const
2916  {
2917  TEUCHOS_TEST_FOR_EXCEPTION(
2918  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2919  "This class doesn't support global matrix indexing.");
2920 
2921  }
2922 
2923  template<class Scalar, class LO, class GO, class Node>
2924  void
2926  getLocalRowView (LO localRowInd,
2927  local_inds_host_view_type &colInds,
2928  values_host_view_type &vals) const
2929  {
2930  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2931  colInds = local_inds_host_view_type();
2932  vals = values_host_view_type();
2933  }
2934  else {
2935  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2936  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2937  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2938 
2939  vals = getValuesHost (localRowInd);
2940  }
2941  }
2942 
2943  template<class Scalar, class LO, class GO, class Node>
2944  void
2947  local_inds_host_view_type &colInds,
2948  nonconst_values_host_view_type &vals) const
2949  {
2950  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2951  colInds = nonconst_local_inds_host_view_type();
2952  vals = nonconst_values_host_view_type();
2953  }
2954  else {
2955  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2956  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2957  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2958 
2959  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2960  vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
2961  }
2962  }
2963 
2964  template<class Scalar, class LO, class GO, class Node>
2965  void
2968  {
2969  const size_t lclNumMeshRows = graph_.getLocalNumRows ();
2970 
2971  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
2972  graph_.getLocalDiagOffsets (diagOffsets);
2973 
2974  // The code below works on host, so use a host View.
2975  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
2976  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
2977  Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
2978 
2979  auto vals_host_out = val_.getHostView(Access::ReadOnly);
2980  const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
2981 
2982  // TODO amk: This is a temporary measure to make the code run with Ifpack2
2983  size_t rowOffset = 0;
2984  size_t offset = 0;
2985  LO bs = getBlockSize();
2986  for(size_t r=0; r<getLocalNumRows(); r++)
2987  {
2988  // move pointer to start of diagonal block
2989  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
2990  for(int b=0; b<bs; b++)
2991  {
2992  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
2993  }
2994  // move pointer to start of next block row
2995  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
2996  }
2997 
2998  //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
2999  }
3000 
3001  template<class Scalar, class LO, class GO, class Node>
3002  void
3004  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3005  {
3006  TEUCHOS_TEST_FOR_EXCEPTION(
3007  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3008  "not implemented.");
3009 
3010  }
3011 
3012  template<class Scalar, class LO, class GO, class Node>
3013  void
3015  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3016  {
3017  TEUCHOS_TEST_FOR_EXCEPTION(
3018  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3019  "not implemented.");
3020 
3021  }
3022 
3023  template<class Scalar, class LO, class GO, class Node>
3024  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3026  getGraph() const
3027  {
3028  return graphRCP_;
3029  }
3030 
3031  template<class Scalar, class LO, class GO, class Node>
3032  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3035  {
3036  TEUCHOS_TEST_FOR_EXCEPTION(
3037  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3038  "not implemented.");
3039  }
3040 
3041 } // namespace Tpetra
3042 
3043 //
3044 // Explicit instantiation macro
3045 //
3046 // Must be expanded from within the Tpetra namespace!
3047 //
3048 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3049  template class BlockCrsMatrix< S, LO, GO, NODE >;
3050 
3051 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
KokkosSparse::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
std::string description() const override
One-line description of this object.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
LO local_ordinal_type
The type of local indices.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
size_t getLocalNumRows() const override
get the local number of block rows
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
MultiVector for multiple degrees of freedom per mesh point.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
size_t global_size_t
Global size_t object.
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
global_size_t getGlobalNumRows() const override
get the global number of block rows
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Replace old value with maximum of magnitudes of old and new values.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Replace existing values with new values.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Replace old values with zero.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
char packet_type
Implementation detail; tells.
local_matrix_device_type getLocalMatrixDevice() const
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
A distributed dense vector.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row&#39;s entries.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix&#39;s values (val_).
Base class for distributed Tpetra objects that support data redistribution.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row. ...