Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 // clang-format off
11 
12 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
13 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
14 
17 
21 #include "Tpetra_BlockMultiVector.hpp"
22 #include "Tpetra_BlockView.hpp"
23 
24 #include "Teuchos_TimeMonitor.hpp"
25 #ifdef HAVE_TPETRA_DEBUG
26 # include <set>
27 #endif // HAVE_TPETRA_DEBUG
28 
29 #include "KokkosSparse_spmv.hpp"
30 
31 namespace Tpetra {
32 
33 namespace Impl {
34 
35  template<typename T>
36  struct BlockCrsRowStruct {
37  T totalNumEntries, totalNumBytes, maxRowLength;
38  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
39  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
40  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
41  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
42  };
43 
44  template<typename T>
45  static
46  KOKKOS_INLINE_FUNCTION
47  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
48  a.totalNumEntries += b.totalNumEntries;
49  a.totalNumBytes += b.totalNumBytes;
50  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
51  }
52 
53  template<typename T, typename ExecSpace>
54  struct BlockCrsReducer {
55  typedef BlockCrsReducer reducer;
56  typedef T value_type;
57  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
58  value_type *value;
59 
60  KOKKOS_INLINE_FUNCTION
61  BlockCrsReducer(value_type &val) : value(&val) {}
62 
63  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
64  KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
65  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
66  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
67  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
68  };
69 
70 } // namespace Impl
71 
72 namespace { // (anonymous)
73 
74 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
75 // version that takes two Kokkos::View arguments).
76 template<class Scalar, class LO, class GO, class Node>
77 class GetLocalDiagCopy {
78 public:
79  typedef typename Node::device_type device_type;
80  typedef size_t diag_offset_type;
81  typedef Kokkos::View<const size_t*, device_type,
82  Kokkos::MemoryUnmanaged> diag_offsets_type;
83  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
84  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
85  typedef typename local_graph_device_type::row_map_type row_offsets_type;
86  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
87  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
88  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
89 
90  // Constructor
91  GetLocalDiagCopy (const diag_type& diag,
92  const values_type& val,
93  const diag_offsets_type& diagOffsets,
94  const row_offsets_type& ptr,
95  const LO blockSize) :
96  diag_ (diag),
97  diagOffsets_ (diagOffsets),
98  ptr_ (ptr),
99  blockSize_ (blockSize),
100  offsetPerBlock_ (blockSize_*blockSize_),
101  val_(val)
102  {}
103 
104  KOKKOS_INLINE_FUNCTION void
105  operator() (const LO& lclRowInd) const
106  {
107  using Kokkos::ALL;
108 
109  // Get row offset
110  const size_t absOffset = ptr_[lclRowInd];
111 
112  // Get offset relative to start of row
113  const size_t relOffset = diagOffsets_[lclRowInd];
114 
115  // Get the total offset
116  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
117 
118  // Get a view of the block. BCRS currently uses LayoutRight
119  // regardless of the device.
120  typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
121  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
122  const_little_block_type;
123  const_little_block_type D_in (val_.data () + pointOffset,
124  blockSize_, blockSize_);
125  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
126  COPY (D_in, D_out);
127  }
128 
129  private:
130  diag_type diag_;
131  diag_offsets_type diagOffsets_;
132  row_offsets_type ptr_;
133  LO blockSize_;
134  LO offsetPerBlock_;
135  values_type val_;
136  };
137 } // namespace (anonymous)
138 
139  template<class Scalar, class LO, class GO, class Node>
140  std::ostream&
141  BlockCrsMatrix<Scalar, LO, GO, Node>::
142  markLocalErrorAndGetStream ()
143  {
144  * (this->localError_) = true;
145  if ((*errs_).is_null ()) {
146  *errs_ = Teuchos::rcp (new std::ostringstream ());
147  }
148  return **errs_;
149  }
150 
151  template<class Scalar, class LO, class GO, class Node>
154  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
155  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
156  blockSize_ (static_cast<LO> (0)),
157  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
158  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
159  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
160  offsetPerBlock_ (0),
161  localError_ (new bool (false)),
162  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
163  {
164  }
165 
166  template<class Scalar, class LO, class GO, class Node>
169  const LO blockSize) :
170  dist_object_type (graph.getMap ()),
171  graph_ (graph),
172  rowMeshMap_ (* (graph.getRowMap ())),
173  blockSize_ (blockSize),
174  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
175  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
176  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
177  offsetPerBlock_ (blockSize * blockSize),
178  localError_ (new bool (false)),
179  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
180  {
181 
183  TEUCHOS_TEST_FOR_EXCEPTION(
184  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
185  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
186  "rows (isSorted() is false). This class assumes sorted rows.");
187 
188  graphRCP_ = Teuchos::rcpFromRef(graph_);
189 
190  // Trick to test whether LO is nonpositive, without a compiler
191  // warning in case LO is unsigned (which is generally a bad idea
192  // anyway). I don't promise that the trick works, but it
193  // generally does with gcc at least, in my experience.
194  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
195  TEUCHOS_TEST_FOR_EXCEPTION(
196  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
197  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
198  " <= 0. The block size must be positive.");
199 
200  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
201  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
202 
203  {
204  // These are rcp
205  const auto domainPointMap = getDomainMap();
206  const auto colPointMap = Teuchos::rcp
207  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
208  *pointImporter_ = Teuchos::rcp
209  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
210  }
211  {
212  auto local_graph_h = graph.getLocalGraphHost ();
213  auto ptr_h = local_graph_h.row_map;
214  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
215  Kokkos::deep_copy(ptrHost_, ptr_h);
216 
217  auto ind_h = local_graph_h.entries;
218  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
219  // DEEP_COPY REVIEW - HOST-TO-HOST
220  Kokkos::deep_copy (indHost_, ind_h);
221 
222  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
223  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
224  }
225  }
226 
227  template<class Scalar, class LO, class GO, class Node>
230  const typename local_matrix_device_type::values_type& values,
231  const LO blockSize) :
232  dist_object_type (graph.getMap ()),
233  graph_ (graph),
234  rowMeshMap_ (* (graph.getRowMap ())),
235  blockSize_ (blockSize),
236  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
237  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
238  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
239  offsetPerBlock_ (blockSize * blockSize),
240  localError_ (new bool (false)),
241  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
242  {
244  TEUCHOS_TEST_FOR_EXCEPTION(
245  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
246  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
247  "rows (isSorted() is false). This class assumes sorted rows.");
248 
249  graphRCP_ = Teuchos::rcpFromRef(graph_);
250 
251  // Trick to test whether LO is nonpositive, without a compiler
252  // warning in case LO is unsigned (which is generally a bad idea
253  // anyway). I don't promise that the trick works, but it
254  // generally does with gcc at least, in my experience.
255  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
256  TEUCHOS_TEST_FOR_EXCEPTION(
257  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
258  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
259  " <= 0. The block size must be positive.");
260 
261  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
262  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
263 
264  {
265  // These are rcp
266  const auto domainPointMap = getDomainMap();
267  const auto colPointMap = Teuchos::rcp
268  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
269  *pointImporter_ = Teuchos::rcp
270  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
271  }
272  {
273  auto local_graph_h = graph.getLocalGraphHost ();
274  auto ptr_h = local_graph_h.row_map;
275  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
276  Kokkos::deep_copy(ptrHost_, ptr_h);
277 
278  auto ind_h = local_graph_h.entries;
279  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
280  Kokkos::deep_copy (indHost_, ind_h);
281 
282  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
283  TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
284  val_ = decltype (val_) (values);
285  }
286  }
287 
288  template<class Scalar, class LO, class GO, class Node>
291  const map_type& domainPointMap,
292  const map_type& rangePointMap,
293  const LO blockSize) :
294  dist_object_type (graph.getMap ()),
295  graph_ (graph),
296  rowMeshMap_ (* (graph.getRowMap ())),
297  domainPointMap_ (domainPointMap),
298  rangePointMap_ (rangePointMap),
299  blockSize_ (blockSize),
300  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
301  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
302  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
303  offsetPerBlock_ (blockSize * blockSize),
304  localError_ (new bool (false)),
305  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
306  {
307  TEUCHOS_TEST_FOR_EXCEPTION(
308  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
309  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
310  "rows (isSorted() is false). This class assumes sorted rows.");
311 
312  graphRCP_ = Teuchos::rcpFromRef(graph_);
313 
314  // Trick to test whether LO is nonpositive, without a compiler
315  // warning in case LO is unsigned (which is generally a bad idea
316  // anyway). I don't promise that the trick works, but it
317  // generally does with gcc at least, in my experience.
318  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
319  TEUCHOS_TEST_FOR_EXCEPTION(
320  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
321  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
322  " <= 0. The block size must be positive.");
323  {
324  // These are rcp
325  const auto rcpDomainPointMap = getDomainMap();
326  const auto colPointMap = Teuchos::rcp
327  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
328  *pointImporter_ = Teuchos::rcp
329  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
330  }
331  {
332  auto local_graph_h = graph.getLocalGraphHost ();
333  auto ptr_h = local_graph_h.row_map;
334  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
335  // DEEP_COPY REVIEW - HOST-TO-HOST
336  Kokkos::deep_copy(ptrHost_, ptr_h);
337 
338  auto ind_h = local_graph_h.entries;
339  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
340  // DEEP_COPY REVIEW - HOST-TO-HOST
341  Kokkos::deep_copy (indHost_, ind_h);
342 
343  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
344  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
345 
346  }
347  }
348 
349  template<class Scalar, class LO, class GO, class Node>
350  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
353  { // Copy constructor of map_type does a shallow copy.
354  // We're only returning by RCP for backwards compatibility.
355  return Teuchos::rcp (new map_type (domainPointMap_));
356  }
357 
358  template<class Scalar, class LO, class GO, class Node>
359  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
361  getRangeMap () const
362  { // Copy constructor of map_type does a shallow copy.
363  // We're only returning by RCP for backwards compatibility.
364  return Teuchos::rcp (new map_type (rangePointMap_));
365  }
366 
367  template<class Scalar, class LO, class GO, class Node>
368  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
370  getRowMap () const
371  {
372  return graph_.getRowMap();
373  }
374 
375  template<class Scalar, class LO, class GO, class Node>
376  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
378  getColMap () const
379  {
380  return graph_.getColMap();
381  }
382 
383  template<class Scalar, class LO, class GO, class Node>
387  {
388  return graph_.getGlobalNumRows();
389  }
390 
391  template<class Scalar, class LO, class GO, class Node>
392  size_t
395  {
396  return graph_.getLocalNumRows();
397  }
398 
399  template<class Scalar, class LO, class GO, class Node>
400  size_t
403  {
404  return graph_.getLocalMaxNumRowEntries();
405  }
406 
407  template<class Scalar, class LO, class GO, class Node>
408  void
410  apply (const mv_type& X,
411  mv_type& Y,
412  Teuchos::ETransp mode,
413  Scalar alpha,
414  Scalar beta) const
415  {
416  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
417  TEUCHOS_TEST_FOR_EXCEPTION(
418  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
419  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
420  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
421  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
422 
423  TEUCHOS_TEST_FOR_EXCEPTION(
424  !X.isConstantStride() || !Y.isConstantStride(),
425  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
426  "X and Y must both be constant stride");
427 
428  BMV X_view;
429  BMV Y_view;
430  const LO blockSize = getBlockSize ();
431  try {
432  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
433  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
434  }
435  catch (std::invalid_argument& e) {
436  TEUCHOS_TEST_FOR_EXCEPTION(
437  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
438  "apply: Either the input MultiVector X or the output MultiVector Y "
439  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
440  "graph. BlockMultiVector's constructor threw the following exception: "
441  << e.what ());
442  }
443 
444  try {
445  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
446  // either that or mark fields of this class as 'mutable'. The
447  // problem is that applyBlock wants to do lazy initialization of
448  // temporary block multivectors.
449  const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
450  } catch (std::invalid_argument& e) {
451  TEUCHOS_TEST_FOR_EXCEPTION(
452  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
453  "apply: The implementation method applyBlock complained about having "
454  "an invalid argument. It reported the following: " << e.what ());
455  } catch (std::logic_error& e) {
456  TEUCHOS_TEST_FOR_EXCEPTION(
457  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
458  "apply: The implementation method applyBlock complained about a "
459  "possible bug in its implementation. It reported the following: "
460  << e.what ());
461  } catch (std::exception& e) {
462  TEUCHOS_TEST_FOR_EXCEPTION(
463  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
464  "apply: The implementation method applyBlock threw an exception which "
465  "is neither std::invalid_argument nor std::logic_error, but is a "
466  "subclass of std::exception. It reported the following: "
467  << e.what ());
468  } catch (...) {
469  TEUCHOS_TEST_FOR_EXCEPTION(
470  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
471  "apply: The implementation method applyBlock threw an exception which "
472  "is not an instance of a subclass of std::exception. This probably "
473  "indicates a bug. Please report this to the Tpetra developers.");
474  }
475  }
476 
477  template<class Scalar, class LO, class GO, class Node>
478  void
482  Teuchos::ETransp mode,
483  const Scalar alpha,
484  const Scalar beta)
485  {
486  TEUCHOS_TEST_FOR_EXCEPTION(
487  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
488  "Tpetra::BlockCrsMatrix::applyBlock: "
489  "X and Y have different block sizes. X.getBlockSize() = "
490  << X.getBlockSize () << " != Y.getBlockSize() = "
491  << Y.getBlockSize () << ".");
492 
493  if (mode == Teuchos::NO_TRANS) {
494  applyBlockNoTrans (X, Y, alpha, beta);
495  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
496  applyBlockTrans (X, Y, mode, alpha, beta);
497  } else {
498  TEUCHOS_TEST_FOR_EXCEPTION(
499  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
500  "applyBlock: Invalid 'mode' argument. Valid values are "
501  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
502  }
503  }
504 
505  template<class Scalar, class LO, class GO, class Node>
506  void
509  const Import<LO, GO, Node>& importer) const
510  {
511  using Teuchos::RCP;
512  using Teuchos::rcp;
513  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
514 
515  // Right now, we make many assumptions...
516  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
517  "destMatrix is required to be null.");
518 
519  // BlockCrsMatrix requires a complete graph at construction.
520  // So first step is to import and fill complete the destGraph.
521  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
522  RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
523  srcGraph->getDomainMap(),
524  srcGraph->getRangeMap());
525 
526 
527  // Final step, create and import the destMatrix.
528  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
529  destMatrix->doImport(*this, importer, Tpetra::INSERT);
530  }
531 
532 
533  template<class Scalar, class LO, class GO, class Node>
534  void
537  const Export<LO, GO, Node>& exporter) const
538  {
539  using Teuchos::RCP;
540  using Teuchos::rcp;
541  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
542 
543  // Right now, we make many assumptions...
544  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
545  "destMatrix is required to be null.");
546 
547  // BlockCrsMatrix requires a complete graph at construction.
548  // So first step is to import and fill complete the destGraph.
549  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
550  RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
551  srcGraph->getDomainMap(),
552  srcGraph->getRangeMap());
553 
554 
555  // Final step, create and import the destMatrix.
556  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
557  destMatrix->doExport(*this, exporter, Tpetra::INSERT);
558  }
559 
560 
561  template<class Scalar, class LO, class GO, class Node>
562  void
564  setAllToScalar (const Scalar& alpha)
565  {
566  auto val_d = val_.getDeviceView(Access::OverwriteAll);
567  Kokkos::deep_copy(val_d, alpha);
568  }
569 
570  template<class Scalar, class LO, class GO, class Node>
571  LO
573  replaceLocalValues (const LO localRowInd,
574  const LO colInds[],
575  const Scalar vals[],
576  const LO numColInds) const
577  {
578  std::vector<ptrdiff_t> offsets(numColInds);
579  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
580  const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
581  return validCount;
582  }
583 
584  template <class Scalar, class LO, class GO, class Node>
585  void
587  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
588  Kokkos::MemoryUnmanaged>& offsets) const
589  {
590  graph_.getLocalDiagOffsets (offsets);
591  }
592 
593  template <class Scalar, class LO, class GO, class Node>
594  void
596  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
597  Kokkos::MemoryUnmanaged>& diag,
598  const Kokkos::View<const size_t*, device_type,
599  Kokkos::MemoryUnmanaged>& offsets) const
600  {
601  using Kokkos::parallel_for;
602  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
603 
604  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
605  const LO blockSize = this->getBlockSize ();
606  TEUCHOS_TEST_FOR_EXCEPTION
607  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
608  static_cast<LO> (diag.extent (1)) < blockSize ||
609  static_cast<LO> (diag.extent (2)) < blockSize,
610  std::invalid_argument, prefix <<
611  "The input Kokkos::View is not big enough to hold all the data.");
612  TEUCHOS_TEST_FOR_EXCEPTION
613  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
614  prefix << "offsets.size() = " << offsets.size () << " < local number of "
615  "diagonal blocks " << lclNumMeshRows << ".");
616 
617  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
618  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
619 
620  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
621  // we reserve the right to do lazy allocation of device data. (We
622  // don't plan to do lazy allocation for host data; the host
623  // version of the data always exists.)
624  auto val_d = val_.getDeviceView(Access::ReadOnly);
625  parallel_for (policy_type (0, lclNumMeshRows),
626  functor_type (diag, val_d, offsets,
627  graph_.getLocalGraphDevice ().row_map, blockSize_));
628  }
629 
630  template<class Scalar, class LO, class GO, class Node>
631  LO
633  absMaxLocalValues (const LO localRowInd,
634  const LO colInds[],
635  const Scalar vals[],
636  const LO numColInds) const
637  {
638  std::vector<ptrdiff_t> offsets(numColInds);
639  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
640  const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
641  return validCount;
642  }
643 
644 
645  template<class Scalar, class LO, class GO, class Node>
646  LO
648  sumIntoLocalValues (const LO localRowInd,
649  const LO colInds[],
650  const Scalar vals[],
651  const LO numColInds) const
652  {
653  std::vector<ptrdiff_t> offsets(numColInds);
654  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
655  const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
656  return validCount;
657  }
658  template<class Scalar, class LO, class GO, class Node>
659  void
661  getLocalRowCopy (LO LocalRow,
662  nonconst_local_inds_host_view_type &Indices,
663  nonconst_values_host_view_type &Values,
664  size_t& NumEntries) const
665  {
666  auto vals = getValuesHost(LocalRow);
667  const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
668  if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
669  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
670  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
671  << numInds << " row entries");
672  }
673  const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
674  for (LO i=0; i<numInds; ++i) {
675  Indices[i] = colInds[i];
676  }
677  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
678  Values[i] = vals[i];
679  }
680  NumEntries = numInds;
681  }
682 
683  template<class Scalar, class LO, class GO, class Node>
684  LO
686  getLocalRowOffsets (const LO localRowInd,
687  ptrdiff_t offsets[],
688  const LO colInds[],
689  const LO numColInds) const
690  {
691  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
692  // We got no offsets, because the input local row index is
693  // invalid on the calling process. That may not be an error, if
694  // numColInds is zero anyway; it doesn't matter. This is the
695  // advantage of returning the number of valid indices.
696  return static_cast<LO> (0);
697  }
698 
699  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
700  LO hint = 0; // Guess for the relative offset into the current row
701  LO validCount = 0; // number of valid column indices in colInds
702 
703  for (LO k = 0; k < numColInds; ++k) {
704  const LO relBlockOffset =
705  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
706  if (relBlockOffset != LINV) {
707  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
708  hint = relBlockOffset + 1;
709  ++validCount;
710  }
711  }
712  return validCount;
713  }
714 
715 
716  template<class Scalar, class LO, class GO, class Node>
717  LO
719  replaceLocalValuesByOffsets (const LO localRowInd,
720  const ptrdiff_t offsets[],
721  const Scalar vals[],
722  const LO numOffsets) const
723  {
724  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
725  // We modified no values, because the input local row index is
726  // invalid on the calling process. That may not be an error, if
727  // numColInds is zero anyway; it doesn't matter. This is the
728  // advantage of returning the number of valid indices.
729  return static_cast<LO> (0);
730  }
731  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
732  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
733  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
734  impl_scalar_type* vOut = val_out.data();
735 
736  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
737  const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
738  size_t pointOffset = 0; // Current offset into input values
739  LO validCount = 0; // number of valid offsets
740 
741  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
742  const size_t blockOffset = offsets[k]*perBlockSize;
743  if (offsets[k] != STINV) {
744  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
745  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
746  COPY (A_new, A_old);
747  ++validCount;
748  }
749  }
750  return validCount;
751  }
752 
753 
754  template<class Scalar, class LO, class GO, class Node>
755  LO
757  absMaxLocalValuesByOffsets (const LO localRowInd,
758  const ptrdiff_t offsets[],
759  const Scalar vals[],
760  const LO numOffsets) const
761  {
762  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
763  // We modified no values, because the input local row index is
764  // invalid on the calling process. That may not be an error, if
765  // numColInds is zero anyway; it doesn't matter. This is the
766  // advantage of returning the number of valid indices.
767  return static_cast<LO> (0);
768  }
769  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
770  auto val_out = getValuesHost(localRowInd);
771  impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
772 
773  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
774  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
775  size_t pointOffset = 0; // Current offset into input values
776  LO validCount = 0; // number of valid offsets
777 
778  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
779  const size_t blockOffset = offsets[k]*perBlockSize;
780  if (blockOffset != STINV) {
781  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
782  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
783  ::Tpetra::Impl::absMax (A_old, A_new);
784  ++validCount;
785  }
786  }
787  return validCount;
788  }
789 
790 
791  template<class Scalar, class LO, class GO, class Node>
792  LO
794  sumIntoLocalValuesByOffsets (const LO localRowInd,
795  const ptrdiff_t offsets[],
796  const Scalar vals[],
797  const LO numOffsets) const
798  {
799  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
800  // We modified no values, because the input local row index is
801  // invalid on the calling process. That may not be an error, if
802  // numColInds is zero anyway; it doesn't matter. This is the
803  // advantage of returning the number of valid indices.
804  return static_cast<LO> (0);
805  }
806  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
807  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
808  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
809  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
810  impl_scalar_type* vOut = val_out.data();
811 
812  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
813  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
814  size_t pointOffset = 0; // Current offset into input values
815  LO validCount = 0; // number of valid offsets
816 
817  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
818  const size_t blockOffset = offsets[k]*perBlockSize;
819  if (blockOffset != STINV) {
820  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
821  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
822  AXPY (ONE, A_new, A_old);
823  ++validCount;
824  }
825  }
826  return validCount;
827  }
828 
829  template<class Scalar, class LO, class GO, class Node>
831  impl_scalar_type_dualview::t_host::const_type
833  getValuesHost () const
834  {
835  return val_.getHostView(Access::ReadOnly);
836  }
837 
838  template<class Scalar, class LO, class GO, class Node>
839  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
840  impl_scalar_type_dualview::t_dev::const_type
841  BlockCrsMatrix<Scalar, LO, GO, Node>::
842  getValuesDevice () const
843  {
844  return val_.getDeviceView(Access::ReadOnly);
845  }
846 
847  template<class Scalar, class LO, class GO, class Node>
848  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
849  impl_scalar_type_dualview::t_host
852  {
853  return val_.getHostView(Access::ReadWrite);
854  }
855 
856  template<class Scalar, class LO, class GO, class Node>
858  impl_scalar_type_dualview::t_dev
860  getValuesDeviceNonConst () const
861  {
862  return val_.getDeviceView(Access::ReadWrite);
863  }
864 
865  template<class Scalar, class LO, class GO, class Node>
866  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
867  impl_scalar_type_dualview::t_host::const_type
868  BlockCrsMatrix<Scalar, LO, GO, Node>::
869  getValuesHost (const LO& lclRow) const
870  {
871  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
872  auto val = val_.getHostView(Access::ReadOnly);
873  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
874  return r_val;
875  }
876 
877  template<class Scalar, class LO, class GO, class Node>
879  impl_scalar_type_dualview::t_dev::const_type
881  getValuesDevice (const LO& lclRow) const
882  {
883  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
884  auto val = val_.getDeviceView(Access::ReadOnly);
885  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
886  return r_val;
887  }
888 
889  template<class Scalar, class LO, class GO, class Node>
892  getValuesHostNonConst (const LO& lclRow)
893  {
894  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
895  auto val = val_.getHostView(Access::ReadWrite);
896  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
897  return r_val;
898  }
899 
900  template<class Scalar, class LO, class GO, class Node>
903  getValuesDeviceNonConst (const LO& lclRow)
904  {
905  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
906  auto val = val_.getDeviceView(Access::ReadWrite);
907  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
908  return r_val;
909  }
910 
911  template<class Scalar, class LO, class GO, class Node>
912  size_t
914  getNumEntriesInLocalRow (const LO localRowInd) const
915  {
916  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
917  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
918  return static_cast<LO> (0); // the calling process doesn't have that row
919  } else {
920  return static_cast<LO> (numEntInGraph);
921  }
922  }
923 
924  template<class Scalar, class LO, class GO, class Node>
925  typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
928  {
929  auto numCols = this->graph_.getColMap()->getLocalNumElements();
930  auto val = val_.getDeviceView(Access::ReadWrite);
931  const LO blockSize = this->getBlockSize ();
932  const auto graph = this->graph_.getLocalGraphDevice ();
933 
934  return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
935  numCols,
936  val,
937  graph,
938  blockSize);
939  }
940 
941  template<class Scalar, class LO, class GO, class Node>
942  void
946  const Teuchos::ETransp mode,
947  const Scalar alpha,
948  const Scalar beta)
949  {
950  (void) X;
951  (void) Y;
952  (void) mode;
953  (void) alpha;
954  (void) beta;
955 
956  TEUCHOS_TEST_FOR_EXCEPTION(
957  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
958  "transpose and conjugate transpose modes are not yet implemented.");
959  }
960 
961  template<class Scalar, class LO, class GO, class Node>
962  void
963  BlockCrsMatrix<Scalar, LO, GO, Node>::
964  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
965  BlockMultiVector<Scalar, LO, GO, Node>& Y,
966  const Scalar alpha,
967  const Scalar beta)
968  {
969  using Teuchos::RCP;
970  using Teuchos::rcp;
971  typedef ::Tpetra::Import<LO, GO, Node> import_type;
972  typedef ::Tpetra::Export<LO, GO, Node> export_type;
973  const Scalar zero = STS::zero ();
974  const Scalar one = STS::one ();
975  RCP<const import_type> import = graph_.getImporter ();
976  // "export" is a reserved C++ keyword, so we can't use it.
977  RCP<const export_type> theExport = graph_.getExporter ();
978  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
979 
980  if (alpha == zero) {
981  if (beta == zero) {
982  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
983  }
984  else if (beta != one) {
985  Y.scale (beta);
986  }
987  }
988  else { // alpha != 0
989  const BMV* X_colMap = NULL;
990  if (import.is_null ()) {
991  try {
992  X_colMap = &X;
993  }
994  catch (std::exception& e) {
995  TEUCHOS_TEST_FOR_EXCEPTION
996  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
997  "operator= threw an exception: " << std::endl << e.what ());
998  }
999  }
1000  else {
1001  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1002  // Y_rowMap_ below. This lets us do lazy initialization
1003  // correctly with view semantics of BlockCrsMatrix. All views
1004  // of this BlockCrsMatrix have the same outer pointer. That
1005  // way, we can set the inner pointer in one view, and all
1006  // other views will see it.
1007  if ((*X_colMap_).is_null () ||
1008  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1009  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1010  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1011  static_cast<LO> (X.getNumVectors ())));
1012  }
1013  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1014  **pointImporter_,
1016  try {
1017  X_colMap = &(**X_colMap_);
1018  }
1019  catch (std::exception& e) {
1020  TEUCHOS_TEST_FOR_EXCEPTION
1021  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1022  "operator= threw an exception: " << std::endl << e.what ());
1023  }
1024  }
1025 
1026  BMV* Y_rowMap = NULL;
1027  if (theExport.is_null ()) {
1028  Y_rowMap = &Y;
1029  }
1030  else if ((*Y_rowMap_).is_null () ||
1031  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1032  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1033  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1034  static_cast<LO> (X.getNumVectors ())));
1035  try {
1036  Y_rowMap = &(**Y_rowMap_);
1037  }
1038  catch (std::exception& e) {
1039  TEUCHOS_TEST_FOR_EXCEPTION(
1040  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1041  "operator= threw an exception: " << std::endl << e.what ());
1042  }
1043  }
1044 
1045  try {
1046  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1047  }
1048  catch (std::exception& e) {
1049  TEUCHOS_TEST_FOR_EXCEPTION
1050  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1051  "an exception: " << e.what ());
1052  }
1053  catch (...) {
1054  TEUCHOS_TEST_FOR_EXCEPTION
1055  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1056  "an exception not a subclass of std::exception.");
1057  }
1058 
1059  if (! theExport.is_null ()) {
1060  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1061  }
1062  }
1063  }
1064 
1065 // clang-format on
1066 template <class Scalar, class LO, class GO, class Node>
1067 void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1068  const BlockMultiVector<Scalar, LO, GO, Node> &X,
1069  BlockMultiVector<Scalar, LO, GO, Node> &Y, const Scalar alpha,
1070  const Scalar beta) {
1071  ::Tpetra::Details::ProfilingRegion profile_region(
1072  "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1073  const impl_scalar_type alpha_impl = alpha;
1074  const auto graph = this->graph_.getLocalGraphDevice();
1075 
1076  mv_type X_mv = X.getMultiVectorView();
1077  mv_type Y_mv = Y.getMultiVectorView();
1078  auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1079  auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1080 
1081 #if KOKKOSKERNELS_VERSION >= 40299
1082  auto A_lcl = getLocalMatrixDevice();
1083  if(!applyHelper.get()) {
1084  // The apply helper does not exist, so create it
1085  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
1086  }
1087  if(applyHelper->shouldUseIntRowptrs())
1088  {
1089  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
1090  KokkosSparse::spmv(
1091  &applyHelper->handle_int, KokkosSparse::NoTranspose,
1092  alpha_impl, A_lcl_int_rowptrs, X_lcl, beta, Y_lcl);
1093  }
1094  else
1095  {
1096  KokkosSparse::spmv(
1097  &applyHelper->handle, KokkosSparse::NoTranspose,
1098  alpha_impl, A_lcl, X_lcl, beta, Y_lcl);
1099  }
1100 #else
1101  auto A_lcl = getLocalMatrixDevice();
1102  KokkosSparse::spmv(KokkosSparse::NoTranspose, alpha_impl, A_lcl, X_lcl, beta,
1103  Y_lcl);
1104 #endif
1105 }
1106 // clang-format off
1107 
1108  template<class Scalar, class LO, class GO, class Node>
1109  LO
1110  BlockCrsMatrix<Scalar, LO, GO, Node>::
1111  findRelOffsetOfColumnIndex (const LO localRowIndex,
1112  const LO colIndexToFind,
1113  const LO hint) const
1114  {
1115  const size_t absStartOffset = ptrHost_[localRowIndex];
1116  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1117  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1118  // Amortize pointer arithmetic over the search loop.
1119  const LO* const curInd = indHost_.data () + absStartOffset;
1120 
1121  // If the hint was correct, then the hint is the offset to return.
1122  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1123  // Always return the offset relative to the current row.
1124  return hint;
1125  }
1126 
1127  // The hint was wrong, so we must search for the given column
1128  // index in the column indices for the given row.
1129  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1130 
1131  // We require that the graph have sorted rows. However, binary
1132  // search only pays if the current row is longer than a certain
1133  // amount. We set this to 32, but you might want to tune this.
1134  const LO maxNumEntriesForLinearSearch = 32;
1135  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1136  // Use binary search. It would probably be better for us to
1137  // roll this loop by hand. If we wrote it right, a smart
1138  // compiler could perhaps use conditional loads and avoid
1139  // branches (according to Jed Brown on May 2014).
1140  const LO* beg = curInd;
1141  const LO* end = curInd + numEntriesInRow;
1142  std::pair<const LO*, const LO*> p =
1143  std::equal_range (beg, end, colIndexToFind);
1144  if (p.first != p.second) {
1145  // offset is relative to the current row
1146  relOffset = static_cast<LO> (p.first - beg);
1147  }
1148  }
1149  else { // use linear search
1150  for (LO k = 0; k < numEntriesInRow; ++k) {
1151  if (colIndexToFind == curInd[k]) {
1152  relOffset = k; // offset is relative to the current row
1153  break;
1154  }
1155  }
1156  }
1157 
1158  return relOffset;
1159  }
1160 
1161  template<class Scalar, class LO, class GO, class Node>
1162  LO
1163  BlockCrsMatrix<Scalar, LO, GO, Node>::
1164  offsetPerBlock () const
1165  {
1166  return offsetPerBlock_;
1167  }
1168 
1169  template<class Scalar, class LO, class GO, class Node>
1170  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1171  BlockCrsMatrix<Scalar, LO, GO, Node>::
1172  getConstLocalBlockFromInput (const impl_scalar_type* val,
1173  const size_t pointOffset) const
1174  {
1175  // Row major blocks
1176  const LO rowStride = blockSize_;
1177  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1178  }
1179 
1180  template<class Scalar, class LO, class GO, class Node>
1181  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1182  BlockCrsMatrix<Scalar, LO, GO, Node>::
1183  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1184  const size_t pointOffset) const
1185  {
1186  // Row major blocks
1187  const LO rowStride = blockSize_;
1188  return little_block_type (val + pointOffset, blockSize_, rowStride);
1189  }
1190 
1191  template<class Scalar, class LO, class GO, class Node>
1192  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1193  BlockCrsMatrix<Scalar, LO, GO, Node>::
1194  getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1195  const size_t pointOffset) const
1196  {
1197  // Row major blocks
1198  const LO rowStride = blockSize_;
1199  const size_t bs2 = blockSize_ * blockSize_;
1200  return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1201  }
1202 
1203  template<class Scalar, class LO, class GO, class Node>
1204  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1205  BlockCrsMatrix<Scalar, LO, GO, Node>::
1206  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1207  {
1208  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1209 
1210  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1211  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1212  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1213  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1214  auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1215  auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1216  return r_val;
1217  }
1218  else {
1219  return little_block_type ();
1220  }
1221  }
1222 
1223  template<class Scalar, class LO, class GO, class Node>
1224  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1225  BlockCrsMatrix<Scalar, LO, GO, Node>::
1226  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1227  {
1228  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1229 
1230  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1231  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1232  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1233  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1234  auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1235  auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1236  return r_val;
1237  }
1238  else {
1239  return little_block_host_type ();
1240  }
1241  }
1242 
1243 
1244  template<class Scalar, class LO, class GO, class Node>
1245  bool
1246  BlockCrsMatrix<Scalar, LO, GO, Node>::
1247  checkSizes (const ::Tpetra::SrcDistObject& source)
1248  {
1249  using std::endl;
1250  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1251  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1252 
1253  if (src == NULL) {
1254  std::ostream& err = markLocalErrorAndGetStream ();
1255  err << "checkSizes: The source object of the Import or Export "
1256  "must be a BlockCrsMatrix with the same template parameters as the "
1257  "target object." << endl;
1258  }
1259  else {
1260  // Use a string of ifs, not if-elseifs, because we want to know
1261  // all the errors.
1262  if (src->getBlockSize () != this->getBlockSize ()) {
1263  std::ostream& err = markLocalErrorAndGetStream ();
1264  err << "checkSizes: The source and target objects of the Import or "
1265  << "Export must have the same block sizes. The source's block "
1266  << "size = " << src->getBlockSize () << " != the target's block "
1267  << "size = " << this->getBlockSize () << "." << endl;
1268  }
1269  if (! src->graph_.isFillComplete ()) {
1270  std::ostream& err = markLocalErrorAndGetStream ();
1271  err << "checkSizes: The source object of the Import or Export is "
1272  "not fill complete. Both source and target objects must be fill "
1273  "complete." << endl;
1274  }
1275  if (! this->graph_.isFillComplete ()) {
1276  std::ostream& err = markLocalErrorAndGetStream ();
1277  err << "checkSizes: The target object of the Import or Export is "
1278  "not fill complete. Both source and target objects must be fill "
1279  "complete." << endl;
1280  }
1281  if (src->graph_.getColMap ().is_null ()) {
1282  std::ostream& err = markLocalErrorAndGetStream ();
1283  err << "checkSizes: The source object of the Import or Export does "
1284  "not have a column Map. Both source and target objects must have "
1285  "column Maps." << endl;
1286  }
1287  if (this->graph_.getColMap ().is_null ()) {
1288  std::ostream& err = markLocalErrorAndGetStream ();
1289  err << "checkSizes: The target object of the Import or Export does "
1290  "not have a column Map. Both source and target objects must have "
1291  "column Maps." << endl;
1292  }
1293  }
1294 
1295  return ! (* (this->localError_));
1296  }
1297 
1298  template<class Scalar, class LO, class GO, class Node>
1299  void
1302  (const ::Tpetra::SrcDistObject& source,
1303  const size_t numSameIDs,
1304  const Kokkos::DualView<const local_ordinal_type*,
1305  buffer_device_type>& permuteToLIDs,
1306  const Kokkos::DualView<const local_ordinal_type*,
1307  buffer_device_type>& permuteFromLIDs,
1308  const CombineMode /*CM*/)
1309  {
1310  using ::Tpetra::Details::Behavior;
1312  using ::Tpetra::Details::ProfilingRegion;
1313  using std::endl;
1314  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1315 
1316  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1317  const bool debug = Behavior::debug();
1318  const bool verbose = Behavior::verbose();
1319 
1320  // Define this function prefix
1321  std::string prefix;
1322  {
1323  std::ostringstream os;
1324  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1325  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1326  prefix = os.str();
1327  }
1328 
1329  // check if this already includes a local error
1330  if (* (this->localError_)) {
1331  std::ostream& err = this->markLocalErrorAndGetStream ();
1332  err << prefix
1333  << "The target object of the Import or Export is already in an error state."
1334  << endl;
1335  return;
1336  }
1337 
1338  //
1339  // Verbose input dual view status
1340  //
1341  if (verbose) {
1342  std::ostringstream os;
1343  os << prefix << endl
1344  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1345  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1346  std::cerr << os.str ();
1347  }
1348 
1352  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1353  std::ostream& err = this->markLocalErrorAndGetStream ();
1354  err << prefix
1355  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1356  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1357  << "." << endl;
1358  return;
1359  }
1360  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1361  std::ostream& err = this->markLocalErrorAndGetStream ();
1362  err << prefix
1363  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1364  << endl;
1365  return;
1366  }
1367 
1368  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1369  if (src == NULL) {
1370  std::ostream& err = this->markLocalErrorAndGetStream ();
1371  err << prefix
1372  << "The source (input) object of the Import or "
1373  "Export is either not a BlockCrsMatrix, or does not have the right "
1374  "template parameters. checkSizes() should have caught this. "
1375  "Please report this bug to the Tpetra developers." << endl;
1376  return;
1377  }
1378 
1379  bool lclErr = false;
1380 #ifdef HAVE_TPETRA_DEBUG
1381  std::set<LO> invalidSrcCopyRows;
1382  std::set<LO> invalidDstCopyRows;
1383  std::set<LO> invalidDstCopyCols;
1384  std::set<LO> invalidDstPermuteCols;
1385  std::set<LO> invalidPermuteFromRows;
1386 #endif // HAVE_TPETRA_DEBUG
1387 
1388  // Copy the initial sequence of rows that are the same.
1389  //
1390  // The two graphs might have different column Maps, so we need to
1391  // do this using global column indices. This is purely local, so
1392  // we only need to check for local sameness of the two column
1393  // Maps.
1394 
1395 #ifdef HAVE_TPETRA_DEBUG
1396  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1397 #endif // HAVE_TPETRA_DEBUG
1398  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1399  const map_type& srcColMap = * (src->graph_.getColMap ());
1400  const map_type& dstColMap = * (this->graph_.getColMap ());
1401  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1402 
1403  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1404  if (verbose) {
1405  std::ostringstream os;
1406  os << prefix
1407  << "canUseLocalColumnIndices: "
1408  << (canUseLocalColumnIndices ? "true" : "false")
1409  << ", numPermute: " << numPermute
1410  << endl;
1411  std::cerr << os.str ();
1412  }
1413 
1414  const auto permuteToLIDsHost = permuteToLIDs.view_host();
1415  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1416 
1417  if (canUseLocalColumnIndices) {
1418  // Copy local rows that are the "same" in both source and target.
1419  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1420 #ifdef HAVE_TPETRA_DEBUG
1421  if (! srcRowMap.isNodeLocalElement (localRow)) {
1422  lclErr = true;
1423  invalidSrcCopyRows.insert (localRow);
1424  continue; // skip invalid rows
1425  }
1426 #endif // HAVE_TPETRA_DEBUG
1427 
1428  local_inds_host_view_type lclSrcCols;
1429  values_host_view_type vals;
1430  LO numEntries;
1431  // If this call fails, that means the mesh row local index is
1432  // invalid. That means the Import or Export is invalid somehow.
1433  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1434  if (numEntries > 0) {
1435  LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1436  if (err != numEntries) {
1437  lclErr = true;
1438  if (! dstRowMap.isNodeLocalElement (localRow)) {
1439 #ifdef HAVE_TPETRA_DEBUG
1440  invalidDstCopyRows.insert (localRow);
1441 #endif // HAVE_TPETRA_DEBUG
1442  }
1443  else {
1444  // Once there's an error, there's no sense in saving
1445  // time, so we check whether the column indices were
1446  // invalid. However, only remember which ones were
1447  // invalid in a debug build, because that might take a
1448  // lot of space.
1449  for (LO k = 0; k < numEntries; ++k) {
1450  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1451  lclErr = true;
1452 #ifdef HAVE_TPETRA_DEBUG
1453  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1454 #endif // HAVE_TPETRA_DEBUG
1455  }
1456  }
1457  }
1458  }
1459  }
1460  } // for each "same" local row
1461 
1462  // Copy the "permute" local rows.
1463  for (size_t k = 0; k < numPermute; ++k) {
1464  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1465  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1466 
1467  local_inds_host_view_type lclSrcCols;
1468  values_host_view_type vals;
1469  LO numEntries;
1470  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1471  if (numEntries > 0) {
1472  LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1473  if (err != numEntries) {
1474  lclErr = true;
1475 #ifdef HAVE_TPETRA_DEBUG
1476  for (LO c = 0; c < numEntries; ++c) {
1477  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1478  invalidDstPermuteCols.insert (lclSrcCols[c]);
1479  }
1480  }
1481 #endif // HAVE_TPETRA_DEBUG
1482  }
1483  }
1484  }
1485  }
1486  else { // must convert column indices to global
1487  // Reserve space to store the destination matrix's local column indices.
1488  const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1489  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1490 
1491  // Copy local rows that are the "same" in both source and target.
1492  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1493  local_inds_host_view_type lclSrcCols;
1494  values_host_view_type vals;
1495  LO numEntries;
1496 
1497  // If this call fails, that means the mesh row local index is
1498  // invalid. That means the Import or Export is invalid somehow.
1499  try {
1500  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1501  } catch (std::exception& e) {
1502  if (debug) {
1503  std::ostringstream os;
1504  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1505  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1506  << localRow << ", src->getLocalRowView() threw an exception: "
1507  << e.what ();
1508  std::cerr << os.str ();
1509  }
1510  throw e;
1511  }
1512 
1513  if (numEntries > 0) {
1514  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1515  lclErr = true;
1516  if (debug) {
1517  std::ostringstream os;
1518  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1519  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1520  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1521  << maxNumEnt << endl;
1522  std::cerr << os.str ();
1523  }
1524  }
1525  else {
1526  // Convert the source matrix's local column indices to the
1527  // destination matrix's local column indices.
1528  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1529  for (LO j = 0; j < numEntries; ++j) {
1530  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1531  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1532  lclErr = true;
1533 #ifdef HAVE_TPETRA_DEBUG
1534  invalidDstCopyCols.insert (lclDstColsView[j]);
1535 #endif // HAVE_TPETRA_DEBUG
1536  }
1537  }
1538  LO err(0);
1539  try {
1540  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1541  } catch (std::exception& e) {
1542  if (debug) {
1543  std::ostringstream os;
1544  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1545  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1546  << localRow << ", this->replaceLocalValues() threw an exception: "
1547  << e.what ();
1548  std::cerr << os.str ();
1549  }
1550  throw e;
1551  }
1552  if (err != numEntries) {
1553  lclErr = true;
1554  if (debug) {
1555  std::ostringstream os;
1556  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1557  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1558  "localRow " << localRow << ", this->replaceLocalValues "
1559  "returned " << err << " instead of numEntries = "
1560  << numEntries << endl;
1561  std::cerr << os.str ();
1562  }
1563  }
1564  }
1565  }
1566  }
1567 
1568  // Copy the "permute" local rows.
1569  for (size_t k = 0; k < numPermute; ++k) {
1570  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1571  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1572 
1573  local_inds_host_view_type lclSrcCols;
1574  values_host_view_type vals;
1575  LO numEntries;
1576 
1577  try {
1578  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1579  } catch (std::exception& e) {
1580  if (debug) {
1581  std::ostringstream os;
1582  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1583  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1584  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1585  << ", src->getLocalRowView() threw an exception: " << e.what ();
1586  std::cerr << os.str ();
1587  }
1588  throw e;
1589  }
1590 
1591  if (numEntries > 0) {
1592  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1593  lclErr = true;
1594  }
1595  else {
1596  // Convert the source matrix's local column indices to the
1597  // destination matrix's local column indices.
1598  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1599  for (LO j = 0; j < numEntries; ++j) {
1600  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1601  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1602  lclErr = true;
1603 #ifdef HAVE_TPETRA_DEBUG
1604  invalidDstPermuteCols.insert (lclDstColsView[j]);
1605 #endif // HAVE_TPETRA_DEBUG
1606  }
1607  }
1608  LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1609  if (err != numEntries) {
1610  lclErr = true;
1611  }
1612  }
1613  }
1614  }
1615  }
1616 
1617  if (lclErr) {
1618  std::ostream& err = this->markLocalErrorAndGetStream ();
1619 #ifdef HAVE_TPETRA_DEBUG
1620  err << "copyAndPermute: The graph structure of the source of the "
1621  "Import or Export must be a subset of the graph structure of the "
1622  "target. ";
1623  err << "invalidSrcCopyRows = [";
1624  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1625  it != invalidSrcCopyRows.end (); ++it) {
1626  err << *it;
1627  typename std::set<LO>::const_iterator itp1 = it;
1628  itp1++;
1629  if (itp1 != invalidSrcCopyRows.end ()) {
1630  err << ",";
1631  }
1632  }
1633  err << "], invalidDstCopyRows = [";
1634  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1635  it != invalidDstCopyRows.end (); ++it) {
1636  err << *it;
1637  typename std::set<LO>::const_iterator itp1 = it;
1638  itp1++;
1639  if (itp1 != invalidDstCopyRows.end ()) {
1640  err << ",";
1641  }
1642  }
1643  err << "], invalidDstCopyCols = [";
1644  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1645  it != invalidDstCopyCols.end (); ++it) {
1646  err << *it;
1647  typename std::set<LO>::const_iterator itp1 = it;
1648  itp1++;
1649  if (itp1 != invalidDstCopyCols.end ()) {
1650  err << ",";
1651  }
1652  }
1653  err << "], invalidDstPermuteCols = [";
1654  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1655  it != invalidDstPermuteCols.end (); ++it) {
1656  err << *it;
1657  typename std::set<LO>::const_iterator itp1 = it;
1658  itp1++;
1659  if (itp1 != invalidDstPermuteCols.end ()) {
1660  err << ",";
1661  }
1662  }
1663  err << "], invalidPermuteFromRows = [";
1664  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1665  it != invalidPermuteFromRows.end (); ++it) {
1666  err << *it;
1667  typename std::set<LO>::const_iterator itp1 = it;
1668  itp1++;
1669  if (itp1 != invalidPermuteFromRows.end ()) {
1670  err << ",";
1671  }
1672  }
1673  err << "]" << endl;
1674 #else
1675  err << "copyAndPermute: The graph structure of the source of the "
1676  "Import or Export must be a subset of the graph structure of the "
1677  "target." << endl;
1678 #endif // HAVE_TPETRA_DEBUG
1679  }
1680 
1681  if (debug) {
1682  std::ostringstream os;
1683  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1684  const bool lclSuccess = ! (* (this->localError_));
1685  os << "*** Proc " << myRank << ": copyAndPermute "
1686  << (lclSuccess ? "succeeded" : "FAILED");
1687  if (lclSuccess) {
1688  os << endl;
1689  } else {
1690  os << ": error messages: " << this->errorMessages (); // comes w/ endl
1691  }
1692  std::cerr << os.str ();
1693  }
1694  }
1695 
1696  namespace { // (anonymous)
1697 
1716  template<class LO, class GO>
1717  size_t
1718  packRowCount (const size_t numEnt,
1719  const size_t numBytesPerValue,
1720  const size_t blkSize)
1721  {
1722  using ::Tpetra::Details::PackTraits;
1723 
1724  if (numEnt == 0) {
1725  // Empty rows always take zero bytes, to ensure sparsity.
1726  return 0;
1727  }
1728  else {
1729  // We store the number of entries as a local index (LO).
1730  LO numEntLO = 0; // packValueCount wants this.
1731  GO gid {};
1732  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1733  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1734  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1735  return numEntLen + gidsLen + valsLen;
1736  }
1737  }
1738 
1749  template<class ST, class LO, class GO>
1750  size_t
1751  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1752  const size_t offset,
1753  const size_t numBytes,
1754  const size_t /* numBytesPerValue */)
1755  {
1756  using Kokkos::subview;
1757  using ::Tpetra::Details::PackTraits;
1758 
1759  if (numBytes == 0) {
1760  // Empty rows always take zero bytes, to ensure sparsity.
1761  return static_cast<size_t> (0);
1762  }
1763  else {
1764  LO numEntLO = 0;
1765  const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
1766  TEUCHOS_TEST_FOR_EXCEPTION
1767  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1768  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
1769  << ".");
1770  const char* const inBuf = imports.data () + offset;
1771  const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
1772  TEUCHOS_TEST_FOR_EXCEPTION
1773  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1774  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
1775  << ".");
1776  return static_cast<size_t> (numEntLO);
1777  }
1778  }
1779 
1783  template<class ST, class LO, class GO>
1784  size_t
1785  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1786  const size_t offset,
1787  const size_t numEnt,
1788  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1789  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1790  const size_t numBytesPerValue,
1791  const size_t blockSize)
1792  {
1793  using Kokkos::subview;
1794  using ::Tpetra::Details::PackTraits;
1795 
1796  if (numEnt == 0) {
1797  // Empty rows always take zero bytes, to ensure sparsity.
1798  return 0;
1799  }
1800  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1801 
1802  const GO gid = 0; // packValueCount wants this
1803  const LO numEntLO = static_cast<size_t> (numEnt);
1804 
1805  const size_t numEntBeg = offset;
1806  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1807  const size_t gidsBeg = numEntBeg + numEntLen;
1808  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1809  const size_t valsBeg = gidsBeg + gidsLen;
1810  const size_t valsLen = numScalarEnt * numBytesPerValue;
1811 
1812  char* const numEntOut = exports.data () + numEntBeg;
1813  char* const gidsOut = exports.data () + gidsBeg;
1814  char* const valsOut = exports.data () + valsBeg;
1815 
1816  size_t numBytesOut = 0;
1817  int errorCode = 0;
1818  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
1819 
1820  {
1821  Kokkos::pair<int, size_t> p;
1822  p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
1823  errorCode += p.first;
1824  numBytesOut += p.second;
1825 
1826  p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
1827  errorCode += p.first;
1828  numBytesOut += p.second;
1829  }
1830 
1831  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1832  TEUCHOS_TEST_FOR_EXCEPTION
1833  (numBytesOut != expectedNumBytes, std::logic_error,
1834  "packRowForBlockCrs: numBytesOut = " << numBytesOut
1835  << " != expectedNumBytes = " << expectedNumBytes << ".");
1836 
1837  TEUCHOS_TEST_FOR_EXCEPTION
1838  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
1839  "PackTraits::packArray returned a nonzero error code");
1840 
1841  return numBytesOut;
1842  }
1843 
1844  // Return the number of bytes actually read / used.
1845  template<class ST, class LO, class GO>
1846  size_t
1847  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1848  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1849  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1850  const size_t offset,
1851  const size_t numBytes,
1852  const size_t numEnt,
1853  const size_t numBytesPerValue,
1854  const size_t blockSize)
1855  {
1856  using ::Tpetra::Details::PackTraits;
1857 
1858  if (numBytes == 0) {
1859  // Rows with zero bytes always have zero entries.
1860  return 0;
1861  }
1862  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1863  TEUCHOS_TEST_FOR_EXCEPTION
1864  (static_cast<size_t> (imports.extent (0)) <= offset,
1865  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1866  << imports.extent (0) << " <= offset = " << offset << ".");
1867  TEUCHOS_TEST_FOR_EXCEPTION
1868  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
1869  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1870  << imports.extent (0) << " < offset + numBytes = "
1871  << (offset + numBytes) << ".");
1872 
1873  const GO gid = 0; // packValueCount wants this
1874  const LO lid = 0; // packValueCount wants this
1875 
1876  const size_t numEntBeg = offset;
1877  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
1878  const size_t gidsBeg = numEntBeg + numEntLen;
1879  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1880  const size_t valsBeg = gidsBeg + gidsLen;
1881  const size_t valsLen = numScalarEnt * numBytesPerValue;
1882 
1883  const char* const numEntIn = imports.data () + numEntBeg;
1884  const char* const gidsIn = imports.data () + gidsBeg;
1885  const char* const valsIn = imports.data () + valsBeg;
1886 
1887  size_t numBytesOut = 0;
1888  int errorCode = 0;
1889  LO numEntOut;
1890  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
1891  TEUCHOS_TEST_FOR_EXCEPTION
1892  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
1893  "unpackRowForBlockCrs: Expected number of entries " << numEnt
1894  << " != actual number of entries " << numEntOut << ".");
1895 
1896  {
1897  Kokkos::pair<int, size_t> p;
1898  p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
1899  errorCode += p.first;
1900  numBytesOut += p.second;
1901 
1902  p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
1903  errorCode += p.first;
1904  numBytesOut += p.second;
1905  }
1906 
1907  TEUCHOS_TEST_FOR_EXCEPTION
1908  (numBytesOut != numBytes, std::logic_error,
1909  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1910  << " != numBytes = " << numBytes << ".");
1911 
1912  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1913  TEUCHOS_TEST_FOR_EXCEPTION
1914  (numBytesOut != expectedNumBytes, std::logic_error,
1915  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1916  << " != expectedNumBytes = " << expectedNumBytes << ".");
1917 
1918  TEUCHOS_TEST_FOR_EXCEPTION
1919  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
1920  "PackTraits::unpackArray returned a nonzero error code");
1921 
1922  return numBytesOut;
1923  }
1924  } // namespace (anonymous)
1925 
1926  template<class Scalar, class LO, class GO, class Node>
1927  void
1930  (const ::Tpetra::SrcDistObject& source,
1931  const Kokkos::DualView<const local_ordinal_type*,
1932  buffer_device_type>& exportLIDs,
1933  Kokkos::DualView<packet_type*,
1934  buffer_device_type>& exports, // output
1935  Kokkos::DualView<size_t*,
1936  buffer_device_type> numPacketsPerLID, // output
1937  size_t& constantNumPackets)
1938  {
1939  using ::Tpetra::Details::Behavior;
1941  using ::Tpetra::Details::ProfilingRegion;
1942  using ::Tpetra::Details::PackTraits;
1943 
1944  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
1945 
1946  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1947 
1948  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1949 
1950  const bool debug = Behavior::debug();
1951  const bool verbose = Behavior::verbose();
1952 
1953  // Define this function prefix
1954  std::string prefix;
1955  {
1956  std::ostringstream os;
1957  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1958  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1959  prefix = os.str();
1960  }
1961 
1962  // check if this already includes a local error
1963  if (* (this->localError_)) {
1964  std::ostream& err = this->markLocalErrorAndGetStream ();
1965  err << prefix
1966  << "The target object of the Import or Export is already in an error state."
1967  << std::endl;
1968  return;
1969  }
1970 
1971  //
1972  // Verbose input dual view status
1973  //
1974  if (verbose) {
1975  std::ostringstream os;
1976  os << prefix << std::endl
1977  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
1978  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
1979  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
1980  std::cerr << os.str ();
1981  }
1982 
1986  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
1987  std::ostream& err = this->markLocalErrorAndGetStream ();
1988  err << prefix
1989  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
1990  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
1991  << "." << std::endl;
1992  return;
1993  }
1994  if (exportLIDs.need_sync_host ()) {
1995  std::ostream& err = this->markLocalErrorAndGetStream ();
1996  err << prefix << "exportLIDs be sync'd to host." << std::endl;
1997  return;
1998  }
1999 
2000  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
2001  if (src == NULL) {
2002  std::ostream& err = this->markLocalErrorAndGetStream ();
2003  err << prefix
2004  << "The source (input) object of the Import or "
2005  "Export is either not a BlockCrsMatrix, or does not have the right "
2006  "template parameters. checkSizes() should have caught this. "
2007  "Please report this bug to the Tpetra developers." << std::endl;
2008  return;
2009  }
2010 
2011  // Graphs and matrices are allowed to have a variable number of
2012  // entries per row. We could test whether all rows have the same
2013  // number of entries, but DistObject can only use this
2014  // optimization if all rows on _all_ processes have the same
2015  // number of entries. Rather than do the all-reduce necessary to
2016  // test for this unlikely case, we tell DistObject (by setting
2017  // constantNumPackets to zero) to assume that different rows may
2018  // have different numbers of entries.
2019  constantNumPackets = 0;
2020 
2021  // const values
2022  const crs_graph_type& srcGraph = src->graph_;
2023  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2024  const size_t numExportLIDs = exportLIDs.extent (0);
2025  size_t numBytesPerValue(0);
2026  {
2027  auto val_host = val_.getHostView(Access::ReadOnly);
2028  numBytesPerValue =
2029  PackTraits<impl_scalar_type>
2030  ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2031  }
2032 
2033  // Compute the number of bytes ("packets") per row to pack. While
2034  // we're at it, compute the total # of block entries to send, and
2035  // the max # of block entries in any of the rows we're sending.
2036 
2037  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2038 
2039  // Graph information is on host; let's do this on host parallel reduce
2040  auto exportLIDsHost = exportLIDs.view_host();
2041  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2042  numPacketsPerLID.modify_host ();
2043  {
2044  rowReducerStruct = Impl::BlockCrsRowStruct<size_t>();
2045  for (size_t i = 0; i < numExportLIDs; ++i) {
2046  const LO lclRow = exportLIDsHost(i);
2047  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2048  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2049 
2050  const size_t numBytes =
2051  packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2052  numPacketsPerLIDHost(i) = numBytes;
2053  rowReducerStruct += Impl::BlockCrsRowStruct<size_t>(numEnt, numBytes, numEnt);
2054  }
2055  }
2056 
2057  // Compute the number of bytes ("packets") per row to pack. While
2058  // we're at it, compute the total # of block entries to send, and
2059  // the max # of block entries in any of the rows we're sending.
2060  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2061  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2062  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2063 
2064  if (verbose) {
2065  std::ostringstream os;
2066  os << prefix
2067  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2068  << std::endl;
2069  std::cerr << os.str ();
2070  }
2071 
2072  // We use a "struct of arrays" approach to packing each row's
2073  // entries. All the column indices (as global indices) go first,
2074  // then all their owning process ranks, and then the values.
2075  if(exports.extent(0) != totalNumBytes)
2076  {
2077  const std::string oldLabel = exports.d_view.label ();
2078  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2079  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2080  }
2081  if (totalNumEntries > 0) {
2082  // Current position (in bytes) in the 'exports' output array.
2083  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2084  {
2085  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2086  Kokkos::parallel_scan
2087  (policy,
2088  [=](const size_t &i, size_t &update, const bool &final) {
2089  if (final) offset(i) = update;
2090  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2091  });
2092  }
2093  if (offset(numExportLIDs) != totalNumBytes) {
2094  std::ostream& err = this->markLocalErrorAndGetStream ();
2095  err << prefix
2096  << "At end of method, the final offset (in bytes) "
2097  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2098  << totalNumBytes << ". "
2099  << "Please report this bug to the Tpetra developers." << std::endl;
2100  return;
2101  }
2102 
2103  // For each block row of the matrix owned by the calling
2104  // process, pack that block row's column indices and values into
2105  // the exports array.
2106 
2107  // Source matrix's column Map. We verified in checkSizes() that
2108  // the column Map exists (is not null).
2109  const map_type& srcColMap = * (srcGraph.getColMap ());
2110 
2111  // Pack the data for each row to send, into the 'exports' buffer.
2112  // exports will be modified on host.
2113  exports.modify_host();
2114  {
2115  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2116  const auto policy =
2117  policy_type(numExportLIDs, 1, 1)
2118  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2119  // The following parallel_for needs const access to the local values of src.
2120  // (the local graph is also accessed on host, but this does not use WDVs).
2121  getValuesHost();
2123  Kokkos::parallel_for
2124  (policy,
2125  [=](const typename policy_type::member_type &member) {
2126  const size_t i = member.league_rank();
2127  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2128  gblColInds(member.team_scratch(0), maxRowLength);
2129 
2130  const LO lclRowInd = exportLIDsHost(i);
2131  local_inds_host_view_type lclColInds;
2132  values_host_view_type vals;
2133 
2134  // It's OK to ignore the return value, since if the calling
2135  // process doesn't own that local row, then the number of
2136  // entries in that row on the calling process is zero.
2137  src->getLocalRowView (lclRowInd, lclColInds, vals);
2138  const size_t numEnt = lclColInds.extent(0);
2139 
2140  // Convert column indices from local to global.
2141  for (size_t j = 0; j < numEnt; ++j)
2142  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2143 
2144  // Kyungjoo: additional wrapping scratch view is necessary
2145  // the following function interface need the same execution space
2146  // host scratch space somehow is not considered same as the host_exec
2147  // Copy the row's data into the current spot in the exports array.
2148  const size_t numBytes =
2149  packRowForBlockCrs<impl_scalar_type, LO, GO>
2150  (exports.view_host(),
2151  offset(i),
2152  numEnt,
2153  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2154  vals,
2155  numBytesPerValue,
2156  blockSize);
2157 
2158  // numBytes should be same as the difference between offsets
2159  if (debug) {
2160  const size_t offsetDiff = offset(i+1) - offset(i);
2161  if (numBytes != offsetDiff) {
2162  std::ostringstream os;
2163  os << prefix
2164  << "numBytes computed from packRowForBlockCrs is different from "
2165  << "precomputed offset values, LID = " << i << std::endl;
2166  std::cerr << os.str ();
2167  }
2168  }
2169  }); // for each LID (of a row) to send
2171  }
2172  } // if totalNumEntries > 0
2173 
2174  if (debug) {
2175  std::ostringstream os;
2176  const bool lclSuccess = ! (* (this->localError_));
2177  os << prefix
2178  << (lclSuccess ? "succeeded" : "FAILED")
2179  << std::endl;
2180  std::cerr << os.str ();
2181  }
2182  }
2183 
2184  template<class Scalar, class LO, class GO, class Node>
2185  void
2188  (const Kokkos::DualView<const local_ordinal_type*,
2189  buffer_device_type>& importLIDs,
2190  Kokkos::DualView<packet_type*,
2191  buffer_device_type> imports,
2192  Kokkos::DualView<size_t*,
2193  buffer_device_type> numPacketsPerLID,
2194  const size_t /* constantNumPackets */,
2195  const CombineMode combineMode)
2196  {
2197  using ::Tpetra::Details::Behavior;
2199  using ::Tpetra::Details::ProfilingRegion;
2200  using ::Tpetra::Details::PackTraits;
2201  using std::endl;
2202  using host_exec =
2203  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2204 
2205  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2206  const bool verbose = Behavior::verbose ();
2207 
2208  // Define this function prefix
2209  std::string prefix;
2210  {
2211  std::ostringstream os;
2212  auto map = this->graph_.getRowMap ();
2213  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2214  const int myRank = comm.is_null () ? -1 : comm->getRank ();
2215  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2216  prefix = os.str ();
2217  if (verbose) {
2218  os << "Start" << endl;
2219  std::cerr << os.str ();
2220  }
2221  }
2222 
2223  // check if this already includes a local error
2224  if (* (this->localError_)) {
2225  std::ostream& err = this->markLocalErrorAndGetStream ();
2226  std::ostringstream os;
2227  os << prefix << "{Im/Ex}port target is already in error." << endl;
2228  if (verbose) {
2229  std::cerr << os.str ();
2230  }
2231  err << os.str ();
2232  return;
2233  }
2234 
2238  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2239  std::ostream& err = this->markLocalErrorAndGetStream ();
2240  std::ostringstream os;
2241  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2242  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2243  << "." << endl;
2244  if (verbose) {
2245  std::cerr << os.str ();
2246  }
2247  err << os.str ();
2248  return;
2249  }
2250 
2251  if (combineMode != ADD && combineMode != INSERT &&
2252  combineMode != REPLACE && combineMode != ABSMAX &&
2253  combineMode != ZERO) {
2254  std::ostream& err = this->markLocalErrorAndGetStream ();
2255  std::ostringstream os;
2256  os << prefix
2257  << "Invalid CombineMode value " << combineMode << ". Valid "
2258  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2259  << std::endl;
2260  if (verbose) {
2261  std::cerr << os.str ();
2262  }
2263  err << os.str ();
2264  return;
2265  }
2266 
2267  if (this->graph_.getColMap ().is_null ()) {
2268  std::ostream& err = this->markLocalErrorAndGetStream ();
2269  std::ostringstream os;
2270  os << prefix << "Target matrix's column Map is null." << endl;
2271  if (verbose) {
2272  std::cerr << os.str ();
2273  }
2274  err << os.str ();
2275  return;
2276  }
2277 
2278  // Target matrix's column Map. Use to convert the global column
2279  // indices in the receive buffer to local indices. We verified in
2280  // checkSizes() that the column Map exists (is not null).
2281  const map_type& tgtColMap = * (this->graph_.getColMap ());
2282 
2283  // Const values
2284  const size_t blockSize = this->getBlockSize ();
2285  const size_t numImportLIDs = importLIDs.extent(0);
2286  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2287  // default-constructed instance could have a different size than
2288  // other instances. (We assume all nominally constructed
2289  // instances have the same size; that's not the issue here.) This
2290  // could be bad if the calling process has no entries, but other
2291  // processes have entries that they want to send to us.
2292  size_t numBytesPerValue(0);
2293  {
2294  auto val_host = val_.getHostView(Access::ReadOnly);
2295  numBytesPerValue =
2296  PackTraits<impl_scalar_type>::packValueCount
2297  (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2298  }
2299  const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2300  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2301 
2302  if (verbose) {
2303  std::ostringstream os;
2304  os << prefix << "combineMode: "
2305  << ::Tpetra::combineModeToString (combineMode)
2306  << ", blockSize: " << blockSize
2307  << ", numImportLIDs: " << numImportLIDs
2308  << ", numBytesPerValue: " << numBytesPerValue
2309  << ", maxRowNumEnt: " << maxRowNumEnt
2310  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2311  std::cerr << os.str ();
2312  }
2313 
2314  if (combineMode == ZERO || numImportLIDs == 0) {
2315  if (verbose) {
2316  std::ostringstream os;
2317  os << prefix << "Nothing to unpack. Done!" << std::endl;
2318  std::cerr << os.str ();
2319  }
2320  return;
2321  }
2322 
2323  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2324  // we can remove this sync.
2325  if (imports.need_sync_host ()) {
2326  imports.sync_host ();
2327  }
2328 
2329  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2330  // sync'd numPacketsPerLID to host, since it needs to do that in
2331  // order to post MPI_Irecv messages with the correct lengths. A
2332  // hypothetical device-specific boundary exchange implementation
2333  // could possibly receive data without sync'ing lengths to host,
2334  // but we don't need to design for that nonexistent thing yet.
2335 
2336  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2337  importLIDs.need_sync_host ()) {
2338  std::ostream& err = this->markLocalErrorAndGetStream ();
2339  std::ostringstream os;
2340  os << prefix << "All input DualViews must be sync'd to host by now. "
2341  << "imports_nc.need_sync_host()="
2342  << (imports.need_sync_host () ? "true" : "false")
2343  << ", numPacketsPerLID.need_sync_host()="
2344  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2345  << ", importLIDs.need_sync_host()="
2346  << (importLIDs.need_sync_host () ? "true" : "false")
2347  << "." << endl;
2348  if (verbose) {
2349  std::cerr << os.str ();
2350  }
2351  err << os.str ();
2352  return;
2353  }
2354 
2355  const auto importLIDsHost = importLIDs.view_host ();
2356  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2357 
2358  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2359  // loop, by only unpacking on final==true (when we know the
2360  // current offset's value).
2361 
2362  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2363  {
2364  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2365  Kokkos::parallel_scan
2366  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2367  [=] (const size_t &i, size_t &update, const bool &final) {
2368  if (final) offset(i) = update;
2369  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2370  });
2371  }
2372 
2373  // this variable does not matter with a race condition (just error flag)
2374  //
2375  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2376  // or atomics on bool, so we use int instead. (I know we're not
2377  // launching a CUDA loop here, but we could in the future, and I'd
2378  // like to avoid potential trouble.)
2379  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2380  errorDuringUnpack ("errorDuringUnpack");
2381  errorDuringUnpack () = 0;
2382  {
2383  using policy_type = Kokkos::TeamPolicy<host_exec>;
2384  size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2385  + 2 * sizeof(GO); // Yeah, this is a fudge factor
2386 
2387  const auto policy = policy_type (numImportLIDs, 1, 1)
2388  .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2389  using host_scratch_space = typename host_exec::scratch_memory_space;
2390 
2391  using pair_type = Kokkos::pair<size_t, size_t>;
2392 
2393  //The following parallel_for modifies values on host while unpacking.
2394  getValuesHostNonConst();
2396  Kokkos::parallel_for
2397  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2398  [=] (const typename policy_type::member_type& member) {
2399  const size_t i = member.league_rank();
2400  Kokkos::View<GO*, host_scratch_space> gblColInds
2401  (member.team_scratch (0), maxRowNumEnt);
2402  Kokkos::View<LO*, host_scratch_space> lclColInds
2403  (member.team_scratch (0), maxRowNumEnt);
2404  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2405  (member.team_scratch (0), maxRowNumScalarEnt);
2406 
2407 
2408  const size_t offval = offset(i);
2409  const LO lclRow = importLIDsHost(i);
2410  const size_t numBytes = numPacketsPerLIDHost(i);
2411  const size_t numEnt =
2412  unpackRowCount<impl_scalar_type, LO, GO>
2413  (imports.view_host (), offval, numBytes, numBytesPerValue);
2414 
2415  if (numBytes > 0) {
2416  if (numEnt > maxRowNumEnt) {
2417  errorDuringUnpack() = 1;
2418  if (verbose) {
2419  std::ostringstream os;
2420  os << prefix
2421  << "At i = " << i << ", numEnt = " << numEnt
2422  << " > maxRowNumEnt = " << maxRowNumEnt
2423  << std::endl;
2424  std::cerr << os.str ();
2425  }
2426  return;
2427  }
2428  }
2429  const size_t numScalarEnt = numEnt*blockSize*blockSize;
2430  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2431  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2432  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2433 
2434  // Kyungjoo: additional wrapping scratch view is necessary
2435  // the following function interface need the same execution space
2436  // host scratch space somehow is not considered same as the host_exec
2437  size_t numBytesOut = 0;
2438  try {
2439  numBytesOut =
2440  unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2441  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2442  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2443  imports.view_host(),
2444  offval, numBytes, numEnt,
2445  numBytesPerValue, blockSize);
2446  }
2447  catch (std::exception& e) {
2448  errorDuringUnpack() = 1;
2449  if (verbose) {
2450  std::ostringstream os;
2451  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2452  << e.what () << endl;
2453  std::cerr << os.str ();
2454  }
2455  return;
2456  }
2457 
2458  if (numBytes != numBytesOut) {
2459  errorDuringUnpack() = 1;
2460  if (verbose) {
2461  std::ostringstream os;
2462  os << prefix
2463  << "At i = " << i << ", numBytes = " << numBytes
2464  << " != numBytesOut = " << numBytesOut << "."
2465  << std::endl;
2466  std::cerr << os.str();
2467  }
2468  return;
2469  }
2470 
2471  // Convert incoming global indices to local indices.
2472  for (size_t k = 0; k < numEnt; ++k) {
2473  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2474  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2475  if (verbose) {
2476  std::ostringstream os;
2477  os << prefix
2478  << "At i = " << i << ", GID " << gidsOut(k)
2479  << " is not owned by the calling process."
2480  << std::endl;
2481  std::cerr << os.str();
2482  }
2483  return;
2484  }
2485  }
2486 
2487  // Combine the incoming data with the matrix's current data.
2488  LO numCombd = 0;
2489  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2490  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2491  (const_cast<const impl_scalar_type*> (valsOut.data ()));
2492  if (combineMode == ADD) {
2493  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2494  } else if (combineMode == INSERT || combineMode == REPLACE) {
2495  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2496  } else if (combineMode == ABSMAX) {
2497  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2498  }
2499 
2500  if (static_cast<LO> (numEnt) != numCombd) {
2501  errorDuringUnpack() = 1;
2502  if (verbose) {
2503  std::ostringstream os;
2504  os << prefix << "At i = " << i << ", numEnt = " << numEnt
2505  << " != numCombd = " << numCombd << "."
2506  << endl;
2507  std::cerr << os.str ();
2508  }
2509  return;
2510  }
2511  }); // for each import LID i
2513  }
2514 
2515  if (errorDuringUnpack () != 0) {
2516  std::ostream& err = this->markLocalErrorAndGetStream ();
2517  err << prefix << "Unpacking failed.";
2518  if (! verbose) {
2519  err << " Please run again with the environment variable setting "
2520  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2521  }
2522  err << endl;
2523  }
2524 
2525  if (verbose) {
2526  std::ostringstream os;
2527  const bool lclSuccess = ! (* (this->localError_));
2528  os << prefix
2529  << (lclSuccess ? "succeeded" : "FAILED")
2530  << std::endl;
2531  std::cerr << os.str ();
2532  }
2533  }
2534 
2535  template<class Scalar, class LO, class GO, class Node>
2536  std::string
2538  {
2539  using Teuchos::TypeNameTraits;
2540  std::ostringstream os;
2541  os << "\"Tpetra::BlockCrsMatrix\": { "
2542  << "Template parameters: { "
2543  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2544  << "LO: " << TypeNameTraits<LO>::name ()
2545  << "GO: " << TypeNameTraits<GO>::name ()
2546  << "Node: " << TypeNameTraits<Node>::name ()
2547  << " }"
2548  << ", Label: \"" << this->getObjectLabel () << "\""
2549  << ", Global dimensions: ["
2550  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2551  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2552  << ", Block size: " << getBlockSize ()
2553  << " }";
2554  return os.str ();
2555  }
2556 
2557 
2558  template<class Scalar, class LO, class GO, class Node>
2559  void
2561  describe (Teuchos::FancyOStream& out,
2562  const Teuchos::EVerbosityLevel verbLevel) const
2563  {
2564  using Teuchos::ArrayRCP;
2565  using Teuchos::CommRequest;
2566  using Teuchos::FancyOStream;
2567  using Teuchos::getFancyOStream;
2568  using Teuchos::ireceive;
2569  using Teuchos::isend;
2570  using Teuchos::outArg;
2571  using Teuchos::TypeNameTraits;
2572  using Teuchos::VERB_DEFAULT;
2573  using Teuchos::VERB_NONE;
2574  using Teuchos::VERB_LOW;
2575  using Teuchos::VERB_MEDIUM;
2576  // using Teuchos::VERB_HIGH;
2577  using Teuchos::VERB_EXTREME;
2578  using Teuchos::RCP;
2579  using Teuchos::wait;
2580  using std::endl;
2581 
2582  // Set default verbosity if applicable.
2583  const Teuchos::EVerbosityLevel vl =
2584  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2585 
2586  if (vl == VERB_NONE) {
2587  return; // print nothing
2588  }
2589 
2590  // describe() always starts with a tab before it prints anything.
2591  Teuchos::OSTab tab0 (out);
2592 
2593  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2594  Teuchos::OSTab tab1 (out);
2595 
2596  out << "Template parameters:" << endl;
2597  {
2598  Teuchos::OSTab tab2 (out);
2599  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2600  << "LO: " << TypeNameTraits<LO>::name () << endl
2601  << "GO: " << TypeNameTraits<GO>::name () << endl
2602  << "Node: " << TypeNameTraits<Node>::name () << endl;
2603  }
2604  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2605  << "Global dimensions: ["
2606  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2607  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2608 
2609  const LO blockSize = getBlockSize ();
2610  out << "Block size: " << blockSize << endl;
2611 
2612  // constituent objects
2613  if (vl >= VERB_MEDIUM) {
2614  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2615  const int myRank = comm.getRank ();
2616  if (myRank == 0) {
2617  out << "Row Map:" << endl;
2618  }
2619  getRowMap()->describe (out, vl);
2620 
2621  if (! getColMap ().is_null ()) {
2622  if (getColMap() == getRowMap()) {
2623  if (myRank == 0) {
2624  out << "Column Map: same as row Map" << endl;
2625  }
2626  }
2627  else {
2628  if (myRank == 0) {
2629  out << "Column Map:" << endl;
2630  }
2631  getColMap ()->describe (out, vl);
2632  }
2633  }
2634  if (! getDomainMap ().is_null ()) {
2635  if (getDomainMap () == getRowMap ()) {
2636  if (myRank == 0) {
2637  out << "Domain Map: same as row Map" << endl;
2638  }
2639  }
2640  else if (getDomainMap () == getColMap ()) {
2641  if (myRank == 0) {
2642  out << "Domain Map: same as column Map" << endl;
2643  }
2644  }
2645  else {
2646  if (myRank == 0) {
2647  out << "Domain Map:" << endl;
2648  }
2649  getDomainMap ()->describe (out, vl);
2650  }
2651  }
2652  if (! getRangeMap ().is_null ()) {
2653  if (getRangeMap () == getDomainMap ()) {
2654  if (myRank == 0) {
2655  out << "Range Map: same as domain Map" << endl;
2656  }
2657  }
2658  else if (getRangeMap () == getRowMap ()) {
2659  if (myRank == 0) {
2660  out << "Range Map: same as row Map" << endl;
2661  }
2662  }
2663  else {
2664  if (myRank == 0) {
2665  out << "Range Map: " << endl;
2666  }
2667  getRangeMap ()->describe (out, vl);
2668  }
2669  }
2670  }
2671 
2672  if (vl >= VERB_EXTREME) {
2673  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2674  const int myRank = comm.getRank ();
2675  const int numProcs = comm.getSize ();
2676 
2677  // Print the calling process' data to the given output stream.
2678  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2679  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2680  FancyOStream& os = *osPtr;
2681  os << "Process " << myRank << ":" << endl;
2682  Teuchos::OSTab tab2 (os);
2683 
2684  const map_type& meshRowMap = * (graph_.getRowMap ());
2685  const map_type& meshColMap = * (graph_.getColMap ());
2686  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2687  meshLclRow <= meshRowMap.getMaxLocalIndex ();
2688  ++meshLclRow) {
2689  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2690  os << "Row " << meshGblRow << ": {";
2691 
2692  local_inds_host_view_type lclColInds;
2693  values_host_view_type vals;
2694  LO numInds = 0;
2695  this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
2696 
2697  for (LO k = 0; k < numInds; ++k) {
2698  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2699 
2700  os << "Col " << gblCol << ": [";
2701  for (LO i = 0; i < blockSize; ++i) {
2702  for (LO j = 0; j < blockSize; ++j) {
2703  os << vals[blockSize*blockSize*k + i*blockSize + j];
2704  if (j + 1 < blockSize) {
2705  os << ", ";
2706  }
2707  }
2708  if (i + 1 < blockSize) {
2709  os << "; ";
2710  }
2711  }
2712  os << "]";
2713  if (k + 1 < numInds) {
2714  os << ", ";
2715  }
2716  }
2717  os << "}" << endl;
2718  }
2719 
2720  // Print data on Process 0. This will automatically respect the
2721  // current indentation level.
2722  if (myRank == 0) {
2723  out << lclOutStrPtr->str ();
2724  lclOutStrPtr = Teuchos::null; // clear it to save space
2725  }
2726 
2727  const int sizeTag = 1337;
2728  const int dataTag = 1338;
2729 
2730  ArrayRCP<char> recvDataBuf; // only used on Process 0
2731 
2732  // Send string sizes and data from each process in turn to
2733  // Process 0, and print on that process.
2734  for (int p = 1; p < numProcs; ++p) {
2735  if (myRank == 0) {
2736  // Receive the incoming string's length.
2737  ArrayRCP<size_t> recvSize (1);
2738  recvSize[0] = 0;
2739  RCP<CommRequest<int> > recvSizeReq =
2740  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2741  wait<int> (comm, outArg (recvSizeReq));
2742  const size_t numCharsToRecv = recvSize[0];
2743 
2744  // Allocate space for the string to receive. Reuse receive
2745  // buffer space if possible. We can do this because in the
2746  // current implementation, we only have one receive in
2747  // flight at a time. Leave space for the '\0' at the end,
2748  // in case the sender doesn't send it.
2749  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2750  recvDataBuf.resize (numCharsToRecv + 1);
2751  }
2752  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2753  // Post the receive of the actual string data.
2754  RCP<CommRequest<int> > recvDataReq =
2755  ireceive<int, char> (recvData, p, dataTag, comm);
2756  wait<int> (comm, outArg (recvDataReq));
2757 
2758  // Print the received data. This will respect the current
2759  // indentation level. Make sure that the string is
2760  // null-terminated.
2761  recvDataBuf[numCharsToRecv] = '\0';
2762  out << recvDataBuf.getRawPtr ();
2763  }
2764  else if (myRank == p) { // if I am not Process 0, and my rank is p
2765  // This deep-copies the string at most twice, depending on
2766  // whether std::string reference counts internally (it
2767  // generally does, so this won't deep-copy at all).
2768  const std::string stringToSend = lclOutStrPtr->str ();
2769  lclOutStrPtr = Teuchos::null; // clear original to save space
2770 
2771  // Send the string's length to Process 0.
2772  const size_t numCharsToSend = stringToSend.size ();
2773  ArrayRCP<size_t> sendSize (1);
2774  sendSize[0] = numCharsToSend;
2775  RCP<CommRequest<int> > sendSizeReq =
2776  isend<int, size_t> (sendSize, 0, sizeTag, comm);
2777  wait<int> (comm, outArg (sendSizeReq));
2778 
2779  // Send the actual string to Process 0. We know that the
2780  // string has length > 0, so it's save to take the address
2781  // of the first entry. Make a nonowning ArrayRCP to hold
2782  // the string. Process 0 will add a null termination
2783  // character at the end of the string, after it receives the
2784  // message.
2785  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2786  RCP<CommRequest<int> > sendDataReq =
2787  isend<int, char> (sendData, 0, dataTag, comm);
2788  wait<int> (comm, outArg (sendDataReq));
2789  }
2790  } // for each process rank p other than 0
2791  } // extreme verbosity level (print the whole matrix)
2792  }
2793 
2794  template<class Scalar, class LO, class GO, class Node>
2795  Teuchos::RCP<const Teuchos::Comm<int> >
2797  getComm() const
2798  {
2799  return graph_.getComm();
2800  }
2801 
2802 
2803  template<class Scalar, class LO, class GO, class Node>
2807  {
2808  return graph_.getGlobalNumCols();
2809  }
2810 
2811 
2812  template<class Scalar, class LO, class GO, class Node>
2813  size_t
2816  {
2817  return graph_.getLocalNumCols();
2818  }
2819 
2820  template<class Scalar, class LO, class GO, class Node>
2821  GO
2824  {
2825  return graph_.getIndexBase();
2826  }
2827 
2828  template<class Scalar, class LO, class GO, class Node>
2832  {
2833  return graph_.getGlobalNumEntries();
2834  }
2835 
2836  template<class Scalar, class LO, class GO, class Node>
2837  size_t
2840  {
2841  return graph_.getLocalNumEntries();
2842  }
2843 
2844  template<class Scalar, class LO, class GO, class Node>
2845  size_t
2847  getNumEntriesInGlobalRow (GO globalRow) const
2848  {
2849  return graph_.getNumEntriesInGlobalRow(globalRow);
2850  }
2851 
2852 
2853  template<class Scalar, class LO, class GO, class Node>
2854  size_t
2857  {
2858  return graph_.getGlobalMaxNumRowEntries();
2859  }
2860 
2861  template<class Scalar, class LO, class GO, class Node>
2862  bool
2864  hasColMap() const
2865  {
2866  return graph_.hasColMap();
2867  }
2868 
2869 
2870  template<class Scalar, class LO, class GO, class Node>
2871  bool
2874  {
2875  return graph_.isLocallyIndexed();
2876  }
2877 
2878  template<class Scalar, class LO, class GO, class Node>
2879  bool
2882  {
2883  return graph_.isGloballyIndexed();
2884  }
2885 
2886  template<class Scalar, class LO, class GO, class Node>
2887  bool
2890  {
2891  return graph_.isFillComplete ();
2892  }
2893 
2894  template<class Scalar, class LO, class GO, class Node>
2895  bool
2898  {
2899  return false;
2900  }
2901 
2902  template<class Scalar, class LO, class GO, class Node>
2903  void
2905  getGlobalRowCopy (GO /*GlobalRow*/,
2906  nonconst_global_inds_host_view_type &/*Indices*/,
2907  nonconst_values_host_view_type &/*Values*/,
2908  size_t& /*NumEntries*/) const
2909  {
2910  TEUCHOS_TEST_FOR_EXCEPTION(
2911  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2912  "This class doesn't support global matrix indexing.");
2913 
2914  }
2915 
2916  template<class Scalar, class LO, class GO, class Node>
2917  void
2919  getGlobalRowView (GO /* GlobalRow */,
2920  global_inds_host_view_type &/* indices */,
2921  values_host_view_type &/* values */) const
2922  {
2923  TEUCHOS_TEST_FOR_EXCEPTION(
2924  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2925  "This class doesn't support global matrix indexing.");
2926 
2927  }
2928 
2929  template<class Scalar, class LO, class GO, class Node>
2930  void
2932  getLocalRowView (LO localRowInd,
2933  local_inds_host_view_type &colInds,
2934  values_host_view_type &vals) const
2935  {
2936  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2937  colInds = local_inds_host_view_type();
2938  vals = values_host_view_type();
2939  }
2940  else {
2941  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2942  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2943  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2944 
2945  vals = getValuesHost (localRowInd);
2946  }
2947  }
2948 
2949  template<class Scalar, class LO, class GO, class Node>
2950  void
2953  local_inds_host_view_type &colInds,
2954  nonconst_values_host_view_type &vals) const
2955  {
2956  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2957  colInds = nonconst_local_inds_host_view_type();
2958  vals = nonconst_values_host_view_type();
2959  }
2960  else {
2961  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2962  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2963  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2964 
2965  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2966  vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
2967  }
2968  }
2969 
2970  template<class Scalar, class LO, class GO, class Node>
2971  void
2974  {
2975  const size_t lclNumMeshRows = graph_.getLocalNumRows ();
2976 
2977  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
2978  graph_.getLocalDiagOffsets (diagOffsets);
2979 
2980  // The code below works on host, so use a host View.
2981  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
2982  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
2983  Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
2984 
2985  auto vals_host_out = val_.getHostView(Access::ReadOnly);
2986  const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
2987 
2988  // TODO amk: This is a temporary measure to make the code run with Ifpack2
2989  size_t rowOffset = 0;
2990  size_t offset = 0;
2991  LO bs = getBlockSize();
2992  for(size_t r=0; r<getLocalNumRows(); r++)
2993  {
2994  // move pointer to start of diagonal block
2995  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
2996  for(int b=0; b<bs; b++)
2997  {
2998  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
2999  }
3000  // move pointer to start of next block row
3001  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3002  }
3003 
3004  //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3005  }
3006 
3007  template<class Scalar, class LO, class GO, class Node>
3008  void
3010  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3011  {
3012  TEUCHOS_TEST_FOR_EXCEPTION(
3013  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3014  "not implemented.");
3015 
3016  }
3017 
3018  template<class Scalar, class LO, class GO, class Node>
3019  void
3021  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3022  {
3023  TEUCHOS_TEST_FOR_EXCEPTION(
3024  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3025  "not implemented.");
3026 
3027  }
3028 
3029  template<class Scalar, class LO, class GO, class Node>
3030  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3032  getGraph() const
3033  {
3034  return graphRCP_;
3035  }
3036 
3037  template<class Scalar, class LO, class GO, class Node>
3038  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3041  {
3042  TEUCHOS_TEST_FOR_EXCEPTION(
3043  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3044  "not implemented.");
3045  }
3046 
3047 } // namespace Tpetra
3048 
3049 //
3050 // Explicit instantiation macro
3051 //
3052 // Must be expanded from within the Tpetra namespace!
3053 //
3054 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3055  template class BlockCrsMatrix< S, LO, GO, NODE >;
3056 
3057 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
std::string description() const override
One-line description of this object.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
LO local_ordinal_type
The type of local indices.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
size_t getLocalNumRows() const override
get the local number of block rows
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
MultiVector for multiple degrees of freedom per mesh point.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
size_t global_size_t
Global size_t object.
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
global_size_t getGlobalNumRows() const override
get the global number of block rows
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Replace old value with maximum of magnitudes of old and new values.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Replace existing values with new values.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Replace old values with zero.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
char packet_type
Implementation detail; tells.
local_matrix_device_type getLocalMatrixDevice() const
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
A distributed dense vector.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row&#39;s entries.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix&#39;s values (val_).
Base class for distributed Tpetra objects that support data redistribution.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row. ...