Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
11 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
12 
15 
19 #include "Tpetra_BlockMultiVector.hpp"
20 #include "Tpetra_BlockView.hpp"
21 
22 #include "Teuchos_TimeMonitor.hpp"
23 #ifdef HAVE_TPETRA_DEBUG
24 #include <set>
25 #endif // HAVE_TPETRA_DEBUG
26 
27 #include "KokkosSparse_spmv.hpp"
28 
29 namespace Tpetra {
30 
31 namespace Impl {
32 
33 template <typename T>
34 struct BlockCrsRowStruct {
35  T totalNumEntries, totalNumBytes, maxRowLength;
36  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
37  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct& b) = default;
38  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
39  : totalNumEntries(numEnt)
40  , totalNumBytes(numBytes)
41  , maxRowLength(rowLength) {}
42 };
43 
44 template <typename T>
45 static KOKKOS_INLINE_FUNCTION void operator+=(BlockCrsRowStruct<T>& a, const BlockCrsRowStruct<T>& b) {
46  a.totalNumEntries += b.totalNumEntries;
47  a.totalNumBytes += b.totalNumBytes;
48  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
49 }
50 
51 template <typename T, typename ExecSpace>
52 struct BlockCrsReducer {
53  typedef BlockCrsReducer reducer;
54  typedef T value_type;
55  typedef Kokkos::View<value_type, ExecSpace, Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
56  value_type* value;
57 
58  KOKKOS_INLINE_FUNCTION
59  BlockCrsReducer(value_type& val)
60  : value(&val) {}
61 
62  KOKKOS_INLINE_FUNCTION void join(value_type& dst, value_type& src) const { dst += src; }
63  KOKKOS_INLINE_FUNCTION void join(value_type& dst, const value_type& src) const { dst += src; }
64  KOKKOS_INLINE_FUNCTION void init(value_type& val) const { val = value_type(); }
65  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
66  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
67 };
68 
69 } // namespace Impl
70 
71 namespace { // (anonymous)
72 
73 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
74 // version that takes two Kokkos::View arguments).
75 template <class Scalar, class LO, class GO, class Node>
76 class GetLocalDiagCopy {
77  public:
78  typedef typename Node::device_type device_type;
79  typedef size_t diag_offset_type;
80  typedef Kokkos::View<const size_t*, device_type,
81  Kokkos::MemoryUnmanaged>
82  diag_offsets_type;
83  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
84  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
85  typedef typename local_graph_device_type::row_map_type row_offsets_type;
86  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
87  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
88  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
89 
90  // Constructor
91  GetLocalDiagCopy(const diag_type& diag,
92  const values_type& val,
93  const diag_offsets_type& diagOffsets,
94  const row_offsets_type& ptr,
95  const LO blockSize)
96  : diag_(diag)
97  , diagOffsets_(diagOffsets)
98  , ptr_(ptr)
99  , blockSize_(blockSize)
100  , offsetPerBlock_(blockSize_ * blockSize_)
101  , val_(val) {}
102 
103  KOKKOS_INLINE_FUNCTION void
104  operator()(const LO& lclRowInd) const {
105  using Kokkos::ALL;
106 
107  // Get row offset
108  const size_t absOffset = ptr_[lclRowInd];
109 
110  // Get offset relative to start of row
111  const size_t relOffset = diagOffsets_[lclRowInd];
112 
113  // Get the total offset
114  const size_t pointOffset = (absOffset + relOffset) * offsetPerBlock_;
115 
116  // Get a view of the block. BCRS currently uses LayoutRight
117  // regardless of the device.
118  typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
119  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
120  const_little_block_type;
121  const_little_block_type D_in(val_.data() + pointOffset,
122  blockSize_, blockSize_);
123  auto D_out = Kokkos::subview(diag_, lclRowInd, ALL(), ALL());
124  COPY(D_in, D_out);
125  }
126 
127  private:
128  diag_type diag_;
129  diag_offsets_type diagOffsets_;
130  row_offsets_type ptr_;
131  LO blockSize_;
132  LO offsetPerBlock_;
133  values_type val_;
134 };
135 } // namespace
136 
137 template <class Scalar, class LO, class GO, class Node>
138 std::ostream&
139 BlockCrsMatrix<Scalar, LO, GO, Node>::
140  markLocalErrorAndGetStream() {
141  *(this->localError_) = true;
142  if ((*errs_).is_null()) {
143  *errs_ = Teuchos::rcp(new std::ostringstream());
144  }
145  return **errs_;
146 }
147 
148 template <class Scalar, class LO, class GO, class Node>
151  : dist_object_type(Teuchos::rcp(new map_type()))
152  , // nonnull, so DistObject doesn't throw
153  graph_(Teuchos::rcp(new map_type()), 0)
154  , // FIXME (mfh 16 May 2014) no empty ctor yet
155  blockSize_(static_cast<LO>(0))
156  , X_colMap_(new Teuchos::RCP<BMV>())
157  , // ptr to a null ptr
158  Y_rowMap_(new Teuchos::RCP<BMV>())
159  , // ptr to a null ptr
160  pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
161  , offsetPerBlock_(0)
162  , localError_(new bool(false))
163  , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
164 {
165 }
166 
167 template <class Scalar, class LO, class GO, class Node>
170  const LO blockSize)
171  : dist_object_type(graph.getMap())
172  , graph_(graph)
173  , rowMeshMap_(*(graph.getRowMap()))
174  , blockSize_(blockSize)
175  , X_colMap_(new Teuchos::RCP<BMV>())
176  , // ptr to a null ptr
177  Y_rowMap_(new Teuchos::RCP<BMV>())
178  , // ptr to a null ptr
179  pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
180  , offsetPerBlock_(blockSize * blockSize)
181  , localError_(new bool(false))
182  , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
183 {
185  TEUCHOS_TEST_FOR_EXCEPTION(
186  !graph_.isSorted(), std::invalid_argument,
187  "Tpetra::"
188  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
189  "rows (isSorted() is false). This class assumes sorted rows.");
190 
191  graphRCP_ = Teuchos::rcpFromRef(graph_);
192 
193  // Trick to test whether LO is nonpositive, without a compiler
194  // warning in case LO is unsigned (which is generally a bad idea
195  // anyway). I don't promise that the trick works, but it
196  // generally does with gcc at least, in my experience.
197  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
198  TEUCHOS_TEST_FOR_EXCEPTION(
199  blockSizeIsNonpositive, std::invalid_argument,
200  "Tpetra::"
201  "BlockCrsMatrix constructor: The input blockSize = "
202  << blockSize << " <= 0. The block size must be positive.");
203 
204  domainPointMap_ = BMV::makePointMap(*(graph.getDomainMap()), blockSize);
205  rangePointMap_ = BMV::makePointMap(*(graph.getRangeMap()), blockSize);
206 
207  {
208  // These are rcp
209  const auto domainPointMap = getDomainMap();
210  const auto colPointMap = Teuchos::rcp(new typename BMV::map_type(BMV::makePointMap(*graph_.getColMap(), blockSize_)));
211  *pointImporter_ = Teuchos::rcp(new typename crs_graph_type::import_type(domainPointMap, colPointMap));
212  }
213  {
214  auto local_graph_h = graph.getLocalGraphHost();
215  auto ptr_h = local_graph_h.row_map;
216  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
217  Kokkos::deep_copy(ptrHost_, ptr_h);
218 
219  auto ind_h = local_graph_h.entries;
220  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
221  // DEEP_COPY REVIEW - HOST-TO-HOST
222  Kokkos::deep_copy(indHost_, ind_h);
223 
224  const auto numValEnt = ind_h.extent(0) * offsetPerBlock();
225  val_ = decltype(val_)(impl_scalar_type_dualview("val", numValEnt));
226  }
227 }
228 
229 template <class Scalar, class LO, class GO, class Node>
232  const typename local_matrix_device_type::values_type& values,
233  const LO blockSize)
234  : dist_object_type(graph.getMap())
235  , graph_(graph)
236  , rowMeshMap_(*(graph.getRowMap()))
237  , blockSize_(blockSize)
238  , X_colMap_(new Teuchos::RCP<BMV>())
239  , // ptr to a null ptr
240  Y_rowMap_(new Teuchos::RCP<BMV>())
241  , // ptr to a null ptr
242  pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
243  , offsetPerBlock_(blockSize * blockSize)
244  , localError_(new bool(false))
245  , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
246 {
248  TEUCHOS_TEST_FOR_EXCEPTION(
249  !graph_.isSorted(), std::invalid_argument,
250  "Tpetra::"
251  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
252  "rows (isSorted() is false). This class assumes sorted rows.");
253 
254  graphRCP_ = Teuchos::rcpFromRef(graph_);
255 
256  // Trick to test whether LO is nonpositive, without a compiler
257  // warning in case LO is unsigned (which is generally a bad idea
258  // anyway). I don't promise that the trick works, but it
259  // generally does with gcc at least, in my experience.
260  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
261  TEUCHOS_TEST_FOR_EXCEPTION(
262  blockSizeIsNonpositive, std::invalid_argument,
263  "Tpetra::"
264  "BlockCrsMatrix constructor: The input blockSize = "
265  << blockSize << " <= 0. The block size must be positive.");
266 
267  domainPointMap_ = BMV::makePointMap(*(graph.getDomainMap()), blockSize);
268  rangePointMap_ = BMV::makePointMap(*(graph.getRangeMap()), blockSize);
269 
270  {
271  // These are rcp
272  const auto domainPointMap = getDomainMap();
273  const auto colPointMap = Teuchos::rcp(new typename BMV::map_type(BMV::makePointMap(*graph_.getColMap(), blockSize_)));
274  *pointImporter_ = Teuchos::rcp(new typename crs_graph_type::import_type(domainPointMap, colPointMap));
275  }
276  {
277  auto local_graph_h = graph.getLocalGraphHost();
278  auto ptr_h = local_graph_h.row_map;
279  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
280  Kokkos::deep_copy(ptrHost_, ptr_h);
281 
282  auto ind_h = local_graph_h.entries;
283  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
284  Kokkos::deep_copy(indHost_, ind_h);
285 
286  const auto numValEnt = ind_h.extent(0) * offsetPerBlock();
287  TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
288  val_ = decltype(val_)(values);
289  }
290 }
291 
292 template <class Scalar, class LO, class GO, class Node>
295  const map_type& domainPointMap,
296  const map_type& rangePointMap,
297  const LO blockSize)
298  : dist_object_type(graph.getMap())
299  , graph_(graph)
300  , rowMeshMap_(*(graph.getRowMap()))
301  , domainPointMap_(domainPointMap)
302  , rangePointMap_(rangePointMap)
303  , blockSize_(blockSize)
304  , X_colMap_(new Teuchos::RCP<BMV>())
305  , // ptr to a null ptr
306  Y_rowMap_(new Teuchos::RCP<BMV>())
307  , // ptr to a null ptr
308  pointImporter_(new Teuchos::RCP<typename crs_graph_type::import_type>())
309  , offsetPerBlock_(blockSize * blockSize)
310  , localError_(new bool(false))
311  , errs_(new Teuchos::RCP<std::ostringstream>()) // ptr to a null ptr
312 {
313  TEUCHOS_TEST_FOR_EXCEPTION(
314  !graph_.isSorted(), std::invalid_argument,
315  "Tpetra::"
316  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
317  "rows (isSorted() is false). This class assumes sorted rows.");
318 
319  graphRCP_ = Teuchos::rcpFromRef(graph_);
320 
321  // Trick to test whether LO is nonpositive, without a compiler
322  // warning in case LO is unsigned (which is generally a bad idea
323  // anyway). I don't promise that the trick works, but it
324  // generally does with gcc at least, in my experience.
325  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
326  TEUCHOS_TEST_FOR_EXCEPTION(
327  blockSizeIsNonpositive, std::invalid_argument,
328  "Tpetra::"
329  "BlockCrsMatrix constructor: The input blockSize = "
330  << blockSize << " <= 0. The block size must be positive.");
331  {
332  // These are rcp
333  const auto rcpDomainPointMap = getDomainMap();
334  const auto colPointMap = Teuchos::rcp(new typename BMV::map_type(BMV::makePointMap(*graph_.getColMap(), blockSize_)));
335  *pointImporter_ = Teuchos::rcp(new typename crs_graph_type::import_type(rcpDomainPointMap, colPointMap));
336  }
337  {
338  auto local_graph_h = graph.getLocalGraphHost();
339  auto ptr_h = local_graph_h.row_map;
340  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
341  // DEEP_COPY REVIEW - HOST-TO-HOST
342  Kokkos::deep_copy(ptrHost_, ptr_h);
343 
344  auto ind_h = local_graph_h.entries;
345  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
346  // DEEP_COPY REVIEW - HOST-TO-HOST
347  Kokkos::deep_copy(indHost_, ind_h);
348 
349  const auto numValEnt = ind_h.extent(0) * offsetPerBlock();
350  val_ = decltype(val_)(impl_scalar_type_dualview("val", numValEnt));
351  }
352 }
353 
354 template <class Scalar, class LO, class GO, class Node>
355 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
357  getDomainMap() const { // Copy constructor of map_type does a shallow copy.
358  // We're only returning by RCP for backwards compatibility.
359  return Teuchos::rcp(new map_type(domainPointMap_));
360 }
361 
362 template <class Scalar, class LO, class GO, class Node>
363 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
365  getRangeMap() const { // Copy constructor of map_type does a shallow copy.
366  // We're only returning by RCP for backwards compatibility.
367  return Teuchos::rcp(new map_type(rangePointMap_));
368 }
369 
370 template <class Scalar, class LO, class GO, class Node>
371 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
373  getRowMap() const {
374  return graph_.getRowMap();
375 }
376 
377 template <class Scalar, class LO, class GO, class Node>
378 Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
380  getColMap() const {
381  return graph_.getColMap();
382 }
383 
384 template <class Scalar, class LO, class GO, class Node>
388  return graph_.getGlobalNumRows();
389 }
390 
391 template <class Scalar, class LO, class GO, class Node>
392 size_t
395  return graph_.getLocalNumRows();
396 }
397 
398 template <class Scalar, class LO, class GO, class Node>
399 size_t
402  return graph_.getLocalMaxNumRowEntries();
403 }
404 
405 template <class Scalar, class LO, class GO, class Node>
407  apply(const mv_type& X,
408  mv_type& Y,
409  Teuchos::ETransp mode,
410  Scalar alpha,
411  Scalar beta) const {
412  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
413  TEUCHOS_TEST_FOR_EXCEPTION(
414  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
415  std::invalid_argument,
416  "Tpetra::BlockCrsMatrix::apply: "
417  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
418  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
419 
420  TEUCHOS_TEST_FOR_EXCEPTION(
421  !X.isConstantStride() || !Y.isConstantStride(),
422  std::invalid_argument,
423  "Tpetra::BlockCrsMatrix::apply: "
424  "X and Y must both be constant stride");
425 
426  BMV X_view;
427  BMV Y_view;
428  const LO blockSize = getBlockSize();
429  try {
430  X_view = BMV(X, *(graph_.getDomainMap()), blockSize);
431  Y_view = BMV(Y, *(graph_.getRangeMap()), blockSize);
432  } catch (std::invalid_argument& e) {
433  TEUCHOS_TEST_FOR_EXCEPTION(
434  true, std::invalid_argument,
435  "Tpetra::BlockCrsMatrix::"
436  "apply: Either the input MultiVector X or the output MultiVector Y "
437  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
438  "graph. BlockMultiVector's constructor threw the following exception: "
439  << e.what());
440  }
441 
442  try {
443  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
444  // either that or mark fields of this class as 'mutable'. The
445  // problem is that applyBlock wants to do lazy initialization of
446  // temporary block multivectors.
447  const_cast<this_BCRS_type&>(*this).applyBlock(X_view, Y_view, mode, alpha, beta);
448  } catch (std::invalid_argument& e) {
449  TEUCHOS_TEST_FOR_EXCEPTION(
450  true, std::invalid_argument,
451  "Tpetra::BlockCrsMatrix::"
452  "apply: The implementation method applyBlock complained about having "
453  "an invalid argument. It reported the following: "
454  << e.what());
455  } catch (std::logic_error& e) {
456  TEUCHOS_TEST_FOR_EXCEPTION(
457  true, std::invalid_argument,
458  "Tpetra::BlockCrsMatrix::"
459  "apply: The implementation method applyBlock complained about a "
460  "possible bug in its implementation. It reported the following: "
461  << e.what());
462  } catch (std::exception& e) {
463  TEUCHOS_TEST_FOR_EXCEPTION(
464  true, std::invalid_argument,
465  "Tpetra::BlockCrsMatrix::"
466  "apply: The implementation method applyBlock threw an exception which "
467  "is neither std::invalid_argument nor std::logic_error, but is a "
468  "subclass of std::exception. It reported the following: "
469  << e.what());
470  } catch (...) {
471  TEUCHOS_TEST_FOR_EXCEPTION(
472  true, std::logic_error,
473  "Tpetra::BlockCrsMatrix::"
474  "apply: The implementation method applyBlock threw an exception which "
475  "is not an instance of a subclass of std::exception. This probably "
476  "indicates a bug. Please report this to the Tpetra developers.");
477  }
478 }
479 
480 template <class Scalar, class LO, class GO, class Node>
484  Teuchos::ETransp mode,
485  const Scalar alpha,
486  const Scalar beta) {
487  TEUCHOS_TEST_FOR_EXCEPTION(
488  X.getBlockSize() != Y.getBlockSize(), std::invalid_argument,
489  "Tpetra::BlockCrsMatrix::applyBlock: "
490  "X and Y have different block sizes. X.getBlockSize() = "
491  << X.getBlockSize() << " != Y.getBlockSize() = "
492  << Y.getBlockSize() << ".");
493 
494  if (mode == Teuchos::NO_TRANS) {
495  applyBlockNoTrans(X, Y, alpha, beta);
496  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
497  applyBlockTrans(X, Y, mode, alpha, beta);
498  } else {
499  TEUCHOS_TEST_FOR_EXCEPTION(
500  true, std::invalid_argument,
501  "Tpetra::BlockCrsMatrix::"
502  "applyBlock: Invalid 'mode' argument. Valid values are "
503  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
504  }
505 }
506 
507 template <class Scalar, class LO, class GO, class Node>
510  const Import<LO, GO, Node>& importer) const {
511  using Teuchos::RCP;
512  using Teuchos::rcp;
513  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
514 
515  // Right now, we make many assumptions...
516  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
517  "destMatrix is required to be null.");
518 
519  // BlockCrsMatrix requires a complete graph at construction.
520  // So first step is to import and fill complete the destGraph.
521  RCP<crs_graph_type> srcGraph = rcp(new crs_graph_type(this->getCrsGraph()));
522  RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
523  srcGraph->getDomainMap(),
524  srcGraph->getRangeMap());
525 
526  // Final step, create and import the destMatrix.
527  destMatrix = rcp(new this_BCRS_type(*destGraph, getBlockSize()));
528  destMatrix->doImport(*this, importer, Tpetra::INSERT);
529 }
530 
531 template <class Scalar, class LO, class GO, class Node>
534  const Export<LO, GO, Node>& exporter) const {
535  using Teuchos::RCP;
536  using Teuchos::rcp;
537  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
538 
539  // Right now, we make many assumptions...
540  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
541  "destMatrix is required to be null.");
542 
543  // BlockCrsMatrix requires a complete graph at construction.
544  // So first step is to import and fill complete the destGraph.
545  RCP<crs_graph_type> srcGraph = rcp(new crs_graph_type(this->getCrsGraph()));
546  RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
547  srcGraph->getDomainMap(),
548  srcGraph->getRangeMap());
549 
550  // Final step, create and import the destMatrix.
551  destMatrix = rcp(new this_BCRS_type(*destGraph, getBlockSize()));
552  destMatrix->doExport(*this, exporter, Tpetra::INSERT);
553 }
554 
555 template <class Scalar, class LO, class GO, class Node>
557  setAllToScalar(const Scalar& alpha) {
558  auto val_d = val_.getDeviceView(Access::OverwriteAll);
559  Kokkos::deep_copy(val_d, alpha);
560 }
561 
562 template <class Scalar, class LO, class GO, class Node>
564  replaceLocalValues(const LO localRowInd,
565  const LO colInds[],
566  const Scalar vals[],
567  const LO numColInds) const {
568  std::vector<ptrdiff_t> offsets(numColInds);
569  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
570  const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
571  return validCount;
572 }
573 
574 template <class Scalar, class LO, class GO, class Node>
576  getLocalDiagOffsets(const Kokkos::View<size_t*, device_type,
577  Kokkos::MemoryUnmanaged>& offsets) const {
578  graph_.getLocalDiagOffsets(offsets);
579 }
580 
581 template <class Scalar, class LO, class GO, class Node>
583  getLocalDiagCopy(const Kokkos::View<impl_scalar_type***, device_type,
584  Kokkos::MemoryUnmanaged>& diag,
585  const Kokkos::View<const size_t*, device_type,
586  Kokkos::MemoryUnmanaged>& offsets) const {
587  using Kokkos::parallel_for;
588  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
589 
590  const LO lclNumMeshRows = static_cast<LO>(rowMeshMap_.getLocalNumElements());
591  const LO blockSize = this->getBlockSize();
592  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<LO>(diag.extent(0)) < lclNumMeshRows ||
593  static_cast<LO>(diag.extent(1)) < blockSize ||
594  static_cast<LO>(diag.extent(2)) < blockSize,
595  std::invalid_argument, prefix << "The input Kokkos::View is not big enough to hold all the data.");
596  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<LO>(offsets.size()) < lclNumMeshRows, std::invalid_argument,
597  prefix << "offsets.size() = " << offsets.size() << " < local number of "
598  "diagonal blocks "
599  << lclNumMeshRows << ".");
600 
601  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
602  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
603 
604  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
605  // we reserve the right to do lazy allocation of device data. (We
606  // don't plan to do lazy allocation for host data; the host
607  // version of the data always exists.)
608  auto val_d = val_.getDeviceView(Access::ReadOnly);
609  parallel_for(policy_type(0, lclNumMeshRows),
610  functor_type(diag, val_d, offsets,
611  graph_.getLocalGraphDevice().row_map, blockSize_));
612 }
613 
614 template <class Scalar, class LO, class GO, class Node>
616  absMaxLocalValues(const LO localRowInd,
617  const LO colInds[],
618  const Scalar vals[],
619  const LO numColInds) const {
620  std::vector<ptrdiff_t> offsets(numColInds);
621  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
622  const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
623  return validCount;
624 }
625 
626 template <class Scalar, class LO, class GO, class Node>
628  sumIntoLocalValues(const LO localRowInd,
629  const LO colInds[],
630  const Scalar vals[],
631  const LO numColInds) const {
632  std::vector<ptrdiff_t> offsets(numColInds);
633  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets.data(), colInds, numColInds);
634  const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets.data(), vals, numOffsets);
635  return validCount;
636 }
637 template <class Scalar, class LO, class GO, class Node>
639  getLocalRowCopy(LO LocalRow,
640  nonconst_local_inds_host_view_type& Indices,
641  nonconst_values_host_view_type& Values,
642  size_t& NumEntries) const {
643  auto vals = getValuesHost(LocalRow);
644  const LO numInds = ptrHost_(LocalRow + 1) - ptrHost_(LocalRow);
645  if (numInds > (LO)Indices.extent(0) || numInds * blockSize_ * blockSize_ > (LO)Values.extent(0)) {
646  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
647  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
648  << numInds << " row entries");
649  }
650  const LO* colInds = indHost_.data() + ptrHost_(LocalRow);
651  for (LO i = 0; i < numInds; ++i) {
652  Indices[i] = colInds[i];
653  }
654  for (LO i = 0; i < numInds * blockSize_ * blockSize_; ++i) {
655  Values[i] = vals[i];
656  }
657  NumEntries = numInds;
658 }
659 
660 template <class Scalar, class LO, class GO, class Node>
662  getLocalRowOffsets(const LO localRowInd,
663  ptrdiff_t offsets[],
664  const LO colInds[],
665  const LO numColInds) const {
666  if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
667  // We got no offsets, because the input local row index is
668  // invalid on the calling process. That may not be an error, if
669  // numColInds is zero anyway; it doesn't matter. This is the
670  // advantage of returning the number of valid indices.
671  return static_cast<LO>(0);
672  }
673 
674  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid();
675  LO hint = 0; // Guess for the relative offset into the current row
676  LO validCount = 0; // number of valid column indices in colInds
677 
678  for (LO k = 0; k < numColInds; ++k) {
679  const LO relBlockOffset =
680  this->findRelOffsetOfColumnIndex(localRowInd, colInds[k], hint);
681  if (relBlockOffset != LINV) {
682  offsets[k] = static_cast<ptrdiff_t>(relBlockOffset);
683  hint = relBlockOffset + 1;
684  ++validCount;
685  }
686  }
687  return validCount;
688 }
689 
690 template <class Scalar, class LO, class GO, class Node>
692  replaceLocalValuesByOffsets(const LO localRowInd,
693  const ptrdiff_t offsets[],
694  const Scalar vals[],
695  const LO numOffsets) const {
696  if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
697  // We modified no values, because the input local row index is
698  // invalid on the calling process. That may not be an error, if
699  // numColInds is zero anyway; it doesn't matter. This is the
700  // advantage of returning the number of valid indices.
701  return static_cast<LO>(0);
702  }
703  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*>(vals);
704  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
705  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
706  impl_scalar_type* vOut = val_out.data();
707 
708  const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
709  const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid();
710  size_t pointOffset = 0; // Current offset into input values
711  LO validCount = 0; // number of valid offsets
712 
713  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
714  const size_t blockOffset = offsets[k] * perBlockSize;
715  if (offsets[k] != STINV) {
716  little_block_type A_old = getNonConstLocalBlockFromInput(vOut, blockOffset);
717  const_little_block_type A_new = getConstLocalBlockFromInput(vIn, pointOffset);
718  COPY(A_new, A_old);
719  ++validCount;
720  }
721  }
722  return validCount;
723 }
724 
725 template <class Scalar, class LO, class GO, class Node>
727  absMaxLocalValuesByOffsets(const LO localRowInd,
728  const ptrdiff_t offsets[],
729  const Scalar vals[],
730  const LO numOffsets) const {
731  if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
732  // We modified no values, because the input local row index is
733  // invalid on the calling process. That may not be an error, if
734  // numColInds is zero anyway; it doesn't matter. This is the
735  // advantage of returning the number of valid indices.
736  return static_cast<LO>(0);
737  }
738  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*>(vals);
739  auto val_out = getValuesHost(localRowInd);
740  impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
741 
742  const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
743  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid();
744  size_t pointOffset = 0; // Current offset into input values
745  LO validCount = 0; // number of valid offsets
746 
747  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
748  const size_t blockOffset = offsets[k] * perBlockSize;
749  if (blockOffset != STINV) {
750  little_block_type A_old = getNonConstLocalBlockFromInput(vOut, blockOffset);
751  const_little_block_type A_new = getConstLocalBlockFromInput(vIn, pointOffset);
752  ::Tpetra::Impl::absMax(A_old, A_new);
753  ++validCount;
754  }
755  }
756  return validCount;
757 }
758 
759 template <class Scalar, class LO, class GO, class Node>
761  sumIntoLocalValuesByOffsets(const LO localRowInd,
762  const ptrdiff_t offsets[],
763  const Scalar vals[],
764  const LO numOffsets) const {
765  if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
766  // We modified no values, because the input local row index is
767  // invalid on the calling process. That may not be an error, if
768  // numColInds is zero anyway; it doesn't matter. This is the
769  // advantage of returning the number of valid indices.
770  return static_cast<LO>(0);
771  }
772  const impl_scalar_type ONE = static_cast<impl_scalar_type>(1.0);
773  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*>(vals);
774  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
775  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
776  impl_scalar_type* vOut = val_out.data();
777 
778  const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
779  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid();
780  size_t pointOffset = 0; // Current offset into input values
781  LO validCount = 0; // number of valid offsets
782 
783  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
784  const size_t blockOffset = offsets[k] * perBlockSize;
785  if (blockOffset != STINV) {
786  little_block_type A_old = getNonConstLocalBlockFromInput(vOut, blockOffset);
787  const_little_block_type A_new = getConstLocalBlockFromInput(vIn, pointOffset);
788  AXPY(ONE, A_new, A_old);
789  ++validCount;
790  }
791  }
792  return validCount;
793 }
794 
795 template <class Scalar, class LO, class GO, class Node>
797  impl_scalar_type_dualview::t_host::const_type
799  getValuesHost() const {
800  return val_.getHostView(Access::ReadOnly);
801 }
802 
803 template <class Scalar, class LO, class GO, class Node>
804 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
805  impl_scalar_type_dualview::t_dev::const_type
806  BlockCrsMatrix<Scalar, LO, GO, Node>::
807  getValuesDevice() const {
808  return val_.getDeviceView(Access::ReadOnly);
809 }
810 
811 template <class Scalar, class LO, class GO, class Node>
812 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
813  impl_scalar_type_dualview::t_host
816  return val_.getHostView(Access::ReadWrite);
817 }
818 
819 template <class Scalar, class LO, class GO, class Node>
821  impl_scalar_type_dualview::t_dev
823  getValuesDeviceNonConst() const {
824  return val_.getDeviceView(Access::ReadWrite);
825 }
826 
827 template <class Scalar, class LO, class GO, class Node>
828 typename BlockCrsMatrix<Scalar, LO, GO, Node>::
829  impl_scalar_type_dualview::t_host::const_type
830  BlockCrsMatrix<Scalar, LO, GO, Node>::
831  getValuesHost(const LO& lclRow) const {
832  const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
833  auto val = val_.getHostView(Access::ReadOnly);
834  auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
835  return r_val;
836 }
837 
838 template <class Scalar, class LO, class GO, class Node>
840  impl_scalar_type_dualview::t_dev::const_type
842  getValuesDevice(const LO& lclRow) const {
843  const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
844  auto val = val_.getDeviceView(Access::ReadOnly);
845  auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
846  return r_val;
847 }
848 
849 template <class Scalar, class LO, class GO, class Node>
852  getValuesHostNonConst(const LO& lclRow) {
853  const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
854  auto val = val_.getHostView(Access::ReadWrite);
855  auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
856  return r_val;
857 }
858 
859 template <class Scalar, class LO, class GO, class Node>
862  getValuesDeviceNonConst(const LO& lclRow) {
863  const size_t perBlockSize = static_cast<LO>(offsetPerBlock());
864  auto val = val_.getDeviceView(Access::ReadWrite);
865  auto r_val = Kokkos::subview(val, Kokkos::pair<LO, LO>(ptrHost_(lclRow) * perBlockSize, ptrHost_(lclRow + 1) * perBlockSize));
866  return r_val;
867 }
868 
869 template <class Scalar, class LO, class GO, class Node>
870 size_t
872  getNumEntriesInLocalRow(const LO localRowInd) const {
873  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow(localRowInd);
874  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid()) {
875  return static_cast<LO>(0); // the calling process doesn't have that row
876  } else {
877  return static_cast<LO>(numEntInGraph);
878  }
879 }
880 
881 template <class Scalar, class LO, class GO, class Node>
882 typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
885  auto numCols = this->graph_.getColMap()->getLocalNumElements();
886  auto val = val_.getDeviceView(Access::ReadWrite);
887  const LO blockSize = this->getBlockSize();
888  const auto graph = this->graph_.getLocalGraphDevice();
889 
890  return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
891  numCols,
892  val,
893  graph,
894  blockSize);
895 }
896 
897 template <class Scalar, class LO, class GO, class Node>
901  const Teuchos::ETransp mode,
902  const Scalar alpha,
903  const Scalar beta) {
904  (void)X;
905  (void)Y;
906  (void)mode;
907  (void)alpha;
908  (void)beta;
909 
910  TEUCHOS_TEST_FOR_EXCEPTION(
911  true, std::logic_error,
912  "Tpetra::BlockCrsMatrix::apply: "
913  "transpose and conjugate transpose modes are not yet implemented.");
914 }
915 
916 template <class Scalar, class LO, class GO, class Node>
917 void BlockCrsMatrix<Scalar, LO, GO, Node>::
918  applyBlockNoTrans(const BlockMultiVector<Scalar, LO, GO, Node>& X,
919  BlockMultiVector<Scalar, LO, GO, Node>& Y,
920  const Scalar alpha,
921  const Scalar beta) {
922  using Teuchos::RCP;
923  using Teuchos::rcp;
924  typedef ::Tpetra::Import<LO, GO, Node> import_type;
925  typedef ::Tpetra::Export<LO, GO, Node> export_type;
926  const Scalar zero = STS::zero();
927  const Scalar one = STS::one();
928  RCP<const import_type> import = graph_.getImporter();
929  // "export" is a reserved C++ keyword, so we can't use it.
930  RCP<const export_type> theExport = graph_.getExporter();
931  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
932 
933  if (alpha == zero) {
934  if (beta == zero) {
935  Y.putScalar(zero); // replace Inf or NaN (BLAS rules)
936  } else if (beta != one) {
937  Y.scale(beta);
938  }
939  } else { // alpha != 0
940  const BMV* X_colMap = NULL;
941  if (import.is_null()) {
942  try {
943  X_colMap = &X;
944  } catch (std::exception& e) {
945  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, prefix << "Tpetra::MultiVector::"
946  "operator= threw an exception: "
947  << std::endl
948  << e.what());
949  }
950  } else {
951  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
952  // Y_rowMap_ below. This lets us do lazy initialization
953  // correctly with view semantics of BlockCrsMatrix. All views
954  // of this BlockCrsMatrix have the same outer pointer. That
955  // way, we can set the inner pointer in one view, and all
956  // other views will see it.
957  if ((*X_colMap_).is_null() ||
958  (**X_colMap_).getNumVectors() != X.getNumVectors() ||
959  (**X_colMap_).getBlockSize() != X.getBlockSize()) {
960  *X_colMap_ = rcp(new BMV(*(graph_.getColMap()), getBlockSize(),
961  static_cast<LO>(X.getNumVectors())));
962  }
963  (*X_colMap_)->getMultiVectorView().doImport(X.getMultiVectorView(), **pointImporter_, ::Tpetra::REPLACE);
964  try {
965  X_colMap = &(**X_colMap_);
966  } catch (std::exception& e) {
967  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, prefix << "Tpetra::MultiVector::"
968  "operator= threw an exception: "
969  << std::endl
970  << e.what());
971  }
972  }
973 
974  BMV* Y_rowMap = NULL;
975  if (theExport.is_null()) {
976  Y_rowMap = &Y;
977  } else if ((*Y_rowMap_).is_null() ||
978  (**Y_rowMap_).getNumVectors() != Y.getNumVectors() ||
979  (**Y_rowMap_).getBlockSize() != Y.getBlockSize()) {
980  *Y_rowMap_ = rcp(new BMV(*(graph_.getRowMap()), getBlockSize(),
981  static_cast<LO>(X.getNumVectors())));
982  try {
983  Y_rowMap = &(**Y_rowMap_);
984  } catch (std::exception& e) {
985  TEUCHOS_TEST_FOR_EXCEPTION(
986  true, std::logic_error, prefix << "Tpetra::MultiVector::"
987  "operator= threw an exception: "
988  << std::endl
989  << e.what());
990  }
991  }
992 
993  try {
994  localApplyBlockNoTrans(*X_colMap, *Y_rowMap, alpha, beta);
995  } catch (std::exception& e) {
996  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
997  "an exception: "
998  << e.what());
999  } catch (...) {
1000  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1001  "an exception not a subclass of std::exception.");
1002  }
1003 
1004  if (!theExport.is_null()) {
1005  Y.doExport(*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1006  }
1007  }
1008 }
1009 
1010 template <class Scalar, class LO, class GO, class Node>
1011 void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1012  const BlockMultiVector<Scalar, LO, GO, Node>& X,
1013  BlockMultiVector<Scalar, LO, GO, Node>& Y, const Scalar alpha,
1014  const Scalar beta) {
1015  ::Tpetra::Details::ProfilingRegion profile_region(
1016  "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1017  const impl_scalar_type alpha_impl = alpha;
1018  const auto graph = this->graph_.getLocalGraphDevice();
1019 
1020  mv_type X_mv = X.getMultiVectorView();
1021  mv_type Y_mv = Y.getMultiVectorView();
1022  auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1023  auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1024 
1025  auto A_lcl = getLocalMatrixDevice();
1026  if (!applyHelper.get()) {
1027  // The apply helper does not exist, so create it
1028  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map);
1029  }
1030  if (applyHelper->shouldUseIntRowptrs()) {
1031  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
1032  KokkosSparse::spmv(
1033  &applyHelper->handle_int, KokkosSparse::NoTranspose,
1034  alpha_impl, A_lcl_int_rowptrs, X_lcl, beta, Y_lcl);
1035  } else {
1036  KokkosSparse::spmv(
1037  &applyHelper->handle, KokkosSparse::NoTranspose,
1038  alpha_impl, A_lcl, X_lcl, beta, Y_lcl);
1039  }
1040 }
1041 
1042 template <class Scalar, class LO, class GO, class Node>
1043 LO BlockCrsMatrix<Scalar, LO, GO, Node>::
1044  findRelOffsetOfColumnIndex(const LO localRowIndex,
1045  const LO colIndexToFind,
1046  const LO hint) const {
1047  const size_t absStartOffset = ptrHost_[localRowIndex];
1048  const size_t absEndOffset = ptrHost_[localRowIndex + 1];
1049  const LO numEntriesInRow = static_cast<LO>(absEndOffset - absStartOffset);
1050  // Amortize pointer arithmetic over the search loop.
1051  const LO* const curInd = indHost_.data() + absStartOffset;
1052 
1053  // If the hint was correct, then the hint is the offset to return.
1054  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1055  // Always return the offset relative to the current row.
1056  return hint;
1057  }
1058 
1059  // The hint was wrong, so we must search for the given column
1060  // index in the column indices for the given row.
1061  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid();
1062 
1063  // We require that the graph have sorted rows. However, binary
1064  // search only pays if the current row is longer than a certain
1065  // amount. We set this to 32, but you might want to tune this.
1066  const LO maxNumEntriesForLinearSearch = 32;
1067  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1068  // Use binary search. It would probably be better for us to
1069  // roll this loop by hand. If we wrote it right, a smart
1070  // compiler could perhaps use conditional loads and avoid
1071  // branches (according to Jed Brown on May 2014).
1072  const LO* beg = curInd;
1073  const LO* end = curInd + numEntriesInRow;
1074  std::pair<const LO*, const LO*> p =
1075  std::equal_range(beg, end, colIndexToFind);
1076  if (p.first != p.second) {
1077  // offset is relative to the current row
1078  relOffset = static_cast<LO>(p.first - beg);
1079  }
1080  } else { // use linear search
1081  for (LO k = 0; k < numEntriesInRow; ++k) {
1082  if (colIndexToFind == curInd[k]) {
1083  relOffset = k; // offset is relative to the current row
1084  break;
1085  }
1086  }
1087  }
1088 
1089  return relOffset;
1090 }
1091 
1092 template <class Scalar, class LO, class GO, class Node>
1093 LO BlockCrsMatrix<Scalar, LO, GO, Node>::
1094  offsetPerBlock() const {
1095  return offsetPerBlock_;
1096 }
1097 
1098 template <class Scalar, class LO, class GO, class Node>
1099 typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1100 BlockCrsMatrix<Scalar, LO, GO, Node>::
1101  getConstLocalBlockFromInput(const impl_scalar_type* val,
1102  const size_t pointOffset) const {
1103  // Row major blocks
1104  const LO rowStride = blockSize_;
1105  return const_little_block_type(val + pointOffset, blockSize_, rowStride);
1106 }
1107 
1108 template <class Scalar, class LO, class GO, class Node>
1109 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1110 BlockCrsMatrix<Scalar, LO, GO, Node>::
1111  getNonConstLocalBlockFromInput(impl_scalar_type* val,
1112  const size_t pointOffset) const {
1113  // Row major blocks
1114  const LO rowStride = blockSize_;
1115  return little_block_type(val + pointOffset, blockSize_, rowStride);
1116 }
1117 
1118 template <class Scalar, class LO, class GO, class Node>
1119 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1120 BlockCrsMatrix<Scalar, LO, GO, Node>::
1121  getNonConstLocalBlockFromInputHost(impl_scalar_type* val,
1122  const size_t pointOffset) const {
1123  // Row major blocks
1124  const LO rowStride = blockSize_;
1125  const size_t bs2 = blockSize_ * blockSize_;
1126  return little_block_host_type(val + bs2 * pointOffset, blockSize_, rowStride);
1127 }
1128 
1129 template <class Scalar, class LO, class GO, class Node>
1130 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1131 BlockCrsMatrix<Scalar, LO, GO, Node>::
1132  getLocalBlockDeviceNonConst(const LO localRowInd, const LO localColInd) const {
1133  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1134 
1135  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1136  const LO relBlockOffset = this->findRelOffsetOfColumnIndex(localRowInd, localColInd);
1137  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid()) {
1138  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1139  auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1140  auto r_val = getNonConstLocalBlockFromInput(vals.data(), absBlockOffset);
1141  return r_val;
1142  } else {
1143  return little_block_type();
1144  }
1145 }
1146 
1147 template <class Scalar, class LO, class GO, class Node>
1148 typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1149 BlockCrsMatrix<Scalar, LO, GO, Node>::
1150  getLocalBlockHostNonConst(const LO localRowInd, const LO localColInd) const {
1151  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1152 
1153  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1154  const LO relBlockOffset = this->findRelOffsetOfColumnIndex(localRowInd, localColInd);
1155  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid()) {
1156  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1157  auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1158  auto r_val = getNonConstLocalBlockFromInputHost(vals.data(), absBlockOffset);
1159  return r_val;
1160  } else {
1161  return little_block_host_type();
1162  }
1163 }
1164 
1165 template <class Scalar, class LO, class GO, class Node>
1166 bool BlockCrsMatrix<Scalar, LO, GO, Node>::
1167  checkSizes(const ::Tpetra::SrcDistObject& source) {
1168  using std::endl;
1169  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1170  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type*>(&source);
1171 
1172  if (src == NULL) {
1173  std::ostream& err = markLocalErrorAndGetStream();
1174  err << "checkSizes: The source object of the Import or Export "
1175  "must be a BlockCrsMatrix with the same template parameters as the "
1176  "target object."
1177  << endl;
1178  } else {
1179  // Use a string of ifs, not if-elseifs, because we want to know
1180  // all the errors.
1181  if (src->getBlockSize() != this->getBlockSize()) {
1182  std::ostream& err = markLocalErrorAndGetStream();
1183  err << "checkSizes: The source and target objects of the Import or "
1184  << "Export must have the same block sizes. The source's block "
1185  << "size = " << src->getBlockSize() << " != the target's block "
1186  << "size = " << this->getBlockSize() << "." << endl;
1187  }
1188  if (!src->graph_.isFillComplete()) {
1189  std::ostream& err = markLocalErrorAndGetStream();
1190  err << "checkSizes: The source object of the Import or Export is "
1191  "not fill complete. Both source and target objects must be fill "
1192  "complete."
1193  << endl;
1194  }
1195  if (!this->graph_.isFillComplete()) {
1196  std::ostream& err = markLocalErrorAndGetStream();
1197  err << "checkSizes: The target object of the Import or Export is "
1198  "not fill complete. Both source and target objects must be fill "
1199  "complete."
1200  << endl;
1201  }
1202  if (src->graph_.getColMap().is_null()) {
1203  std::ostream& err = markLocalErrorAndGetStream();
1204  err << "checkSizes: The source object of the Import or Export does "
1205  "not have a column Map. Both source and target objects must have "
1206  "column Maps."
1207  << endl;
1208  }
1209  if (this->graph_.getColMap().is_null()) {
1210  std::ostream& err = markLocalErrorAndGetStream();
1211  err << "checkSizes: The target object of the Import or Export does "
1212  "not have a column Map. Both source and target objects must have "
1213  "column Maps."
1214  << endl;
1215  }
1216  }
1217 
1218  return !(*(this->localError_));
1219 }
1220 
1221 template <class Scalar, class LO, class GO, class Node>
1223  copyAndPermute(const ::Tpetra::SrcDistObject& source,
1224  const size_t numSameIDs,
1225  const Kokkos::DualView<const local_ordinal_type*,
1226  buffer_device_type>& permuteToLIDs,
1227  const Kokkos::DualView<const local_ordinal_type*,
1228  buffer_device_type>& permuteFromLIDs,
1229  const CombineMode /*CM*/) {
1230  using std::endl;
1231  using ::Tpetra::Details::Behavior;
1233  using ::Tpetra::Details::ProfilingRegion;
1234  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1235 
1236  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1237  const bool debug = Behavior::debug();
1238  const bool verbose = Behavior::verbose();
1239 
1240  // Define this function prefix
1241  std::string prefix;
1242  {
1243  std::ostringstream os;
1244  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1245  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1246  prefix = os.str();
1247  }
1248 
1249  // check if this already includes a local error
1250  if (*(this->localError_)) {
1251  std::ostream& err = this->markLocalErrorAndGetStream();
1252  err << prefix
1253  << "The target object of the Import or Export is already in an error state."
1254  << endl;
1255  return;
1256  }
1257 
1258  //
1259  // Verbose input dual view status
1260  //
1261  if (verbose) {
1262  std::ostringstream os;
1263  os << prefix << endl
1264  << prefix << " " << dualViewStatusToString(permuteToLIDs, "permuteToLIDs") << endl
1265  << prefix << " " << dualViewStatusToString(permuteFromLIDs, "permuteFromLIDs") << endl;
1266  std::cerr << os.str();
1267  }
1268 
1272  if (permuteToLIDs.extent(0) != permuteFromLIDs.extent(0)) {
1273  std::ostream& err = this->markLocalErrorAndGetStream();
1274  err << prefix
1275  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent(0)
1276  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1277  << "." << endl;
1278  return;
1279  }
1280  if (permuteToLIDs.need_sync_host() || permuteFromLIDs.need_sync_host()) {
1281  std::ostream& err = this->markLocalErrorAndGetStream();
1282  err << prefix
1283  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1284  << endl;
1285  return;
1286  }
1287 
1288  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type*>(&source);
1289  if (src == NULL) {
1290  std::ostream& err = this->markLocalErrorAndGetStream();
1291  err << prefix
1292  << "The source (input) object of the Import or "
1293  "Export is either not a BlockCrsMatrix, or does not have the right "
1294  "template parameters. checkSizes() should have caught this. "
1295  "Please report this bug to the Tpetra developers."
1296  << endl;
1297  return;
1298  }
1299 
1300  bool lclErr = false;
1301 #ifdef HAVE_TPETRA_DEBUG
1302  std::set<LO> invalidSrcCopyRows;
1303  std::set<LO> invalidDstCopyRows;
1304  std::set<LO> invalidDstCopyCols;
1305  std::set<LO> invalidDstPermuteCols;
1306  std::set<LO> invalidPermuteFromRows;
1307 #endif // HAVE_TPETRA_DEBUG
1308 
1309  // Copy the initial sequence of rows that are the same.
1310  //
1311  // The two graphs might have different column Maps, so we need to
1312  // do this using global column indices. This is purely local, so
1313  // we only need to check for local sameness of the two column
1314  // Maps.
1315 
1316 #ifdef HAVE_TPETRA_DEBUG
1317  const map_type& srcRowMap = *(src->graph_.getRowMap());
1318 #endif // HAVE_TPETRA_DEBUG
1319  const map_type& dstRowMap = *(this->graph_.getRowMap());
1320  const map_type& srcColMap = *(src->graph_.getColMap());
1321  const map_type& dstColMap = *(this->graph_.getColMap());
1322  const bool canUseLocalColumnIndices = srcColMap.locallySameAs(dstColMap);
1323 
1324  const size_t numPermute = static_cast<size_t>(permuteFromLIDs.extent(0));
1325  if (verbose) {
1326  std::ostringstream os;
1327  os << prefix
1328  << "canUseLocalColumnIndices: "
1329  << (canUseLocalColumnIndices ? "true" : "false")
1330  << ", numPermute: " << numPermute
1331  << endl;
1332  std::cerr << os.str();
1333  }
1334 
1335  const auto permuteToLIDsHost = permuteToLIDs.view_host();
1336  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1337 
1338  if (canUseLocalColumnIndices) {
1339  // Copy local rows that are the "same" in both source and target.
1340  for (LO localRow = 0; localRow < static_cast<LO>(numSameIDs); ++localRow) {
1341 #ifdef HAVE_TPETRA_DEBUG
1342  if (!srcRowMap.isNodeLocalElement(localRow)) {
1343  lclErr = true;
1344  invalidSrcCopyRows.insert(localRow);
1345  continue; // skip invalid rows
1346  }
1347 #endif // HAVE_TPETRA_DEBUG
1348 
1349  local_inds_host_view_type lclSrcCols;
1350  values_host_view_type vals;
1351  LO numEntries;
1352  // If this call fails, that means the mesh row local index is
1353  // invalid. That means the Import or Export is invalid somehow.
1354  src->getLocalRowView(localRow, lclSrcCols, vals);
1355  numEntries = lclSrcCols.extent(0);
1356  if (numEntries > 0) {
1357  LO err = this->replaceLocalValues(localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1358  if (err != numEntries) {
1359  lclErr = true;
1360  if (!dstRowMap.isNodeLocalElement(localRow)) {
1361 #ifdef HAVE_TPETRA_DEBUG
1362  invalidDstCopyRows.insert(localRow);
1363 #endif // HAVE_TPETRA_DEBUG
1364  } else {
1365  // Once there's an error, there's no sense in saving
1366  // time, so we check whether the column indices were
1367  // invalid. However, only remember which ones were
1368  // invalid in a debug build, because that might take a
1369  // lot of space.
1370  for (LO k = 0; k < numEntries; ++k) {
1371  if (!dstColMap.isNodeLocalElement(lclSrcCols[k])) {
1372  lclErr = true;
1373 #ifdef HAVE_TPETRA_DEBUG
1374  (void)invalidDstCopyCols.insert(lclSrcCols[k]);
1375 #endif // HAVE_TPETRA_DEBUG
1376  }
1377  }
1378  }
1379  }
1380  }
1381  } // for each "same" local row
1382 
1383  // Copy the "permute" local rows.
1384  for (size_t k = 0; k < numPermute; ++k) {
1385  const LO srcLclRow = static_cast<LO>(permuteFromLIDsHost(k));
1386  const LO dstLclRow = static_cast<LO>(permuteToLIDsHost(k));
1387 
1388  local_inds_host_view_type lclSrcCols;
1389  values_host_view_type vals;
1390  LO numEntries;
1391  src->getLocalRowView(srcLclRow, lclSrcCols, vals);
1392  numEntries = lclSrcCols.extent(0);
1393  if (numEntries > 0) {
1394  LO err = this->replaceLocalValues(dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1395  if (err != numEntries) {
1396  lclErr = true;
1397 #ifdef HAVE_TPETRA_DEBUG
1398  for (LO c = 0; c < numEntries; ++c) {
1399  if (!dstColMap.isNodeLocalElement(lclSrcCols[c])) {
1400  invalidDstPermuteCols.insert(lclSrcCols[c]);
1401  }
1402  }
1403 #endif // HAVE_TPETRA_DEBUG
1404  }
1405  }
1406  }
1407  } else { // must convert column indices to global
1408  // Reserve space to store the destination matrix's local column indices.
1409  const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries();
1410  Teuchos::Array<LO> lclDstCols(maxNumEnt);
1411 
1412  // Copy local rows that are the "same" in both source and target.
1413  for (LO localRow = 0; localRow < static_cast<LO>(numSameIDs); ++localRow) {
1414  local_inds_host_view_type lclSrcCols;
1415  values_host_view_type vals;
1416  LO numEntries;
1417 
1418  // If this call fails, that means the mesh row local index is
1419  // invalid. That means the Import or Export is invalid somehow.
1420  try {
1421  src->getLocalRowView(localRow, lclSrcCols, vals);
1422  numEntries = lclSrcCols.extent(0);
1423  } catch (std::exception& e) {
1424  if (debug) {
1425  std::ostringstream os;
1426  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1427  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1428  << localRow << ", src->getLocalRowView() threw an exception: "
1429  << e.what();
1430  std::cerr << os.str();
1431  }
1432  throw e;
1433  }
1434 
1435  if (numEntries > 0) {
1436  if (static_cast<size_t>(numEntries) > static_cast<size_t>(lclDstCols.size())) {
1437  lclErr = true;
1438  if (debug) {
1439  std::ostringstream os;
1440  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1441  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1442  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1443  << maxNumEnt << endl;
1444  std::cerr << os.str();
1445  }
1446  } else {
1447  // Convert the source matrix's local column indices to the
1448  // destination matrix's local column indices.
1449  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view(0, numEntries);
1450  for (LO j = 0; j < numEntries; ++j) {
1451  lclDstColsView[j] = dstColMap.getLocalElement(srcColMap.getGlobalElement(lclSrcCols[j]));
1452  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid()) {
1453  lclErr = true;
1454 #ifdef HAVE_TPETRA_DEBUG
1455  invalidDstCopyCols.insert(lclDstColsView[j]);
1456 #endif // HAVE_TPETRA_DEBUG
1457  }
1458  }
1459  LO err(0);
1460  try {
1461  err = this->replaceLocalValues(localRow, lclDstColsView.getRawPtr(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1462  } catch (std::exception& e) {
1463  if (debug) {
1464  std::ostringstream os;
1465  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1466  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1467  << localRow << ", this->replaceLocalValues() threw an exception: "
1468  << e.what();
1469  std::cerr << os.str();
1470  }
1471  throw e;
1472  }
1473  if (err != numEntries) {
1474  lclErr = true;
1475  if (debug) {
1476  std::ostringstream os;
1477  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1478  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1479  "localRow "
1480  << localRow << ", this->replaceLocalValues "
1481  "returned "
1482  << err << " instead of numEntries = "
1483  << numEntries << endl;
1484  std::cerr << os.str();
1485  }
1486  }
1487  }
1488  }
1489  }
1490 
1491  // Copy the "permute" local rows.
1492  for (size_t k = 0; k < numPermute; ++k) {
1493  const LO srcLclRow = static_cast<LO>(permuteFromLIDsHost(k));
1494  const LO dstLclRow = static_cast<LO>(permuteToLIDsHost(k));
1495 
1496  local_inds_host_view_type lclSrcCols;
1497  values_host_view_type vals;
1498  LO numEntries;
1499 
1500  try {
1501  src->getLocalRowView(srcLclRow, lclSrcCols, vals);
1502  numEntries = lclSrcCols.extent(0);
1503  } catch (std::exception& e) {
1504  if (debug) {
1505  std::ostringstream os;
1506  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1507  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1508  "srcLclRow "
1509  << srcLclRow << " and dstLclRow " << dstLclRow
1510  << ", src->getLocalRowView() threw an exception: " << e.what();
1511  std::cerr << os.str();
1512  }
1513  throw e;
1514  }
1515 
1516  if (numEntries > 0) {
1517  if (static_cast<size_t>(numEntries) > static_cast<size_t>(lclDstCols.size())) {
1518  lclErr = true;
1519  } else {
1520  // Convert the source matrix's local column indices to the
1521  // destination matrix's local column indices.
1522  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view(0, numEntries);
1523  for (LO j = 0; j < numEntries; ++j) {
1524  lclDstColsView[j] = dstColMap.getLocalElement(srcColMap.getGlobalElement(lclSrcCols[j]));
1525  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid()) {
1526  lclErr = true;
1527 #ifdef HAVE_TPETRA_DEBUG
1528  invalidDstPermuteCols.insert(lclDstColsView[j]);
1529 #endif // HAVE_TPETRA_DEBUG
1530  }
1531  }
1532  LO err = this->replaceLocalValues(dstLclRow, lclDstColsView.getRawPtr(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1533  if (err != numEntries) {
1534  lclErr = true;
1535  }
1536  }
1537  }
1538  }
1539  }
1540 
1541  if (lclErr) {
1542  std::ostream& err = this->markLocalErrorAndGetStream();
1543 #ifdef HAVE_TPETRA_DEBUG
1544  err << "copyAndPermute: The graph structure of the source of the "
1545  "Import or Export must be a subset of the graph structure of the "
1546  "target. ";
1547  err << "invalidSrcCopyRows = [";
1548  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin();
1549  it != invalidSrcCopyRows.end(); ++it) {
1550  err << *it;
1551  typename std::set<LO>::const_iterator itp1 = it;
1552  itp1++;
1553  if (itp1 != invalidSrcCopyRows.end()) {
1554  err << ",";
1555  }
1556  }
1557  err << "], invalidDstCopyRows = [";
1558  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin();
1559  it != invalidDstCopyRows.end(); ++it) {
1560  err << *it;
1561  typename std::set<LO>::const_iterator itp1 = it;
1562  itp1++;
1563  if (itp1 != invalidDstCopyRows.end()) {
1564  err << ",";
1565  }
1566  }
1567  err << "], invalidDstCopyCols = [";
1568  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin();
1569  it != invalidDstCopyCols.end(); ++it) {
1570  err << *it;
1571  typename std::set<LO>::const_iterator itp1 = it;
1572  itp1++;
1573  if (itp1 != invalidDstCopyCols.end()) {
1574  err << ",";
1575  }
1576  }
1577  err << "], invalidDstPermuteCols = [";
1578  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin();
1579  it != invalidDstPermuteCols.end(); ++it) {
1580  err << *it;
1581  typename std::set<LO>::const_iterator itp1 = it;
1582  itp1++;
1583  if (itp1 != invalidDstPermuteCols.end()) {
1584  err << ",";
1585  }
1586  }
1587  err << "], invalidPermuteFromRows = [";
1588  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin();
1589  it != invalidPermuteFromRows.end(); ++it) {
1590  err << *it;
1591  typename std::set<LO>::const_iterator itp1 = it;
1592  itp1++;
1593  if (itp1 != invalidPermuteFromRows.end()) {
1594  err << ",";
1595  }
1596  }
1597  err << "]" << endl;
1598 #else
1599  err << "copyAndPermute: The graph structure of the source of the "
1600  "Import or Export must be a subset of the graph structure of the "
1601  "target."
1602  << endl;
1603 #endif // HAVE_TPETRA_DEBUG
1604  }
1605 
1606  if (debug) {
1607  std::ostringstream os;
1608  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1609  const bool lclSuccess = !(*(this->localError_));
1610  os << "*** Proc " << myRank << ": copyAndPermute "
1611  << (lclSuccess ? "succeeded" : "FAILED");
1612  if (lclSuccess) {
1613  os << endl;
1614  } else {
1615  os << ": error messages: " << this->errorMessages(); // comes w/ endl
1616  }
1617  std::cerr << os.str();
1618  }
1619 }
1620 
1621 namespace { // (anonymous)
1622 
1641 template <class LO, class GO>
1642 size_t
1643 packRowCount(const size_t numEnt,
1644  const size_t numBytesPerValue,
1645  const size_t blkSize) {
1646  using ::Tpetra::Details::PackTraits;
1647 
1648  if (numEnt == 0) {
1649  // Empty rows always take zero bytes, to ensure sparsity.
1650  return 0;
1651  } else {
1652  // We store the number of entries as a local index (LO).
1653  LO numEntLO = 0; // packValueCount wants this.
1654  GO gid{};
1655  const size_t numEntLen = PackTraits<LO>::packValueCount(numEntLO);
1656  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
1657  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1658  return numEntLen + gidsLen + valsLen;
1659  }
1660 }
1661 
1672 template <class ST, class LO, class GO>
1673 size_t
1674 unpackRowCount(const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1675  const size_t offset,
1676  const size_t numBytes,
1677  const size_t /* numBytesPerValue */) {
1678  using Kokkos::subview;
1679  using ::Tpetra::Details::PackTraits;
1680 
1681  if (numBytes == 0) {
1682  // Empty rows always take zero bytes, to ensure sparsity.
1683  return static_cast<size_t>(0);
1684  } else {
1685  LO numEntLO = 0;
1686  const size_t theNumBytes = PackTraits<LO>::packValueCount(numEntLO);
1687  TEUCHOS_TEST_FOR_EXCEPTION(theNumBytes > numBytes, std::logic_error,
1688  "unpackRowCount: "
1689  "theNumBytes = "
1690  << theNumBytes << " < numBytes = " << numBytes
1691  << ".");
1692  const char* const inBuf = imports.data() + offset;
1693  const size_t actualNumBytes = PackTraits<LO>::unpackValue(numEntLO, inBuf);
1694  TEUCHOS_TEST_FOR_EXCEPTION(actualNumBytes > numBytes, std::logic_error,
1695  "unpackRowCount: "
1696  "actualNumBytes = "
1697  << actualNumBytes << " < numBytes = " << numBytes
1698  << ".");
1699  return static_cast<size_t>(numEntLO);
1700  }
1701 }
1702 
1706 template <class ST, class LO, class GO>
1707 size_t
1708 packRowForBlockCrs(const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1709  const size_t offset,
1710  const size_t numEnt,
1711  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1712  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1713  const size_t numBytesPerValue,
1714  const size_t blockSize) {
1715  using Kokkos::subview;
1716  using ::Tpetra::Details::PackTraits;
1717 
1718  if (numEnt == 0) {
1719  // Empty rows always take zero bytes, to ensure sparsity.
1720  return 0;
1721  }
1722  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1723 
1724  const GO gid = 0; // packValueCount wants this
1725  const LO numEntLO = static_cast<size_t>(numEnt);
1726 
1727  const size_t numEntBeg = offset;
1728  const size_t numEntLen = PackTraits<LO>::packValueCount(numEntLO);
1729  const size_t gidsBeg = numEntBeg + numEntLen;
1730  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
1731  const size_t valsBeg = gidsBeg + gidsLen;
1732  const size_t valsLen = numScalarEnt * numBytesPerValue;
1733 
1734  char* const numEntOut = exports.data() + numEntBeg;
1735  char* const gidsOut = exports.data() + gidsBeg;
1736  char* const valsOut = exports.data() + valsBeg;
1737 
1738  size_t numBytesOut = 0;
1739  int errorCode = 0;
1740  numBytesOut += PackTraits<LO>::packValue(numEntOut, numEntLO);
1741 
1742  {
1743  Kokkos::pair<int, size_t> p;
1744  p = PackTraits<GO>::packArray(gidsOut, gidsIn.data(), numEnt);
1745  errorCode += p.first;
1746  numBytesOut += p.second;
1747 
1748  p = PackTraits<ST>::packArray(valsOut, valsIn.data(), numScalarEnt);
1749  errorCode += p.first;
1750  numBytesOut += p.second;
1751  }
1752 
1753  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1754  TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
1755  "packRowForBlockCrs: numBytesOut = " << numBytesOut
1756  << " != expectedNumBytes = " << expectedNumBytes << ".");
1757 
1758  TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
1759  "packRowForBlockCrs: "
1760  "PackTraits::packArray returned a nonzero error code");
1761 
1762  return numBytesOut;
1763 }
1764 
1765 // Return the number of bytes actually read / used.
1766 template <class ST, class LO, class GO>
1767 size_t
1768 unpackRowForBlockCrs(const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1769  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1770  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1771  const size_t offset,
1772  const size_t numBytes,
1773  const size_t numEnt,
1774  const size_t numBytesPerValue,
1775  const size_t blockSize) {
1776  using ::Tpetra::Details::PackTraits;
1777 
1778  if (numBytes == 0) {
1779  // Rows with zero bytes always have zero entries.
1780  return 0;
1781  }
1782  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1783  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(imports.extent(0)) <= offset,
1784  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = " << imports.extent(0) << " <= offset = " << offset << ".");
1785  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(imports.extent(0)) < offset + numBytes,
1786  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = " << imports.extent(0) << " < offset + numBytes = " << (offset + numBytes) << ".");
1787 
1788  const GO gid = 0; // packValueCount wants this
1789  const LO lid = 0; // packValueCount wants this
1790 
1791  const size_t numEntBeg = offset;
1792  const size_t numEntLen = PackTraits<LO>::packValueCount(lid);
1793  const size_t gidsBeg = numEntBeg + numEntLen;
1794  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount(gid);
1795  const size_t valsBeg = gidsBeg + gidsLen;
1796  const size_t valsLen = numScalarEnt * numBytesPerValue;
1797 
1798  const char* const numEntIn = imports.data() + numEntBeg;
1799  const char* const gidsIn = imports.data() + gidsBeg;
1800  const char* const valsIn = imports.data() + valsBeg;
1801 
1802  size_t numBytesOut = 0;
1803  int errorCode = 0;
1804  LO numEntOut;
1805  numBytesOut += PackTraits<LO>::unpackValue(numEntOut, numEntIn);
1806  TEUCHOS_TEST_FOR_EXCEPTION(static_cast<size_t>(numEntOut) != numEnt, std::logic_error,
1807  "unpackRowForBlockCrs: Expected number of entries " << numEnt
1808  << " != actual number of entries " << numEntOut << ".");
1809 
1810  {
1811  Kokkos::pair<int, size_t> p;
1812  p = PackTraits<GO>::unpackArray(gidsOut.data(), gidsIn, numEnt);
1813  errorCode += p.first;
1814  numBytesOut += p.second;
1815 
1816  p = PackTraits<ST>::unpackArray(valsOut.data(), valsIn, numScalarEnt);
1817  errorCode += p.first;
1818  numBytesOut += p.second;
1819  }
1820 
1821  TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != numBytes, std::logic_error,
1822  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1823  << " != numBytes = " << numBytes << ".");
1824 
1825  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1826  TEUCHOS_TEST_FOR_EXCEPTION(numBytesOut != expectedNumBytes, std::logic_error,
1827  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1828  << " != expectedNumBytes = " << expectedNumBytes << ".");
1829 
1830  TEUCHOS_TEST_FOR_EXCEPTION(errorCode != 0, std::runtime_error,
1831  "unpackRowForBlockCrs: "
1832  "PackTraits::unpackArray returned a nonzero error code");
1833 
1834  return numBytesOut;
1835 }
1836 } // namespace
1837 
1838 template <class Scalar, class LO, class GO, class Node>
1840  packAndPrepare(const ::Tpetra::SrcDistObject& source,
1841  const Kokkos::DualView<const local_ordinal_type*,
1842  buffer_device_type>& exportLIDs,
1843  Kokkos::DualView<packet_type*,
1844  buffer_device_type>& exports, // output
1845  Kokkos::DualView<size_t*,
1847  numPacketsPerLID, // output
1848  size_t& constantNumPackets) {
1849  using ::Tpetra::Details::Behavior;
1851  using ::Tpetra::Details::PackTraits;
1852  using ::Tpetra::Details::ProfilingRegion;
1853 
1854  typedef typename Kokkos::View<int*, device_type>::host_mirror_type::execution_space host_exec;
1855 
1856  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1857 
1858  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1859 
1860  const bool debug = Behavior::debug();
1861  const bool verbose = Behavior::verbose();
1862 
1863  // Define this function prefix
1864  std::string prefix;
1865  {
1866  std::ostringstream os;
1867  const int myRank = this->graph_.getRowMap()->getComm()->getRank();
1868  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1869  prefix = os.str();
1870  }
1871 
1872  // check if this already includes a local error
1873  if (*(this->localError_)) {
1874  std::ostream& err = this->markLocalErrorAndGetStream();
1875  err << prefix
1876  << "The target object of the Import or Export is already in an error state."
1877  << std::endl;
1878  return;
1879  }
1880 
1881  //
1882  // Verbose input dual view status
1883  //
1884  if (verbose) {
1885  std::ostringstream os;
1886  os << prefix << std::endl
1887  << prefix << " " << dualViewStatusToString(exportLIDs, "exportLIDs") << std::endl
1888  << prefix << " " << dualViewStatusToString(exports, "exports") << std::endl
1889  << prefix << " " << dualViewStatusToString(numPacketsPerLID, "numPacketsPerLID") << std::endl;
1890  std::cerr << os.str();
1891  }
1892 
1896  if (exportLIDs.extent(0) != numPacketsPerLID.extent(0)) {
1897  std::ostream& err = this->markLocalErrorAndGetStream();
1898  err << prefix
1899  << "exportLIDs.extent(0) = " << exportLIDs.extent(0)
1900  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
1901  << "." << std::endl;
1902  return;
1903  }
1904  if (exportLIDs.need_sync_host()) {
1905  std::ostream& err = this->markLocalErrorAndGetStream();
1906  err << prefix << "exportLIDs be sync'd to host." << std::endl;
1907  return;
1908  }
1909 
1910  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type*>(&source);
1911  if (src == NULL) {
1912  std::ostream& err = this->markLocalErrorAndGetStream();
1913  err << prefix
1914  << "The source (input) object of the Import or "
1915  "Export is either not a BlockCrsMatrix, or does not have the right "
1916  "template parameters. checkSizes() should have caught this. "
1917  "Please report this bug to the Tpetra developers."
1918  << std::endl;
1919  return;
1920  }
1921 
1922  // Graphs and matrices are allowed to have a variable number of
1923  // entries per row. We could test whether all rows have the same
1924  // number of entries, but DistObject can only use this
1925  // optimization if all rows on _all_ processes have the same
1926  // number of entries. Rather than do the all-reduce necessary to
1927  // test for this unlikely case, we tell DistObject (by setting
1928  // constantNumPackets to zero) to assume that different rows may
1929  // have different numbers of entries.
1930  constantNumPackets = 0;
1931 
1932  // const values
1933  const crs_graph_type& srcGraph = src->graph_;
1934  const size_t blockSize = static_cast<size_t>(src->getBlockSize());
1935  const size_t numExportLIDs = exportLIDs.extent(0);
1936  size_t numBytesPerValue(0);
1937  {
1938  auto val_host = val_.getHostView(Access::ReadOnly);
1939  numBytesPerValue =
1940  PackTraits<impl_scalar_type>::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
1941  }
1942 
1943  // Compute the number of bytes ("packets") per row to pack. While
1944  // we're at it, compute the total # of block entries to send, and
1945  // the max # of block entries in any of the rows we're sending.
1946 
1947  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
1948 
1949  // Graph information is on host; let's do this on host parallel reduce
1950  auto exportLIDsHost = exportLIDs.view_host();
1951  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
1952  numPacketsPerLID.modify_host();
1953  {
1954  rowReducerStruct = Impl::BlockCrsRowStruct<size_t>();
1955  for (size_t i = 0; i < numExportLIDs; ++i) {
1956  const LO lclRow = exportLIDsHost(i);
1957  size_t numEnt = srcGraph.getNumEntriesInLocalRow(lclRow);
1958  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid() ? 0 : numEnt);
1959 
1960  const size_t numBytes =
1961  packRowCount<LO, GO>(numEnt, numBytesPerValue, blockSize);
1962  numPacketsPerLIDHost(i) = numBytes;
1963  rowReducerStruct += Impl::BlockCrsRowStruct<size_t>(numEnt, numBytes, numEnt);
1964  }
1965  }
1966 
1967  // Compute the number of bytes ("packets") per row to pack. While
1968  // we're at it, compute the total # of block entries to send, and
1969  // the max # of block entries in any of the rows we're sending.
1970  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
1971  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
1972  const size_t maxRowLength = rowReducerStruct.maxRowLength;
1973 
1974  if (verbose) {
1975  std::ostringstream os;
1976  os << prefix
1977  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
1978  << std::endl;
1979  std::cerr << os.str();
1980  }
1981 
1982  // We use a "struct of arrays" approach to packing each row's
1983  // entries. All the column indices (as global indices) go first,
1984  // then all their owning process ranks, and then the values.
1985  if (exports.extent(0) != totalNumBytes) {
1986  const std::string oldLabel = exports.view_device().label();
1987  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
1988  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
1989  }
1990  if (totalNumEntries > 0) {
1991  // Current position (in bytes) in the 'exports' output array.
1992  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs + 1);
1993  {
1994  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs + 1);
1995  Kokkos::parallel_scan(policy,
1996  [=](const size_t& i, size_t& update, const bool& final) {
1997  if (final) offset(i) = update;
1998  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
1999  });
2000  }
2001  if (offset(numExportLIDs) != totalNumBytes) {
2002  std::ostream& err = this->markLocalErrorAndGetStream();
2003  err << prefix
2004  << "At end of method, the final offset (in bytes) "
2005  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2006  << totalNumBytes << ". "
2007  << "Please report this bug to the Tpetra developers." << std::endl;
2008  return;
2009  }
2010 
2011  // For each block row of the matrix owned by the calling
2012  // process, pack that block row's column indices and values into
2013  // the exports array.
2014 
2015  // Source matrix's column Map. We verified in checkSizes() that
2016  // the column Map exists (is not null).
2017  const map_type& srcColMap = *(srcGraph.getColMap());
2018 
2019  // Pack the data for each row to send, into the 'exports' buffer.
2020  // exports will be modified on host.
2021  exports.modify_host();
2022  {
2023  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2024  const auto policy =
2025  policy_type(numExportLIDs, 1, 1)
2026  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO) * maxRowLength));
2027  // The following parallel_for needs const access to the local values of src.
2028  // (the local graph is also accessed on host, but this does not use WDVs).
2029  getValuesHost();
2031  Kokkos::parallel_for(policy,
2032  [=](const typename policy_type::member_type& member) {
2033  const size_t i = member.league_rank();
2034  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2035  gblColInds(member.team_scratch(0), maxRowLength);
2036 
2037  const LO lclRowInd = exportLIDsHost(i);
2038  local_inds_host_view_type lclColInds;
2039  values_host_view_type vals;
2040 
2041  // It's OK to ignore the return value, since if the calling
2042  // process doesn't own that local row, then the number of
2043  // entries in that row on the calling process is zero.
2044  src->getLocalRowView(lclRowInd, lclColInds, vals);
2045  const size_t numEnt = lclColInds.extent(0);
2046 
2047  // Convert column indices from local to global.
2048  for (size_t j = 0; j < numEnt; ++j)
2049  gblColInds(j) = srcColMap.getGlobalElement(lclColInds(j));
2050 
2051  // Kyungjoo: additional wrapping scratch view is necessary
2052  // the following function interface need the same execution space
2053  // host scratch space somehow is not considered same as the host_exec
2054  // Copy the row's data into the current spot in the exports array.
2055  const size_t numBytes =
2056  packRowForBlockCrs<impl_scalar_type, LO, GO>(exports.view_host(),
2057  offset(i),
2058  numEnt,
2059  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2060  vals,
2061  numBytesPerValue,
2062  blockSize);
2063 
2064  // numBytes should be same as the difference between offsets
2065  if (debug) {
2066  const size_t offsetDiff = offset(i + 1) - offset(i);
2067  if (numBytes != offsetDiff) {
2068  std::ostringstream os;
2069  os << prefix
2070  << "numBytes computed from packRowForBlockCrs is different from "
2071  << "precomputed offset values, LID = " << i << std::endl;
2072  std::cerr << os.str();
2073  }
2074  }
2075  }); // for each LID (of a row) to send
2077  }
2078  } // if totalNumEntries > 0
2079 
2080  if (debug) {
2081  std::ostringstream os;
2082  const bool lclSuccess = !(*(this->localError_));
2083  os << prefix
2084  << (lclSuccess ? "succeeded" : "FAILED")
2085  << std::endl;
2086  std::cerr << os.str();
2087  }
2088 }
2089 
2090 template <class Scalar, class LO, class GO, class Node>
2092  unpackAndCombine(const Kokkos::DualView<const local_ordinal_type*,
2093  buffer_device_type>& importLIDs,
2094  Kokkos::DualView<packet_type*,
2096  imports,
2097  Kokkos::DualView<size_t*,
2099  numPacketsPerLID,
2100  const size_t /* constantNumPackets */,
2101  const CombineMode combineMode) {
2102  using std::endl;
2103  using ::Tpetra::Details::Behavior;
2105  using ::Tpetra::Details::PackTraits;
2106  using ::Tpetra::Details::ProfilingRegion;
2107  using host_exec =
2108  typename Kokkos::View<int*, device_type>::host_mirror_type::execution_space;
2109 
2110  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2111  const bool verbose = Behavior::verbose();
2112 
2113  // Define this function prefix
2114  std::string prefix;
2115  {
2116  std::ostringstream os;
2117  auto map = this->graph_.getRowMap();
2118  auto comm = map.is_null() ? Teuchos::null : map->getComm();
2119  const int myRank = comm.is_null() ? -1 : comm->getRank();
2120  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2121  prefix = os.str();
2122  if (verbose) {
2123  os << "Start" << endl;
2124  std::cerr << os.str();
2125  }
2126  }
2127 
2128  // check if this already includes a local error
2129  if (*(this->localError_)) {
2130  std::ostream& err = this->markLocalErrorAndGetStream();
2131  std::ostringstream os;
2132  os << prefix << "{Im/Ex}port target is already in error." << endl;
2133  if (verbose) {
2134  std::cerr << os.str();
2135  }
2136  err << os.str();
2137  return;
2138  }
2139 
2143  if (importLIDs.extent(0) != numPacketsPerLID.extent(0)) {
2144  std::ostream& err = this->markLocalErrorAndGetStream();
2145  std::ostringstream os;
2146  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent(0)
2147  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2148  << "." << endl;
2149  if (verbose) {
2150  std::cerr << os.str();
2151  }
2152  err << os.str();
2153  return;
2154  }
2155 
2156  if (combineMode != ADD && combineMode != INSERT &&
2157  combineMode != REPLACE && combineMode != ABSMAX &&
2158  combineMode != ZERO) {
2159  std::ostream& err = this->markLocalErrorAndGetStream();
2160  std::ostringstream os;
2161  os << prefix
2162  << "Invalid CombineMode value " << combineMode << ". Valid "
2163  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2164  << std::endl;
2165  if (verbose) {
2166  std::cerr << os.str();
2167  }
2168  err << os.str();
2169  return;
2170  }
2171 
2172  if (this->graph_.getColMap().is_null()) {
2173  std::ostream& err = this->markLocalErrorAndGetStream();
2174  std::ostringstream os;
2175  os << prefix << "Target matrix's column Map is null." << endl;
2176  if (verbose) {
2177  std::cerr << os.str();
2178  }
2179  err << os.str();
2180  return;
2181  }
2182 
2183  // Target matrix's column Map. Use to convert the global column
2184  // indices in the receive buffer to local indices. We verified in
2185  // checkSizes() that the column Map exists (is not null).
2186  const map_type& tgtColMap = *(this->graph_.getColMap());
2187 
2188  // Const values
2189  const size_t blockSize = this->getBlockSize();
2190  const size_t numImportLIDs = importLIDs.extent(0);
2191  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2192  // default-constructed instance could have a different size than
2193  // other instances. (We assume all nominally constructed
2194  // instances have the same size; that's not the issue here.) This
2195  // could be bad if the calling process has no entries, but other
2196  // processes have entries that they want to send to us.
2197  size_t numBytesPerValue(0);
2198  {
2199  auto val_host = val_.getHostView(Access::ReadOnly);
2200  numBytesPerValue =
2201  PackTraits<impl_scalar_type>::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2202  }
2203  const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries();
2204  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2205 
2206  if (verbose) {
2207  std::ostringstream os;
2208  os << prefix << "combineMode: "
2209  << ::Tpetra::combineModeToString(combineMode)
2210  << ", blockSize: " << blockSize
2211  << ", numImportLIDs: " << numImportLIDs
2212  << ", numBytesPerValue: " << numBytesPerValue
2213  << ", maxRowNumEnt: " << maxRowNumEnt
2214  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2215  std::cerr << os.str();
2216  }
2217 
2218  if (combineMode == ZERO || numImportLIDs == 0) {
2219  if (verbose) {
2220  std::ostringstream os;
2221  os << prefix << "Nothing to unpack. Done!" << std::endl;
2222  std::cerr << os.str();
2223  }
2224  return;
2225  }
2226 
2227  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2228  // we can remove this sync.
2229  if (imports.need_sync_host()) {
2230  imports.sync_host();
2231  }
2232 
2233  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2234  // sync'd numPacketsPerLID to host, since it needs to do that in
2235  // order to post MPI_Irecv messages with the correct lengths. A
2236  // hypothetical device-specific boundary exchange implementation
2237  // could possibly receive data without sync'ing lengths to host,
2238  // but we don't need to design for that nonexistent thing yet.
2239 
2240  if (imports.need_sync_host() || numPacketsPerLID.need_sync_host() ||
2241  importLIDs.need_sync_host()) {
2242  std::ostream& err = this->markLocalErrorAndGetStream();
2243  std::ostringstream os;
2244  os << prefix << "All input DualViews must be sync'd to host by now. "
2245  << "imports_nc.need_sync_host()="
2246  << (imports.need_sync_host() ? "true" : "false")
2247  << ", numPacketsPerLID.need_sync_host()="
2248  << (numPacketsPerLID.need_sync_host() ? "true" : "false")
2249  << ", importLIDs.need_sync_host()="
2250  << (importLIDs.need_sync_host() ? "true" : "false")
2251  << "." << endl;
2252  if (verbose) {
2253  std::cerr << os.str();
2254  }
2255  err << os.str();
2256  return;
2257  }
2258 
2259  const auto importLIDsHost = importLIDs.view_host();
2260  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host();
2261 
2262  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2263  // loop, by only unpacking on final==true (when we know the
2264  // current offset's value).
2265 
2266  Kokkos::View<size_t*, host_exec> offset("offset", numImportLIDs + 1);
2267  {
2268  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs + 1);
2269  Kokkos::parallel_scan("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2270  [=](const size_t& i, size_t& update, const bool& final) {
2271  if (final) offset(i) = update;
2272  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2273  });
2274  }
2275 
2276  // this variable does not matter with a race condition (just error flag)
2277  //
2278  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2279  // or atomics on bool, so we use int instead. (I know we're not
2280  // launching a CUDA loop here, but we could in the future, and I'd
2281  // like to avoid potential trouble.)
2282  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2283  errorDuringUnpack("errorDuringUnpack");
2284  errorDuringUnpack() = 0;
2285  {
2286  using policy_type = Kokkos::TeamPolicy<host_exec>;
2287  size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof(LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt + 2 * sizeof(GO); // Yeah, this is a fudge factor
2288 
2289  const auto policy = policy_type(numImportLIDs, 1, 1)
2290  .set_scratch_size(0, Kokkos::PerTeam(scratch_per_row));
2291  using host_scratch_space = typename host_exec::scratch_memory_space;
2292 
2293  using pair_type = Kokkos::pair<size_t, size_t>;
2294 
2295  // The following parallel_for modifies values on host while unpacking.
2296  getValuesHostNonConst();
2298  Kokkos::parallel_for("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2299  [=, *this](const typename policy_type::member_type& member) {
2300  const size_t i = member.league_rank();
2301  Kokkos::View<GO*, host_scratch_space> gblColInds(member.team_scratch(0), maxRowNumEnt);
2302  Kokkos::View<LO*, host_scratch_space> lclColInds(member.team_scratch(0), maxRowNumEnt);
2303  Kokkos::View<impl_scalar_type*, host_scratch_space> vals(member.team_scratch(0), maxRowNumScalarEnt);
2304 
2305  const size_t offval = offset(i);
2306  const LO lclRow = importLIDsHost(i);
2307  const size_t numBytes = numPacketsPerLIDHost(i);
2308  const size_t numEnt =
2309  unpackRowCount<impl_scalar_type, LO, GO>(imports.view_host(), offval, numBytes, numBytesPerValue);
2310 
2311  if (numBytes > 0) {
2312  if (numEnt > maxRowNumEnt) {
2313  errorDuringUnpack() = 1;
2314  if (verbose) {
2315  std::ostringstream os;
2316  os << prefix
2317  << "At i = " << i << ", numEnt = " << numEnt
2318  << " > maxRowNumEnt = " << maxRowNumEnt
2319  << std::endl;
2320  std::cerr << os.str();
2321  }
2322  return;
2323  }
2324  }
2325  const size_t numScalarEnt = numEnt * blockSize * blockSize;
2326  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2327  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2328  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2329 
2330  // Kyungjoo: additional wrapping scratch view is necessary
2331  // the following function interface need the same execution space
2332  // host scratch space somehow is not considered same as the host_exec
2333  size_t numBytesOut = 0;
2334  try {
2335  numBytesOut =
2336  unpackRowForBlockCrs<impl_scalar_type, LO, GO>(Kokkos::View<GO*, host_exec>(gidsOut.data(), numEnt),
2337  Kokkos::View<impl_scalar_type*, host_exec>(valsOut.data(), numScalarEnt),
2338  imports.view_host(),
2339  offval, numBytes, numEnt,
2340  numBytesPerValue, blockSize);
2341  } catch (std::exception& e) {
2342  errorDuringUnpack() = 1;
2343  if (verbose) {
2344  std::ostringstream os;
2345  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2346  << e.what() << endl;
2347  std::cerr << os.str();
2348  }
2349  return;
2350  }
2351 
2352  if (numBytes != numBytesOut) {
2353  errorDuringUnpack() = 1;
2354  if (verbose) {
2355  std::ostringstream os;
2356  os << prefix
2357  << "At i = " << i << ", numBytes = " << numBytes
2358  << " != numBytesOut = " << numBytesOut << "."
2359  << std::endl;
2360  std::cerr << os.str();
2361  }
2362  return;
2363  }
2364 
2365  // Convert incoming global indices to local indices.
2366  for (size_t k = 0; k < numEnt; ++k) {
2367  lidsOut(k) = tgtColMap.getLocalElement(gidsOut(k));
2368  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid()) {
2369  if (verbose) {
2370  std::ostringstream os;
2371  os << prefix
2372  << "At i = " << i << ", GID " << gidsOut(k)
2373  << " is not owned by the calling process."
2374  << std::endl;
2375  std::cerr << os.str();
2376  }
2377  return;
2378  }
2379  }
2380 
2381  // Combine the incoming data with the matrix's current data.
2382  LO numCombd = 0;
2383  const LO* const lidsRaw = const_cast<const LO*>(lidsOut.data());
2384  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>(const_cast<const impl_scalar_type*>(valsOut.data()));
2385  if (combineMode == ADD) {
2386  numCombd = this->sumIntoLocalValues(lclRow, lidsRaw, valsRaw, numEnt);
2387  } else if (combineMode == INSERT || combineMode == REPLACE) {
2388  numCombd = this->replaceLocalValues(lclRow, lidsRaw, valsRaw, numEnt);
2389  } else if (combineMode == ABSMAX) {
2390  numCombd = this->absMaxLocalValues(lclRow, lidsRaw, valsRaw, numEnt);
2391  }
2392 
2393  if (static_cast<LO>(numEnt) != numCombd) {
2394  errorDuringUnpack() = 1;
2395  if (verbose) {
2396  std::ostringstream os;
2397  os << prefix << "At i = " << i << ", numEnt = " << numEnt
2398  << " != numCombd = " << numCombd << "."
2399  << endl;
2400  std::cerr << os.str();
2401  }
2402  return;
2403  }
2404  }); // for each import LID i
2406  }
2407 
2408  if (errorDuringUnpack() != 0) {
2409  std::ostream& err = this->markLocalErrorAndGetStream();
2410  err << prefix << "Unpacking failed.";
2411  if (!verbose) {
2412  err << " Please run again with the environment variable setting "
2413  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2414  }
2415  err << endl;
2416  }
2417 
2418  if (verbose) {
2419  std::ostringstream os;
2420  const bool lclSuccess = !(*(this->localError_));
2421  os << prefix
2422  << (lclSuccess ? "succeeded" : "FAILED")
2423  << std::endl;
2424  std::cerr << os.str();
2425  }
2426 }
2427 
2428 template <class Scalar, class LO, class GO, class Node>
2429 std::string
2431  using Teuchos::TypeNameTraits;
2432  std::ostringstream os;
2433  os << "\"Tpetra::BlockCrsMatrix\": { "
2434  << "Template parameters: { "
2435  << "Scalar: " << TypeNameTraits<Scalar>::name()
2436  << "LO: " << TypeNameTraits<LO>::name()
2437  << "GO: " << TypeNameTraits<GO>::name()
2438  << "Node: " << TypeNameTraits<Node>::name()
2439  << " }"
2440  << ", Label: \"" << this->getObjectLabel() << "\""
2441  << ", Global dimensions: ["
2442  << graph_.getDomainMap()->getGlobalNumElements() << ", "
2443  << graph_.getRangeMap()->getGlobalNumElements() << "]"
2444  << ", Block size: " << getBlockSize()
2445  << " }";
2446  return os.str();
2447 }
2448 
2449 template <class Scalar, class LO, class GO, class Node>
2451  describe(Teuchos::FancyOStream& out,
2452  const Teuchos::EVerbosityLevel verbLevel) const {
2453  using Teuchos::ArrayRCP;
2454  using Teuchos::CommRequest;
2455  using Teuchos::FancyOStream;
2456  using Teuchos::getFancyOStream;
2457  using Teuchos::ireceive;
2458  using Teuchos::isend;
2459  using Teuchos::outArg;
2460  using Teuchos::TypeNameTraits;
2461  using Teuchos::VERB_DEFAULT;
2462  using Teuchos::VERB_LOW;
2463  using Teuchos::VERB_MEDIUM;
2464  using Teuchos::VERB_NONE;
2465  // using Teuchos::VERB_HIGH;
2466  using std::endl;
2467  using Teuchos::RCP;
2468  using Teuchos::VERB_EXTREME;
2469  using Teuchos::wait;
2470 
2471  // Set default verbosity if applicable.
2472  const Teuchos::EVerbosityLevel vl =
2473  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2474 
2475  if (vl == VERB_NONE) {
2476  return; // print nothing
2477  }
2478 
2479  // describe() always starts with a tab before it prints anything.
2480  Teuchos::OSTab tab0(out);
2481 
2482  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2483  Teuchos::OSTab tab1(out);
2484 
2485  out << "Template parameters:" << endl;
2486  {
2487  Teuchos::OSTab tab2(out);
2488  out << "Scalar: " << TypeNameTraits<Scalar>::name() << endl
2489  << "LO: " << TypeNameTraits<LO>::name() << endl
2490  << "GO: " << TypeNameTraits<GO>::name() << endl
2491  << "Node: " << TypeNameTraits<Node>::name() << endl;
2492  }
2493  out << "Label: \"" << this->getObjectLabel() << "\"" << endl
2494  << "Global dimensions: ["
2495  << graph_.getDomainMap()->getGlobalNumElements() << ", "
2496  << graph_.getRangeMap()->getGlobalNumElements() << "]" << endl;
2497 
2498  const LO blockSize = getBlockSize();
2499  out << "Block size: " << blockSize << endl;
2500 
2501  // constituent objects
2502  if (vl >= VERB_MEDIUM) {
2503  const Teuchos::Comm<int>& comm = *(graph_.getMap()->getComm());
2504  const int myRank = comm.getRank();
2505  if (myRank == 0) {
2506  out << "Row Map:" << endl;
2507  }
2508  getRowMap()->describe(out, vl);
2509 
2510  if (!getColMap().is_null()) {
2511  if (getColMap() == getRowMap()) {
2512  if (myRank == 0) {
2513  out << "Column Map: same as row Map" << endl;
2514  }
2515  } else {
2516  if (myRank == 0) {
2517  out << "Column Map:" << endl;
2518  }
2519  getColMap()->describe(out, vl);
2520  }
2521  }
2522  if (!getDomainMap().is_null()) {
2523  if (getDomainMap() == getRowMap()) {
2524  if (myRank == 0) {
2525  out << "Domain Map: same as row Map" << endl;
2526  }
2527  } else if (getDomainMap() == getColMap()) {
2528  if (myRank == 0) {
2529  out << "Domain Map: same as column Map" << endl;
2530  }
2531  } else {
2532  if (myRank == 0) {
2533  out << "Domain Map:" << endl;
2534  }
2535  getDomainMap()->describe(out, vl);
2536  }
2537  }
2538  if (!getRangeMap().is_null()) {
2539  if (getRangeMap() == getDomainMap()) {
2540  if (myRank == 0) {
2541  out << "Range Map: same as domain Map" << endl;
2542  }
2543  } else if (getRangeMap() == getRowMap()) {
2544  if (myRank == 0) {
2545  out << "Range Map: same as row Map" << endl;
2546  }
2547  } else {
2548  if (myRank == 0) {
2549  out << "Range Map: " << endl;
2550  }
2551  getRangeMap()->describe(out, vl);
2552  }
2553  }
2554  }
2555 
2556  if (vl >= VERB_EXTREME) {
2557  const Teuchos::Comm<int>& comm = *(graph_.getMap()->getComm());
2558  const int myRank = comm.getRank();
2559  const int numProcs = comm.getSize();
2560 
2561  // Print the calling process' data to the given output stream.
2562  RCP<std::ostringstream> lclOutStrPtr(new std::ostringstream());
2563  RCP<FancyOStream> osPtr = getFancyOStream(lclOutStrPtr);
2564  FancyOStream& os = *osPtr;
2565  os << "Process " << myRank << ":" << endl;
2566  Teuchos::OSTab tab2(os);
2567 
2568  const map_type& meshRowMap = *(graph_.getRowMap());
2569  const map_type& meshColMap = *(graph_.getColMap());
2570  for (LO meshLclRow = meshRowMap.getMinLocalIndex();
2571  meshLclRow <= meshRowMap.getMaxLocalIndex();
2572  ++meshLclRow) {
2573  const GO meshGblRow = meshRowMap.getGlobalElement(meshLclRow);
2574  os << "Row " << meshGblRow << ": {";
2575 
2576  local_inds_host_view_type lclColInds;
2577  values_host_view_type vals;
2578  LO numInds = 0;
2579  this->getLocalRowView(meshLclRow, lclColInds, vals);
2580  numInds = lclColInds.extent(0);
2581 
2582  for (LO k = 0; k < numInds; ++k) {
2583  const GO gblCol = meshColMap.getGlobalElement(lclColInds[k]);
2584 
2585  os << "Col " << gblCol << ": [";
2586  for (LO i = 0; i < blockSize; ++i) {
2587  for (LO j = 0; j < blockSize; ++j) {
2588  os << vals[blockSize * blockSize * k + i * blockSize + j];
2589  if (j + 1 < blockSize) {
2590  os << ", ";
2591  }
2592  }
2593  if (i + 1 < blockSize) {
2594  os << "; ";
2595  }
2596  }
2597  os << "]";
2598  if (k + 1 < numInds) {
2599  os << ", ";
2600  }
2601  }
2602  os << "}" << endl;
2603  }
2604 
2605  // Print data on Process 0. This will automatically respect the
2606  // current indentation level.
2607  if (myRank == 0) {
2608  out << lclOutStrPtr->str();
2609  lclOutStrPtr = Teuchos::null; // clear it to save space
2610  }
2611 
2612  const int sizeTag = 1337;
2613  const int dataTag = 1338;
2614 
2615  ArrayRCP<char> recvDataBuf; // only used on Process 0
2616 
2617  // Send string sizes and data from each process in turn to
2618  // Process 0, and print on that process.
2619  for (int p = 1; p < numProcs; ++p) {
2620  if (myRank == 0) {
2621  // Receive the incoming string's length.
2622  ArrayRCP<size_t> recvSize(1);
2623  recvSize[0] = 0;
2624  RCP<CommRequest<int> > recvSizeReq =
2625  ireceive<int, size_t>(recvSize, p, sizeTag, comm);
2626  wait<int>(comm, outArg(recvSizeReq));
2627  const size_t numCharsToRecv = recvSize[0];
2628 
2629  // Allocate space for the string to receive. Reuse receive
2630  // buffer space if possible. We can do this because in the
2631  // current implementation, we only have one receive in
2632  // flight at a time. Leave space for the '\0' at the end,
2633  // in case the sender doesn't send it.
2634  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2635  recvDataBuf.resize(numCharsToRecv + 1);
2636  }
2637  ArrayRCP<char> recvData = recvDataBuf.persistingView(0, numCharsToRecv);
2638  // Post the receive of the actual string data.
2639  RCP<CommRequest<int> > recvDataReq =
2640  ireceive<int, char>(recvData, p, dataTag, comm);
2641  wait<int>(comm, outArg(recvDataReq));
2642 
2643  // Print the received data. This will respect the current
2644  // indentation level. Make sure that the string is
2645  // null-terminated.
2646  recvDataBuf[numCharsToRecv] = '\0';
2647  out << recvDataBuf.getRawPtr();
2648  } else if (myRank == p) { // if I am not Process 0, and my rank is p
2649  // This deep-copies the string at most twice, depending on
2650  // whether std::string reference counts internally (it
2651  // generally does, so this won't deep-copy at all).
2652  const std::string stringToSend = lclOutStrPtr->str();
2653  lclOutStrPtr = Teuchos::null; // clear original to save space
2654 
2655  // Send the string's length to Process 0.
2656  const size_t numCharsToSend = stringToSend.size();
2657  ArrayRCP<size_t> sendSize(1);
2658  sendSize[0] = numCharsToSend;
2659  RCP<CommRequest<int> > sendSizeReq =
2660  isend<int, size_t>(sendSize, 0, sizeTag, comm);
2661  wait<int>(comm, outArg(sendSizeReq));
2662 
2663  // Send the actual string to Process 0. We know that the
2664  // string has length > 0, so it's save to take the address
2665  // of the first entry. Make a nonowning ArrayRCP to hold
2666  // the string. Process 0 will add a null termination
2667  // character at the end of the string, after it receives the
2668  // message.
2669  ArrayRCP<const char> sendData(&stringToSend[0], 0, numCharsToSend, false);
2670  RCP<CommRequest<int> > sendDataReq =
2671  isend<int, char>(sendData, 0, dataTag, comm);
2672  wait<int>(comm, outArg(sendDataReq));
2673  }
2674  } // for each process rank p other than 0
2675  } // extreme verbosity level (print the whole matrix)
2676 }
2677 
2678 template <class Scalar, class LO, class GO, class Node>
2679 Teuchos::RCP<const Teuchos::Comm<int> >
2681  getComm() const {
2682  return graph_.getComm();
2683 }
2684 
2685 template <class Scalar, class LO, class GO, class Node>
2689  return graph_.getGlobalNumCols();
2690 }
2691 
2692 template <class Scalar, class LO, class GO, class Node>
2693 size_t
2696  return graph_.getLocalNumCols();
2697 }
2698 
2699 template <class Scalar, class LO, class GO, class Node>
2701  getIndexBase() const {
2702  return graph_.getIndexBase();
2703 }
2704 
2705 template <class Scalar, class LO, class GO, class Node>
2709  return graph_.getGlobalNumEntries();
2710 }
2711 
2712 template <class Scalar, class LO, class GO, class Node>
2713 size_t
2716  return graph_.getLocalNumEntries();
2717 }
2718 
2719 template <class Scalar, class LO, class GO, class Node>
2720 size_t
2722  getNumEntriesInGlobalRow(GO globalRow) const {
2723  return graph_.getNumEntriesInGlobalRow(globalRow);
2724 }
2725 
2726 template <class Scalar, class LO, class GO, class Node>
2727 size_t
2730  return graph_.getGlobalMaxNumRowEntries();
2731 }
2732 
2733 template <class Scalar, class LO, class GO, class Node>
2735  hasColMap() const {
2736  return graph_.hasColMap();
2737 }
2738 
2739 template <class Scalar, class LO, class GO, class Node>
2742  return graph_.isLocallyIndexed();
2743 }
2744 
2745 template <class Scalar, class LO, class GO, class Node>
2748  return graph_.isGloballyIndexed();
2749 }
2750 
2751 template <class Scalar, class LO, class GO, class Node>
2754  return graph_.isFillComplete();
2755 }
2756 
2757 template <class Scalar, class LO, class GO, class Node>
2760  return false;
2761 }
2762 
2763 template <class Scalar, class LO, class GO, class Node>
2765  getGlobalRowCopy(GO /*GlobalRow*/,
2766  nonconst_global_inds_host_view_type& /*Indices*/,
2767  nonconst_values_host_view_type& /*Values*/,
2768  size_t& /*NumEntries*/) const {
2769  TEUCHOS_TEST_FOR_EXCEPTION(
2770  true, std::logic_error,
2771  "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2772  "This class doesn't support global matrix indexing.");
2773 }
2774 
2775 template <class Scalar, class LO, class GO, class Node>
2777  getGlobalRowView(GO /* GlobalRow */,
2778  global_inds_host_view_type& /* indices */,
2779  values_host_view_type& /* values */) const {
2780  TEUCHOS_TEST_FOR_EXCEPTION(
2781  true, std::logic_error,
2782  "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2783  "This class doesn't support global matrix indexing.");
2784 }
2785 
2786 template <class Scalar, class LO, class GO, class Node>
2788  getLocalRowView(LO localRowInd,
2789  local_inds_host_view_type& colInds,
2790  values_host_view_type& vals) const {
2791  if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
2792  colInds = local_inds_host_view_type();
2793  vals = values_host_view_type();
2794  } else {
2795  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2796  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2797  colInds = local_inds_host_view_type(indHost_.data() + absBlockOffsetStart, numInds);
2798 
2799  vals = getValuesHost(localRowInd);
2800  }
2801 }
2802 
2803 template <class Scalar, class LO, class GO, class Node>
2806  local_inds_host_view_type& colInds,
2807  nonconst_values_host_view_type& vals) const {
2808  if (!rowMeshMap_.isNodeLocalElement(localRowInd)) {
2809  colInds = nonconst_local_inds_host_view_type();
2810  vals = nonconst_values_host_view_type();
2811  } else {
2812  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2813  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2814  colInds = local_inds_host_view_type(indHost_.data() + absBlockOffsetStart, numInds);
2815 
2816  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2817  vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
2818  }
2819 }
2820 
2821 template <class Scalar, class LO, class GO, class Node>
2824  const size_t lclNumMeshRows = graph_.getLocalNumRows();
2825 
2826  Kokkos::View<size_t*, device_type> diagOffsets("diagOffsets", lclNumMeshRows);
2827  graph_.getLocalDiagOffsets(diagOffsets);
2828 
2829  // The code below works on host, so use a host View.
2830  auto diagOffsetsHost = Kokkos::create_mirror_view(diagOffsets);
2831  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
2832  Kokkos::deep_copy(execution_space(), diagOffsetsHost, diagOffsets);
2833 
2834  auto vals_host_out = val_.getHostView(Access::ReadOnly);
2835  const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
2836 
2837  // TODO amk: This is a temporary measure to make the code run with Ifpack2
2838  size_t rowOffset = 0;
2839  size_t offset = 0;
2840  LO bs = getBlockSize();
2841  for (size_t r = 0; r < getLocalNumRows(); r++) {
2842  // move pointer to start of diagonal block
2843  offset = rowOffset + diagOffsetsHost(r) * bs * bs;
2844  for (int b = 0; b < bs; b++) {
2845  diag.replaceLocalValue(r * bs + b, vals_host_out_raw[offset + b * (bs + 1)]);
2846  }
2847  // move pointer to start of next block row
2848  rowOffset += getNumEntriesInLocalRow(r) * bs * bs;
2849  }
2850 
2851  // diag.template sync<memory_space> (); // sync vec of diag entries back to dev
2852 }
2853 
2854 template <class Scalar, class LO, class GO, class Node>
2856  leftScale(const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */) {
2857  TEUCHOS_TEST_FOR_EXCEPTION(
2858  true, std::logic_error,
2859  "Tpetra::BlockCrsMatrix::leftScale: "
2860  "not implemented.");
2861 }
2862 
2863 template <class Scalar, class LO, class GO, class Node>
2865  rightScale(const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */) {
2866  TEUCHOS_TEST_FOR_EXCEPTION(
2867  true, std::logic_error,
2868  "Tpetra::BlockCrsMatrix::rightScale: "
2869  "not implemented.");
2870 }
2871 
2872 template <class Scalar, class LO, class GO, class Node>
2873 Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
2875  getGraph() const {
2876  return graphRCP_;
2877 }
2878 
2879 template <class Scalar, class LO, class GO, class Node>
2880 typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
2883  TEUCHOS_TEST_FOR_EXCEPTION(
2884  true, std::logic_error,
2885  "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
2886  "not implemented.");
2887 }
2888 
2889 } // namespace Tpetra
2890 
2891 //
2892 // Explicit instantiation macro
2893 //
2894 // Must be expanded from within the Tpetra namespace!
2895 //
2896 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S, LO, GO, NODE) \
2897  template class BlockCrsMatrix<S, LO, GO, NODE>;
2898 
2899 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
KokkosSparse::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
std::string description() const override
One-line description of this object.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
LO local_ordinal_type
The type of local indices.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
bool isConstantStride() const
Whether this multivector has constant stride between columns.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
size_t getLocalNumRows() const override
get the local number of block rows
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
MultiVector for multiple degrees of freedom per mesh point.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
size_t global_size_t
Global size_t object.
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
global_size_t getGlobalNumRows() const override
get the global number of block rows
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Replace old value with maximum of magnitudes of old and new values.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Replace existing values with new values.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Replace old values with zero.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
char packet_type
Implementation detail; tells.
local_matrix_device_type getLocalMatrixDevice() const
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
A distributed dense vector.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row&#39;s entries.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix&#39;s values (val_).
Base class for distributed Tpetra objects that support data redistribution.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row. ...