Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_BlockCrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ************************************************************************
38 // @HEADER
39 // clang-format off
40 
41 #ifndef TPETRA_BLOCKCRSMATRIX_DEF_HPP
42 #define TPETRA_BLOCKCRSMATRIX_DEF_HPP
43 
46 
50 #include "Tpetra_BlockMultiVector.hpp"
51 #include "Tpetra_BlockView.hpp"
52 
53 #include "Teuchos_TimeMonitor.hpp"
54 #ifdef HAVE_TPETRA_DEBUG
55 # include <set>
56 #endif // HAVE_TPETRA_DEBUG
57 
58 #include "KokkosSparse_spmv.hpp"
59 
60 namespace Tpetra {
61 
62 namespace Impl {
63 
64  template<typename T>
65  struct BlockCrsRowStruct {
66  T totalNumEntries, totalNumBytes, maxRowLength;
67  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct() = default;
68  KOKKOS_DEFAULTED_FUNCTION BlockCrsRowStruct(const BlockCrsRowStruct &b) = default;
69  KOKKOS_INLINE_FUNCTION BlockCrsRowStruct(const T& numEnt, const T& numBytes, const T& rowLength)
70  : totalNumEntries(numEnt), totalNumBytes(numBytes), maxRowLength(rowLength) {}
71  };
72 
73  template<typename T>
74  static
75  KOKKOS_INLINE_FUNCTION
76  void operator+=(BlockCrsRowStruct<T> &a, const BlockCrsRowStruct<T> &b) {
77  a.totalNumEntries += b.totalNumEntries;
78  a.totalNumBytes += b.totalNumBytes;
79  a.maxRowLength = a.maxRowLength > b.maxRowLength ? a.maxRowLength : b.maxRowLength;
80  }
81 
82  template<typename T, typename ExecSpace>
83  struct BlockCrsReducer {
84  typedef BlockCrsReducer reducer;
85  typedef T value_type;
86  typedef Kokkos::View<value_type,ExecSpace,Kokkos::MemoryTraits<Kokkos::Unmanaged> > result_view_type;
87  value_type *value;
88 
89  KOKKOS_INLINE_FUNCTION
90  BlockCrsReducer(value_type &val) : value(&val) {}
91 
92  KOKKOS_INLINE_FUNCTION void join(value_type &dst, value_type &src) const { dst += src; }
93  KOKKOS_INLINE_FUNCTION void join(value_type &dst, const value_type &src) const { dst += src; }
94  KOKKOS_INLINE_FUNCTION void init(value_type &val) const { val = value_type(); }
95  KOKKOS_INLINE_FUNCTION value_type& reference() { return *value; }
96  KOKKOS_INLINE_FUNCTION result_view_type view() const { return result_view_type(value); }
97  };
98 
99 } // namespace Impl
100 
101 namespace { // (anonymous)
102 
103 // Implementation of BlockCrsMatrix::getLocalDiagCopy (non-deprecated
104 // version that takes two Kokkos::View arguments).
105 template<class Scalar, class LO, class GO, class Node>
106 class GetLocalDiagCopy {
107 public:
108  typedef typename Node::device_type device_type;
109  typedef size_t diag_offset_type;
110  typedef Kokkos::View<const size_t*, device_type,
111  Kokkos::MemoryUnmanaged> diag_offsets_type;
112  typedef typename ::Tpetra::CrsGraph<LO, GO, Node> global_graph_type;
113  typedef typename global_graph_type::local_graph_device_type local_graph_device_type;
114  typedef typename local_graph_device_type::row_map_type row_offsets_type;
115  typedef typename ::Tpetra::BlockMultiVector<Scalar, LO, GO, Node>::impl_scalar_type IST;
116  typedef Kokkos::View<IST***, device_type, Kokkos::MemoryUnmanaged> diag_type;
117  typedef Kokkos::View<const IST*, device_type, Kokkos::MemoryUnmanaged> values_type;
118 
119  // Constructor
120  GetLocalDiagCopy (const diag_type& diag,
121  const values_type& val,
122  const diag_offsets_type& diagOffsets,
123  const row_offsets_type& ptr,
124  const LO blockSize) :
125  diag_ (diag),
126  diagOffsets_ (diagOffsets),
127  ptr_ (ptr),
128  blockSize_ (blockSize),
129  offsetPerBlock_ (blockSize_*blockSize_),
130  val_(val)
131  {}
132 
133  KOKKOS_INLINE_FUNCTION void
134  operator() (const LO& lclRowInd) const
135  {
136  using Kokkos::ALL;
137 
138  // Get row offset
139  const size_t absOffset = ptr_[lclRowInd];
140 
141  // Get offset relative to start of row
142  const size_t relOffset = diagOffsets_[lclRowInd];
143 
144  // Get the total offset
145  const size_t pointOffset = (absOffset+relOffset)*offsetPerBlock_;
146 
147  // Get a view of the block. BCRS currently uses LayoutRight
148  // regardless of the device.
149  typedef Kokkos::View<const IST**, Impl::BlockCrsMatrixLittleBlockArrayLayout,
150  device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged> >
151  const_little_block_type;
152  const_little_block_type D_in (val_.data () + pointOffset,
153  blockSize_, blockSize_);
154  auto D_out = Kokkos::subview (diag_, lclRowInd, ALL (), ALL ());
155  COPY (D_in, D_out);
156  }
157 
158  private:
159  diag_type diag_;
160  diag_offsets_type diagOffsets_;
161  row_offsets_type ptr_;
162  LO blockSize_;
163  LO offsetPerBlock_;
164  values_type val_;
165  };
166 } // namespace (anonymous)
167 
168  template<class Scalar, class LO, class GO, class Node>
169  std::ostream&
170  BlockCrsMatrix<Scalar, LO, GO, Node>::
171  markLocalErrorAndGetStream ()
172  {
173  * (this->localError_) = true;
174  if ((*errs_).is_null ()) {
175  *errs_ = Teuchos::rcp (new std::ostringstream ());
176  }
177  return **errs_;
178  }
179 
180  template<class Scalar, class LO, class GO, class Node>
183  dist_object_type (Teuchos::rcp (new map_type ())), // nonnull, so DistObject doesn't throw
184  graph_ (Teuchos::rcp (new map_type ()), 0), // FIXME (mfh 16 May 2014) no empty ctor yet
185  blockSize_ (static_cast<LO> (0)),
186  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
187  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
188  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
189  offsetPerBlock_ (0),
190  localError_ (new bool (false)),
191  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
192  {
193  }
194 
195  template<class Scalar, class LO, class GO, class Node>
198  const LO blockSize) :
199  dist_object_type (graph.getMap ()),
200  graph_ (graph),
201  rowMeshMap_ (* (graph.getRowMap ())),
202  blockSize_ (blockSize),
203  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
204  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
205  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
206  offsetPerBlock_ (blockSize * blockSize),
207  localError_ (new bool (false)),
208  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
209  {
210 
212  TEUCHOS_TEST_FOR_EXCEPTION(
213  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
214  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
215  "rows (isSorted() is false). This class assumes sorted rows.");
216 
217  graphRCP_ = Teuchos::rcpFromRef(graph_);
218 
219  // Trick to test whether LO is nonpositive, without a compiler
220  // warning in case LO is unsigned (which is generally a bad idea
221  // anyway). I don't promise that the trick works, but it
222  // generally does with gcc at least, in my experience.
223  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
224  TEUCHOS_TEST_FOR_EXCEPTION(
225  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
226  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
227  " <= 0. The block size must be positive.");
228 
229  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
230  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
231 
232  {
233  // These are rcp
234  const auto domainPointMap = getDomainMap();
235  const auto colPointMap = Teuchos::rcp
236  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
237  *pointImporter_ = Teuchos::rcp
238  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
239  }
240  {
241  auto local_graph_h = graph.getLocalGraphHost ();
242  auto ptr_h = local_graph_h.row_map;
243  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
244  Kokkos::deep_copy(ptrHost_, ptr_h);
245 
246  auto ind_h = local_graph_h.entries;
247  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
248  // DEEP_COPY REVIEW - HOST-TO-HOST
249  Kokkos::deep_copy (indHost_, ind_h);
250 
251  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
252  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
253  }
254  }
255 
256  template<class Scalar, class LO, class GO, class Node>
259  const typename local_matrix_device_type::values_type& values,
260  const LO blockSize) :
261  dist_object_type (graph.getMap ()),
262  graph_ (graph),
263  rowMeshMap_ (* (graph.getRowMap ())),
264  blockSize_ (blockSize),
265  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
266  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
267  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
268  offsetPerBlock_ (blockSize * blockSize),
269  localError_ (new bool (false)),
270  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
271  {
273  TEUCHOS_TEST_FOR_EXCEPTION(
274  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
275  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
276  "rows (isSorted() is false). This class assumes sorted rows.");
277 
278  graphRCP_ = Teuchos::rcpFromRef(graph_);
279 
280  // Trick to test whether LO is nonpositive, without a compiler
281  // warning in case LO is unsigned (which is generally a bad idea
282  // anyway). I don't promise that the trick works, but it
283  // generally does with gcc at least, in my experience.
284  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
285  TEUCHOS_TEST_FOR_EXCEPTION(
286  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
287  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
288  " <= 0. The block size must be positive.");
289 
290  domainPointMap_ = BMV::makePointMap (* (graph.getDomainMap ()), blockSize);
291  rangePointMap_ = BMV::makePointMap (* (graph.getRangeMap ()), blockSize);
292 
293  {
294  // These are rcp
295  const auto domainPointMap = getDomainMap();
296  const auto colPointMap = Teuchos::rcp
297  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
298  *pointImporter_ = Teuchos::rcp
299  (new typename crs_graph_type::import_type (domainPointMap, colPointMap));
300  }
301  {
302  auto local_graph_h = graph.getLocalGraphHost ();
303  auto ptr_h = local_graph_h.row_map;
304  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
305  Kokkos::deep_copy(ptrHost_, ptr_h);
306 
307  auto ind_h = local_graph_h.entries;
308  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
309  Kokkos::deep_copy (indHost_, ind_h);
310 
311  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
312  TEUCHOS_ASSERT_EQUALITY(numValEnt, values.size());
313  val_ = decltype (val_) (values);
314  }
315  }
316 
317  template<class Scalar, class LO, class GO, class Node>
320  const map_type& domainPointMap,
321  const map_type& rangePointMap,
322  const LO blockSize) :
323  dist_object_type (graph.getMap ()),
324  graph_ (graph),
325  rowMeshMap_ (* (graph.getRowMap ())),
326  domainPointMap_ (domainPointMap),
327  rangePointMap_ (rangePointMap),
328  blockSize_ (blockSize),
329  X_colMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
330  Y_rowMap_ (new Teuchos::RCP<BMV> ()), // ptr to a null ptr
331  pointImporter_ (new Teuchos::RCP<typename crs_graph_type::import_type> ()),
332  offsetPerBlock_ (blockSize * blockSize),
333  localError_ (new bool (false)),
334  errs_ (new Teuchos::RCP<std::ostringstream> ()) // ptr to a null ptr
335  {
336  TEUCHOS_TEST_FOR_EXCEPTION(
337  ! graph_.isSorted (), std::invalid_argument, "Tpetra::"
338  "BlockCrsMatrix constructor: The input CrsGraph does not have sorted "
339  "rows (isSorted() is false). This class assumes sorted rows.");
340 
341  graphRCP_ = Teuchos::rcpFromRef(graph_);
342 
343  // Trick to test whether LO is nonpositive, without a compiler
344  // warning in case LO is unsigned (which is generally a bad idea
345  // anyway). I don't promise that the trick works, but it
346  // generally does with gcc at least, in my experience.
347  const bool blockSizeIsNonpositive = (blockSize + 1 <= 1);
348  TEUCHOS_TEST_FOR_EXCEPTION(
349  blockSizeIsNonpositive, std::invalid_argument, "Tpetra::"
350  "BlockCrsMatrix constructor: The input blockSize = " << blockSize <<
351  " <= 0. The block size must be positive.");
352  {
353  // These are rcp
354  const auto rcpDomainPointMap = getDomainMap();
355  const auto colPointMap = Teuchos::rcp
356  (new typename BMV::map_type (BMV::makePointMap (*graph_.getColMap(), blockSize_)));
357  *pointImporter_ = Teuchos::rcp
358  (new typename crs_graph_type::import_type (rcpDomainPointMap, colPointMap));
359  }
360  {
361  auto local_graph_h = graph.getLocalGraphHost ();
362  auto ptr_h = local_graph_h.row_map;
363  ptrHost_ = decltype(ptrHost_)(Kokkos::ViewAllocateWithoutInitializing("graph row offset"), ptr_h.extent(0));
364  // DEEP_COPY REVIEW - HOST-TO-HOST
365  Kokkos::deep_copy(ptrHost_, ptr_h);
366 
367  auto ind_h = local_graph_h.entries;
368  indHost_ = decltype(indHost_)(Kokkos::ViewAllocateWithoutInitializing("graph column indices"), ind_h.extent(0));
369  // DEEP_COPY REVIEW - HOST-TO-HOST
370  Kokkos::deep_copy (indHost_, ind_h);
371 
372  const auto numValEnt = ind_h.extent(0) * offsetPerBlock ();
373  val_ = decltype (val_) (impl_scalar_type_dualview("val", numValEnt));
374 
375  }
376  }
377 
378  template<class Scalar, class LO, class GO, class Node>
379  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
382  { // Copy constructor of map_type does a shallow copy.
383  // We're only returning by RCP for backwards compatibility.
384  return Teuchos::rcp (new map_type (domainPointMap_));
385  }
386 
387  template<class Scalar, class LO, class GO, class Node>
388  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
390  getRangeMap () const
391  { // Copy constructor of map_type does a shallow copy.
392  // We're only returning by RCP for backwards compatibility.
393  return Teuchos::rcp (new map_type (rangePointMap_));
394  }
395 
396  template<class Scalar, class LO, class GO, class Node>
397  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
399  getRowMap () const
400  {
401  return graph_.getRowMap();
402  }
403 
404  template<class Scalar, class LO, class GO, class Node>
405  Teuchos::RCP<const typename BlockCrsMatrix<Scalar, LO, GO, Node>::map_type>
407  getColMap () const
408  {
409  return graph_.getColMap();
410  }
411 
412  template<class Scalar, class LO, class GO, class Node>
416  {
417  return graph_.getGlobalNumRows();
418  }
419 
420  template<class Scalar, class LO, class GO, class Node>
421  size_t
424  {
425  return graph_.getLocalNumRows();
426  }
427 
428  template<class Scalar, class LO, class GO, class Node>
429  size_t
432  {
433  return graph_.getLocalMaxNumRowEntries();
434  }
435 
436  template<class Scalar, class LO, class GO, class Node>
437  void
439  apply (const mv_type& X,
440  mv_type& Y,
441  Teuchos::ETransp mode,
442  Scalar alpha,
443  Scalar beta) const
444  {
445  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
446  TEUCHOS_TEST_FOR_EXCEPTION(
447  mode != Teuchos::NO_TRANS && mode != Teuchos::TRANS && mode != Teuchos::CONJ_TRANS,
448  std::invalid_argument, "Tpetra::BlockCrsMatrix::apply: "
449  "Invalid 'mode' argument. Valid values are Teuchos::NO_TRANS, "
450  "Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
451 
452  BMV X_view;
453  BMV Y_view;
454  const LO blockSize = getBlockSize ();
455  try {
456  X_view = BMV (X, * (graph_.getDomainMap ()), blockSize);
457  Y_view = BMV (Y, * (graph_.getRangeMap ()), blockSize);
458  }
459  catch (std::invalid_argument& e) {
460  TEUCHOS_TEST_FOR_EXCEPTION(
461  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
462  "apply: Either the input MultiVector X or the output MultiVector Y "
463  "cannot be viewed as a BlockMultiVector, given this BlockCrsMatrix's "
464  "graph. BlockMultiVector's constructor threw the following exception: "
465  << e.what ());
466  }
467 
468  try {
469  // mfh 16 May 2014: Casting 'this' to nonconst is icky, but it's
470  // either that or mark fields of this class as 'mutable'. The
471  // problem is that applyBlock wants to do lazy initialization of
472  // temporary block multivectors.
473  const_cast<this_BCRS_type&> (*this).applyBlock (X_view, Y_view, mode, alpha, beta);
474  } catch (std::invalid_argument& e) {
475  TEUCHOS_TEST_FOR_EXCEPTION(
476  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
477  "apply: The implementation method applyBlock complained about having "
478  "an invalid argument. It reported the following: " << e.what ());
479  } catch (std::logic_error& e) {
480  TEUCHOS_TEST_FOR_EXCEPTION(
481  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
482  "apply: The implementation method applyBlock complained about a "
483  "possible bug in its implementation. It reported the following: "
484  << e.what ());
485  } catch (std::exception& e) {
486  TEUCHOS_TEST_FOR_EXCEPTION(
487  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
488  "apply: The implementation method applyBlock threw an exception which "
489  "is neither std::invalid_argument nor std::logic_error, but is a "
490  "subclass of std::exception. It reported the following: "
491  << e.what ());
492  } catch (...) {
493  TEUCHOS_TEST_FOR_EXCEPTION(
494  true, std::logic_error, "Tpetra::BlockCrsMatrix::"
495  "apply: The implementation method applyBlock threw an exception which "
496  "is not an instance of a subclass of std::exception. This probably "
497  "indicates a bug. Please report this to the Tpetra developers.");
498  }
499  }
500 
501  template<class Scalar, class LO, class GO, class Node>
502  void
506  Teuchos::ETransp mode,
507  const Scalar alpha,
508  const Scalar beta)
509  {
510  TEUCHOS_TEST_FOR_EXCEPTION(
511  X.getBlockSize () != Y.getBlockSize (), std::invalid_argument,
512  "Tpetra::BlockCrsMatrix::applyBlock: "
513  "X and Y have different block sizes. X.getBlockSize() = "
514  << X.getBlockSize () << " != Y.getBlockSize() = "
515  << Y.getBlockSize () << ".");
516 
517  if (mode == Teuchos::NO_TRANS) {
518  applyBlockNoTrans (X, Y, alpha, beta);
519  } else if (mode == Teuchos::TRANS || mode == Teuchos::CONJ_TRANS) {
520  applyBlockTrans (X, Y, mode, alpha, beta);
521  } else {
522  TEUCHOS_TEST_FOR_EXCEPTION(
523  true, std::invalid_argument, "Tpetra::BlockCrsMatrix::"
524  "applyBlock: Invalid 'mode' argument. Valid values are "
525  "Teuchos::NO_TRANS, Teuchos::TRANS, and Teuchos::CONJ_TRANS.");
526  }
527  }
528 
529  template<class Scalar, class LO, class GO, class Node>
530  void
533  const Import<LO, GO, Node>& importer) const
534  {
535  using Teuchos::RCP;
536  using Teuchos::rcp;
537  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
538 
539  // Right now, we make many assumptions...
540  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
541  "destMatrix is required to be null.");
542 
543  // BlockCrsMatrix requires a complete graph at construction.
544  // So first step is to import and fill complete the destGraph.
545  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
546  RCP<crs_graph_type> destGraph = importAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, importer,
547  srcGraph->getDomainMap(),
548  srcGraph->getRangeMap());
549 
550 
551  // Final step, create and import the destMatrix.
552  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
553  destMatrix->doImport(*this, importer, Tpetra::INSERT);
554  }
555 
556 
557  template<class Scalar, class LO, class GO, class Node>
558  void
561  const Export<LO, GO, Node>& exporter) const
562  {
563  using Teuchos::RCP;
564  using Teuchos::rcp;
565  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
566 
567  // Right now, we make many assumptions...
568  TEUCHOS_TEST_FOR_EXCEPTION(!destMatrix.is_null(), std::invalid_argument,
569  "destMatrix is required to be null.");
570 
571  // BlockCrsMatrix requires a complete graph at construction.
572  // So first step is to import and fill complete the destGraph.
573  RCP<crs_graph_type> srcGraph = rcp (new crs_graph_type(this->getCrsGraph()));
574  RCP<crs_graph_type> destGraph = exportAndFillCompleteCrsGraph<crs_graph_type>(srcGraph, exporter,
575  srcGraph->getDomainMap(),
576  srcGraph->getRangeMap());
577 
578 
579  // Final step, create and import the destMatrix.
580  destMatrix = rcp (new this_BCRS_type (*destGraph, getBlockSize()));
581  destMatrix->doExport(*this, exporter, Tpetra::INSERT);
582  }
583 
584 
585  template<class Scalar, class LO, class GO, class Node>
586  void
588  setAllToScalar (const Scalar& alpha)
589  {
590  auto val_d = val_.getDeviceView(Access::OverwriteAll);
591  Kokkos::deep_copy(val_d, alpha);
592  }
593 
594  template<class Scalar, class LO, class GO, class Node>
595  LO
597  replaceLocalValues (const LO localRowInd,
598  const LO colInds[],
599  const Scalar vals[],
600  const LO numColInds) const
601  {
602  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
603  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
604  ptrdiff_t * offsets = offsets_host_view.data();
605  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
606  const LO validCount = this->replaceLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
607  return validCount;
608  }
609 
610  template <class Scalar, class LO, class GO, class Node>
611  void
613  getLocalDiagOffsets (const Kokkos::View<size_t*, device_type,
614  Kokkos::MemoryUnmanaged>& offsets) const
615  {
616  graph_.getLocalDiagOffsets (offsets);
617  }
618 
619  template <class Scalar, class LO, class GO, class Node>
620  void
622  getLocalDiagCopy (const Kokkos::View<impl_scalar_type***, device_type,
623  Kokkos::MemoryUnmanaged>& diag,
624  const Kokkos::View<const size_t*, device_type,
625  Kokkos::MemoryUnmanaged>& offsets) const
626  {
627  using Kokkos::parallel_for;
628  const char prefix[] = "Tpetra::BlockCrsMatrix::getLocalDiagCopy (2-arg): ";
629 
630  const LO lclNumMeshRows = static_cast<LO> (rowMeshMap_.getLocalNumElements ());
631  const LO blockSize = this->getBlockSize ();
632  TEUCHOS_TEST_FOR_EXCEPTION
633  (static_cast<LO> (diag.extent (0)) < lclNumMeshRows ||
634  static_cast<LO> (diag.extent (1)) < blockSize ||
635  static_cast<LO> (diag.extent (2)) < blockSize,
636  std::invalid_argument, prefix <<
637  "The input Kokkos::View is not big enough to hold all the data.");
638  TEUCHOS_TEST_FOR_EXCEPTION
639  (static_cast<LO> (offsets.size ()) < lclNumMeshRows, std::invalid_argument,
640  prefix << "offsets.size() = " << offsets.size () << " < local number of "
641  "diagonal blocks " << lclNumMeshRows << ".");
642 
643  typedef Kokkos::RangePolicy<execution_space, LO> policy_type;
644  typedef GetLocalDiagCopy<Scalar, LO, GO, Node> functor_type;
645 
646  // FIXME (mfh 26 May 2016) Not really OK to const_cast here, since
647  // we reserve the right to do lazy allocation of device data. (We
648  // don't plan to do lazy allocation for host data; the host
649  // version of the data always exists.)
650  auto val_d = val_.getDeviceView(Access::ReadOnly);
651  parallel_for (policy_type (0, lclNumMeshRows),
652  functor_type (diag, val_d, offsets,
653  graph_.getLocalGraphDevice ().row_map, blockSize_));
654  }
655 
656  template<class Scalar, class LO, class GO, class Node>
657  LO
659  absMaxLocalValues (const LO localRowInd,
660  const LO colInds[],
661  const Scalar vals[],
662  const LO numColInds) const
663  {
664  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
665  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
666  ptrdiff_t * offsets = offsets_host_view.data();
667  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
668  const LO validCount = this->absMaxLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
669  return validCount;
670  }
671 
672 
673  template<class Scalar, class LO, class GO, class Node>
674  LO
676  sumIntoLocalValues (const LO localRowInd,
677  const LO colInds[],
678  const Scalar vals[],
679  const LO numColInds) const
680  {
681  Kokkos::View<ptrdiff_t*,Kokkos::HostSpace>
682  offsets_host_view(Kokkos::ViewAllocateWithoutInitializing("offsets"), numColInds);
683  ptrdiff_t * offsets = offsets_host_view.data();
684  const LO numOffsets = this->getLocalRowOffsets(localRowInd, offsets, colInds, numColInds);
685  const LO validCount = this->sumIntoLocalValuesByOffsets(localRowInd, offsets, vals, numOffsets);
686  return validCount;
687  }
688  template<class Scalar, class LO, class GO, class Node>
689  void
691  getLocalRowCopy (LO LocalRow,
692  nonconst_local_inds_host_view_type &Indices,
693  nonconst_values_host_view_type &Values,
694  size_t& NumEntries) const
695  {
696  auto vals = getValuesHost(LocalRow);
697  const LO numInds = ptrHost_(LocalRow+1) - ptrHost_(LocalRow);
698  if (numInds > (LO)Indices.extent(0) || numInds*blockSize_*blockSize_ > (LO)Values.extent(0)) {
699  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error,
700  "Tpetra::BlockCrsMatrix::getLocalRowCopy : Column and/or values array is not large enough to hold "
701  << numInds << " row entries");
702  }
703  const LO * colInds = indHost_.data() + ptrHost_(LocalRow);
704  for (LO i=0; i<numInds; ++i) {
705  Indices[i] = colInds[i];
706  }
707  for (LO i=0; i<numInds*blockSize_*blockSize_; ++i) {
708  Values[i] = vals[i];
709  }
710  NumEntries = numInds;
711  }
712 
713  template<class Scalar, class LO, class GO, class Node>
714  LO
716  getLocalRowOffsets (const LO localRowInd,
717  ptrdiff_t offsets[],
718  const LO colInds[],
719  const LO numColInds) const
720  {
721  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
722  // We got no offsets, because the input local row index is
723  // invalid on the calling process. That may not be an error, if
724  // numColInds is zero anyway; it doesn't matter. This is the
725  // advantage of returning the number of valid indices.
726  return static_cast<LO> (0);
727  }
728 
729  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
730  LO hint = 0; // Guess for the relative offset into the current row
731  LO validCount = 0; // number of valid column indices in colInds
732 
733  for (LO k = 0; k < numColInds; ++k) {
734  const LO relBlockOffset =
735  this->findRelOffsetOfColumnIndex (localRowInd, colInds[k], hint);
736  if (relBlockOffset != LINV) {
737  offsets[k] = static_cast<ptrdiff_t> (relBlockOffset);
738  hint = relBlockOffset + 1;
739  ++validCount;
740  }
741  }
742  return validCount;
743  }
744 
745 
746  template<class Scalar, class LO, class GO, class Node>
747  LO
749  replaceLocalValuesByOffsets (const LO localRowInd,
750  const ptrdiff_t offsets[],
751  const Scalar vals[],
752  const LO numOffsets) const
753  {
754  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
755  // We modified no values, because the input local row index is
756  // invalid on the calling process. That may not be an error, if
757  // numColInds is zero anyway; it doesn't matter. This is the
758  // advantage of returning the number of valid indices.
759  return static_cast<LO> (0);
760  }
761  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
762  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
763  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
764  impl_scalar_type* vOut = val_out.data();
765 
766  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
767  const ptrdiff_t STINV = Teuchos::OrdinalTraits<ptrdiff_t>::invalid ();
768  size_t pointOffset = 0; // Current offset into input values
769  LO validCount = 0; // number of valid offsets
770 
771  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
772  const size_t blockOffset = offsets[k]*perBlockSize;
773  if (offsets[k] != STINV) {
774  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
775  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
776  COPY (A_new, A_old);
777  ++validCount;
778  }
779  }
780  return validCount;
781  }
782 
783 
784  template<class Scalar, class LO, class GO, class Node>
785  LO
787  absMaxLocalValuesByOffsets (const LO localRowInd,
788  const ptrdiff_t offsets[],
789  const Scalar vals[],
790  const LO numOffsets) const
791  {
792  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
793  // We modified no values, because the input local row index is
794  // invalid on the calling process. That may not be an error, if
795  // numColInds is zero anyway; it doesn't matter. This is the
796  // advantage of returning the number of valid indices.
797  return static_cast<LO> (0);
798  }
799  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
800  auto val_out = getValuesHost(localRowInd);
801  impl_scalar_type* vOut = const_cast<impl_scalar_type*>(val_out.data());
802 
803  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
804  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
805  size_t pointOffset = 0; // Current offset into input values
806  LO validCount = 0; // number of valid offsets
807 
808  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
809  const size_t blockOffset = offsets[k]*perBlockSize;
810  if (blockOffset != STINV) {
811  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
812  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
813  ::Tpetra::Impl::absMax (A_old, A_new);
814  ++validCount;
815  }
816  }
817  return validCount;
818  }
819 
820 
821  template<class Scalar, class LO, class GO, class Node>
822  LO
824  sumIntoLocalValuesByOffsets (const LO localRowInd,
825  const ptrdiff_t offsets[],
826  const Scalar vals[],
827  const LO numOffsets) const
828  {
829  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
830  // We modified no values, because the input local row index is
831  // invalid on the calling process. That may not be an error, if
832  // numColInds is zero anyway; it doesn't matter. This is the
833  // advantage of returning the number of valid indices.
834  return static_cast<LO> (0);
835  }
836  const impl_scalar_type ONE = static_cast<impl_scalar_type> (1.0);
837  const impl_scalar_type* const vIn = reinterpret_cast<const impl_scalar_type*> (vals);
838  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
839  auto val_out = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
840  impl_scalar_type* vOut = val_out.data();
841 
842  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
843  const size_t STINV = Teuchos::OrdinalTraits<size_t>::invalid ();
844  size_t pointOffset = 0; // Current offset into input values
845  LO validCount = 0; // number of valid offsets
846 
847  for (LO k = 0; k < numOffsets; ++k, pointOffset += perBlockSize) {
848  const size_t blockOffset = offsets[k]*perBlockSize;
849  if (blockOffset != STINV) {
850  little_block_type A_old = getNonConstLocalBlockFromInput (vOut, blockOffset);
851  const_little_block_type A_new = getConstLocalBlockFromInput (vIn, pointOffset);
852  AXPY (ONE, A_new, A_old);
853  ++validCount;
854  }
855  }
856  return validCount;
857  }
858 
859  template<class Scalar, class LO, class GO, class Node>
861  impl_scalar_type_dualview::t_host::const_type
863  getValuesHost () const
864  {
865  return val_.getHostView(Access::ReadOnly);
866  }
867 
868  template<class Scalar, class LO, class GO, class Node>
869  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
870  impl_scalar_type_dualview::t_dev::const_type
871  BlockCrsMatrix<Scalar, LO, GO, Node>::
872  getValuesDevice () const
873  {
874  return val_.getDeviceView(Access::ReadOnly);
875  }
876 
877  template<class Scalar, class LO, class GO, class Node>
878  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
879  impl_scalar_type_dualview::t_host
882  {
883  return val_.getHostView(Access::ReadWrite);
884  }
885 
886  template<class Scalar, class LO, class GO, class Node>
888  impl_scalar_type_dualview::t_dev
890  getValuesDeviceNonConst () const
891  {
892  return val_.getDeviceView(Access::ReadWrite);
893  }
894 
895  template<class Scalar, class LO, class GO, class Node>
896  typename BlockCrsMatrix<Scalar, LO, GO, Node>::
897  impl_scalar_type_dualview::t_host::const_type
898  BlockCrsMatrix<Scalar, LO, GO, Node>::
899  getValuesHost (const LO& lclRow) const
900  {
901  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
902  auto val = val_.getHostView(Access::ReadOnly);
903  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
904  return r_val;
905  }
906 
907  template<class Scalar, class LO, class GO, class Node>
909  impl_scalar_type_dualview::t_dev::const_type
911  getValuesDevice (const LO& lclRow) const
912  {
913  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
914  auto val = val_.getDeviceView(Access::ReadOnly);
915  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
916  return r_val;
917  }
918 
919  template<class Scalar, class LO, class GO, class Node>
922  getValuesHostNonConst (const LO& lclRow)
923  {
924  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
925  auto val = val_.getHostView(Access::ReadWrite);
926  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
927  return r_val;
928  }
929 
930  template<class Scalar, class LO, class GO, class Node>
933  getValuesDeviceNonConst (const LO& lclRow)
934  {
935  const size_t perBlockSize = static_cast<LO> (offsetPerBlock ());
936  auto val = val_.getDeviceView(Access::ReadWrite);
937  auto r_val = Kokkos::subview(val, Kokkos::pair<LO,LO>(ptrHost_(lclRow)*perBlockSize, ptrHost_(lclRow+1)*perBlockSize));
938  return r_val;
939  }
940 
941  template<class Scalar, class LO, class GO, class Node>
942  size_t
944  getNumEntriesInLocalRow (const LO localRowInd) const
945  {
946  const size_t numEntInGraph = graph_.getNumEntriesInLocalRow (localRowInd);
947  if (numEntInGraph == Teuchos::OrdinalTraits<size_t>::invalid ()) {
948  return static_cast<LO> (0); // the calling process doesn't have that row
949  } else {
950  return static_cast<LO> (numEntInGraph);
951  }
952  }
953 
954  template<class Scalar, class LO, class GO, class Node>
955  typename BlockCrsMatrix<Scalar, LO, GO, Node>::local_matrix_device_type
958  {
959  auto numCols = this->graph_.getColMap()->getLocalNumElements();
960  auto val = val_.getDeviceView(Access::ReadWrite);
961  const LO blockSize = this->getBlockSize ();
962  const auto graph = this->graph_.getLocalGraphDevice ();
963 
964  return local_matrix_device_type("Tpetra::BlockCrsMatrix::lclMatrixDevice",
965  numCols,
966  val,
967  graph,
968  blockSize);
969  }
970 
971  template<class Scalar, class LO, class GO, class Node>
972  void
976  const Teuchos::ETransp mode,
977  const Scalar alpha,
978  const Scalar beta)
979  {
980  (void) X;
981  (void) Y;
982  (void) mode;
983  (void) alpha;
984  (void) beta;
985 
986  TEUCHOS_TEST_FOR_EXCEPTION(
987  true, std::logic_error, "Tpetra::BlockCrsMatrix::apply: "
988  "transpose and conjugate transpose modes are not yet implemented.");
989  }
990 
991  template<class Scalar, class LO, class GO, class Node>
992  void
993  BlockCrsMatrix<Scalar, LO, GO, Node>::
994  applyBlockNoTrans (const BlockMultiVector<Scalar, LO, GO, Node>& X,
995  BlockMultiVector<Scalar, LO, GO, Node>& Y,
996  const Scalar alpha,
997  const Scalar beta)
998  {
999  using Teuchos::RCP;
1000  using Teuchos::rcp;
1001  typedef ::Tpetra::Import<LO, GO, Node> import_type;
1002  typedef ::Tpetra::Export<LO, GO, Node> export_type;
1003  const Scalar zero = STS::zero ();
1004  const Scalar one = STS::one ();
1005  RCP<const import_type> import = graph_.getImporter ();
1006  // "export" is a reserved C++ keyword, so we can't use it.
1007  RCP<const export_type> theExport = graph_.getExporter ();
1008  const char prefix[] = "Tpetra::BlockCrsMatrix::applyBlockNoTrans: ";
1009 
1010  if (alpha == zero) {
1011  if (beta == zero) {
1012  Y.putScalar (zero); // replace Inf or NaN (BLAS rules)
1013  }
1014  else if (beta != one) {
1015  Y.scale (beta);
1016  }
1017  }
1018  else { // alpha != 0
1019  const BMV* X_colMap = NULL;
1020  if (import.is_null ()) {
1021  try {
1022  X_colMap = &X;
1023  }
1024  catch (std::exception& e) {
1025  TEUCHOS_TEST_FOR_EXCEPTION
1026  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1027  "operator= threw an exception: " << std::endl << e.what ());
1028  }
1029  }
1030  else {
1031  // X_colMap_ is a pointer to a pointer to BMV. Ditto for
1032  // Y_rowMap_ below. This lets us do lazy initialization
1033  // correctly with view semantics of BlockCrsMatrix. All views
1034  // of this BlockCrsMatrix have the same outer pointer. That
1035  // way, we can set the inner pointer in one view, and all
1036  // other views will see it.
1037  if ((*X_colMap_).is_null () ||
1038  (**X_colMap_).getNumVectors () != X.getNumVectors () ||
1039  (**X_colMap_).getBlockSize () != X.getBlockSize ()) {
1040  *X_colMap_ = rcp (new BMV (* (graph_.getColMap ()), getBlockSize (),
1041  static_cast<LO> (X.getNumVectors ())));
1042  }
1043  (*X_colMap_)->getMultiVectorView().doImport (X.getMultiVectorView (),
1044  **pointImporter_,
1046  try {
1047  X_colMap = &(**X_colMap_);
1048  }
1049  catch (std::exception& e) {
1050  TEUCHOS_TEST_FOR_EXCEPTION
1051  (true, std::logic_error, prefix << "Tpetra::MultiVector::"
1052  "operator= threw an exception: " << std::endl << e.what ());
1053  }
1054  }
1055 
1056  BMV* Y_rowMap = NULL;
1057  if (theExport.is_null ()) {
1058  Y_rowMap = &Y;
1059  }
1060  else if ((*Y_rowMap_).is_null () ||
1061  (**Y_rowMap_).getNumVectors () != Y.getNumVectors () ||
1062  (**Y_rowMap_).getBlockSize () != Y.getBlockSize ()) {
1063  *Y_rowMap_ = rcp (new BMV (* (graph_.getRowMap ()), getBlockSize (),
1064  static_cast<LO> (X.getNumVectors ())));
1065  try {
1066  Y_rowMap = &(**Y_rowMap_);
1067  }
1068  catch (std::exception& e) {
1069  TEUCHOS_TEST_FOR_EXCEPTION(
1070  true, std::logic_error, prefix << "Tpetra::MultiVector::"
1071  "operator= threw an exception: " << std::endl << e.what ());
1072  }
1073  }
1074 
1075  try {
1076  localApplyBlockNoTrans (*X_colMap, *Y_rowMap, alpha, beta);
1077  }
1078  catch (std::exception& e) {
1079  TEUCHOS_TEST_FOR_EXCEPTION
1080  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1081  "an exception: " << e.what ());
1082  }
1083  catch (...) {
1084  TEUCHOS_TEST_FOR_EXCEPTION
1085  (true, std::runtime_error, prefix << "localApplyBlockNoTrans threw "
1086  "an exception not a subclass of std::exception.");
1087  }
1088 
1089  if (! theExport.is_null ()) {
1090  Y.doExport (*Y_rowMap, *theExport, ::Tpetra::REPLACE);
1091  }
1092  }
1093  }
1094 
1095 // clang-format on
1096 template <class Scalar, class LO, class GO, class Node>
1097 void BlockCrsMatrix<Scalar, LO, GO, Node>::localApplyBlockNoTrans(
1098  const BlockMultiVector<Scalar, LO, GO, Node> &X,
1099  BlockMultiVector<Scalar, LO, GO, Node> &Y, const Scalar alpha,
1100  const Scalar beta) {
1101  ::Tpetra::Details::ProfilingRegion profile_region(
1102  "Tpetra::BlockCrsMatrix::localApplyBlockNoTrans");
1103  const impl_scalar_type alpha_impl = alpha;
1104  const auto graph = this->graph_.getLocalGraphDevice();
1105 
1106  auto X_mv = X.getMultiVectorView();
1107  auto Y_mv = Y.getMultiVectorView();
1108  auto X_lcl = X_mv.getLocalViewDevice(Access::ReadOnly);
1109  auto Y_lcl = Y_mv.getLocalViewDevice(Access::ReadWrite);
1110 
1111  auto A_lcl = getLocalMatrixDevice();
1112  KokkosSparse::spmv(KokkosSparse::NoTranspose, alpha_impl, A_lcl, X_lcl, beta,
1113  Y_lcl);
1114 }
1115 // clang-format off
1116 
1117  template<class Scalar, class LO, class GO, class Node>
1118  LO
1119  BlockCrsMatrix<Scalar, LO, GO, Node>::
1120  findRelOffsetOfColumnIndex (const LO localRowIndex,
1121  const LO colIndexToFind,
1122  const LO hint) const
1123  {
1124  const size_t absStartOffset = ptrHost_[localRowIndex];
1125  const size_t absEndOffset = ptrHost_[localRowIndex+1];
1126  const LO numEntriesInRow = static_cast<LO> (absEndOffset - absStartOffset);
1127  // Amortize pointer arithmetic over the search loop.
1128  const LO* const curInd = indHost_.data () + absStartOffset;
1129 
1130  // If the hint was correct, then the hint is the offset to return.
1131  if (hint < numEntriesInRow && curInd[hint] == colIndexToFind) {
1132  // Always return the offset relative to the current row.
1133  return hint;
1134  }
1135 
1136  // The hint was wrong, so we must search for the given column
1137  // index in the column indices for the given row.
1138  LO relOffset = Teuchos::OrdinalTraits<LO>::invalid ();
1139 
1140  // We require that the graph have sorted rows. However, binary
1141  // search only pays if the current row is longer than a certain
1142  // amount. We set this to 32, but you might want to tune this.
1143  const LO maxNumEntriesForLinearSearch = 32;
1144  if (numEntriesInRow > maxNumEntriesForLinearSearch) {
1145  // Use binary search. It would probably be better for us to
1146  // roll this loop by hand. If we wrote it right, a smart
1147  // compiler could perhaps use conditional loads and avoid
1148  // branches (according to Jed Brown on May 2014).
1149  const LO* beg = curInd;
1150  const LO* end = curInd + numEntriesInRow;
1151  std::pair<const LO*, const LO*> p =
1152  std::equal_range (beg, end, colIndexToFind);
1153  if (p.first != p.second) {
1154  // offset is relative to the current row
1155  relOffset = static_cast<LO> (p.first - beg);
1156  }
1157  }
1158  else { // use linear search
1159  for (LO k = 0; k < numEntriesInRow; ++k) {
1160  if (colIndexToFind == curInd[k]) {
1161  relOffset = k; // offset is relative to the current row
1162  break;
1163  }
1164  }
1165  }
1166 
1167  return relOffset;
1168  }
1169 
1170  template<class Scalar, class LO, class GO, class Node>
1171  LO
1172  BlockCrsMatrix<Scalar, LO, GO, Node>::
1173  offsetPerBlock () const
1174  {
1175  return offsetPerBlock_;
1176  }
1177 
1178  template<class Scalar, class LO, class GO, class Node>
1179  typename BlockCrsMatrix<Scalar, LO, GO, Node>::const_little_block_type
1180  BlockCrsMatrix<Scalar, LO, GO, Node>::
1181  getConstLocalBlockFromInput (const impl_scalar_type* val,
1182  const size_t pointOffset) const
1183  {
1184  // Row major blocks
1185  const LO rowStride = blockSize_;
1186  return const_little_block_type (val + pointOffset, blockSize_, rowStride);
1187  }
1188 
1189  template<class Scalar, class LO, class GO, class Node>
1190  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1191  BlockCrsMatrix<Scalar, LO, GO, Node>::
1192  getNonConstLocalBlockFromInput (impl_scalar_type* val,
1193  const size_t pointOffset) const
1194  {
1195  // Row major blocks
1196  const LO rowStride = blockSize_;
1197  return little_block_type (val + pointOffset, blockSize_, rowStride);
1198  }
1199 
1200  template<class Scalar, class LO, class GO, class Node>
1201  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1202  BlockCrsMatrix<Scalar, LO, GO, Node>::
1203  getNonConstLocalBlockFromInputHost (impl_scalar_type* val,
1204  const size_t pointOffset) const
1205  {
1206  // Row major blocks
1207  const LO rowStride = blockSize_;
1208  const size_t bs2 = blockSize_ * blockSize_;
1209  return little_block_host_type (val + bs2 * pointOffset, blockSize_, rowStride);
1210  }
1211 
1212  template<class Scalar, class LO, class GO, class Node>
1213  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_type
1214  BlockCrsMatrix<Scalar, LO, GO, Node>::
1215  getLocalBlockDeviceNonConst (const LO localRowInd, const LO localColInd) const
1216  {
1217  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1218 
1219  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1220  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1221  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1222  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1223  auto vals = const_cast<this_BCRS_type&>(*this).getValuesDeviceNonConst();
1224  auto r_val = getNonConstLocalBlockFromInput (vals.data(), absBlockOffset);
1225  return r_val;
1226  }
1227  else {
1228  return little_block_type ();
1229  }
1230  }
1231 
1232  template<class Scalar, class LO, class GO, class Node>
1233  typename BlockCrsMatrix<Scalar, LO, GO, Node>::little_block_host_type
1234  BlockCrsMatrix<Scalar, LO, GO, Node>::
1235  getLocalBlockHostNonConst (const LO localRowInd, const LO localColInd) const
1236  {
1237  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1238 
1239  const size_t absRowBlockOffset = ptrHost_[localRowInd];
1240  const LO relBlockOffset = this->findRelOffsetOfColumnIndex (localRowInd, localColInd);
1241  if (relBlockOffset != Teuchos::OrdinalTraits<LO>::invalid ()) {
1242  const size_t absBlockOffset = absRowBlockOffset + relBlockOffset;
1243  auto vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst();
1244  auto r_val = getNonConstLocalBlockFromInputHost (vals.data(), absBlockOffset);
1245  return r_val;
1246  }
1247  else {
1248  return little_block_host_type ();
1249  }
1250  }
1251 
1252 
1253  template<class Scalar, class LO, class GO, class Node>
1254  bool
1255  BlockCrsMatrix<Scalar, LO, GO, Node>::
1256  checkSizes (const ::Tpetra::SrcDistObject& source)
1257  {
1258  using std::endl;
1259  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1260  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1261 
1262  if (src == NULL) {
1263  std::ostream& err = markLocalErrorAndGetStream ();
1264  err << "checkSizes: The source object of the Import or Export "
1265  "must be a BlockCrsMatrix with the same template parameters as the "
1266  "target object." << endl;
1267  }
1268  else {
1269  // Use a string of ifs, not if-elseifs, because we want to know
1270  // all the errors.
1271  if (src->getBlockSize () != this->getBlockSize ()) {
1272  std::ostream& err = markLocalErrorAndGetStream ();
1273  err << "checkSizes: The source and target objects of the Import or "
1274  << "Export must have the same block sizes. The source's block "
1275  << "size = " << src->getBlockSize () << " != the target's block "
1276  << "size = " << this->getBlockSize () << "." << endl;
1277  }
1278  if (! src->graph_.isFillComplete ()) {
1279  std::ostream& err = markLocalErrorAndGetStream ();
1280  err << "checkSizes: The source object of the Import or Export is "
1281  "not fill complete. Both source and target objects must be fill "
1282  "complete." << endl;
1283  }
1284  if (! this->graph_.isFillComplete ()) {
1285  std::ostream& err = markLocalErrorAndGetStream ();
1286  err << "checkSizes: The target object of the Import or Export is "
1287  "not fill complete. Both source and target objects must be fill "
1288  "complete." << endl;
1289  }
1290  if (src->graph_.getColMap ().is_null ()) {
1291  std::ostream& err = markLocalErrorAndGetStream ();
1292  err << "checkSizes: The source object of the Import or Export does "
1293  "not have a column Map. Both source and target objects must have "
1294  "column Maps." << endl;
1295  }
1296  if (this->graph_.getColMap ().is_null ()) {
1297  std::ostream& err = markLocalErrorAndGetStream ();
1298  err << "checkSizes: The target object of the Import or Export does "
1299  "not have a column Map. Both source and target objects must have "
1300  "column Maps." << endl;
1301  }
1302  }
1303 
1304  return ! (* (this->localError_));
1305  }
1306 
1307  template<class Scalar, class LO, class GO, class Node>
1308  void
1311  (const ::Tpetra::SrcDistObject& source,
1312  const size_t numSameIDs,
1313  const Kokkos::DualView<const local_ordinal_type*,
1314  buffer_device_type>& permuteToLIDs,
1315  const Kokkos::DualView<const local_ordinal_type*,
1316  buffer_device_type>& permuteFromLIDs,
1317  const CombineMode /*CM*/)
1318  {
1319  using ::Tpetra::Details::Behavior;
1321  using ::Tpetra::Details::ProfilingRegion;
1322  using std::endl;
1323  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
1324 
1325  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::copyAndPermute");
1326  const bool debug = Behavior::debug();
1327  const bool verbose = Behavior::verbose();
1328 
1329  // Define this function prefix
1330  std::string prefix;
1331  {
1332  std::ostringstream os;
1333  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1334  os << "Proc " << myRank << ": BlockCrsMatrix::copyAndPermute : " << endl;
1335  prefix = os.str();
1336  }
1337 
1338  // check if this already includes a local error
1339  if (* (this->localError_)) {
1340  std::ostream& err = this->markLocalErrorAndGetStream ();
1341  err << prefix
1342  << "The target object of the Import or Export is already in an error state."
1343  << endl;
1344  return;
1345  }
1346 
1347  //
1348  // Verbose input dual view status
1349  //
1350  if (verbose) {
1351  std::ostringstream os;
1352  os << prefix << endl
1353  << prefix << " " << dualViewStatusToString (permuteToLIDs, "permuteToLIDs") << endl
1354  << prefix << " " << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs") << endl;
1355  std::cerr << os.str ();
1356  }
1357 
1361  if (permuteToLIDs.extent (0) != permuteFromLIDs.extent (0)) {
1362  std::ostream& err = this->markLocalErrorAndGetStream ();
1363  err << prefix
1364  << "permuteToLIDs.extent(0) = " << permuteToLIDs.extent (0)
1365  << " != permuteFromLIDs.extent(0) = " << permuteFromLIDs.extent(0)
1366  << "." << endl;
1367  return;
1368  }
1369  if (permuteToLIDs.need_sync_host () || permuteFromLIDs.need_sync_host ()) {
1370  std::ostream& err = this->markLocalErrorAndGetStream ();
1371  err << prefix
1372  << "Both permuteToLIDs and permuteFromLIDs must be sync'd to host."
1373  << endl;
1374  return;
1375  }
1376 
1377  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
1378  if (src == NULL) {
1379  std::ostream& err = this->markLocalErrorAndGetStream ();
1380  err << prefix
1381  << "The source (input) object of the Import or "
1382  "Export is either not a BlockCrsMatrix, or does not have the right "
1383  "template parameters. checkSizes() should have caught this. "
1384  "Please report this bug to the Tpetra developers." << endl;
1385  return;
1386  }
1387 
1388  bool lclErr = false;
1389 #ifdef HAVE_TPETRA_DEBUG
1390  std::set<LO> invalidSrcCopyRows;
1391  std::set<LO> invalidDstCopyRows;
1392  std::set<LO> invalidDstCopyCols;
1393  std::set<LO> invalidDstPermuteCols;
1394  std::set<LO> invalidPermuteFromRows;
1395 #endif // HAVE_TPETRA_DEBUG
1396 
1397  // Copy the initial sequence of rows that are the same.
1398  //
1399  // The two graphs might have different column Maps, so we need to
1400  // do this using global column indices. This is purely local, so
1401  // we only need to check for local sameness of the two column
1402  // Maps.
1403 
1404 #ifdef HAVE_TPETRA_DEBUG
1405  const map_type& srcRowMap = * (src->graph_.getRowMap ());
1406 #endif // HAVE_TPETRA_DEBUG
1407  const map_type& dstRowMap = * (this->graph_.getRowMap ());
1408  const map_type& srcColMap = * (src->graph_.getColMap ());
1409  const map_type& dstColMap = * (this->graph_.getColMap ());
1410  const bool canUseLocalColumnIndices = srcColMap.locallySameAs (dstColMap);
1411 
1412  const size_t numPermute = static_cast<size_t> (permuteFromLIDs.extent(0));
1413  if (verbose) {
1414  std::ostringstream os;
1415  os << prefix
1416  << "canUseLocalColumnIndices: "
1417  << (canUseLocalColumnIndices ? "true" : "false")
1418  << ", numPermute: " << numPermute
1419  << endl;
1420  std::cerr << os.str ();
1421  }
1422 
1423  const auto permuteToLIDsHost = permuteToLIDs.view_host();
1424  const auto permuteFromLIDsHost = permuteFromLIDs.view_host();
1425 
1426  if (canUseLocalColumnIndices) {
1427  // Copy local rows that are the "same" in both source and target.
1428  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1429 #ifdef HAVE_TPETRA_DEBUG
1430  if (! srcRowMap.isNodeLocalElement (localRow)) {
1431  lclErr = true;
1432  invalidSrcCopyRows.insert (localRow);
1433  continue; // skip invalid rows
1434  }
1435 #endif // HAVE_TPETRA_DEBUG
1436 
1437  local_inds_host_view_type lclSrcCols;
1438  values_host_view_type vals;
1439  LO numEntries;
1440  // If this call fails, that means the mesh row local index is
1441  // invalid. That means the Import or Export is invalid somehow.
1442  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1443  if (numEntries > 0) {
1444  LO err = this->replaceLocalValues (localRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1445  if (err != numEntries) {
1446  lclErr = true;
1447  if (! dstRowMap.isNodeLocalElement (localRow)) {
1448 #ifdef HAVE_TPETRA_DEBUG
1449  invalidDstCopyRows.insert (localRow);
1450 #endif // HAVE_TPETRA_DEBUG
1451  }
1452  else {
1453  // Once there's an error, there's no sense in saving
1454  // time, so we check whether the column indices were
1455  // invalid. However, only remember which ones were
1456  // invalid in a debug build, because that might take a
1457  // lot of space.
1458  for (LO k = 0; k < numEntries; ++k) {
1459  if (! dstColMap.isNodeLocalElement (lclSrcCols[k])) {
1460  lclErr = true;
1461 #ifdef HAVE_TPETRA_DEBUG
1462  (void) invalidDstCopyCols.insert (lclSrcCols[k]);
1463 #endif // HAVE_TPETRA_DEBUG
1464  }
1465  }
1466  }
1467  }
1468  }
1469  } // for each "same" local row
1470 
1471  // Copy the "permute" local rows.
1472  for (size_t k = 0; k < numPermute; ++k) {
1473  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1474  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1475 
1476  local_inds_host_view_type lclSrcCols;
1477  values_host_view_type vals;
1478  LO numEntries;
1479  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1480  if (numEntries > 0) {
1481  LO err = this->replaceLocalValues (dstLclRow, lclSrcCols.data(), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1482  if (err != numEntries) {
1483  lclErr = true;
1484 #ifdef HAVE_TPETRA_DEBUG
1485  for (LO c = 0; c < numEntries; ++c) {
1486  if (! dstColMap.isNodeLocalElement (lclSrcCols[c])) {
1487  invalidDstPermuteCols.insert (lclSrcCols[c]);
1488  }
1489  }
1490 #endif // HAVE_TPETRA_DEBUG
1491  }
1492  }
1493  }
1494  }
1495  else { // must convert column indices to global
1496  // Reserve space to store the destination matrix's local column indices.
1497  const size_t maxNumEnt = src->graph_.getLocalMaxNumRowEntries ();
1498  Teuchos::Array<LO> lclDstCols (maxNumEnt);
1499 
1500  // Copy local rows that are the "same" in both source and target.
1501  for (LO localRow = 0; localRow < static_cast<LO> (numSameIDs); ++localRow) {
1502  local_inds_host_view_type lclSrcCols;
1503  values_host_view_type vals;
1504  LO numEntries;
1505 
1506  // If this call fails, that means the mesh row local index is
1507  // invalid. That means the Import or Export is invalid somehow.
1508  try {
1509  src->getLocalRowView (localRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1510  } catch (std::exception& e) {
1511  if (debug) {
1512  std::ostringstream os;
1513  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1514  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1515  << localRow << ", src->getLocalRowView() threw an exception: "
1516  << e.what ();
1517  std::cerr << os.str ();
1518  }
1519  throw e;
1520  }
1521 
1522  if (numEntries > 0) {
1523  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1524  lclErr = true;
1525  if (debug) {
1526  std::ostringstream os;
1527  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1528  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1529  << localRow << ", numEntries = " << numEntries << " > maxNumEnt = "
1530  << maxNumEnt << endl;
1531  std::cerr << os.str ();
1532  }
1533  }
1534  else {
1535  // Convert the source matrix's local column indices to the
1536  // destination matrix's local column indices.
1537  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1538  for (LO j = 0; j < numEntries; ++j) {
1539  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1540  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1541  lclErr = true;
1542 #ifdef HAVE_TPETRA_DEBUG
1543  invalidDstCopyCols.insert (lclDstColsView[j]);
1544 #endif // HAVE_TPETRA_DEBUG
1545  }
1546  }
1547  LO err(0);
1548  try {
1549  err = this->replaceLocalValues (localRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1550  } catch (std::exception& e) {
1551  if (debug) {
1552  std::ostringstream os;
1553  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1554  os << "Proc " << myRank << ": copyAndPermute: At \"same\" localRow "
1555  << localRow << ", this->replaceLocalValues() threw an exception: "
1556  << e.what ();
1557  std::cerr << os.str ();
1558  }
1559  throw e;
1560  }
1561  if (err != numEntries) {
1562  lclErr = true;
1563  if (debug) {
1564  std::ostringstream os;
1565  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1566  os << "Proc " << myRank << ": copyAndPermute: At \"same\" "
1567  "localRow " << localRow << ", this->replaceLocalValues "
1568  "returned " << err << " instead of numEntries = "
1569  << numEntries << endl;
1570  std::cerr << os.str ();
1571  }
1572  }
1573  }
1574  }
1575  }
1576 
1577  // Copy the "permute" local rows.
1578  for (size_t k = 0; k < numPermute; ++k) {
1579  const LO srcLclRow = static_cast<LO> (permuteFromLIDsHost(k));
1580  const LO dstLclRow = static_cast<LO> (permuteToLIDsHost(k));
1581 
1582  local_inds_host_view_type lclSrcCols;
1583  values_host_view_type vals;
1584  LO numEntries;
1585 
1586  try {
1587  src->getLocalRowView (srcLclRow, lclSrcCols, vals); numEntries = lclSrcCols.extent(0);
1588  } catch (std::exception& e) {
1589  if (debug) {
1590  std::ostringstream os;
1591  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1592  os << "Proc " << myRank << ": copyAndPermute: At \"permute\" "
1593  "srcLclRow " << srcLclRow << " and dstLclRow " << dstLclRow
1594  << ", src->getLocalRowView() threw an exception: " << e.what ();
1595  std::cerr << os.str ();
1596  }
1597  throw e;
1598  }
1599 
1600  if (numEntries > 0) {
1601  if (static_cast<size_t> (numEntries) > static_cast<size_t> (lclDstCols.size ())) {
1602  lclErr = true;
1603  }
1604  else {
1605  // Convert the source matrix's local column indices to the
1606  // destination matrix's local column indices.
1607  Teuchos::ArrayView<LO> lclDstColsView = lclDstCols.view (0, numEntries);
1608  for (LO j = 0; j < numEntries; ++j) {
1609  lclDstColsView[j] = dstColMap.getLocalElement (srcColMap.getGlobalElement (lclSrcCols[j]));
1610  if (lclDstColsView[j] == Teuchos::OrdinalTraits<LO>::invalid ()) {
1611  lclErr = true;
1612 #ifdef HAVE_TPETRA_DEBUG
1613  invalidDstPermuteCols.insert (lclDstColsView[j]);
1614 #endif // HAVE_TPETRA_DEBUG
1615  }
1616  }
1617  LO err = this->replaceLocalValues (dstLclRow, lclDstColsView.getRawPtr (), reinterpret_cast<const scalar_type*>(vals.data()), numEntries);
1618  if (err != numEntries) {
1619  lclErr = true;
1620  }
1621  }
1622  }
1623  }
1624  }
1625 
1626  if (lclErr) {
1627  std::ostream& err = this->markLocalErrorAndGetStream ();
1628 #ifdef HAVE_TPETRA_DEBUG
1629  err << "copyAndPermute: The graph structure of the source of the "
1630  "Import or Export must be a subset of the graph structure of the "
1631  "target. ";
1632  err << "invalidSrcCopyRows = [";
1633  for (typename std::set<LO>::const_iterator it = invalidSrcCopyRows.begin ();
1634  it != invalidSrcCopyRows.end (); ++it) {
1635  err << *it;
1636  typename std::set<LO>::const_iterator itp1 = it;
1637  itp1++;
1638  if (itp1 != invalidSrcCopyRows.end ()) {
1639  err << ",";
1640  }
1641  }
1642  err << "], invalidDstCopyRows = [";
1643  for (typename std::set<LO>::const_iterator it = invalidDstCopyRows.begin ();
1644  it != invalidDstCopyRows.end (); ++it) {
1645  err << *it;
1646  typename std::set<LO>::const_iterator itp1 = it;
1647  itp1++;
1648  if (itp1 != invalidDstCopyRows.end ()) {
1649  err << ",";
1650  }
1651  }
1652  err << "], invalidDstCopyCols = [";
1653  for (typename std::set<LO>::const_iterator it = invalidDstCopyCols.begin ();
1654  it != invalidDstCopyCols.end (); ++it) {
1655  err << *it;
1656  typename std::set<LO>::const_iterator itp1 = it;
1657  itp1++;
1658  if (itp1 != invalidDstCopyCols.end ()) {
1659  err << ",";
1660  }
1661  }
1662  err << "], invalidDstPermuteCols = [";
1663  for (typename std::set<LO>::const_iterator it = invalidDstPermuteCols.begin ();
1664  it != invalidDstPermuteCols.end (); ++it) {
1665  err << *it;
1666  typename std::set<LO>::const_iterator itp1 = it;
1667  itp1++;
1668  if (itp1 != invalidDstPermuteCols.end ()) {
1669  err << ",";
1670  }
1671  }
1672  err << "], invalidPermuteFromRows = [";
1673  for (typename std::set<LO>::const_iterator it = invalidPermuteFromRows.begin ();
1674  it != invalidPermuteFromRows.end (); ++it) {
1675  err << *it;
1676  typename std::set<LO>::const_iterator itp1 = it;
1677  itp1++;
1678  if (itp1 != invalidPermuteFromRows.end ()) {
1679  err << ",";
1680  }
1681  }
1682  err << "]" << endl;
1683 #else
1684  err << "copyAndPermute: The graph structure of the source of the "
1685  "Import or Export must be a subset of the graph structure of the "
1686  "target." << endl;
1687 #endif // HAVE_TPETRA_DEBUG
1688  }
1689 
1690  if (debug) {
1691  std::ostringstream os;
1692  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1693  const bool lclSuccess = ! (* (this->localError_));
1694  os << "*** Proc " << myRank << ": copyAndPermute "
1695  << (lclSuccess ? "succeeded" : "FAILED");
1696  if (lclSuccess) {
1697  os << endl;
1698  } else {
1699  os << ": error messages: " << this->errorMessages (); // comes w/ endl
1700  }
1701  std::cerr << os.str ();
1702  }
1703  }
1704 
1705  namespace { // (anonymous)
1706 
1725  template<class LO, class GO>
1726  size_t
1727  packRowCount (const size_t numEnt,
1728  const size_t numBytesPerValue,
1729  const size_t blkSize)
1730  {
1731  using ::Tpetra::Details::PackTraits;
1732 
1733  if (numEnt == 0) {
1734  // Empty rows always take zero bytes, to ensure sparsity.
1735  return 0;
1736  }
1737  else {
1738  // We store the number of entries as a local index (LO).
1739  LO numEntLO = 0; // packValueCount wants this.
1740  GO gid {};
1741  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1742  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1743  const size_t valsLen = numEnt * numBytesPerValue * blkSize * blkSize;
1744  return numEntLen + gidsLen + valsLen;
1745  }
1746  }
1747 
1758  template<class ST, class LO, class GO>
1759  size_t
1760  unpackRowCount (const typename ::Tpetra::Details::PackTraits<LO>::input_buffer_type& imports,
1761  const size_t offset,
1762  const size_t numBytes,
1763  const size_t /* numBytesPerValue */)
1764  {
1765  using Kokkos::subview;
1766  using ::Tpetra::Details::PackTraits;
1767 
1768  if (numBytes == 0) {
1769  // Empty rows always take zero bytes, to ensure sparsity.
1770  return static_cast<size_t> (0);
1771  }
1772  else {
1773  LO numEntLO = 0;
1774  const size_t theNumBytes = PackTraits<LO>::packValueCount (numEntLO);
1775  TEUCHOS_TEST_FOR_EXCEPTION
1776  (theNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1777  "theNumBytes = " << theNumBytes << " < numBytes = " << numBytes
1778  << ".");
1779  const char* const inBuf = imports.data () + offset;
1780  const size_t actualNumBytes = PackTraits<LO>::unpackValue (numEntLO, inBuf);
1781  TEUCHOS_TEST_FOR_EXCEPTION
1782  (actualNumBytes > numBytes, std::logic_error, "unpackRowCount: "
1783  "actualNumBytes = " << actualNumBytes << " < numBytes = " << numBytes
1784  << ".");
1785  return static_cast<size_t> (numEntLO);
1786  }
1787  }
1788 
1792  template<class ST, class LO, class GO>
1793  size_t
1794  packRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<LO>::output_buffer_type exports,
1795  const size_t offset,
1796  const size_t numEnt,
1797  const typename ::Tpetra::Details::PackTraits<GO>::input_array_type& gidsIn,
1798  const typename ::Tpetra::Details::PackTraits<ST>::input_array_type& valsIn,
1799  const size_t numBytesPerValue,
1800  const size_t blockSize)
1801  {
1802  using Kokkos::subview;
1803  using ::Tpetra::Details::PackTraits;
1804 
1805  if (numEnt == 0) {
1806  // Empty rows always take zero bytes, to ensure sparsity.
1807  return 0;
1808  }
1809  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1810 
1811  const GO gid = 0; // packValueCount wants this
1812  const LO numEntLO = static_cast<size_t> (numEnt);
1813 
1814  const size_t numEntBeg = offset;
1815  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
1816  const size_t gidsBeg = numEntBeg + numEntLen;
1817  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1818  const size_t valsBeg = gidsBeg + gidsLen;
1819  const size_t valsLen = numScalarEnt * numBytesPerValue;
1820 
1821  char* const numEntOut = exports.data () + numEntBeg;
1822  char* const gidsOut = exports.data () + gidsBeg;
1823  char* const valsOut = exports.data () + valsBeg;
1824 
1825  size_t numBytesOut = 0;
1826  int errorCode = 0;
1827  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
1828 
1829  {
1830  Kokkos::pair<int, size_t> p;
1831  p = PackTraits<GO>::packArray (gidsOut, gidsIn.data (), numEnt);
1832  errorCode += p.first;
1833  numBytesOut += p.second;
1834 
1835  p = PackTraits<ST>::packArray (valsOut, valsIn.data (), numScalarEnt);
1836  errorCode += p.first;
1837  numBytesOut += p.second;
1838  }
1839 
1840  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1841  TEUCHOS_TEST_FOR_EXCEPTION
1842  (numBytesOut != expectedNumBytes, std::logic_error,
1843  "packRowForBlockCrs: numBytesOut = " << numBytesOut
1844  << " != expectedNumBytes = " << expectedNumBytes << ".");
1845 
1846  TEUCHOS_TEST_FOR_EXCEPTION
1847  (errorCode != 0, std::runtime_error, "packRowForBlockCrs: "
1848  "PackTraits::packArray returned a nonzero error code");
1849 
1850  return numBytesOut;
1851  }
1852 
1853  // Return the number of bytes actually read / used.
1854  template<class ST, class LO, class GO>
1855  size_t
1856  unpackRowForBlockCrs (const typename ::Tpetra::Details::PackTraits<GO>::output_array_type& gidsOut,
1857  const typename ::Tpetra::Details::PackTraits<ST>::output_array_type& valsOut,
1858  const typename ::Tpetra::Details::PackTraits<int>::input_buffer_type& imports,
1859  const size_t offset,
1860  const size_t numBytes,
1861  const size_t numEnt,
1862  const size_t numBytesPerValue,
1863  const size_t blockSize)
1864  {
1865  using ::Tpetra::Details::PackTraits;
1866 
1867  if (numBytes == 0) {
1868  // Rows with zero bytes always have zero entries.
1869  return 0;
1870  }
1871  const size_t numScalarEnt = numEnt * blockSize * blockSize;
1872  TEUCHOS_TEST_FOR_EXCEPTION
1873  (static_cast<size_t> (imports.extent (0)) <= offset,
1874  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1875  << imports.extent (0) << " <= offset = " << offset << ".");
1876  TEUCHOS_TEST_FOR_EXCEPTION
1877  (static_cast<size_t> (imports.extent (0)) < offset + numBytes,
1878  std::logic_error, "unpackRowForBlockCrs: imports.extent(0) = "
1879  << imports.extent (0) << " < offset + numBytes = "
1880  << (offset + numBytes) << ".");
1881 
1882  const GO gid = 0; // packValueCount wants this
1883  const LO lid = 0; // packValueCount wants this
1884 
1885  const size_t numEntBeg = offset;
1886  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
1887  const size_t gidsBeg = numEntBeg + numEntLen;
1888  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
1889  const size_t valsBeg = gidsBeg + gidsLen;
1890  const size_t valsLen = numScalarEnt * numBytesPerValue;
1891 
1892  const char* const numEntIn = imports.data () + numEntBeg;
1893  const char* const gidsIn = imports.data () + gidsBeg;
1894  const char* const valsIn = imports.data () + valsBeg;
1895 
1896  size_t numBytesOut = 0;
1897  int errorCode = 0;
1898  LO numEntOut;
1899  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
1900  TEUCHOS_TEST_FOR_EXCEPTION
1901  (static_cast<size_t> (numEntOut) != numEnt, std::logic_error,
1902  "unpackRowForBlockCrs: Expected number of entries " << numEnt
1903  << " != actual number of entries " << numEntOut << ".");
1904 
1905  {
1906  Kokkos::pair<int, size_t> p;
1907  p = PackTraits<GO>::unpackArray (gidsOut.data (), gidsIn, numEnt);
1908  errorCode += p.first;
1909  numBytesOut += p.second;
1910 
1911  p = PackTraits<ST>::unpackArray (valsOut.data (), valsIn, numScalarEnt);
1912  errorCode += p.first;
1913  numBytesOut += p.second;
1914  }
1915 
1916  TEUCHOS_TEST_FOR_EXCEPTION
1917  (numBytesOut != numBytes, std::logic_error,
1918  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1919  << " != numBytes = " << numBytes << ".");
1920 
1921  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
1922  TEUCHOS_TEST_FOR_EXCEPTION
1923  (numBytesOut != expectedNumBytes, std::logic_error,
1924  "unpackRowForBlockCrs: numBytesOut = " << numBytesOut
1925  << " != expectedNumBytes = " << expectedNumBytes << ".");
1926 
1927  TEUCHOS_TEST_FOR_EXCEPTION
1928  (errorCode != 0, std::runtime_error, "unpackRowForBlockCrs: "
1929  "PackTraits::unpackArray returned a nonzero error code");
1930 
1931  return numBytesOut;
1932  }
1933  } // namespace (anonymous)
1934 
1935  template<class Scalar, class LO, class GO, class Node>
1936  void
1939  (const ::Tpetra::SrcDistObject& source,
1940  const Kokkos::DualView<const local_ordinal_type*,
1941  buffer_device_type>& exportLIDs,
1942  Kokkos::DualView<packet_type*,
1943  buffer_device_type>& exports, // output
1944  Kokkos::DualView<size_t*,
1945  buffer_device_type> numPacketsPerLID, // output
1946  size_t& constantNumPackets)
1947  {
1948  using ::Tpetra::Details::Behavior;
1950  using ::Tpetra::Details::ProfilingRegion;
1951  using ::Tpetra::Details::PackTraits;
1952 
1953  typedef typename Kokkos::View<int*, device_type>::HostMirror::execution_space host_exec;
1954 
1955  typedef BlockCrsMatrix<Scalar, LO, GO, Node> this_BCRS_type;
1956 
1957  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::packAndPrepare");
1958 
1959  const bool debug = Behavior::debug();
1960  const bool verbose = Behavior::verbose();
1961 
1962  // Define this function prefix
1963  std::string prefix;
1964  {
1965  std::ostringstream os;
1966  const int myRank = this->graph_.getRowMap ()->getComm ()->getRank ();
1967  os << "Proc " << myRank << ": BlockCrsMatrix::packAndPrepare: " << std::endl;
1968  prefix = os.str();
1969  }
1970 
1971  // check if this already includes a local error
1972  if (* (this->localError_)) {
1973  std::ostream& err = this->markLocalErrorAndGetStream ();
1974  err << prefix
1975  << "The target object of the Import or Export is already in an error state."
1976  << std::endl;
1977  return;
1978  }
1979 
1980  //
1981  // Verbose input dual view status
1982  //
1983  if (verbose) {
1984  std::ostringstream os;
1985  os << prefix << std::endl
1986  << prefix << " " << dualViewStatusToString (exportLIDs, "exportLIDs") << std::endl
1987  << prefix << " " << dualViewStatusToString (exports, "exports") << std::endl
1988  << prefix << " " << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID") << std::endl;
1989  std::cerr << os.str ();
1990  }
1991 
1995  if (exportLIDs.extent (0) != numPacketsPerLID.extent (0)) {
1996  std::ostream& err = this->markLocalErrorAndGetStream ();
1997  err << prefix
1998  << "exportLIDs.extent(0) = " << exportLIDs.extent (0)
1999  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2000  << "." << std::endl;
2001  return;
2002  }
2003  if (exportLIDs.need_sync_host ()) {
2004  std::ostream& err = this->markLocalErrorAndGetStream ();
2005  err << prefix << "exportLIDs be sync'd to host." << std::endl;
2006  return;
2007  }
2008 
2009  const this_BCRS_type* src = dynamic_cast<const this_BCRS_type* > (&source);
2010  if (src == NULL) {
2011  std::ostream& err = this->markLocalErrorAndGetStream ();
2012  err << prefix
2013  << "The source (input) object of the Import or "
2014  "Export is either not a BlockCrsMatrix, or does not have the right "
2015  "template parameters. checkSizes() should have caught this. "
2016  "Please report this bug to the Tpetra developers." << std::endl;
2017  return;
2018  }
2019 
2020  // Graphs and matrices are allowed to have a variable number of
2021  // entries per row. We could test whether all rows have the same
2022  // number of entries, but DistObject can only use this
2023  // optimization if all rows on _all_ processes have the same
2024  // number of entries. Rather than do the all-reduce necessary to
2025  // test for this unlikely case, we tell DistObject (by setting
2026  // constantNumPackets to zero) to assume that different rows may
2027  // have different numbers of entries.
2028  constantNumPackets = 0;
2029 
2030  // const values
2031  const crs_graph_type& srcGraph = src->graph_;
2032  const size_t blockSize = static_cast<size_t> (src->getBlockSize ());
2033  const size_t numExportLIDs = exportLIDs.extent (0);
2034  size_t numBytesPerValue(0);
2035  {
2036  auto val_host = val_.getHostView(Access::ReadOnly);
2037  numBytesPerValue =
2038  PackTraits<impl_scalar_type>
2039  ::packValueCount(val_host.extent(0) ? val_host(0) : impl_scalar_type());
2040  }
2041 
2042  // Compute the number of bytes ("packets") per row to pack. While
2043  // we're at it, compute the total # of block entries to send, and
2044  // the max # of block entries in any of the rows we're sending.
2045 
2046  Impl::BlockCrsRowStruct<size_t> rowReducerStruct;
2047 
2048  // Graph information is on host; let's do this on host parallel reduce
2049  auto exportLIDsHost = exportLIDs.view_host();
2050  auto numPacketsPerLIDHost = numPacketsPerLID.view_host(); // we will modify this.
2051  numPacketsPerLID.modify_host ();
2052  {
2053  using reducer_type = Impl::BlockCrsReducer<Impl::BlockCrsRowStruct<size_t>,host_exec>;
2054  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs);
2055  Kokkos::parallel_reduce
2056  (policy,
2057  [=](const size_t &i, typename reducer_type::value_type &update) {
2058  const LO lclRow = exportLIDsHost(i);
2059  size_t numEnt = srcGraph.getNumEntriesInLocalRow (lclRow);
2060  numEnt = (numEnt == Teuchos::OrdinalTraits<size_t>::invalid () ? 0 : numEnt);
2061 
2062  const size_t numBytes =
2063  packRowCount<LO, GO> (numEnt, numBytesPerValue, blockSize);
2064  numPacketsPerLIDHost(i) = numBytes;
2065  update += typename reducer_type::value_type(numEnt, numBytes, numEnt);
2066  }, rowReducerStruct);
2067  }
2068 
2069  // Compute the number of bytes ("packets") per row to pack. While
2070  // we're at it, compute the total # of block entries to send, and
2071  // the max # of block entries in any of the rows we're sending.
2072  const size_t totalNumBytes = rowReducerStruct.totalNumBytes;
2073  const size_t totalNumEntries = rowReducerStruct.totalNumEntries;
2074  const size_t maxRowLength = rowReducerStruct.maxRowLength;
2075 
2076  if (verbose) {
2077  std::ostringstream os;
2078  os << prefix
2079  << "totalNumBytes = " << totalNumBytes << ", totalNumEntries = " << totalNumEntries
2080  << std::endl;
2081  std::cerr << os.str ();
2082  }
2083 
2084  // We use a "struct of arrays" approach to packing each row's
2085  // entries. All the column indices (as global indices) go first,
2086  // then all their owning process ranks, and then the values.
2087  if(exports.extent(0) != totalNumBytes)
2088  {
2089  const std::string oldLabel = exports.d_view.label ();
2090  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
2091  exports = Kokkos::DualView<packet_type*, buffer_device_type>(newLabel, totalNumBytes);
2092  }
2093  if (totalNumEntries > 0) {
2094  // Current position (in bytes) in the 'exports' output array.
2095  Kokkos::View<size_t*, host_exec> offset("offset", numExportLIDs+1);
2096  {
2097  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numExportLIDs+1);
2098  Kokkos::parallel_scan
2099  (policy,
2100  [=](const size_t &i, size_t &update, const bool &final) {
2101  if (final) offset(i) = update;
2102  update += (i == numExportLIDs ? 0 : numPacketsPerLIDHost(i));
2103  });
2104  }
2105  if (offset(numExportLIDs) != totalNumBytes) {
2106  std::ostream& err = this->markLocalErrorAndGetStream ();
2107  err << prefix
2108  << "At end of method, the final offset (in bytes) "
2109  << offset(numExportLIDs) << " does not equal the total number of bytes packed "
2110  << totalNumBytes << ". "
2111  << "Please report this bug to the Tpetra developers." << std::endl;
2112  return;
2113  }
2114 
2115  // For each block row of the matrix owned by the calling
2116  // process, pack that block row's column indices and values into
2117  // the exports array.
2118 
2119  // Source matrix's column Map. We verified in checkSizes() that
2120  // the column Map exists (is not null).
2121  const map_type& srcColMap = * (srcGraph.getColMap ());
2122 
2123  // Pack the data for each row to send, into the 'exports' buffer.
2124  // exports will be modified on host.
2125  exports.modify_host();
2126  {
2127  typedef Kokkos::TeamPolicy<host_exec> policy_type;
2128  const auto policy =
2129  policy_type(numExportLIDs, 1, 1)
2130  .set_scratch_size(0, Kokkos::PerTeam(sizeof(GO)*maxRowLength));
2131  // The following parallel_for needs const access to the local values of src.
2132  // (the local graph is also accessed on host, but this does not use WDVs).
2133  getValuesHost();
2135  Kokkos::parallel_for
2136  (policy,
2137  [=](const typename policy_type::member_type &member) {
2138  const size_t i = member.league_rank();
2139  Kokkos::View<GO*, typename host_exec::scratch_memory_space>
2140  gblColInds(member.team_scratch(0), maxRowLength);
2141 
2142  const LO lclRowInd = exportLIDsHost(i);
2143  local_inds_host_view_type lclColInds;
2144  values_host_view_type vals;
2145 
2146  // It's OK to ignore the return value, since if the calling
2147  // process doesn't own that local row, then the number of
2148  // entries in that row on the calling process is zero.
2149  src->getLocalRowView (lclRowInd, lclColInds, vals);
2150  const size_t numEnt = lclColInds.extent(0);
2151 
2152  // Convert column indices from local to global.
2153  for (size_t j = 0; j < numEnt; ++j)
2154  gblColInds(j) = srcColMap.getGlobalElement (lclColInds(j));
2155 
2156  // Kyungjoo: additional wrapping scratch view is necessary
2157  // the following function interface need the same execution space
2158  // host scratch space somehow is not considered same as the host_exec
2159  // Copy the row's data into the current spot in the exports array.
2160  const size_t numBytes =
2161  packRowForBlockCrs<impl_scalar_type, LO, GO>
2162  (exports.view_host(),
2163  offset(i),
2164  numEnt,
2165  Kokkos::View<const GO*, host_exec>(gblColInds.data(), maxRowLength),
2166  vals,
2167  numBytesPerValue,
2168  blockSize);
2169 
2170  // numBytes should be same as the difference between offsets
2171  if (debug) {
2172  const size_t offsetDiff = offset(i+1) - offset(i);
2173  if (numBytes != offsetDiff) {
2174  std::ostringstream os;
2175  os << prefix
2176  << "numBytes computed from packRowForBlockCrs is different from "
2177  << "precomputed offset values, LID = " << i << std::endl;
2178  std::cerr << os.str ();
2179  }
2180  }
2181  }); // for each LID (of a row) to send
2183  }
2184  } // if totalNumEntries > 0
2185 
2186  if (debug) {
2187  std::ostringstream os;
2188  const bool lclSuccess = ! (* (this->localError_));
2189  os << prefix
2190  << (lclSuccess ? "succeeded" : "FAILED")
2191  << std::endl;
2192  std::cerr << os.str ();
2193  }
2194  }
2195 
2196  template<class Scalar, class LO, class GO, class Node>
2197  void
2200  (const Kokkos::DualView<const local_ordinal_type*,
2201  buffer_device_type>& importLIDs,
2202  Kokkos::DualView<packet_type*,
2203  buffer_device_type> imports,
2204  Kokkos::DualView<size_t*,
2205  buffer_device_type> numPacketsPerLID,
2206  const size_t /* constantNumPackets */,
2207  const CombineMode combineMode)
2208  {
2209  using ::Tpetra::Details::Behavior;
2211  using ::Tpetra::Details::ProfilingRegion;
2212  using ::Tpetra::Details::PackTraits;
2213  using std::endl;
2214  using host_exec =
2215  typename Kokkos::View<int*, device_type>::HostMirror::execution_space;
2216 
2217  ProfilingRegion profile_region("Tpetra::BlockCrsMatrix::unpackAndCombine");
2218  const bool verbose = Behavior::verbose ();
2219 
2220  // Define this function prefix
2221  std::string prefix;
2222  {
2223  std::ostringstream os;
2224  auto map = this->graph_.getRowMap ();
2225  auto comm = map.is_null () ? Teuchos::null : map->getComm ();
2226  const int myRank = comm.is_null () ? -1 : comm->getRank ();
2227  os << "Proc " << myRank << ": Tpetra::BlockCrsMatrix::unpackAndCombine: ";
2228  prefix = os.str ();
2229  if (verbose) {
2230  os << "Start" << endl;
2231  std::cerr << os.str ();
2232  }
2233  }
2234 
2235  // check if this already includes a local error
2236  if (* (this->localError_)) {
2237  std::ostream& err = this->markLocalErrorAndGetStream ();
2238  std::ostringstream os;
2239  os << prefix << "{Im/Ex}port target is already in error." << endl;
2240  if (verbose) {
2241  std::cerr << os.str ();
2242  }
2243  err << os.str ();
2244  return;
2245  }
2246 
2250  if (importLIDs.extent (0) != numPacketsPerLID.extent (0)) {
2251  std::ostream& err = this->markLocalErrorAndGetStream ();
2252  std::ostringstream os;
2253  os << prefix << "importLIDs.extent(0) = " << importLIDs.extent (0)
2254  << " != numPacketsPerLID.extent(0) = " << numPacketsPerLID.extent(0)
2255  << "." << endl;
2256  if (verbose) {
2257  std::cerr << os.str ();
2258  }
2259  err << os.str ();
2260  return;
2261  }
2262 
2263  if (combineMode != ADD && combineMode != INSERT &&
2264  combineMode != REPLACE && combineMode != ABSMAX &&
2265  combineMode != ZERO) {
2266  std::ostream& err = this->markLocalErrorAndGetStream ();
2267  std::ostringstream os;
2268  os << prefix
2269  << "Invalid CombineMode value " << combineMode << ". Valid "
2270  << "values include ADD, INSERT, REPLACE, ABSMAX, and ZERO."
2271  << std::endl;
2272  if (verbose) {
2273  std::cerr << os.str ();
2274  }
2275  err << os.str ();
2276  return;
2277  }
2278 
2279  if (this->graph_.getColMap ().is_null ()) {
2280  std::ostream& err = this->markLocalErrorAndGetStream ();
2281  std::ostringstream os;
2282  os << prefix << "Target matrix's column Map is null." << endl;
2283  if (verbose) {
2284  std::cerr << os.str ();
2285  }
2286  err << os.str ();
2287  return;
2288  }
2289 
2290  // Target matrix's column Map. Use to convert the global column
2291  // indices in the receive buffer to local indices. We verified in
2292  // checkSizes() that the column Map exists (is not null).
2293  const map_type& tgtColMap = * (this->graph_.getColMap ());
2294 
2295  // Const values
2296  const size_t blockSize = this->getBlockSize ();
2297  const size_t numImportLIDs = importLIDs.extent(0);
2298  // FIXME (mfh 06 Feb 2019) For scalar types with run-time sizes, a
2299  // default-constructed instance could have a different size than
2300  // other instances. (We assume all nominally constructed
2301  // instances have the same size; that's not the issue here.) This
2302  // could be bad if the calling process has no entries, but other
2303  // processes have entries that they want to send to us.
2304  size_t numBytesPerValue(0);
2305  {
2306  auto val_host = val_.getHostView(Access::ReadOnly);
2307  numBytesPerValue =
2308  PackTraits<impl_scalar_type>::packValueCount
2309  (val_host.extent (0) ? val_host(0) : impl_scalar_type ());
2310  }
2311  const size_t maxRowNumEnt = graph_.getLocalMaxNumRowEntries ();
2312  const size_t maxRowNumScalarEnt = maxRowNumEnt * blockSize * blockSize;
2313 
2314  if (verbose) {
2315  std::ostringstream os;
2316  os << prefix << "combineMode: "
2317  << ::Tpetra::combineModeToString (combineMode)
2318  << ", blockSize: " << blockSize
2319  << ", numImportLIDs: " << numImportLIDs
2320  << ", numBytesPerValue: " << numBytesPerValue
2321  << ", maxRowNumEnt: " << maxRowNumEnt
2322  << ", maxRowNumScalarEnt: " << maxRowNumScalarEnt << endl;
2323  std::cerr << os.str ();
2324  }
2325 
2326  if (combineMode == ZERO || numImportLIDs == 0) {
2327  if (verbose) {
2328  std::ostringstream os;
2329  os << prefix << "Nothing to unpack. Done!" << std::endl;
2330  std::cerr << os.str ();
2331  }
2332  return;
2333  }
2334 
2335  // NOTE (mfh 07 Feb 2019) If we ever implement unpack on device,
2336  // we can remove this sync.
2337  if (imports.need_sync_host ()) {
2338  imports.sync_host ();
2339  }
2340 
2341  // NOTE (mfh 07 Feb 2019) DistObject::doTransferNew has already
2342  // sync'd numPacketsPerLID to host, since it needs to do that in
2343  // order to post MPI_Irecv messages with the correct lengths. A
2344  // hypothetical device-specific boundary exchange implementation
2345  // could possibly receive data without sync'ing lengths to host,
2346  // but we don't need to design for that nonexistent thing yet.
2347 
2348  if (imports.need_sync_host () || numPacketsPerLID.need_sync_host () ||
2349  importLIDs.need_sync_host ()) {
2350  std::ostream& err = this->markLocalErrorAndGetStream ();
2351  std::ostringstream os;
2352  os << prefix << "All input DualViews must be sync'd to host by now. "
2353  << "imports_nc.need_sync_host()="
2354  << (imports.need_sync_host () ? "true" : "false")
2355  << ", numPacketsPerLID.need_sync_host()="
2356  << (numPacketsPerLID.need_sync_host () ? "true" : "false")
2357  << ", importLIDs.need_sync_host()="
2358  << (importLIDs.need_sync_host () ? "true" : "false")
2359  << "." << endl;
2360  if (verbose) {
2361  std::cerr << os.str ();
2362  }
2363  err << os.str ();
2364  return;
2365  }
2366 
2367  const auto importLIDsHost = importLIDs.view_host ();
2368  const auto numPacketsPerLIDHost = numPacketsPerLID.view_host ();
2369 
2370  // FIXME (mfh 06 Feb 2019) We could fuse the scan with the unpack
2371  // loop, by only unpacking on final==true (when we know the
2372  // current offset's value).
2373 
2374  Kokkos::View<size_t*, host_exec> offset ("offset", numImportLIDs+1);
2375  {
2376  const auto policy = Kokkos::RangePolicy<host_exec>(size_t(0), numImportLIDs+1);
2377  Kokkos::parallel_scan
2378  ("Tpetra::BlockCrsMatrix::unpackAndCombine: offsets", policy,
2379  [=] (const size_t &i, size_t &update, const bool &final) {
2380  if (final) offset(i) = update;
2381  update += (i == numImportLIDs ? 0 : numPacketsPerLIDHost(i));
2382  });
2383  }
2384 
2385  // this variable does not matter with a race condition (just error flag)
2386  //
2387  // NOTE (mfh 06 Feb 2019) CUDA doesn't necessarily like reductions
2388  // or atomics on bool, so we use int instead. (I know we're not
2389  // launching a CUDA loop here, but we could in the future, and I'd
2390  // like to avoid potential trouble.)
2391  Kokkos::View<int, host_exec, Kokkos::MemoryTraits<Kokkos::Atomic> >
2392  errorDuringUnpack ("errorDuringUnpack");
2393  errorDuringUnpack () = 0;
2394  {
2395  using policy_type = Kokkos::TeamPolicy<host_exec>;
2396  size_t scratch_per_row = sizeof(GO) * maxRowNumEnt + sizeof (LO) * maxRowNumEnt + numBytesPerValue * maxRowNumScalarEnt
2397  + 2 * sizeof(GO); // Yeah, this is a fudge factor
2398 
2399  const auto policy = policy_type (numImportLIDs, 1, 1)
2400  .set_scratch_size (0, Kokkos::PerTeam (scratch_per_row));
2401  using host_scratch_space = typename host_exec::scratch_memory_space;
2402 
2403  using pair_type = Kokkos::pair<size_t, size_t>;
2404 
2405  //The following parallel_for modifies values on host while unpacking.
2406  getValuesHostNonConst();
2408  Kokkos::parallel_for
2409  ("Tpetra::BlockCrsMatrix::unpackAndCombine: unpack", policy,
2410  [=] (const typename policy_type::member_type& member) {
2411  const size_t i = member.league_rank();
2412  Kokkos::View<GO*, host_scratch_space> gblColInds
2413  (member.team_scratch (0), maxRowNumEnt);
2414  Kokkos::View<LO*, host_scratch_space> lclColInds
2415  (member.team_scratch (0), maxRowNumEnt);
2416  Kokkos::View<impl_scalar_type*, host_scratch_space> vals
2417  (member.team_scratch (0), maxRowNumScalarEnt);
2418 
2419 
2420  const size_t offval = offset(i);
2421  const LO lclRow = importLIDsHost(i);
2422  const size_t numBytes = numPacketsPerLIDHost(i);
2423  const size_t numEnt =
2424  unpackRowCount<impl_scalar_type, LO, GO>
2425  (imports.view_host (), offval, numBytes, numBytesPerValue);
2426 
2427  if (numBytes > 0) {
2428  if (numEnt > maxRowNumEnt) {
2429  errorDuringUnpack() = 1;
2430  if (verbose) {
2431  std::ostringstream os;
2432  os << prefix
2433  << "At i = " << i << ", numEnt = " << numEnt
2434  << " > maxRowNumEnt = " << maxRowNumEnt
2435  << std::endl;
2436  std::cerr << os.str ();
2437  }
2438  return;
2439  }
2440  }
2441  const size_t numScalarEnt = numEnt*blockSize*blockSize;
2442  auto gidsOut = Kokkos::subview(gblColInds, pair_type(0, numEnt));
2443  auto lidsOut = Kokkos::subview(lclColInds, pair_type(0, numEnt));
2444  auto valsOut = Kokkos::subview(vals, pair_type(0, numScalarEnt));
2445 
2446  // Kyungjoo: additional wrapping scratch view is necessary
2447  // the following function interface need the same execution space
2448  // host scratch space somehow is not considered same as the host_exec
2449  size_t numBytesOut = 0;
2450  try {
2451  numBytesOut =
2452  unpackRowForBlockCrs<impl_scalar_type, LO, GO>
2453  (Kokkos::View<GO*,host_exec>(gidsOut.data(), numEnt),
2454  Kokkos::View<impl_scalar_type*,host_exec>(valsOut.data(), numScalarEnt),
2455  imports.view_host(),
2456  offval, numBytes, numEnt,
2457  numBytesPerValue, blockSize);
2458  }
2459  catch (std::exception& e) {
2460  errorDuringUnpack() = 1;
2461  if (verbose) {
2462  std::ostringstream os;
2463  os << prefix << "At i = " << i << ", unpackRowForBlockCrs threw: "
2464  << e.what () << endl;
2465  std::cerr << os.str ();
2466  }
2467  return;
2468  }
2469 
2470  if (numBytes != numBytesOut) {
2471  errorDuringUnpack() = 1;
2472  if (verbose) {
2473  std::ostringstream os;
2474  os << prefix
2475  << "At i = " << i << ", numBytes = " << numBytes
2476  << " != numBytesOut = " << numBytesOut << "."
2477  << std::endl;
2478  std::cerr << os.str();
2479  }
2480  return;
2481  }
2482 
2483  // Convert incoming global indices to local indices.
2484  for (size_t k = 0; k < numEnt; ++k) {
2485  lidsOut(k) = tgtColMap.getLocalElement (gidsOut(k));
2486  if (lidsOut(k) == Teuchos::OrdinalTraits<LO>::invalid ()) {
2487  if (verbose) {
2488  std::ostringstream os;
2489  os << prefix
2490  << "At i = " << i << ", GID " << gidsOut(k)
2491  << " is not owned by the calling process."
2492  << std::endl;
2493  std::cerr << os.str();
2494  }
2495  return;
2496  }
2497  }
2498 
2499  // Combine the incoming data with the matrix's current data.
2500  LO numCombd = 0;
2501  const LO* const lidsRaw = const_cast<const LO*> (lidsOut.data ());
2502  const Scalar* const valsRaw = reinterpret_cast<const Scalar*>
2503  (const_cast<const impl_scalar_type*> (valsOut.data ()));
2504  if (combineMode == ADD) {
2505  numCombd = this->sumIntoLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2506  } else if (combineMode == INSERT || combineMode == REPLACE) {
2507  numCombd = this->replaceLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2508  } else if (combineMode == ABSMAX) {
2509  numCombd = this->absMaxLocalValues (lclRow, lidsRaw, valsRaw, numEnt);
2510  }
2511 
2512  if (static_cast<LO> (numEnt) != numCombd) {
2513  errorDuringUnpack() = 1;
2514  if (verbose) {
2515  std::ostringstream os;
2516  os << prefix << "At i = " << i << ", numEnt = " << numEnt
2517  << " != numCombd = " << numCombd << "."
2518  << endl;
2519  std::cerr << os.str ();
2520  }
2521  return;
2522  }
2523  }); // for each import LID i
2525  }
2526 
2527  if (errorDuringUnpack () != 0) {
2528  std::ostream& err = this->markLocalErrorAndGetStream ();
2529  err << prefix << "Unpacking failed.";
2530  if (! verbose) {
2531  err << " Please run again with the environment variable setting "
2532  "TPETRA_VERBOSE=1 to get more verbose diagnostic output.";
2533  }
2534  err << endl;
2535  }
2536 
2537  if (verbose) {
2538  std::ostringstream os;
2539  const bool lclSuccess = ! (* (this->localError_));
2540  os << prefix
2541  << (lclSuccess ? "succeeded" : "FAILED")
2542  << std::endl;
2543  std::cerr << os.str ();
2544  }
2545  }
2546 
2547  template<class Scalar, class LO, class GO, class Node>
2548  std::string
2550  {
2551  using Teuchos::TypeNameTraits;
2552  std::ostringstream os;
2553  os << "\"Tpetra::BlockCrsMatrix\": { "
2554  << "Template parameters: { "
2555  << "Scalar: " << TypeNameTraits<Scalar>::name ()
2556  << "LO: " << TypeNameTraits<LO>::name ()
2557  << "GO: " << TypeNameTraits<GO>::name ()
2558  << "Node: " << TypeNameTraits<Node>::name ()
2559  << " }"
2560  << ", Label: \"" << this->getObjectLabel () << "\""
2561  << ", Global dimensions: ["
2562  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2563  << graph_.getRangeMap ()->getGlobalNumElements () << "]"
2564  << ", Block size: " << getBlockSize ()
2565  << " }";
2566  return os.str ();
2567  }
2568 
2569 
2570  template<class Scalar, class LO, class GO, class Node>
2571  void
2573  describe (Teuchos::FancyOStream& out,
2574  const Teuchos::EVerbosityLevel verbLevel) const
2575  {
2576  using Teuchos::ArrayRCP;
2577  using Teuchos::CommRequest;
2578  using Teuchos::FancyOStream;
2579  using Teuchos::getFancyOStream;
2580  using Teuchos::ireceive;
2581  using Teuchos::isend;
2582  using Teuchos::outArg;
2583  using Teuchos::TypeNameTraits;
2584  using Teuchos::VERB_DEFAULT;
2585  using Teuchos::VERB_NONE;
2586  using Teuchos::VERB_LOW;
2587  using Teuchos::VERB_MEDIUM;
2588  // using Teuchos::VERB_HIGH;
2589  using Teuchos::VERB_EXTREME;
2590  using Teuchos::RCP;
2591  using Teuchos::wait;
2592  using std::endl;
2593 
2594  // Set default verbosity if applicable.
2595  const Teuchos::EVerbosityLevel vl =
2596  (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
2597 
2598  if (vl == VERB_NONE) {
2599  return; // print nothing
2600  }
2601 
2602  // describe() always starts with a tab before it prints anything.
2603  Teuchos::OSTab tab0 (out);
2604 
2605  out << "\"Tpetra::BlockCrsMatrix\":" << endl;
2606  Teuchos::OSTab tab1 (out);
2607 
2608  out << "Template parameters:" << endl;
2609  {
2610  Teuchos::OSTab tab2 (out);
2611  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
2612  << "LO: " << TypeNameTraits<LO>::name () << endl
2613  << "GO: " << TypeNameTraits<GO>::name () << endl
2614  << "Node: " << TypeNameTraits<Node>::name () << endl;
2615  }
2616  out << "Label: \"" << this->getObjectLabel () << "\"" << endl
2617  << "Global dimensions: ["
2618  << graph_.getDomainMap ()->getGlobalNumElements () << ", "
2619  << graph_.getRangeMap ()->getGlobalNumElements () << "]" << endl;
2620 
2621  const LO blockSize = getBlockSize ();
2622  out << "Block size: " << blockSize << endl;
2623 
2624  // constituent objects
2625  if (vl >= VERB_MEDIUM) {
2626  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2627  const int myRank = comm.getRank ();
2628  if (myRank == 0) {
2629  out << "Row Map:" << endl;
2630  }
2631  getRowMap()->describe (out, vl);
2632 
2633  if (! getColMap ().is_null ()) {
2634  if (getColMap() == getRowMap()) {
2635  if (myRank == 0) {
2636  out << "Column Map: same as row Map" << endl;
2637  }
2638  }
2639  else {
2640  if (myRank == 0) {
2641  out << "Column Map:" << endl;
2642  }
2643  getColMap ()->describe (out, vl);
2644  }
2645  }
2646  if (! getDomainMap ().is_null ()) {
2647  if (getDomainMap () == getRowMap ()) {
2648  if (myRank == 0) {
2649  out << "Domain Map: same as row Map" << endl;
2650  }
2651  }
2652  else if (getDomainMap () == getColMap ()) {
2653  if (myRank == 0) {
2654  out << "Domain Map: same as column Map" << endl;
2655  }
2656  }
2657  else {
2658  if (myRank == 0) {
2659  out << "Domain Map:" << endl;
2660  }
2661  getDomainMap ()->describe (out, vl);
2662  }
2663  }
2664  if (! getRangeMap ().is_null ()) {
2665  if (getRangeMap () == getDomainMap ()) {
2666  if (myRank == 0) {
2667  out << "Range Map: same as domain Map" << endl;
2668  }
2669  }
2670  else if (getRangeMap () == getRowMap ()) {
2671  if (myRank == 0) {
2672  out << "Range Map: same as row Map" << endl;
2673  }
2674  }
2675  else {
2676  if (myRank == 0) {
2677  out << "Range Map: " << endl;
2678  }
2679  getRangeMap ()->describe (out, vl);
2680  }
2681  }
2682  }
2683 
2684  if (vl >= VERB_EXTREME) {
2685  const Teuchos::Comm<int>& comm = * (graph_.getMap ()->getComm ());
2686  const int myRank = comm.getRank ();
2687  const int numProcs = comm.getSize ();
2688 
2689  // Print the calling process' data to the given output stream.
2690  RCP<std::ostringstream> lclOutStrPtr (new std::ostringstream ());
2691  RCP<FancyOStream> osPtr = getFancyOStream (lclOutStrPtr);
2692  FancyOStream& os = *osPtr;
2693  os << "Process " << myRank << ":" << endl;
2694  Teuchos::OSTab tab2 (os);
2695 
2696  const map_type& meshRowMap = * (graph_.getRowMap ());
2697  const map_type& meshColMap = * (graph_.getColMap ());
2698  for (LO meshLclRow = meshRowMap.getMinLocalIndex ();
2699  meshLclRow <= meshRowMap.getMaxLocalIndex ();
2700  ++meshLclRow) {
2701  const GO meshGblRow = meshRowMap.getGlobalElement (meshLclRow);
2702  os << "Row " << meshGblRow << ": {";
2703 
2704  local_inds_host_view_type lclColInds;
2705  values_host_view_type vals;
2706  LO numInds = 0;
2707  this->getLocalRowView (meshLclRow, lclColInds, vals); numInds = lclColInds.extent(0);
2708 
2709  for (LO k = 0; k < numInds; ++k) {
2710  const GO gblCol = meshColMap.getGlobalElement (lclColInds[k]);
2711 
2712  os << "Col " << gblCol << ": [";
2713  for (LO i = 0; i < blockSize; ++i) {
2714  for (LO j = 0; j < blockSize; ++j) {
2715  os << vals[blockSize*blockSize*k + i*blockSize + j];
2716  if (j + 1 < blockSize) {
2717  os << ", ";
2718  }
2719  }
2720  if (i + 1 < blockSize) {
2721  os << "; ";
2722  }
2723  }
2724  os << "]";
2725  if (k + 1 < numInds) {
2726  os << ", ";
2727  }
2728  }
2729  os << "}" << endl;
2730  }
2731 
2732  // Print data on Process 0. This will automatically respect the
2733  // current indentation level.
2734  if (myRank == 0) {
2735  out << lclOutStrPtr->str ();
2736  lclOutStrPtr = Teuchos::null; // clear it to save space
2737  }
2738 
2739  const int sizeTag = 1337;
2740  const int dataTag = 1338;
2741 
2742  ArrayRCP<char> recvDataBuf; // only used on Process 0
2743 
2744  // Send string sizes and data from each process in turn to
2745  // Process 0, and print on that process.
2746  for (int p = 1; p < numProcs; ++p) {
2747  if (myRank == 0) {
2748  // Receive the incoming string's length.
2749  ArrayRCP<size_t> recvSize (1);
2750  recvSize[0] = 0;
2751  RCP<CommRequest<int> > recvSizeReq =
2752  ireceive<int, size_t> (recvSize, p, sizeTag, comm);
2753  wait<int> (comm, outArg (recvSizeReq));
2754  const size_t numCharsToRecv = recvSize[0];
2755 
2756  // Allocate space for the string to receive. Reuse receive
2757  // buffer space if possible. We can do this because in the
2758  // current implementation, we only have one receive in
2759  // flight at a time. Leave space for the '\0' at the end,
2760  // in case the sender doesn't send it.
2761  if (static_cast<size_t>(recvDataBuf.size()) < numCharsToRecv + 1) {
2762  recvDataBuf.resize (numCharsToRecv + 1);
2763  }
2764  ArrayRCP<char> recvData = recvDataBuf.persistingView (0, numCharsToRecv);
2765  // Post the receive of the actual string data.
2766  RCP<CommRequest<int> > recvDataReq =
2767  ireceive<int, char> (recvData, p, dataTag, comm);
2768  wait<int> (comm, outArg (recvDataReq));
2769 
2770  // Print the received data. This will respect the current
2771  // indentation level. Make sure that the string is
2772  // null-terminated.
2773  recvDataBuf[numCharsToRecv] = '\0';
2774  out << recvDataBuf.getRawPtr ();
2775  }
2776  else if (myRank == p) { // if I am not Process 0, and my rank is p
2777  // This deep-copies the string at most twice, depending on
2778  // whether std::string reference counts internally (it
2779  // generally does, so this won't deep-copy at all).
2780  const std::string stringToSend = lclOutStrPtr->str ();
2781  lclOutStrPtr = Teuchos::null; // clear original to save space
2782 
2783  // Send the string's length to Process 0.
2784  const size_t numCharsToSend = stringToSend.size ();
2785  ArrayRCP<size_t> sendSize (1);
2786  sendSize[0] = numCharsToSend;
2787  RCP<CommRequest<int> > sendSizeReq =
2788  isend<int, size_t> (sendSize, 0, sizeTag, comm);
2789  wait<int> (comm, outArg (sendSizeReq));
2790 
2791  // Send the actual string to Process 0. We know that the
2792  // string has length > 0, so it's save to take the address
2793  // of the first entry. Make a nonowning ArrayRCP to hold
2794  // the string. Process 0 will add a null termination
2795  // character at the end of the string, after it receives the
2796  // message.
2797  ArrayRCP<const char> sendData (&stringToSend[0], 0, numCharsToSend, false);
2798  RCP<CommRequest<int> > sendDataReq =
2799  isend<int, char> (sendData, 0, dataTag, comm);
2800  wait<int> (comm, outArg (sendDataReq));
2801  }
2802  } // for each process rank p other than 0
2803  } // extreme verbosity level (print the whole matrix)
2804  }
2805 
2806  template<class Scalar, class LO, class GO, class Node>
2807  Teuchos::RCP<const Teuchos::Comm<int> >
2809  getComm() const
2810  {
2811  return graph_.getComm();
2812  }
2813 
2814 
2815  template<class Scalar, class LO, class GO, class Node>
2819  {
2820  return graph_.getGlobalNumCols();
2821  }
2822 
2823 
2824  template<class Scalar, class LO, class GO, class Node>
2825  size_t
2828  {
2829  return graph_.getLocalNumCols();
2830  }
2831 
2832  template<class Scalar, class LO, class GO, class Node>
2833  GO
2836  {
2837  return graph_.getIndexBase();
2838  }
2839 
2840  template<class Scalar, class LO, class GO, class Node>
2844  {
2845  return graph_.getGlobalNumEntries();
2846  }
2847 
2848  template<class Scalar, class LO, class GO, class Node>
2849  size_t
2852  {
2853  return graph_.getLocalNumEntries();
2854  }
2855 
2856  template<class Scalar, class LO, class GO, class Node>
2857  size_t
2859  getNumEntriesInGlobalRow (GO globalRow) const
2860  {
2861  return graph_.getNumEntriesInGlobalRow(globalRow);
2862  }
2863 
2864 
2865  template<class Scalar, class LO, class GO, class Node>
2866  size_t
2869  {
2870  return graph_.getGlobalMaxNumRowEntries();
2871  }
2872 
2873  template<class Scalar, class LO, class GO, class Node>
2874  bool
2876  hasColMap() const
2877  {
2878  return graph_.hasColMap();
2879  }
2880 
2881 
2882  template<class Scalar, class LO, class GO, class Node>
2883  bool
2886  {
2887  return graph_.isLocallyIndexed();
2888  }
2889 
2890  template<class Scalar, class LO, class GO, class Node>
2891  bool
2894  {
2895  return graph_.isGloballyIndexed();
2896  }
2897 
2898  template<class Scalar, class LO, class GO, class Node>
2899  bool
2902  {
2903  return graph_.isFillComplete ();
2904  }
2905 
2906  template<class Scalar, class LO, class GO, class Node>
2907  bool
2910  {
2911  return false;
2912  }
2913 
2914  template<class Scalar, class LO, class GO, class Node>
2915  void
2917  getGlobalRowCopy (GO /*GlobalRow*/,
2918  nonconst_global_inds_host_view_type &/*Indices*/,
2919  nonconst_values_host_view_type &/*Values*/,
2920  size_t& /*NumEntries*/) const
2921  {
2922  TEUCHOS_TEST_FOR_EXCEPTION(
2923  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowCopy: "
2924  "This class doesn't support global matrix indexing.");
2925 
2926  }
2927 
2928  template<class Scalar, class LO, class GO, class Node>
2929  void
2931  getGlobalRowView (GO /* GlobalRow */,
2932  global_inds_host_view_type &/* indices */,
2933  values_host_view_type &/* values */) const
2934  {
2935  TEUCHOS_TEST_FOR_EXCEPTION(
2936  true, std::logic_error, "Tpetra::BlockCrsMatrix::getGlobalRowView: "
2937  "This class doesn't support global matrix indexing.");
2938 
2939  }
2940 
2941  template<class Scalar, class LO, class GO, class Node>
2942  void
2944  getLocalRowView (LO localRowInd,
2945  local_inds_host_view_type &colInds,
2946  values_host_view_type &vals) const
2947  {
2948  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2949  colInds = local_inds_host_view_type();
2950  vals = values_host_view_type();
2951  }
2952  else {
2953  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2954  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2955  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2956 
2957  vals = getValuesHost (localRowInd);
2958  }
2959  }
2960 
2961  template<class Scalar, class LO, class GO, class Node>
2962  void
2965  local_inds_host_view_type &colInds,
2966  nonconst_values_host_view_type &vals) const
2967  {
2968  if (! rowMeshMap_.isNodeLocalElement (localRowInd)) {
2969  colInds = nonconst_local_inds_host_view_type();
2970  vals = nonconst_values_host_view_type();
2971  }
2972  else {
2973  const size_t absBlockOffsetStart = ptrHost_[localRowInd];
2974  const size_t numInds = ptrHost_[localRowInd + 1] - absBlockOffsetStart;
2975  colInds = local_inds_host_view_type(indHost_.data()+absBlockOffsetStart, numInds);
2976 
2977  using this_BCRS_type = BlockCrsMatrix<Scalar, LO, GO, Node>;
2978  vals = const_cast<this_BCRS_type&>(*this).getValuesHostNonConst(localRowInd);
2979  }
2980  }
2981 
2982  template<class Scalar, class LO, class GO, class Node>
2983  void
2986  {
2987  const size_t lclNumMeshRows = graph_.getLocalNumRows ();
2988 
2989  Kokkos::View<size_t*, device_type> diagOffsets ("diagOffsets", lclNumMeshRows);
2990  graph_.getLocalDiagOffsets (diagOffsets);
2991 
2992  // The code below works on host, so use a host View.
2993  auto diagOffsetsHost = Kokkos::create_mirror_view (diagOffsets);
2994  // DEEP_COPY REVIEW - DEVICE-TO-HOSTMIRROR
2995  Kokkos::deep_copy (execution_space(), diagOffsetsHost, diagOffsets);
2996 
2997  auto vals_host_out = val_.getHostView(Access::ReadOnly);
2998  const impl_scalar_type* vals_host_out_raw = vals_host_out.data();
2999 
3000  // TODO amk: This is a temporary measure to make the code run with Ifpack2
3001  size_t rowOffset = 0;
3002  size_t offset = 0;
3003  LO bs = getBlockSize();
3004  for(size_t r=0; r<getLocalNumRows(); r++)
3005  {
3006  // move pointer to start of diagonal block
3007  offset = rowOffset + diagOffsetsHost(r)*bs*bs;
3008  for(int b=0; b<bs; b++)
3009  {
3010  diag.replaceLocalValue(r*bs+b, vals_host_out_raw[offset+b*(bs+1)]);
3011  }
3012  // move pointer to start of next block row
3013  rowOffset += getNumEntriesInLocalRow(r)*bs*bs;
3014  }
3015 
3016  //diag.template sync<memory_space> (); // sync vec of diag entries back to dev
3017  }
3018 
3019  template<class Scalar, class LO, class GO, class Node>
3020  void
3022  leftScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3023  {
3024  TEUCHOS_TEST_FOR_EXCEPTION(
3025  true, std::logic_error, "Tpetra::BlockCrsMatrix::leftScale: "
3026  "not implemented.");
3027 
3028  }
3029 
3030  template<class Scalar, class LO, class GO, class Node>
3031  void
3033  rightScale (const ::Tpetra::Vector<Scalar, LO, GO, Node>& /* x */)
3034  {
3035  TEUCHOS_TEST_FOR_EXCEPTION(
3036  true, std::logic_error, "Tpetra::BlockCrsMatrix::rightScale: "
3037  "not implemented.");
3038 
3039  }
3040 
3041  template<class Scalar, class LO, class GO, class Node>
3042  Teuchos::RCP<const ::Tpetra::RowGraph<LO, GO, Node> >
3044  getGraph() const
3045  {
3046  return graphRCP_;
3047  }
3048 
3049  template<class Scalar, class LO, class GO, class Node>
3050  typename ::Tpetra::RowMatrix<Scalar, LO, GO, Node>::mag_type
3053  {
3054  TEUCHOS_TEST_FOR_EXCEPTION(
3055  true, std::logic_error, "Tpetra::BlockCrsMatrix::getFrobeniusNorm: "
3056  "not implemented.");
3057  }
3058 
3059 } // namespace Tpetra
3060 
3061 //
3062 // Explicit instantiation macro
3063 //
3064 // Must be expanded from within the Tpetra namespace!
3065 //
3066 #define TPETRA_BLOCKCRSMATRIX_INSTANT(S,LO,GO,NODE) \
3067  template class BlockCrsMatrix< S, LO, GO, NODE >;
3068 
3069 #endif // TPETRA_BLOCKCRSMATRIX_DEF_HPP
virtual void getLocalRowCopy(LO LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Not implemented.
size_t getNumEntriesInLocalRow(const LO localRowInd) const override
Return the number of entries in the given row on the calling process.
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
virtual size_t getLocalNumEntries() const override
The local number of stored (structurally nonzero) entries.
virtual global_size_t getGlobalNumCols() const override
The global number of columns of this matrix.
typename BMV::impl_scalar_type impl_scalar_type
The implementation type of entries in the matrix.
void getLocalDiagCopy(const Kokkos::View< impl_scalar_type ***, device_type, Kokkos::MemoryUnmanaged > &diag, const Kokkos::View< const size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel) const override
Print a description of this object to the given output stream.
Kokkos::View< impl_scalar_type **, Impl::BlockCrsMatrixLittleBlockArrayLayout, device_type, Kokkos::MemoryTraits< Kokkos::Unmanaged > > little_block_type
The type used to access nonconst matrix blocks.
virtual typename::Tpetra::RowMatrix< Scalar, LO, GO, Node >::mag_type getFrobeniusNorm() const override
The Frobenius norm of the matrix.
LO replaceLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Replace values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual void leftScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the left with the given Vector x.
Teuchos::RCP< const map_type > getColMap() const override
Returns the Map that describes the column distribution in this graph.
LO replaceLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like replaceLocalValues, but avoids computing row offsets.
std::string description() const override
One-line description of this object.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
typename DistObject< Scalar, LO, GO, Node >::buffer_device_type buffer_device_type
Kokkos::Device specialization for communication buffers.
void exportAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Export< LO, GO, Node > &exporter) const
Import from this to the given destination matrix, and make the result fill complete.
virtual bool hasColMap() const override
Whether this matrix has a well-defined column Map.
virtual bool isLocallyIndexed() const override
Whether matrix indices are locally indexed.
LO local_ordinal_type
The type of local indices.
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void getLocalRowViewNonConst(LO LocalRow, local_inds_host_view_type &indices, nonconst_values_host_view_type &values) const
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
One or more distributed dense vectors.
Kokkos::StaticCrsGraph< local_ordinal_type, Kokkos::LayoutLeft, device_type, void, size_t > local_graph_device_type
The type of the part of the sparse graph on each MPI process.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
Linear algebra kernels for small dense matrices and vectors.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
Scalar scalar_type
The type of entries in the matrix (that is, of each entry in each block).
KOKKOS_INLINE_FUNCTION void AXPY(const CoefficientType &alpha, const ViewType1 &x, const ViewType2 &y)
y := y + alpha * x (dense vector or matrix update)
size_t getLocalNumRows() const override
get the local number of block rows
virtual Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which this matrix is distributed.
Kokkos::LayoutRight BlockCrsMatrixLittleBlockArrayLayout
give an option to use layoutleft
virtual void rightScale(const ::Tpetra::Vector< Scalar, LO, GO, Node > &x) override
Scale the RowMatrix on the right with the given Vector x.
void getLocalRowView(LO LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a view of the (mesh, i.e., block) row, using local (mesh, i.e., block) indices.
static map_type makePointMap(const map_type &meshMap, const LO blockSize)
Create and return the point Map corresponding to the given mesh Map and block size.
local_ordinal_type getMinLocalIndex() const
The minimum local index.
MultiVector for multiple degrees of freedom per mesh point.
virtual void copyAndPermute(const SrcDistObject &sourceObj, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
KOKKOS_INLINE_FUNCTION void absMax(const ViewType2 &Y, const ViewType1 &X)
Implementation of Tpetra&#39;s ABSMAX CombineMode for the small dense blocks in BlockCrsMatrix, and the small dense vectors in BlockMultiVector and BlockVector.
Teuchos::RCP< const map_type > getColMap() const override
get the (mesh) map for the columns of this block matrix.
virtual size_t getGlobalMaxNumRowEntries() const override
The maximum number of entries in any row over all processes in the matrix&#39;s communicator.
size_t global_size_t
Global size_t object.
virtual void getGlobalRowView(GO GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting, globally indexed view of the given row of the matrix.
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
BlockCrsMatrix()
Default constructor: Makes an empty block matrix.
LO absMaxLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Variant of getLocalDiagCopy() that uses precomputed offsets and puts diagonal blocks in a 3-D Kokkos:...
LO getBlockSize() const
Get the number of degrees of freedom per mesh point.
Teuchos::RCP< const map_type > getDomainMap() const override
Get the (point) domain Map of this matrix.
Teuchos::RCP< const map_type > getRowMap() const override
get the (mesh) map for the rows of this block matrix.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
LO sumIntoLocalValuesByOffsets(const LO localRowInd, const ptrdiff_t offsets[], const Scalar vals[], const LO numOffsets) const
Like sumIntoLocalValues, but avoids computing row offsets.
global_size_t getGlobalNumRows() const override
get the global number of block rows
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
void replaceLocalValue(const LocalOrdinal myRow, const Scalar &value)
Replace current value at the specified location with specified values.
local_ordinal_type getMaxLocalIndex() const
The maximum local index on the calling process.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
virtual Teuchos::RCP< const ::Tpetra::RowGraph< LO, GO, Node > > getGraph() const override
Get the (mesh) graph.
virtual GO getIndexBase() const override
The index base for global indices in this matrix.
void applyBlock(const BlockMultiVector< Scalar, LO, GO, Node > &X, BlockMultiVector< Scalar, LO, GO, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar beta=Teuchos::ScalarTraits< Scalar >::zero())
Version of apply() that takes BlockMultiVector input and output.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
virtual bool isGloballyIndexed() const override
Whether matrix indices are globally indexed.
LO getLocalRowOffsets(const LO localRowInd, ptrdiff_t offsets[], const LO colInds[], const LO numColInds) const
Get relative offsets corresponding to the given rows, given by local row index.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
virtual global_size_t getGlobalNumEntries() const override
The global number of stored (structurally nonzero) entries.
Replace old value with maximum of magnitudes of old and new values.
void apply(const mv_type &X, mv_type &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
For this matrix A, compute Y := beta * Y + alpha * Op(A) * X.
Replace existing values with new values.
device_type::execution_space execution_space
The Kokkos execution space that this class uses.
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
void enableWDVTracking()
Enable WrappedDualView reference-count tracking and syncing. Call this after exiting a host-parallel ...
Replace old values with zero.
Teuchos::RCP< const map_type > getRangeMap() const override
Get the (point) range Map of this matrix.
KOKKOS_INLINE_FUNCTION void COPY(const ViewType1 &x, const ViewType2 &y)
Deep copy x into y, where x and y are either rank 1 (vectors) or rank 2 (matrices) with the same dime...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
void getLocalDiagOffsets(const Kokkos::View< size_t *, device_type, Kokkos::MemoryUnmanaged > &offsets) const
Get offsets of the diagonal entries in the matrix.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
char packet_type
Implementation detail; tells.
local_matrix_device_type getLocalMatrixDevice() const
void disableWDVTracking()
Disable WrappedDualView reference-count tracking and syncing. Call this before entering a host-parall...
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
A distributed dense vector.
virtual bool supportsRowViews() const override
Whether this object implements getLocalRowView() and getGlobalRowView().
virtual void packAndPrepare(const SrcDistObject &sourceObj, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< packet_type *, buffer_device_type > &exports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, size_t &constantNumPackets) override
virtual void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< packet_type *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode) override
virtual void getGlobalRowCopy(GO GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Get a copy of the given global row&#39;s entries.
bool locallySameAs(const Map< local_ordinal_type, global_ordinal_type, node_type > &map) const
Is this Map locally the same as the input Map?
virtual bool isFillComplete() const override
Whether fillComplete() has been called.
impl_scalar_type_dualview::t_host getValuesHostNonConst() const
Get the host or device View of the matrix&#39;s values (val_).
Base class for distributed Tpetra objects that support data redistribution.
LO sumIntoLocalValues(const LO localRowInd, const LO colInds[], const Scalar vals[], const LO numColInds) const
Sum into values at the given (mesh, i.e., block) column indices, in the given (mesh, i.e., block) row.
virtual size_t getLocalNumCols() const override
The number of columns needed to apply the forward operator on this node.
void importAndFillComplete(Teuchos::RCP< BlockCrsMatrix< Scalar, LO, GO, Node > > &destMatrix, const Import< LO, GO, Node > &importer) const
Import from this to the given destination matrix, and make the result fill complete.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
virtual size_t getNumEntriesInGlobalRow(GO globalRow) const override
The current number of entries on the calling process in the specified global row. ...