Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_Experimental_RBILUK_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_EXPERIMENTAL_CRSRBILUK_DEF_HPP
11 #define IFPACK2_EXPERIMENTAL_CRSRBILUK_DEF_HPP
12 
13 #include "Tpetra_BlockMultiVector.hpp"
14 #include "Tpetra_BlockView.hpp"
15 #include "Tpetra_BlockCrsMatrix_Helpers_decl.hpp"
16 #include "Ifpack2_OverlappingRowMatrix.hpp"
17 #include "Ifpack2_Details_getCrsMatrix.hpp"
18 #include "Ifpack2_LocalFilter.hpp"
19 #include "Ifpack2_Utilities.hpp"
20 #include "Ifpack2_RILUK.hpp"
21 #include "KokkosSparse_sptrsv.hpp"
22 
23 //#define IFPACK2_RBILUK_INITIAL
24 //#define IFPACK2_RBILUK_INITIAL_NOKK
25 
26 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
27 #include "KokkosBatched_Gemm_Decl.hpp"
28 #include "KokkosBatched_Gemm_Serial_Impl.hpp"
29 #include "KokkosBatched_Util.hpp"
30 #endif
31 
32 namespace Ifpack2 {
33 
34 namespace Experimental {
35 
36 namespace
37 {
38 template<class MatrixType>
39 struct LocalRowHandler
40 {
41  using LocalOrdinal = typename MatrixType::local_ordinal_type;
42  using row_matrix_type = Tpetra::RowMatrix<
43  typename MatrixType::scalar_type,
44  LocalOrdinal,
45  typename MatrixType::global_ordinal_type,
46  typename MatrixType::node_type>;
47  using inds_type = typename row_matrix_type::local_inds_host_view_type;
48  using vals_type = typename row_matrix_type::values_host_view_type;
49 
50  LocalRowHandler(Teuchos::RCP<const row_matrix_type> A)
51  : A_(std::move(A))
52  {
53  if (!A_->supportsRowViews())
54  {
55  const auto maxNumRowEntr = A_->getLocalMaxNumRowEntries();
56  const auto blockSize = A_->getBlockSize();
57  ind_nc_ = inds_type_nc("Ifpack2::RBILUK::LocalRowHandler::indices",maxNumRowEntr);
58  val_nc_ = vals_type_nc("Ifpack2::RBILUK::LocalRowHandler::values",maxNumRowEntr*blockSize*blockSize);
59  }
60  }
61 
62  void getLocalRow(LocalOrdinal local_row, inds_type & InI, vals_type & InV, LocalOrdinal & NumIn)
63  {
64  if (A_->supportsRowViews())
65  {
66  A_->getLocalRowView(local_row,InI,InV);
67  NumIn = (LocalOrdinal)InI.size();
68  }
69  else
70  {
71  size_t cnt;
72  A_->getLocalRowCopy(local_row,ind_nc_,val_nc_,cnt);
73  InI = ind_nc_;
74  InV = val_nc_;
75  NumIn = (LocalOrdinal)cnt;
76  }
77  }
78 
79 private:
80 
81  using inds_type_nc = typename row_matrix_type::nonconst_local_inds_host_view_type;
82  using vals_type_nc = typename row_matrix_type::nonconst_values_host_view_type;
83 
85  inds_type_nc ind_nc_;
86  vals_type_nc val_nc_;
87 };
88 
89 } // namespace
90 
91 template<class MatrixType>
93  : RILUK<row_matrix_type>(Teuchos::rcp_dynamic_cast<const row_matrix_type>(Matrix_in) )
94 {}
95 
96 template<class MatrixType>
98  : RILUK<row_matrix_type>(Teuchos::rcp_dynamic_cast<const row_matrix_type>(Matrix_in) )
99 {}
100 
101 
102 template<class MatrixType>
104 
105 
106 template<class MatrixType>
107 void
109 {
110  // FIXME (mfh 04 Nov 2015) What about A_? When does that get (re)set?
111 
112  // It's legal for A to be null; in that case, you may not call
113  // initialize() until calling setMatrix() with a nonnull input.
114  // Regardless, setting the matrix invalidates any previous
115  // factorization.
116  if (A.getRawPtr () != this->A_.getRawPtr ())
117  {
118  this->isAllocated_ = false;
119  this->isInitialized_ = false;
120  this->isComputed_ = false;
121  this->Graph_ = Teuchos::null;
122  L_block_ = Teuchos::null;
123  U_block_ = Teuchos::null;
124  D_block_ = Teuchos::null;
125  }
126 }
127 
128 
129 
130 template<class MatrixType>
131 const typename RBILUK<MatrixType>::block_crs_matrix_type&
133 {
135  L_block_.is_null (), std::runtime_error, "Ifpack2::RILUK::getL: The L factor "
136  "is null. Please call initialize() (and preferably also compute()) "
137  "before calling this method. If the input matrix has not yet been set, "
138  "you must first call setMatrix() with a nonnull input matrix before you "
139  "may call initialize() or compute().");
140  return *L_block_;
141 }
142 
143 
144 template<class MatrixType>
145 const typename RBILUK<MatrixType>::block_crs_matrix_type&
147 {
149  D_block_.is_null (), std::runtime_error, "Ifpack2::RILUK::getD: The D factor "
150  "(of diagonal entries) is null. Please call initialize() (and "
151  "preferably also compute()) before calling this method. If the input "
152  "matrix has not yet been set, you must first call setMatrix() with a "
153  "nonnull input matrix before you may call initialize() or compute().");
154  return *D_block_;
155 }
156 
157 
158 template<class MatrixType>
159 const typename RBILUK<MatrixType>::block_crs_matrix_type&
161 {
163  U_block_.is_null (), std::runtime_error, "Ifpack2::RILUK::getU: The U factor "
164  "is null. Please call initialize() (and preferably also compute()) "
165  "before calling this method. If the input matrix has not yet been set, "
166  "you must first call setMatrix() with a nonnull input matrix before you "
167  "may call initialize() or compute().");
168  return *U_block_;
169 }
170 
171 template<class MatrixType>
173 {
174  using Teuchos::null;
175  using Teuchos::rcp;
176 
177  if (! this->isAllocated_) {
178  // Deallocate any existing storage. This avoids storing 2x
179  // memory, since RCP op= does not deallocate until after the
180  // assignment.
181  this->L_ = null;
182  this->U_ = null;
183  this->D_ = null;
184  L_block_ = null;
185  U_block_ = null;
186  D_block_ = null;
187 
188  // Allocate Matrix using ILUK graphs
189  L_block_ = rcp(new block_crs_matrix_type(*this->Graph_->getL_Graph (), blockSize_) );
190  U_block_ = rcp(new block_crs_matrix_type(*this->Graph_->getU_Graph (), blockSize_) );
191  D_block_ = rcp(new block_crs_matrix_type(*(Ifpack2::Details::computeDiagonalGraph(*(this->Graph_->getOverlapGraph()))),
192  blockSize_) );
193  L_block_->setAllToScalar (STM::zero ()); // Zero out L and U matrices
194  U_block_->setAllToScalar (STM::zero ());
195  D_block_->setAllToScalar (STM::zero ());
196 
197  // Allocate temp space for apply
198  if (this->isKokkosKernelsSpiluk_) {
199  const auto numRows = L_block_->getLocalNumRows();
200  tmp_ = decltype(tmp_)("RBILUK::tmp_", numRows * blockSize_);
201  }
202  }
203  this->isAllocated_ = true;
204 }
205 
206 namespace
207 {
208 
209 template<class MatrixType>
211 getBlockCrsGraph(const Teuchos::RCP<const typename RBILUK<MatrixType>::row_matrix_type>& A)
212 {
213  using local_ordinal_type = typename MatrixType::local_ordinal_type;
214  using Teuchos::RCP;
215  using Teuchos::rcp;
216  using Teuchos::rcp_dynamic_cast;
217  using Teuchos::rcp_const_cast;
218  using Teuchos::rcpFromRef;
219  using row_matrix_type = typename RBILUK<MatrixType>::row_matrix_type;
220  using crs_graph_type = typename RBILUK<MatrixType>::crs_graph_type;
221  using block_crs_matrix_type = typename RBILUK<MatrixType>::block_crs_matrix_type;
222 
223  auto A_local = RBILUK<MatrixType>::makeLocalFilter(A);
224 
225  {
226  RCP<const LocalFilter<row_matrix_type> > filteredA =
227  rcp_dynamic_cast<const LocalFilter<row_matrix_type> >(A_local);
228  RCP<const OverlappingRowMatrix<row_matrix_type> > overlappedA = Teuchos::null;
229  RCP<const block_crs_matrix_type > A_block = Teuchos::null;
230  if (!filteredA.is_null ())
231  {
232  overlappedA = rcp_dynamic_cast<const OverlappingRowMatrix<row_matrix_type> > (filteredA->getUnderlyingMatrix ());
233  }
234 
235  if (! overlappedA.is_null ()) {
236  A_block = rcp_dynamic_cast<const block_crs_matrix_type>(overlappedA->getUnderlyingMatrix());
237  }
238  else if (!filteredA.is_null ()){
239  //If there is no overlap, filteredA could be the block CRS matrix
240  A_block = rcp_dynamic_cast<const block_crs_matrix_type>(filteredA->getUnderlyingMatrix());
241  }
242  else
243  {
244  A_block = rcp_dynamic_cast<const block_crs_matrix_type>(A_local);
245  }
246 
247  if (!A_block.is_null()){
248  return rcpFromRef(A_block->getCrsGraph());
249  }
250  }
251 
252  // Could not extract block crs, make graph manually
253  {
254  local_ordinal_type numRows = A_local->getLocalNumRows();
255  Teuchos::Array<size_t> entriesPerRow(numRows);
256  for(local_ordinal_type i = 0; i < numRows; i++) {
257  entriesPerRow[i] = A_local->getNumEntriesInLocalRow(i);
258  }
259  RCP<crs_graph_type> A_local_crs_nc =
260  rcp (new crs_graph_type (A_local->getRowMap (),
261  A_local->getColMap (),
262  entriesPerRow()));
263 
264  {
265  using LocalRowHandler_t = LocalRowHandler<MatrixType>;
266  LocalRowHandler_t localRowHandler(A_local);
267  typename LocalRowHandler_t::inds_type indices;
268  typename LocalRowHandler_t::vals_type values;
269  for(local_ordinal_type i = 0; i < numRows; i++) {
270  local_ordinal_type numEntries = 0;
271  localRowHandler.getLocalRow(i, indices, values, numEntries);
272  A_local_crs_nc->insertLocalIndices(i, numEntries,indices.data());
273  }
274  }
275 
276  A_local_crs_nc->fillComplete (A_local->getDomainMap (), A_local->getRangeMap ());
277  return rcp_const_cast<const crs_graph_type> (A_local_crs_nc);
278  }
279 
280 }
281 
282 
283 } // namespace
284 
285 template<class MatrixType>
287 {
288  using Teuchos::RCP;
289  using Teuchos::rcp;
290  using Teuchos::rcp_dynamic_cast;
291  const char prefix[] = "Ifpack2::Experimental::RBILUK::initialize: ";
292 
294  (this->A_.is_null (), std::runtime_error, prefix << "The matrix (A_, the "
295  "RowMatrix) is null. Please call setMatrix() with a nonnull input "
296  "before calling this method.");
298  (! this->A_->isFillComplete (), std::runtime_error, prefix << "The matrix "
299  "(A_, the BlockCrsMatrix) is not fill complete. You may not invoke "
300  "initialize() or compute() with this matrix until the matrix is fill "
301  "complete. Note: BlockCrsMatrix is fill complete if and only if its "
302  "underlying graph is fill complete.");
303 
304  blockSize_ = this->A_->getBlockSize();
305  this->A_local_ = this->makeLocalFilter(this->A_);
306 
307  Teuchos::Time timer ("RBILUK::initialize");
308  double startTime = timer.wallTime();
309  { // Start timing
310  Teuchos::TimeMonitor timeMon (timer);
311 
312  // Calling initialize() means that the user asserts that the graph
313  // of the sparse matrix may have changed. We must not just reuse
314  // the previous graph in that case.
315  //
316  // Regarding setting isAllocated_ to false: Eventually, we may want
317  // some kind of clever memory reuse strategy, but it's always
318  // correct just to blow everything away and start over.
319  this->isInitialized_ = false;
320  this->isAllocated_ = false;
321  this->isComputed_ = false;
322  this->Graph_ = Teuchos::null;
323 
324  RCP<const crs_graph_type> matrixCrsGraph = getBlockCrsGraph<MatrixType>(this->A_);
325  this->Graph_ = rcp (new Ifpack2::IlukGraph<crs_graph_type,kk_handle_type> (matrixCrsGraph,
326  this->LevelOfFill_, 0));
327 
328  if (this->isKokkosKernelsSpiluk_) {
329  this->KernelHandle_ = Teuchos::rcp (new kk_handle_type ());
330  const auto numRows = this->A_local_->getLocalNumRows();
331  KernelHandle_->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,
332  numRows,
333  2*this->A_local_->getLocalNumEntries()*(this->LevelOfFill_+1),
334  2*this->A_local_->getLocalNumEntries()*(this->LevelOfFill_+1),
335  blockSize_);
336  this->Graph_->initialize(KernelHandle_); // this calls spiluk_symbolic
337 
338  this->L_Sptrsv_KernelHandle_ = Teuchos::rcp (new kk_handle_type ());
339  this->U_Sptrsv_KernelHandle_ = Teuchos::rcp (new kk_handle_type ());
340 
341  KokkosSparse::Experimental::SPTRSVAlgorithm alg = KokkosSparse::Experimental::SPTRSVAlgorithm::SEQLVLSCHD_TP1;
342 
343  this->L_Sptrsv_KernelHandle_->create_sptrsv_handle(alg, numRows, true /*lower*/, blockSize_);
344  this->U_Sptrsv_KernelHandle_->create_sptrsv_handle(alg, numRows, false /*upper*/, blockSize_);
345  }
346  else {
347  this->Graph_->initialize ();
348  }
349 
350  allocate_L_and_U_blocks ();
351 
352 #ifdef IFPACK2_RBILUK_INITIAL
353  initAllValues ();
354 #endif
355  } // Stop timing
356 
357  this->isInitialized_ = true;
358  this->numInitialize_ += 1;
359  this->initializeTime_ += (timer.wallTime() - startTime);
360 }
361 
362 
363 template<class MatrixType>
364 void
366 initAllValues ()
367 {
368  using Teuchos::RCP;
369  typedef Tpetra::Map<LO,GO,node_type> map_type;
370 
371  LO NumIn = 0, NumL = 0, NumU = 0;
372  bool DiagFound = false;
373  size_t NumNonzeroDiags = 0;
374  size_t MaxNumEntries = this->A_->getLocalMaxNumRowEntries();
375  LO blockMatSize = blockSize_*blockSize_;
376 
377  // First check that the local row map ordering is the same as the local portion of the column map.
378  // The extraction of the strictly lower/upper parts of A, as well as the factorization,
379  // implicitly assume that this is the case.
380  Teuchos::ArrayView<const GO> rowGIDs = this->A_->getRowMap()->getLocalElementList();
381  Teuchos::ArrayView<const GO> colGIDs = this->A_->getColMap()->getLocalElementList();
382  bool gidsAreConsistentlyOrdered=true;
383  GO indexOfInconsistentGID=0;
384  for (GO i=0; i<rowGIDs.size(); ++i) {
385  if (rowGIDs[i] != colGIDs[i]) {
386  gidsAreConsistentlyOrdered=false;
387  indexOfInconsistentGID=i;
388  break;
389  }
390  }
391  TEUCHOS_TEST_FOR_EXCEPTION(gidsAreConsistentlyOrdered==false, std::runtime_error,
392  "The ordering of the local GIDs in the row and column maps is not the same"
393  << std::endl << "at index " << indexOfInconsistentGID
394  << ". Consistency is required, as all calculations are done with"
395  << std::endl << "local indexing.");
396 
397  // Allocate temporary space for extracting the strictly
398  // lower and upper parts of the matrix A.
399  Teuchos::Array<LO> LI(MaxNumEntries);
400  Teuchos::Array<LO> UI(MaxNumEntries);
401  Teuchos::Array<scalar_type> LV(MaxNumEntries*blockMatSize);
402  Teuchos::Array<scalar_type> UV(MaxNumEntries*blockMatSize);
403 
404  Teuchos::Array<scalar_type> diagValues(blockMatSize);
405 
406  L_block_->setAllToScalar (STM::zero ()); // Zero out L and U matrices
407  U_block_->setAllToScalar (STM::zero ());
408  D_block_->setAllToScalar (STM::zero ()); // Set diagonal values to zero
409 
410  // NOTE (mfh 27 May 2016) The factorization below occurs entirely on
411  // host, so sync to host first. The const_cast is unfortunate but
412  // is our only option to make this correct.
413 
414  /*
415  const_cast<block_crs_matrix_type&> (A).sync_host ();
416  L_block_->sync_host ();
417  U_block_->sync_host ();
418  D_block_->sync_host ();
419  // NOTE (mfh 27 May 2016) We're modifying L, U, and D on host.
420  L_block_->modify_host ();
421  U_block_->modify_host ();
422  D_block_->modify_host ();
423  */
424 
425  RCP<const map_type> rowMap = L_block_->getRowMap ();
426 
427  // First we copy the user's matrix into L and U, regardless of fill level.
428  // It is important to note that L and U are populated using local indices.
429  // This means that if the row map GIDs are not monotonically increasing
430  // (i.e., permuted or gappy), then the strictly lower (upper) part of the
431  // matrix is not the one that you would get if you based L (U) on GIDs.
432  // This is ok, as the *order* of the GIDs in the rowmap is a better
433  // expression of the user's intent than the GIDs themselves.
434 
435  //TODO BMK: Revisit this fence when BlockCrsMatrix is refactored.
436  Kokkos::fence();
437  using LocalRowHandler_t = LocalRowHandler<MatrixType>;
438  LocalRowHandler_t localRowHandler(this->A_);
439  typename LocalRowHandler_t::inds_type InI;
440  typename LocalRowHandler_t::vals_type InV;
441  for (size_t myRow=0; myRow<this->A_->getLocalNumRows(); ++myRow) {
442  LO local_row = myRow;
443 
444  localRowHandler.getLocalRow(local_row, InI, InV, NumIn);
445 
446  // Split into L and U (we don't assume that indices are ordered).
447  NumL = 0;
448  NumU = 0;
449  DiagFound = false;
450 
451  for (LO j = 0; j < NumIn; ++j) {
452  const LO k = InI[j];
453  const LO blockOffset = blockMatSize*j;
454 
455  if (k == local_row) {
456  DiagFound = true;
457  // Store perturbed diagonal in Tpetra::Vector D_
458  for (LO jj = 0; jj < blockMatSize; ++jj)
459  diagValues[jj] = this->Rthresh_ * InV[blockOffset+jj] + IFPACK2_SGN(InV[blockOffset+jj]) * this->Athresh_;
460  D_block_->replaceLocalValues(local_row, &InI[j], diagValues.getRawPtr(), 1);
461  }
462  else if (k < 0) { // Out of range
464  true, std::runtime_error, "Ifpack2::RILUK::initAllValues: current "
465  "GID k = " << k << " < 0. I'm not sure why this is an error; it is "
466  "probably an artifact of the undocumented assumptions of the "
467  "original implementation (likely copied and pasted from Ifpack). "
468  "Nevertheless, the code I found here insisted on this being an error "
469  "state, so I will throw an exception here.");
470  }
471  else if (k < local_row) {
472  LI[NumL] = k;
473  const LO LBlockOffset = NumL*blockMatSize;
474  for (LO jj = 0; jj < blockMatSize; ++jj)
475  LV[LBlockOffset+jj] = InV[blockOffset+jj];
476  NumL++;
477  }
478  else if (Teuchos::as<size_t>(k) <= rowMap->getLocalNumElements()) {
479  UI[NumU] = k;
480  const LO UBlockOffset = NumU*blockMatSize;
481  for (LO jj = 0; jj < blockMatSize; ++jj)
482  UV[UBlockOffset+jj] = InV[blockOffset+jj];
483  NumU++;
484  }
485  }
486 
487  // Check in things for this row of L and U
488 
489  if (DiagFound) {
490  ++NumNonzeroDiags;
491  } else
492  {
493  for (LO jj = 0; jj < blockSize_; ++jj)
494  diagValues[jj*(blockSize_+1)] = this->Athresh_;
495  D_block_->replaceLocalValues(local_row, &local_row, diagValues.getRawPtr(), 1);
496  }
497 
498  if (NumL) {
499  L_block_->replaceLocalValues(local_row, &LI[0], &LV[0], NumL);
500  }
501 
502  if (NumU) {
503  U_block_->replaceLocalValues(local_row, &UI[0], &UV[0], NumU);
504  }
505  }
506 
507  // NOTE (mfh 27 May 2016) Sync back to device, in case compute()
508  // ever gets a device implementation.
509  /*
510  {
511  typedef typename block_crs_matrix_type::device_type device_type;
512  const_cast<block_crs_matrix_type&> (A).template sync<device_type> ();
513  L_block_->template sync<device_type> ();
514  U_block_->template sync<device_type> ();
515  D_block_->template sync<device_type> ();
516  }
517  */
518  this->isInitialized_ = true;
519 }
520 
521 namespace { // (anonymous)
522 
523 // For a given Kokkos::View type, possibly unmanaged, get the
524 // corresponding managed Kokkos::View type. This is handy for
525 // translating from little_block_type or little_host_vec_type (both
526 // possibly unmanaged) to their managed versions.
527 template<class LittleBlockType>
528 struct GetManagedView {
529  static_assert (Kokkos::is_view<LittleBlockType>::value,
530  "LittleBlockType must be a Kokkos::View.");
531  typedef Kokkos::View<typename LittleBlockType::non_const_data_type,
532  typename LittleBlockType::array_layout,
533  typename LittleBlockType::device_type> managed_non_const_type;
534  static_assert (static_cast<int> (managed_non_const_type::rank) ==
535  static_cast<int> (LittleBlockType::rank),
536  "managed_non_const_type::rank != LittleBlock::rank. "
537  "Please report this bug to the Ifpack2 developers.");
538 };
539 
540 } // namespace (anonymous)
541 
542 template<class MatrixType>
544 {
545  using Teuchos::RCP;
546  using Teuchos::rcp;
547  using Teuchos::Array;
548 
549  typedef impl_scalar_type IST;
550  const char prefix[] = "Ifpack2::Experimental::RBILUK::compute: ";
551 
552  // initialize() checks this too, but it's easier for users if the
553  // error shows them the name of the method that they actually
554  // called, rather than the name of some internally called method.
556  (this->A_.is_null (), std::runtime_error, prefix << "The matrix (A_, "
557  "the BlockCrsMatrix) is null. Please call setMatrix() with a nonnull "
558  "input before calling this method.");
560  (! this->A_->isFillComplete (), std::runtime_error, prefix << "The matrix "
561  "(A_, the BlockCrsMatrix) is not fill complete. You may not invoke "
562  "initialize() or compute() with this matrix until the matrix is fill "
563  "complete. Note: BlockCrsMatrix is fill complete if and only if its "
564  "underlying graph is fill complete.");
565 
566  if (! this->isInitialized ()) {
567  initialize (); // Don't count this in the compute() time
568  }
569 
570  // // NOTE (mfh 27 May 2016) The factorization below occurs entirely on
571  // // host, so sync to host first. The const_cast is unfortunate but
572  // // is our only option to make this correct.
573  // if (! A_block_.is_null ()) {
574  // Teuchos::RCP<block_crs_matrix_type> A_nc =
575  // Teuchos::rcp_const_cast<block_crs_matrix_type> (A_block_);
576  // // A_nc->sync_host ();
577  // }
578  /*
579  L_block_->sync_host ();
580  U_block_->sync_host ();
581  D_block_->sync_host ();
582  // NOTE (mfh 27 May 2016) We're modifying L, U, and D on host.
583  L_block_->modify_host ();
584  U_block_->modify_host ();
585  D_block_->modify_host ();
586  */
587 
588  Teuchos::Time timer ("RBILUK::compute");
589  double startTime = timer.wallTime();
590  { // Start timing
591  Teuchos::TimeMonitor timeMon (timer);
592  this->isComputed_ = false;
593 
594  // MinMachNum should be officially defined, for now pick something a little
595  // bigger than IEEE underflow value
596 
597 // const scalar_type MinDiagonalValue = STS::rmin ();
598 // const scalar_type MaxDiagonalValue = STS::one () / MinDiagonalValue;
599  if (!this->isKokkosKernelsSpiluk_) {
600  initAllValues ();
601  size_t NumIn;
602  LO NumL, NumU, NumURead;
603 
604  // Get Maximum Row length
605  const size_t MaxNumEntries =
606  L_block_->getLocalMaxNumRowEntries () + U_block_->getLocalMaxNumRowEntries () + 1;
607 
608  const LO blockMatSize = blockSize_*blockSize_;
609 
610  // FIXME (mfh 08 Nov 2015, 24 May 2016) We need to move away from
611  // expressing these strides explicitly, in order to finish #177
612  // (complete Kokkos-ization of BlockCrsMatrix) thoroughly.
613  const LO rowStride = blockSize_;
614 
615  Teuchos::Array<int> ipiv_teuchos(blockSize_);
616  Kokkos::View<int*, Kokkos::HostSpace,
617  Kokkos::MemoryUnmanaged> ipiv (ipiv_teuchos.getRawPtr (), blockSize_);
618  Teuchos::Array<IST> work_teuchos(blockSize_);
619  Kokkos::View<IST*, Kokkos::HostSpace,
620  Kokkos::MemoryUnmanaged> work (work_teuchos.getRawPtr (), blockSize_);
621 
622  size_t num_cols = U_block_->getColMap()->getLocalNumElements();
623  Teuchos::Array<int> colflag(num_cols);
624 
625  typename GetManagedView<little_block_host_type>::managed_non_const_type diagModBlock ("diagModBlock", blockSize_, blockSize_);
626  typename GetManagedView<little_block_host_type>::managed_non_const_type matTmp ("matTmp", blockSize_, blockSize_);
627  typename GetManagedView<little_block_host_type>::managed_non_const_type multiplier ("multiplier", blockSize_, blockSize_);
628 
629 // Teuchos::ArrayRCP<scalar_type> DV = D_->get1dViewNonConst(); // Get view of diagonal
630 
631  // Now start the factorization.
632 
633  // Need some integer workspace and pointers
634  LO NumUU;
635  for (size_t j = 0; j < num_cols; ++j) {
636  colflag[j] = -1;
637  }
638  Teuchos::Array<LO> InI(MaxNumEntries, 0);
639  Teuchos::Array<scalar_type> InV(MaxNumEntries*blockMatSize,STM::zero());
640 
641  const LO numLocalRows = L_block_->getLocalNumRows ();
642  for (LO local_row = 0; local_row < numLocalRows; ++local_row) {
643 
644  // Fill InV, InI with current row of L, D and U combined
645 
646  NumIn = MaxNumEntries;
647  local_inds_host_view_type colValsL;
648  values_host_view_type valsL;
649 
650  L_block_->getLocalRowView(local_row, colValsL, valsL);
651  NumL = (LO) colValsL.size();
652  for (LO j = 0; j < NumL; ++j)
653  {
654  const LO matOffset = blockMatSize*j;
655  little_block_host_type lmat((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride);
656  little_block_host_type lmatV((typename little_block_host_type::value_type*) &InV[matOffset],blockSize_,rowStride);
657  //lmatV.assign(lmat);
658  Tpetra::COPY (lmat, lmatV);
659  InI[j] = colValsL[j];
660  }
661 
662  little_block_host_type dmat = D_block_->getLocalBlockHostNonConst(local_row, local_row);
663  little_block_host_type dmatV((typename little_block_host_type::value_type*) &InV[NumL*blockMatSize], blockSize_, rowStride);
664  //dmatV.assign(dmat);
665  Tpetra::COPY (dmat, dmatV);
666  InI[NumL] = local_row;
667 
668  local_inds_host_view_type colValsU;
669  values_host_view_type valsU;
670  U_block_->getLocalRowView(local_row, colValsU, valsU);
671  NumURead = (LO) colValsU.size();
672  NumU = 0;
673  for (LO j = 0; j < NumURead; ++j)
674  {
675  if (!(colValsU[j] < numLocalRows)) continue;
676  InI[NumL+1+j] = colValsU[j];
677  const LO matOffset = blockMatSize*(NumL+1+j);
678  little_block_host_type umat((typename little_block_host_type::value_type*) &valsU[blockMatSize*j], blockSize_, rowStride);
679  little_block_host_type umatV((typename little_block_host_type::value_type*) &InV[matOffset], blockSize_, rowStride);
680  //umatV.assign(umat);
681  Tpetra::COPY (umat, umatV);
682  NumU += 1;
683  }
684  NumIn = NumL+NumU+1;
685 
686  // Set column flags
687  for (size_t j = 0; j < NumIn; ++j) {
688  colflag[InI[j]] = j;
689  }
690 
691 #ifndef IFPACK2_RBILUK_INITIAL
692  for (LO i = 0; i < blockSize_; ++i)
693  for (LO j = 0; j < blockSize_; ++j){
694  {
695  diagModBlock(i,j) = 0;
696  }
697  }
698 #else
699  scalar_type diagmod = STM::zero (); // Off-diagonal accumulator
700  Kokkos::deep_copy (diagModBlock, diagmod);
701 #endif
702 
703  for (LO jj = 0; jj < NumL; ++jj) {
704  LO j = InI[jj];
705  little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[jj*blockMatSize], blockSize_, rowStride); // current_mults++;
706  //multiplier.assign(currentVal);
707  Tpetra::COPY (currentVal, multiplier);
708 
709  const little_block_host_type dmatInverse = D_block_->getLocalBlockHostNonConst(j,j);
710  // alpha = 1, beta = 0
711 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
712  KokkosBatched::SerialGemm
713  <KokkosBatched::Trans::NoTranspose,
714  KokkosBatched::Trans::NoTranspose,
715  KokkosBatched::Algo::Gemm::Blocked>::invoke
716  (STS::one (), currentVal, dmatInverse, STS::zero (), matTmp);
717 #else
718  Tpetra::GEMM ("N", "N", STS::one (), currentVal, dmatInverse,
719  STS::zero (), matTmp);
720 #endif
721  //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast<impl_scalar_type*> (currentVal.data ()), reinterpret_cast<impl_scalar_type*> (dmatInverse.data ()), reinterpret_cast<impl_scalar_type*> (matTmp.data ()), blockSize_);
722  //currentVal.assign(matTmp);
723  Tpetra::COPY (matTmp, currentVal);
724  local_inds_host_view_type UUI;
725  values_host_view_type UUV;
726 
727  U_block_->getLocalRowView(j, UUI, UUV);
728  NumUU = (LO) UUI.size();
729 
730  if (this->RelaxValue_ == STM::zero ()) {
731  for (LO k = 0; k < NumUU; ++k) {
732  if (!(UUI[k] < numLocalRows)) continue;
733  const int kk = colflag[UUI[k]];
734  if (kk > -1) {
735  little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride);
736  little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride);
737 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
738  KokkosBatched::SerialGemm
739  <KokkosBatched::Trans::NoTranspose,
740  KokkosBatched::Trans::NoTranspose,
741  KokkosBatched::Algo::Gemm::Blocked>::invoke
742  ( magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), kkval);
743 #else
744  Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat,
745  STM::one (), kkval);
746 #endif
747  //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast<impl_scalar_type*> (multiplier.data ()), reinterpret_cast<impl_scalar_type*> (uumat.data ()), reinterpret_cast<impl_scalar_type*> (kkval.data ()), blockSize_, -STM::one(), STM::one());
748  }
749  }
750  }
751  else {
752  for (LO k = 0; k < NumUU; ++k) {
753  if (!(UUI[k] < numLocalRows)) continue;
754  const int kk = colflag[UUI[k]];
755  little_block_host_type uumat((typename little_block_host_type::value_type*) &UUV[k*blockMatSize], blockSize_, rowStride);
756  if (kk > -1) {
757  little_block_host_type kkval((typename little_block_host_type::value_type*) &InV[kk*blockMatSize], blockSize_, rowStride);
758 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
759  KokkosBatched::SerialGemm
760  <KokkosBatched::Trans::NoTranspose,
761  KokkosBatched::Trans::NoTranspose,
762  KokkosBatched::Algo::Gemm::Blocked>::invoke
763  (magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), kkval);
764 #else
765  Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat,
766  STM::one (), kkval);
767 #endif
768  //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast<impl_scalar_type*>(multiplier.data ()), reinterpret_cast<impl_scalar_type*>(uumat.data ()), reinterpret_cast<impl_scalar_type*>(kkval.data ()), blockSize_, -STM::one(), STM::one());
769  }
770  else {
771 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
772  KokkosBatched::SerialGemm
773  <KokkosBatched::Trans::NoTranspose,
774  KokkosBatched::Trans::NoTranspose,
775  KokkosBatched::Algo::Gemm::Blocked>::invoke
776  (magnitude_type(-STM::one ()), multiplier, uumat, STM::one (), diagModBlock);
777 #else
778  Tpetra::GEMM ("N", "N", magnitude_type(-STM::one ()), multiplier, uumat,
779  STM::one (), diagModBlock);
780 #endif
781  //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast<impl_scalar_type*>(multiplier.data ()), reinterpret_cast<impl_scalar_type*>(uumat.data ()), reinterpret_cast<impl_scalar_type*>(diagModBlock.data ()), blockSize_, -STM::one(), STM::one());
782  }
783  }
784  }
785  }
786  if (NumL) {
787  // Replace current row of L
788  L_block_->replaceLocalValues (local_row, InI.getRawPtr (), InV.getRawPtr (), NumL);
789  }
790 
791  // dmat.assign(dmatV);
792  Tpetra::COPY (dmatV, dmat);
793 
794  if (this->RelaxValue_ != STM::zero ()) {
795  //dmat.update(this->RelaxValue_, diagModBlock);
796  Tpetra::AXPY (this->RelaxValue_, diagModBlock, dmat);
797  }
798 
799 // if (STS::magnitude (DV[i]) > STS::magnitude (MaxDiagonalValue)) {
800 // if (STS::real (DV[i]) < STM::zero ()) {
801 // DV[i] = -MinDiagonalValue;
802 // }
803 // else {
804 // DV[i] = MinDiagonalValue;
805 // }
806 // }
807 // else
808  {
809  int lapackInfo = 0;
810  for (int k = 0; k < blockSize_; ++k) {
811  ipiv[k] = 0;
812  }
813 
814  Tpetra::GETF2 (dmat, ipiv, lapackInfo);
815  //lapack.GETRF(blockSize_, blockSize_, d_raw, blockSize_, ipiv.getRawPtr(), &lapackInfo);
817  lapackInfo != 0, std::runtime_error, "Ifpack2::Experimental::RBILUK::compute: "
818  "lapackInfo = " << lapackInfo << " which indicates an error in the factorization GETRF.");
819 
820  Tpetra::GETRI (dmat, ipiv, work, lapackInfo);
821  //lapack.GETRI(blockSize_, d_raw, blockSize_, ipiv.getRawPtr(), work.getRawPtr(), lwork, &lapackInfo);
823  lapackInfo != 0, std::runtime_error, "Ifpack2::Experimental::RBILUK::compute: "
824  "lapackInfo = " << lapackInfo << " which indicates an error in the matrix inverse GETRI.");
825  }
826 
827  for (LO j = 0; j < NumU; ++j) {
828  little_block_host_type currentVal((typename little_block_host_type::value_type*) &InV[(NumL+1+j)*blockMatSize], blockSize_, rowStride); // current_mults++;
829  // scale U by the diagonal inverse
830 #ifndef IFPACK2_RBILUK_INITIAL_NOKK
831  KokkosBatched::SerialGemm
832  <KokkosBatched::Trans::NoTranspose,
833  KokkosBatched::Trans::NoTranspose,
834  KokkosBatched::Algo::Gemm::Blocked>::invoke
835  (STS::one (), dmat, currentVal, STS::zero (), matTmp);
836 #else
837  Tpetra::GEMM ("N", "N", STS::one (), dmat, currentVal,
838  STS::zero (), matTmp);
839 #endif
840  //blockMatOpts.square_matrix_matrix_multiply(reinterpret_cast<impl_scalar_type*>(dmat.data ()), reinterpret_cast<impl_scalar_type*>(currentVal.data ()), reinterpret_cast<impl_scalar_type*>(matTmp.data ()), blockSize_);
841  //currentVal.assign(matTmp);
842  Tpetra::COPY (matTmp, currentVal);
843  }
844 
845  if (NumU) {
846  // Replace current row of L and U
847  U_block_->replaceLocalValues (local_row, &InI[NumL+1], &InV[blockMatSize*(NumL+1)], NumU);
848  }
849 
850 #ifndef IFPACK2_RBILUK_INITIAL
851  // Reset column flags
852  for (size_t j = 0; j < NumIn; ++j) {
853  colflag[InI[j]] = -1;
854  }
855 #else
856  // Reset column flags
857  for (size_t j = 0; j < num_cols; ++j) {
858  colflag[j] = -1;
859  }
860 #endif
861  }
862  } // !this->isKokkosKernelsSpiluk_
863  else {
864  RCP<const block_crs_matrix_type> A_local_bcrs = Details::getBcrsMatrix(this->A_local_);
865  RCP<const crs_matrix_type> A_local_crs = Details::getCrsMatrix(this->A_local_);
866  if (A_local_bcrs.is_null()) {
867  if (A_local_crs.is_null()) {
868  local_ordinal_type numRows = this->A_local_->getLocalNumRows();
869  Array<size_t> entriesPerRow(numRows);
870  for(local_ordinal_type i = 0; i < numRows; i++) {
871  entriesPerRow[i] = this->A_local_->getNumEntriesInLocalRow(i);
872  }
873  RCP<crs_matrix_type> A_local_crs_nc =
874  rcp (new crs_matrix_type (this->A_local_->getRowMap (),
875  this->A_local_->getColMap (),
876  entriesPerRow()));
877  // copy entries into A_local_crs
878  nonconst_local_inds_host_view_type indices("indices",this->A_local_->getLocalMaxNumRowEntries());
879  nonconst_values_host_view_type values("values",this->A_local_->getLocalMaxNumRowEntries());
880  for(local_ordinal_type i = 0; i < numRows; i++) {
881  size_t numEntries = 0;
882  this->A_local_->getLocalRowCopy(i, indices, values, numEntries);
883  A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()),indices.data());
884  }
885  A_local_crs_nc->fillComplete (this->A_local_->getDomainMap (), this->A_local_->getRangeMap ());
886  A_local_crs = Teuchos::rcp_const_cast<const crs_matrix_type> (A_local_crs_nc);
887  }
888  // Create bcrs from crs
889  // We can skip fillLogicalBlocks if we know the input is already filled
890  if (blockSize_ > 1) {
891  auto crs_matrix_block_filled = Tpetra::fillLogicalBlocks(*A_local_crs, blockSize_);
892  A_local_bcrs = Tpetra::convertToBlockCrsMatrix(*crs_matrix_block_filled, blockSize_);
893  }
894  else {
895  A_local_bcrs = Tpetra::convertToBlockCrsMatrix(*A_local_crs, blockSize_);
896  }
897  }
898 
900  this->isKokkosKernelsStream_, std::runtime_error, "Ifpack2::RBILUK::compute: "
901  "streams are not yet supported.");
902 
903  auto lclMtx = A_local_bcrs->getLocalMatrixDevice();
904  auto A_local_rowmap = lclMtx.graph.row_map;
905  auto A_local_entries = lclMtx.graph.entries;
906  auto A_local_values = lclMtx.values;
907 
908  // L_block_->resumeFill ();
909  // U_block_->resumeFill ();
910 
911  if (L_block_->isLocallyIndexed ()) {
912  L_block_->setAllToScalar (STS::zero ()); // Zero out L and U matrices
913  U_block_->setAllToScalar (STS::zero ());
914  }
915 
916  using row_map_type = typename local_matrix_device_type::row_map_type;
917 
918  auto lclL = L_block_->getLocalMatrixDevice();
919  row_map_type L_rowmap = lclL.graph.row_map;
920  auto L_entries = lclL.graph.entries;
921  auto L_values = lclL.values;
922 
923  auto lclU = U_block_->getLocalMatrixDevice();
924  row_map_type U_rowmap = lclU.graph.row_map;
925  auto U_entries = lclU.graph.entries;
926  auto U_values = lclU.values;
927 
928  KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), this->LevelOfFill_,
929  A_local_rowmap, A_local_entries, A_local_values,
930  L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values );
931 
932  // Now call symbolic for sptrsvs
933  KokkosSparse::Experimental::sptrsv_symbolic(L_Sptrsv_KernelHandle_.getRawPtr(), L_rowmap, L_entries, L_values);
934  KokkosSparse::Experimental::sptrsv_symbolic(U_Sptrsv_KernelHandle_.getRawPtr(), U_rowmap, U_entries, U_values);
935  }
936  } // Stop timing
937 
938  // Sync everything back to device, for efficient solves.
939  /*
940  {
941  typedef typename block_crs_matrix_type::device_type device_type;
942  if (! A_block_.is_null ()) {
943  Teuchos::RCP<block_crs_matrix_type> A_nc =
944  Teuchos::rcp_const_cast<block_crs_matrix_type> (A_block_);
945  A_nc->template sync<device_type> ();
946  }
947  L_block_->template sync<device_type> ();
948  U_block_->template sync<device_type> ();
949  D_block_->template sync<device_type> ();
950  }
951  */
952 
953  this->isComputed_ = true;
954  this->numCompute_ += 1;
955  this->computeTime_ += (timer.wallTime() - startTime);
956 }
957 
958 
959 template<class MatrixType>
960 void
962 apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
963  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
964  Teuchos::ETransp mode,
965  scalar_type alpha,
966  scalar_type beta) const
967 {
968  using Teuchos::RCP;
969  typedef Tpetra::BlockMultiVector<scalar_type,
971 
973  this->A_.is_null (), std::runtime_error, "Ifpack2::Experimental::RBILUK::apply: The matrix is "
974  "null. Please call setMatrix() with a nonnull input, then initialize() "
975  "and compute(), before calling this method.");
977  ! this->isComputed (), std::runtime_error,
978  "Ifpack2::Experimental::RBILUK::apply: If you have not yet called compute(), "
979  "you must call compute() before calling this method.");
980  TEUCHOS_TEST_FOR_EXCEPTION(
981  X.getNumVectors () != Y.getNumVectors (), std::invalid_argument,
982  "Ifpack2::Experimental::RBILUK::apply: X and Y do not have the same number of columns. "
983  "X.getNumVectors() = " << X.getNumVectors ()
984  << " != Y.getNumVectors() = " << Y.getNumVectors () << ".");
985  TEUCHOS_TEST_FOR_EXCEPTION(
986  STS::isComplex && mode == Teuchos::CONJ_TRANS, std::logic_error,
987  "Ifpack2::Experimental::RBILUK::apply: mode = Teuchos::CONJ_TRANS is not implemented for "
988  "complex Scalar type. Please talk to the Ifpack2 developers to get this "
989  "fixed. There is a FIXME in this file about this very issue.");
990 
991  const LO blockMatSize = blockSize_*blockSize_;
992 
993  const LO rowStride = blockSize_;
994 
995  BMV yBlock (Y, * (this->Graph_->getA_Graph()->getDomainMap ()), blockSize_);
996  const BMV xBlock (X, * (this->A_->getColMap ()), blockSize_);
997 
998  Teuchos::Array<scalar_type> lclarray(blockSize_);
999  little_host_vec_type lclvec((typename little_host_vec_type::value_type*)&lclarray[0], blockSize_);
1000  const scalar_type one = STM::one ();
1001  const scalar_type zero = STM::zero ();
1002 
1003  Teuchos::Time timer ("RBILUK::apply");
1004  double startTime = timer.wallTime();
1005  { // Start timing
1006  Teuchos::TimeMonitor timeMon (timer);
1007  if (!this->isKokkosKernelsSpiluk_) {
1008  if (alpha == one && beta == zero) {
1009  if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
1010  // Start by solving L C = X for C. C must have the same Map
1011  // as D. We have to use a temp multivector, since our
1012  // implementation of triangular solves does not allow its
1013  // input and output to alias one another.
1014  //
1015  // FIXME (mfh 24 Jan 2014) Cache this temp multivector.
1016  const LO numVectors = xBlock.getNumVectors();
1017  BMV cBlock (* (this->Graph_->getA_Graph()->getDomainMap ()), blockSize_, numVectors);
1018  BMV rBlock (* (this->Graph_->getA_Graph()->getDomainMap ()), blockSize_, numVectors);
1019  for (LO imv = 0; imv < numVectors; ++imv)
1020  {
1021  for (size_t i = 0; i < D_block_->getLocalNumRows(); ++i)
1022  {
1023  LO local_row = i;
1024  const_host_little_vec_type xval =
1025  xBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly);
1026  little_host_vec_type cval =
1027  cBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::OverwriteAll);
1028  //cval.assign(xval);
1029  Tpetra::COPY (xval, cval);
1030 
1031  local_inds_host_view_type colValsL;
1032  values_host_view_type valsL;
1033  L_block_->getLocalRowView(local_row, colValsL, valsL);
1034  LO NumL = (LO) colValsL.size();
1035 
1036  for (LO j = 0; j < NumL; ++j)
1037  {
1038  LO col = colValsL[j];
1039  const_host_little_vec_type prevVal =
1040  cBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly);
1041 
1042  const LO matOffset = blockMatSize*j;
1043  little_block_host_type lij((typename little_block_host_type::value_type*) &valsL[matOffset],blockSize_,rowStride);
1044 
1045  //cval.matvecUpdate(-one, lij, prevVal);
1046  Tpetra::GEMV (-one, lij, prevVal, cval);
1047  }
1048  }
1049  }
1050 
1051  // Solve D R = C. Note that D has been replaced by D^{-1} at this point.
1052  D_block_->applyBlock(cBlock, rBlock);
1053 
1054  // Solve U Y = R.
1055  for (LO imv = 0; imv < numVectors; ++imv)
1056  {
1057  const LO numRows = D_block_->getLocalNumRows();
1058  for (LO i = 0; i < numRows; ++i)
1059  {
1060  LO local_row = (numRows-1)-i;
1061  const_host_little_vec_type rval =
1062  rBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::ReadOnly);
1063  little_host_vec_type yval =
1064  yBlock.getLocalBlockHost(local_row, imv, Tpetra::Access::OverwriteAll);
1065  //yval.assign(rval);
1066  Tpetra::COPY (rval, yval);
1067 
1068  local_inds_host_view_type colValsU;
1069  values_host_view_type valsU;
1070  U_block_->getLocalRowView(local_row, colValsU, valsU);
1071  LO NumU = (LO) colValsU.size();
1072 
1073  for (LO j = 0; j < NumU; ++j)
1074  {
1075  LO col = colValsU[NumU-1-j];
1076  const_host_little_vec_type prevVal =
1077  yBlock.getLocalBlockHost(col, imv, Tpetra::Access::ReadOnly);
1078 
1079  const LO matOffset = blockMatSize*(NumU-1-j);
1080  little_block_host_type uij((typename little_block_host_type::value_type*) &valsU[matOffset], blockSize_, rowStride);
1081 
1082  //yval.matvecUpdate(-one, uij, prevVal);
1083  Tpetra::GEMV (-one, uij, prevVal, yval);
1084  }
1085  }
1086  }
1087  }
1088  else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
1089  TEUCHOS_TEST_FOR_EXCEPTION(
1090  true, std::runtime_error,
1091  "Ifpack2::Experimental::RBILUK::apply: transpose apply is not implemented for the block algorithm");
1092  }
1093  }
1094  else { // alpha != 1 or beta != 0
1095  if (alpha == zero) {
1096  if (beta == zero) {
1097  Y.putScalar (zero);
1098  } else {
1099  Y.scale (beta);
1100  }
1101  } else { // alpha != zero
1102  MV Y_tmp (Y.getMap (), Y.getNumVectors ());
1103  apply (X, Y_tmp, mode);
1104  Y.update (alpha, Y_tmp, beta);
1105  }
1106  }
1107  }
1108  else {
1109  // Kokkos kernels impl.
1110  auto X_views = X.getLocalViewDevice(Tpetra::Access::ReadOnly);
1111  auto Y_views = Y.getLocalViewDevice(Tpetra::Access::ReadWrite);
1112 
1113  auto lclL = L_block_->getLocalMatrixDevice();
1114  auto L_rowmap = lclL.graph.row_map;
1115  auto L_entries = lclL.graph.entries;
1116  auto L_values = lclL.values;
1117 
1118  auto lclU = U_block_->getLocalMatrixDevice();
1119  auto U_rowmap = lclU.graph.row_map;
1120  auto U_entries = lclU.graph.entries;
1121  auto U_values = lclU.values;
1122 
1123  if (mode == Teuchos::NO_TRANS) {
1124  {
1125  const LO numVecs = X.getNumVectors();
1126  for (LO vec = 0; vec < numVecs; ++vec) {
1127  auto X_view = Kokkos::subview(X_views, Kokkos::ALL(), vec);
1128  auto Y_view = Kokkos::subview(Y_views, Kokkos::ALL(), vec);
1129  KokkosSparse::Experimental::sptrsv_solve(L_Sptrsv_KernelHandle_.getRawPtr(), L_rowmap, L_entries, L_values, X_view, tmp_);
1130  }
1131  }
1132 
1133  {
1134  const LO numVecs = X.getNumVectors();
1135  for (LO vec = 0; vec < numVecs; ++vec) {
1136  auto Y_view = Kokkos::subview(Y_views, Kokkos::ALL(), vec);
1137  KokkosSparse::Experimental::sptrsv_solve(U_Sptrsv_KernelHandle_.getRawPtr(), U_rowmap, U_entries, U_values, tmp_, Y_view);
1138  }
1139  }
1140 
1141  KokkosBlas::axpby(alpha, Y_views, beta, Y_views);
1142  }
1143  else {
1144  TEUCHOS_TEST_FOR_EXCEPTION(
1145  true, std::runtime_error,
1146  "Ifpack2::Experimental::RBILUK::apply: transpose apply is not implemented for the block algorithm");
1147  }
1148 
1149  //Y.getWrappedDualView().sync();
1150  }
1151  } // Stop timing
1152 
1153  this->numApply_ += 1;
1154  this->applyTime_ += (timer.wallTime() - startTime);
1155 }
1156 
1157 
1158 template<class MatrixType>
1160 {
1161  std::ostringstream os;
1162 
1163  // Output is a valid YAML dictionary in flow style. If you don't
1164  // like everything on a single line, you should call describe()
1165  // instead.
1166  os << "\"Ifpack2::Experimental::RBILUK\": {";
1167  os << "Initialized: " << (this->isInitialized () ? "true" : "false") << ", "
1168  << "Computed: " << (this->isComputed () ? "true" : "false") << ", ";
1169 
1170  os << "Level-of-fill: " << this->getLevelOfFill() << ", ";
1171 
1172  if (this->A_.is_null ()) {
1173  os << "Matrix: null";
1174  }
1175  else {
1176  os << "Global matrix dimensions: ["
1177  << this->A_->getGlobalNumRows () << ", " << this->A_->getGlobalNumCols () << "]"
1178  << ", Global nnz: " << this->A_->getGlobalNumEntries();
1179  }
1180 
1181  os << "}";
1182  return os.str ();
1183 }
1184 
1185 } // namespace Experimental
1186 
1187 } // namespace Ifpack2
1188 
1189 // FIXME (mfh 26 Aug 2015) We only need to do instantiation for
1190 // MatrixType = Tpetra::RowMatrix. Conversions to BlockCrsMatrix are
1191 // handled internally via dynamic cast.
1192 
1193 #define IFPACK2_EXPERIMENTAL_RBILUK_INSTANT(S,LO,GO,N) \
1194  template class Ifpack2::Experimental::RBILUK< Tpetra::BlockCrsMatrix<S, LO, GO, N> >; \
1195  template class Ifpack2::Experimental::RBILUK< Tpetra::RowMatrix<S, LO, GO, N> >;
1196 
1197 #endif
void initialize()
Initialize by computing the symbolic incomplete factorization.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:286
MatrixType::global_ordinal_type global_ordinal_type
The type of global indices in the input MatrixType.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:113
virtual ~RBILUK()
Destructor (declared virtual for memory safety).
Definition: Ifpack2_Experimental_RBILUK_def.hpp:103
static Teuchos::RCP< const row_matrix_type > makeLocalFilter(const Teuchos::RCP< const row_matrix_type > &A)
Return A, wrapped in a LocalFilter, if necessary.
const block_crs_matrix_type & getUBlock() const
Return the U factor of the ILU factorization.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:160
MatrixType::scalar_type scalar_type
The type of the entries of the input MatrixType.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:103
void apply(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_type alpha=Teuchos::ScalarTraits< scalar_type >::one(), scalar_type beta=Teuchos::ScalarTraits< scalar_type >::zero()) const
Apply the (inverse of the) incomplete factorization to X, resulting in Y.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:962
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
size_type size() const
Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type
The type of the magnitude (absolute value) of a matrix entry.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:120
ILU(k) factorization of a given Tpetra::RowMatrix.
Definition: Ifpack2_RILUK_decl.hpp:213
MatrixType::node_type node_type
The Node type used by the input MatrixType.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:117
Tpetra::CrsMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > crs_matrix_type
Tpetra::CrsMatrix specialization used by this class for representing L and U.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:132
MatrixType::local_ordinal_type local_ordinal_type
The type of local indices in the input MatrixType.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:109
T * getRawPtr() const
const block_crs_matrix_type & getDBlock() const
Return the diagonal entries of the ILU factorization.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:146
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
void compute()
Compute the (numeric) incomplete factorization.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:543
File for utility functions.
Construct a level filled graph for use in computing an ILU(k) incomplete factorization.
Definition: Ifpack2_IlukGraph.hpp:67
void setMatrix(const Teuchos::RCP< const block_crs_matrix_type > &A)
Change the matrix to be preconditioned.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:108
const block_crs_matrix_type & getLBlock() const
Return the L factor of the ILU factorization.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:132
size_type size() const
static double wallTime()
Tpetra::RowMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > row_matrix_type
Tpetra::RowMatrix specialization used by this class.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:126
std::string description() const
A one-line description of this object.
Definition: Ifpack2_Experimental_RBILUK_def.hpp:1159
ILU(k) factorization of a given Tpetra::BlockCrsMatrix.
Definition: Ifpack2_Experimental_RBILUK_decl.hpp:95
bool is_null() const