Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_RILUK_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_CRSRILUK_DEF_HPP
11 #define IFPACK2_CRSRILUK_DEF_HPP
12 
13 #include "Ifpack2_RILUK_decl.hpp"
14 #include "Ifpack2_LocalFilter.hpp"
15 #include "Tpetra_CrsMatrix.hpp"
16 #include "Teuchos_StandardParameterEntryValidators.hpp"
17 #include "Ifpack2_LocalSparseTriangularSolver.hpp"
18 #include "Ifpack2_Details_getParamTryingTypes.hpp"
19 #include "Ifpack2_Details_getCrsMatrix.hpp"
20 #include "Kokkos_Sort.hpp"
21 #include "KokkosSparse_Utils.hpp"
22 #include "KokkosKernels_Sorting.hpp"
23 #include "KokkosSparse_IOUtils.hpp"
24 
25 namespace Ifpack2 {
26 
27 namespace Details {
28 struct IlukImplType {
29  enum Enum {
30  Serial,
31  KSPILUK
32  };
33 
34  static void loadPLTypeOption(Teuchos::Array<std::string>& type_strs, Teuchos::Array<Enum>& type_enums) {
35  type_strs.resize(2);
36  type_strs[0] = "Serial";
37  type_strs[1] = "KSPILUK";
38  type_enums.resize(2);
39  type_enums[0] = Serial;
40  type_enums[1] = KSPILUK;
41  }
42 };
43 } // namespace Details
44 
45 template <class MatrixType>
47  : A_(Matrix_in)
48  , LevelOfFill_(0)
49  , Overalloc_(2.)
50  , isAllocated_(false)
51  , isInitialized_(false)
52  , isComputed_(false)
53  , numInitialize_(0)
54  , numCompute_(0)
55  , numApply_(0)
56  , initializeTime_(0.0)
57  , computeTime_(0.0)
58  , applyTime_(0.0)
59  , RelaxValue_(Teuchos::ScalarTraits<magnitude_type>::zero())
60  , Athresh_(Teuchos::ScalarTraits<magnitude_type>::zero())
61  , Rthresh_(Teuchos::ScalarTraits<magnitude_type>::one())
62  , isKokkosKernelsSpiluk_(false)
63  , isKokkosKernelsStream_(false)
64  , num_streams_(0)
65  , hasStreamReordered_(false) {
66  allocateSolvers();
67 }
68 
69 template <class MatrixType>
71  : A_(Matrix_in)
72  , LevelOfFill_(0)
73  , Overalloc_(2.)
74  , isAllocated_(false)
75  , isInitialized_(false)
76  , isComputed_(false)
77  , numInitialize_(0)
78  , numCompute_(0)
79  , numApply_(0)
80  , initializeTime_(0.0)
81  , computeTime_(0.0)
82  , applyTime_(0.0)
83  , RelaxValue_(Teuchos::ScalarTraits<magnitude_type>::zero())
84  , Athresh_(Teuchos::ScalarTraits<magnitude_type>::zero())
85  , Rthresh_(Teuchos::ScalarTraits<magnitude_type>::one())
86  , isKokkosKernelsSpiluk_(false)
87  , isKokkosKernelsStream_(false)
88  , num_streams_(0)
89  , hasStreamReordered_(false) {
90  allocateSolvers();
91 }
92 
93 template <class MatrixType>
95  if (!isKokkosKernelsStream_) {
96  if (Teuchos::nonnull(KernelHandle_)) {
97  KernelHandle_->destroy_spiluk_handle();
98  }
99  } else {
100  for (size_t i = 0; i < KernelHandle_v_.size(); i++) {
101  if (Teuchos::nonnull(KernelHandle_v_[i])) {
102  KernelHandle_v_[i]->destroy_spiluk_handle();
103  }
104  }
105  }
106 }
107 
108 template <class MatrixType>
111  L_solver_->setObjectLabel("lower");
113  U_solver_->setObjectLabel("upper");
114 }
115 
116 template <class MatrixType>
118  // It's legal for A to be null; in that case, you may not call
119  // initialize() until calling setMatrix() with a nonnull input.
120  // Regardless, setting the matrix invalidates any previous
121  // factorization.
122  if (A.getRawPtr() != A_.getRawPtr()) {
123  isAllocated_ = false;
124  isInitialized_ = false;
125  isComputed_ = false;
126  A_local_ = Teuchos::null;
127  Graph_ = Teuchos::null;
128 
129  // The sparse triangular solvers get a triangular factor as their
130  // input matrix. The triangular factors L_ and U_ are getting
131  // reset, so we reset the solvers' matrices to null. Do that
132  // before setting L_ and U_ to null, so that latter step actually
133  // frees the factors.
134  if (!L_solver_.is_null()) {
135  L_solver_->setMatrix(Teuchos::null);
136  }
137  if (!U_solver_.is_null()) {
138  U_solver_->setMatrix(Teuchos::null);
139  }
140 
141  L_ = Teuchos::null;
142  U_ = Teuchos::null;
143  D_ = Teuchos::null;
144  A_ = A;
145  }
146 }
147 
148 template <class MatrixType>
152  L_.is_null(), std::runtime_error,
153  "Ifpack2::RILUK::getL: The L factor "
154  "is null. Please call initialize() (and preferably also compute()) "
155  "before calling this method. If the input matrix has not yet been set, "
156  "you must first call setMatrix() with a nonnull input matrix before you "
157  "may call initialize() or compute().");
158  return *L_;
159 }
160 
161 template <class MatrixType>
162 const Tpetra::Vector<typename RILUK<MatrixType>::scalar_type,
168  D_.is_null(), std::runtime_error,
169  "Ifpack2::RILUK::getD: The D factor "
170  "(of diagonal entries) is null. Please call initialize() (and "
171  "preferably also compute()) before calling this method. If the input "
172  "matrix has not yet been set, you must first call setMatrix() with a "
173  "nonnull input matrix before you may call initialize() or compute().");
174  return *D_;
175 }
176 
177 template <class MatrixType>
181  U_.is_null(), std::runtime_error,
182  "Ifpack2::RILUK::getU: The U factor "
183  "is null. Please call initialize() (and preferably also compute()) "
184  "before calling this method. If the input matrix has not yet been set, "
185  "you must first call setMatrix() with a nonnull input matrix before you "
186  "may call initialize() or compute().");
187  return *U_;
188 }
189 
190 template <class MatrixType>
193  A_.is_null(), std::runtime_error,
194  "Ifpack2::RILUK::getNodeSmootherComplexity: "
195  "The input matrix A is null. Please call setMatrix() with a nonnull "
196  "input matrix, then call compute(), before calling this method.");
197  // RILUK methods cost roughly one apply + the nnz in the upper+lower triangles
198  if (!L_.is_null() && !U_.is_null())
199  return A_->getLocalNumEntries() + L_->getLocalNumEntries() + U_->getLocalNumEntries();
200  else
201  return 0;
202 }
203 
204 template <class MatrixType>
207  typename RILUK<MatrixType>::node_type> >
210  A_.is_null(), std::runtime_error,
211  "Ifpack2::RILUK::getDomainMap: "
212  "The matrix is null. Please call setMatrix() with a nonnull input "
213  "before calling this method.");
214 
215  // FIXME (mfh 25 Jan 2014) Shouldn't this just come from A_?
217  Graph_.is_null(), std::runtime_error,
218  "Ifpack2::RILUK::getDomainMap: "
219  "The computed graph is null. Please call initialize() before calling "
220  "this method.");
221  return Graph_->getL_Graph()->getDomainMap();
222 }
223 
224 template <class MatrixType>
227  typename RILUK<MatrixType>::node_type> >
230  A_.is_null(), std::runtime_error,
231  "Ifpack2::RILUK::getRangeMap: "
232  "The matrix is null. Please call setMatrix() with a nonnull input "
233  "before calling this method.");
234 
235  // FIXME (mfh 25 Jan 2014) Shouldn't this just come from A_?
237  Graph_.is_null(), std::runtime_error,
238  "Ifpack2::RILUK::getRangeMap: "
239  "The computed graph is null. Please call initialize() before calling "
240  "this method.");
241  return Graph_->getL_Graph()->getRangeMap();
242 }
243 
244 template <class MatrixType>
246  using Teuchos::null;
247  using Teuchos::rcp;
248 
249  if (!isAllocated_) {
250  if (!isKokkosKernelsStream_) {
251  // Deallocate any existing storage. This avoids storing 2x
252  // memory, since RCP op= does not deallocate until after the
253  // assignment.
254  L_ = null;
255  U_ = null;
256  D_ = null;
257 
258  // Allocate Matrix using ILUK graphs
259  L_ = rcp(new crs_matrix_type(Graph_->getL_Graph()));
260  U_ = rcp(new crs_matrix_type(Graph_->getU_Graph()));
261  L_->setAllToScalar(STS::zero()); // Zero out L and U matrices
262  U_->setAllToScalar(STS::zero());
263 
264  // FIXME (mfh 24 Jan 2014) This assumes domain == range Map for L and U.
265  L_->fillComplete();
266  U_->fillComplete();
267  D_ = rcp(new vec_type(Graph_->getL_Graph()->getRowMap()));
268  } else {
269  L_v_ = std::vector<Teuchos::RCP<crs_matrix_type> >(num_streams_, null);
270  U_v_ = std::vector<Teuchos::RCP<crs_matrix_type> >(num_streams_, null);
271  for (int i = 0; i < num_streams_; i++) {
272  L_v_[i] = rcp(new crs_matrix_type(Graph_v_[i]->getL_Graph()));
273  U_v_[i] = rcp(new crs_matrix_type(Graph_v_[i]->getU_Graph()));
274  L_v_[i]->setAllToScalar(STS::zero()); // Zero out L and U matrices
275  U_v_[i]->setAllToScalar(STS::zero());
276 
277  L_v_[i]->fillComplete();
278  U_v_[i]->fillComplete();
279  }
280  }
281  }
282  isAllocated_ = true;
283 }
284 
285 template <class MatrixType>
288  using Details::getParamTryingTypes;
289  using Teuchos::Array;
291  using Teuchos::RCP;
292  const char prefix[] = "Ifpack2::RILUK: ";
293 
294  // Default values of the various parameters.
295  int fillLevel = 0;
296  magnitude_type absThresh = STM::zero();
297  magnitude_type relThresh = STM::one();
298  magnitude_type relaxValue = STM::zero();
299  double overalloc = 2.;
300  int nstreams = 0;
301 
302  // "fact: iluk level-of-fill" parsing is more complicated, because
303  // we want to allow as many types as make sense. int is the native
304  // type, but we also want to accept double (for backwards
305  // compatibilty with ILUT). You can't cast arbitrary magnitude_type
306  // (e.g., Sacado::MP::Vector) to int, so we use float instead, to
307  // get coverage of the most common magnitude_type cases. Weirdly,
308  // there's an Ifpack2 test that sets the fill level as a
309  // global_ordinal_type.
310  {
311  const std::string paramName("fact: iluk level-of-fill");
312  getParamTryingTypes<int, int, global_ordinal_type, double, float>(fillLevel, params, paramName, prefix);
313  }
314  // For the other parameters, we prefer magnitude_type, but allow
315  // double for backwards compatibility.
316  {
317  const std::string paramName("fact: absolute threshold");
318  getParamTryingTypes<magnitude_type, magnitude_type, double>(absThresh, params, paramName, prefix);
319  }
320  {
321  const std::string paramName("fact: relative threshold");
322  getParamTryingTypes<magnitude_type, magnitude_type, double>(relThresh, params, paramName, prefix);
323  }
324  {
325  const std::string paramName("fact: relax value");
326  getParamTryingTypes<magnitude_type, magnitude_type, double>(relaxValue, params, paramName, prefix);
327  }
328  {
329  const std::string paramName("fact: iluk overalloc");
330  getParamTryingTypes<double, double>(overalloc, params, paramName, prefix);
331  }
332 
333  // Parsing implementation type
334  Details::IlukImplType::Enum ilukimplType = Details::IlukImplType::Serial;
335  do {
336  static const char typeName[] = "fact: type";
337 
338  if (!params.isType<std::string>(typeName)) break;
339 
340  // Map std::string <-> IlukImplType::Enum.
341  Array<std::string> ilukimplTypeStrs;
342  Array<Details::IlukImplType::Enum> ilukimplTypeEnums;
343  Details::IlukImplType::loadPLTypeOption(ilukimplTypeStrs, ilukimplTypeEnums);
345  s2i(ilukimplTypeStrs(), ilukimplTypeEnums(), typeName, false);
346 
347  ilukimplType = s2i.getIntegralValue(params.get<std::string>(typeName));
348  } while (0);
349 
350  if (ilukimplType == Details::IlukImplType::KSPILUK) {
351  this->isKokkosKernelsSpiluk_ = true;
352  } else {
353  this->isKokkosKernelsSpiluk_ = false;
354  }
355 
356  {
357  const std::string paramName("fact: kspiluk number-of-streams");
358  getParamTryingTypes<int, int, global_ordinal_type>(nstreams, params, paramName, prefix);
359  }
360 
361  // Forward to trisolvers.
362  L_solver_->setParameters(params);
363  U_solver_->setParameters(params);
364 
365  // "Commit" the values only after validating all of them. This
366  // ensures that there are no side effects if this routine throws an
367  // exception.
368 
369  LevelOfFill_ = fillLevel;
370  Overalloc_ = overalloc;
371 #ifdef KOKKOS_ENABLE_OPENMP
372  if constexpr (std::is_same_v<execution_space, Kokkos::OpenMP>) {
373  nstreams = std::min(nstreams, execution_space{}.concurrency());
374  }
375 #endif
376  num_streams_ = nstreams;
377 
378  if (num_streams_ >= 1) {
379  this->isKokkosKernelsStream_ = true;
380  // Will we do reordering in streams?
381  if (params.isParameter("fact: kspiluk reordering in streams"))
382  hasStreamReordered_ = params.get<bool>("fact: kspiluk reordering in streams");
383  } else {
384  this->isKokkosKernelsStream_ = false;
385  }
386 
387  // mfh 28 Nov 2012: The previous code would not assign Athresh_,
388  // Rthresh_, or RelaxValue_, if the read-in value was -1. I don't
389  // know if keeping this behavior is correct, but I'll keep it just
390  // so as not to change previous behavior.
391 
392  if (absThresh != -STM::one()) {
393  Athresh_ = absThresh;
394  }
395  if (relThresh != -STM::one()) {
396  Rthresh_ = relThresh;
397  }
398  if (relaxValue != -STM::one()) {
399  RelaxValue_ = relaxValue;
400  }
401 }
402 
403 template <class MatrixType>
406  return Teuchos::rcp_implicit_cast<const row_matrix_type>(A_);
407 }
408 
409 template <class MatrixType>
412  return Teuchos::rcp_dynamic_cast<const crs_matrix_type>(A_, true);
413 }
414 
415 template <class MatrixType>
418  using Teuchos::RCP;
419  using Teuchos::rcp;
420  using Teuchos::rcp_dynamic_cast;
421  using Teuchos::rcp_implicit_cast;
422 
423  // If A_'s communicator only has one process, or if its column and
424  // row Maps are the same, then it is already local, so use it
425  // directly.
426  if (A->getRowMap()->getComm()->getSize() == 1 ||
427  A->getRowMap()->isSameAs(*(A->getColMap()))) {
428  return A;
429  }
430 
431  // If A_ is already a LocalFilter, then use it directly. This
432  // should be the case if RILUK is being used through
433  // AdditiveSchwarz, for example.
434  RCP<const LocalFilter<row_matrix_type> > A_lf_r =
435  rcp_dynamic_cast<const LocalFilter<row_matrix_type> >(A);
436  if (!A_lf_r.is_null()) {
437  return rcp_implicit_cast<const row_matrix_type>(A_lf_r);
438  } else {
439  // A_'s communicator has more than one process, its row Map and
440  // its column Map differ, and A_ is not a LocalFilter. Thus, we
441  // have to wrap it in a LocalFilter.
442  return rcp(new LocalFilter<row_matrix_type>(A));
443  }
444 }
445 
446 template <class MatrixType>
448  using Teuchos::Array;
449  using Teuchos::ArrayView;
450  using Teuchos::RCP;
451  using Teuchos::rcp;
452  using Teuchos::rcp_const_cast;
453  using Teuchos::rcp_dynamic_cast;
454  using Teuchos::rcp_implicit_cast;
455  typedef Tpetra::CrsGraph<local_ordinal_type,
457  node_type>
458  crs_graph_type;
459  typedef Tpetra::Map<local_ordinal_type,
460  global_ordinal_type,
461  node_type>
462  crs_map_type;
463  const char prefix[] = "Ifpack2::RILUK::initialize: ";
464 
465  TEUCHOS_TEST_FOR_EXCEPTION(A_.is_null(), std::runtime_error, prefix << "The matrix is null. Please "
466  "call setMatrix() with a nonnull input before calling this method.");
467  TEUCHOS_TEST_FOR_EXCEPTION(!A_->isFillComplete(), std::runtime_error, prefix << "The matrix is not "
468  "fill complete. You may not invoke initialize() or compute() with this "
469  "matrix until the matrix is fill complete. If your matrix is a "
470  "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
471  "range Maps, if appropriate) before calling this method.");
472 
473  Teuchos::Time timer("RILUK::initialize");
474  double startTime = timer.wallTime();
475  { // Start timing
476  Teuchos::TimeMonitor timeMon(timer);
477 
478  // Calling initialize() means that the user asserts that the graph
479  // of the sparse matrix may have changed. We must not just reuse
480  // the previous graph in that case.
481  //
482  // Regarding setting isAllocated_ to false: Eventually, we may want
483  // some kind of clever memory reuse strategy, but it's always
484  // correct just to blow everything away and start over.
485  isInitialized_ = false;
486  isAllocated_ = false;
487  isComputed_ = false;
488  Graph_ = Teuchos::null;
489  Y_tmp_ = nullptr;
490  reordered_x_ = nullptr;
491  reordered_y_ = nullptr;
492 
493  if (isKokkosKernelsStream_) {
494  Graph_v_ = std::vector<Teuchos::RCP<iluk_graph_type> >(num_streams_);
495  A_local_diagblks_v_ = std::vector<local_matrix_device_type>(num_streams_);
496  for (int i = 0; i < num_streams_; i++) {
497  Graph_v_[i] = Teuchos::null;
498  }
499  }
500 
501  A_local_ = makeLocalFilter(A_);
502 
504  A_local_.is_null(), std::logic_error,
505  "Ifpack2::RILUK::initialize: "
506  "makeLocalFilter returned null; it failed to compute A_local. "
507  "Please report this bug to the Ifpack2 developers.");
508 
509  // FIXME (mfh 24 Jan 2014, 26 Mar 2014) It would be more efficient
510  // to rewrite RILUK so that it works with any RowMatrix input, not
511  // just CrsMatrix. (That would require rewriting IlukGraph to
512  // handle a Tpetra::RowGraph.) However, to make it work for now,
513  // we just copy the input matrix if it's not a CrsMatrix.
514 
515  {
516  A_local_crs_ = Details::getCrsMatrix(A_local_);
517  if (A_local_crs_.is_null()) {
518  local_ordinal_type numRows = A_local_->getLocalNumRows();
519  Array<size_t> entriesPerRow(numRows);
520  for (local_ordinal_type i = 0; i < numRows; i++) {
521  entriesPerRow[i] = A_local_->getNumEntriesInLocalRow(i);
522  }
523  A_local_crs_nc_ =
524  rcp(new crs_matrix_type(A_local_->getRowMap(),
525  A_local_->getColMap(),
526  entriesPerRow()));
527  // copy entries into A_local_crs
528  nonconst_local_inds_host_view_type indices("indices", A_local_->getLocalMaxNumRowEntries());
529  nonconst_values_host_view_type values("values", A_local_->getLocalMaxNumRowEntries());
530  for (local_ordinal_type i = 0; i < numRows; i++) {
531  size_t numEntries = 0;
532  A_local_->getLocalRowCopy(i, indices, values, numEntries);
533  A_local_crs_nc_->insertLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()), indices.data());
534  }
535  A_local_crs_nc_->fillComplete(A_local_->getDomainMap(), A_local_->getRangeMap());
536  A_local_crs_ = rcp_const_cast<const crs_matrix_type>(A_local_crs_nc_);
537  }
538  if (!isKokkosKernelsStream_) {
539  Graph_ = rcp(new Ifpack2::IlukGraph<crs_graph_type, kk_handle_type>(A_local_crs_->getCrsGraph(),
540  LevelOfFill_, 0, Overalloc_));
541  } else {
542  std::vector<int> weights(num_streams_);
543  std::fill(weights.begin(), weights.end(), 1);
544  exec_space_instances_ = Kokkos::Experimental::partition_space(execution_space(), weights);
545 
546  auto lclMtx = A_local_crs_->getLocalMatrixDevice();
547  if (!hasStreamReordered_) {
548  KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks_v_);
549  } else {
550  perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks_v_, true);
551  reverse_perm_v_.resize(perm_v_.size());
552  for (size_t istream = 0; istream < perm_v_.size(); ++istream) {
553  using perm_type = typename lno_nonzero_view_t::non_const_type;
554  const auto perm = perm_v_[istream];
555  const auto perm_length = perm.extent(0);
556  perm_type reverse_perm(
557  Kokkos::view_alloc(Kokkos::WithoutInitializing, "reverse_perm"),
558  perm_length);
559  Kokkos::parallel_for(
560  Kokkos::RangePolicy<execution_space>(exec_space_instances_[istream], 0, perm_length),
561  KOKKOS_LAMBDA(const local_ordinal_type ii) {
562  reverse_perm(perm(ii)) = ii;
563  });
564  reverse_perm_v_[istream] = reverse_perm;
565  }
566  }
567 
568  A_local_diagblks_rowmap_v_ = std::vector<lno_row_view_t>(num_streams_);
569  A_local_diagblks_entries_v_ = std::vector<lno_nonzero_view_t>(num_streams_);
570  A_local_diagblks_values_v_ = std::vector<scalar_nonzero_view_t>(num_streams_);
571 
572  for (int i = 0; i < num_streams_; i++) {
573  A_local_diagblks_rowmap_v_[i] = A_local_diagblks_v_[i].graph.row_map;
574  A_local_diagblks_entries_v_[i] = A_local_diagblks_v_[i].graph.entries;
575  A_local_diagblks_values_v_[i] = A_local_diagblks_v_[i].values;
576 
577  Teuchos::RCP<const crs_map_type> A_local_diagblks_RowMap = rcp(new crs_map_type(A_local_diagblks_v_[i].numRows(),
578  A_local_diagblks_v_[i].numRows(),
579  A_local_crs_->getRowMap()->getComm()));
580  Teuchos::RCP<const crs_map_type> A_local_diagblks_ColMap = rcp(new crs_map_type(A_local_diagblks_v_[i].numCols(),
581  A_local_diagblks_v_[i].numCols(),
582  A_local_crs_->getColMap()->getComm()));
583  Teuchos::RCP<crs_matrix_type> A_local_diagblks = rcp(new crs_matrix_type(A_local_diagblks_RowMap,
584  A_local_diagblks_ColMap,
585  A_local_diagblks_v_[i]));
586  Graph_v_[i] = rcp(new Ifpack2::IlukGraph<crs_graph_type, kk_handle_type>(A_local_diagblks->getCrsGraph(),
587  LevelOfFill_, 0, Overalloc_));
588  }
589  }
590  }
591 
592  if (this->isKokkosKernelsSpiluk_) {
593  if (!isKokkosKernelsStream_) {
594  this->KernelHandle_ = Teuchos::rcp(new kk_handle_type());
595  KernelHandle_->create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,
596  A_local_->getLocalNumRows(),
597  2 * A_local_->getLocalNumEntries() * (LevelOfFill_ + 1),
598  2 * A_local_->getLocalNumEntries() * (LevelOfFill_ + 1));
599  Graph_->initialize(KernelHandle_); // this calls spiluk_symbolic
600  } else {
601  KernelHandle_v_ = std::vector<Teuchos::RCP<kk_handle_type> >(num_streams_);
602  for (int i = 0; i < num_streams_; i++) {
603  KernelHandle_v_[i] = Teuchos::rcp(new kk_handle_type());
604  KernelHandle_v_[i]->create_spiluk_handle(KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,
605  A_local_diagblks_v_[i].numRows(),
606  2 * A_local_diagblks_v_[i].nnz() * (LevelOfFill_ + 1),
607  2 * A_local_diagblks_v_[i].nnz() * (LevelOfFill_ + 1));
608  Graph_v_[i]->initialize(KernelHandle_v_[i]); // this calls spiluk_symbolic
609  }
610  }
611  } else {
612  Graph_->initialize();
613  }
614 
615  allocate_L_and_U();
616  checkOrderingConsistency(*A_local_);
617  if (!isKokkosKernelsStream_) {
618  L_solver_->setMatrix(L_);
619  } else {
620  L_solver_->setStreamInfo(isKokkosKernelsStream_, num_streams_, exec_space_instances_);
621  L_solver_->setMatrices(L_v_);
622  }
623  L_solver_->initialize();
624 
625  if (!isKokkosKernelsStream_) {
626  U_solver_->setMatrix(U_);
627  } else {
628  U_solver_->setStreamInfo(isKokkosKernelsStream_, num_streams_, exec_space_instances_);
629  U_solver_->setMatrices(U_v_);
630  }
631  U_solver_->initialize();
632 
633  // Do not call initAllValues. compute() always calls initAllValues to
634  // fill L and U with possibly new numbers. initialize() is concerned
635  // only with the nonzero pattern.
636  } // Stop timing
637 
638  isInitialized_ = true;
639  ++numInitialize_;
640  initializeTime_ += (timer.wallTime() - startTime);
641 }
642 
643 template <class MatrixType>
645  checkOrderingConsistency(const row_matrix_type& A) {
646  // First check that the local row map ordering is the same as the local portion of the column map.
647  // The extraction of the strictly lower/upper parts of A, as well as the factorization,
648  // implicitly assume that this is the case.
649  Teuchos::ArrayView<const global_ordinal_type> rowGIDs = A.getRowMap()->getLocalElementList();
650  Teuchos::ArrayView<const global_ordinal_type> colGIDs = A.getColMap()->getLocalElementList();
651  bool gidsAreConsistentlyOrdered = true;
652  global_ordinal_type indexOfInconsistentGID = 0;
653  for (global_ordinal_type i = 0; i < rowGIDs.size(); ++i) {
654  if (rowGIDs[i] != colGIDs[i]) {
655  gidsAreConsistentlyOrdered = false;
656  indexOfInconsistentGID = i;
657  break;
658  }
659  }
660  TEUCHOS_TEST_FOR_EXCEPTION(gidsAreConsistentlyOrdered == false, std::runtime_error,
661  "The ordering of the local GIDs in the row and column maps is not the same"
662  << std::endl
663  << "at index " << indexOfInconsistentGID
664  << ". Consistency is required, as all calculations are done with"
665  << std::endl
666  << "local indexing.");
667 }
668 
669 template <class MatrixType>
670 void RILUK<MatrixType>::
671  initAllValues(const row_matrix_type& A) {
672  using Teuchos::ArrayRCP;
673  using Teuchos::Comm;
674  using Teuchos::ptr;
675  using Teuchos::RCP;
676  using Teuchos::REDUCE_SUM;
677  using Teuchos::reduceAll;
678  typedef Tpetra::Map<local_ordinal_type, global_ordinal_type, node_type> map_type;
679 
680  size_t NumIn = 0, NumL = 0, NumU = 0;
681  bool DiagFound = false;
682  size_t NumNonzeroDiags = 0;
683  size_t MaxNumEntries = A.getGlobalMaxNumRowEntries();
684 
685  // Allocate temporary space for extracting the strictly
686  // lower and upper parts of the matrix A.
687  nonconst_local_inds_host_view_type InI("InI", MaxNumEntries);
688  Teuchos::Array<local_ordinal_type> LI(MaxNumEntries);
689  Teuchos::Array<local_ordinal_type> UI(MaxNumEntries);
690  nonconst_values_host_view_type InV("InV", MaxNumEntries);
691  Teuchos::Array<scalar_type> LV(MaxNumEntries);
692  Teuchos::Array<scalar_type> UV(MaxNumEntries);
693 
694  // Check if values should be inserted or replaced
695  const bool ReplaceValues = L_->isStaticGraph() || L_->isLocallyIndexed();
696 
697  L_->resumeFill();
698  U_->resumeFill();
699  if (ReplaceValues) {
700  L_->setAllToScalar(STS::zero()); // Zero out L and U matrices
701  U_->setAllToScalar(STS::zero());
702  }
703 
704  D_->putScalar(STS::zero()); // Set diagonal values to zero
705  auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
706 
707  RCP<const map_type> rowMap = L_->getRowMap();
708 
709  // First we copy the user's matrix into L and U, regardless of fill level.
710  // It is important to note that L and U are populated using local indices.
711  // This means that if the row map GIDs are not monotonically increasing
712  // (i.e., permuted or gappy), then the strictly lower (upper) part of the
713  // matrix is not the one that you would get if you based L (U) on GIDs.
714  // This is ok, as the *order* of the GIDs in the rowmap is a better
715  // expression of the user's intent than the GIDs themselves.
716 
717  Teuchos::ArrayView<const global_ordinal_type> nodeGIDs = rowMap->getLocalElementList();
718  for (size_t myRow = 0; myRow < A.getLocalNumRows(); ++myRow) {
719  local_ordinal_type local_row = myRow;
720 
721  // TODO JJH 4April2014 An optimization is to use getLocalRowView. Not all matrices support this,
722  // we'd need to check via the Tpetra::RowMatrix method supportsRowViews().
723  A.getLocalRowCopy(local_row, InI, InV, NumIn); // Get Values and Indices
724 
725  // Split into L and U (we don't assume that indices are ordered).
726 
727  NumL = 0;
728  NumU = 0;
729  DiagFound = false;
730 
731  for (size_t j = 0; j < NumIn; ++j) {
732  const local_ordinal_type k = InI[j];
733 
734  if (k == local_row) {
735  DiagFound = true;
736  // Store perturbed diagonal in Tpetra::Vector D_
737  DV(local_row) += Rthresh_ * InV[j] + IFPACK2_SGN(InV[j]) * Athresh_;
738  } else if (k < 0) { // Out of range
740  true, std::runtime_error,
741  "Ifpack2::RILUK::initAllValues: current "
742  "GID k = "
743  << k << " < 0. I'm not sure why this is an error; it is "
744  "probably an artifact of the undocumented assumptions of the "
745  "original implementation (likely copied and pasted from Ifpack). "
746  "Nevertheless, the code I found here insisted on this being an error "
747  "state, so I will throw an exception here.");
748  } else if (k < local_row) {
749  LI[NumL] = k;
750  LV[NumL] = InV[j];
751  NumL++;
752  } else if (Teuchos::as<size_t>(k) <= rowMap->getLocalNumElements()) {
753  UI[NumU] = k;
754  UV[NumU] = InV[j];
755  NumU++;
756  }
757  }
758 
759  // Check in things for this row of L and U
760 
761  if (DiagFound) {
762  ++NumNonzeroDiags;
763  } else {
764  DV(local_row) = Athresh_;
765  }
766 
767  if (NumL) {
768  if (ReplaceValues) {
769  L_->replaceLocalValues(local_row, LI(0, NumL), LV(0, NumL));
770  } else {
771  // FIXME JJH 24April2014 Is this correct? I believe this case is when there aren't already values
772  // FIXME in this row in the column locations corresponding to UI.
773  L_->insertLocalValues(local_row, LI(0, NumL), LV(0, NumL));
774  }
775  }
776 
777  if (NumU) {
778  if (ReplaceValues) {
779  U_->replaceLocalValues(local_row, UI(0, NumU), UV(0, NumU));
780  } else {
781  // FIXME JJH 24April2014 Is this correct? I believe this case is when there aren't already values
782  // FIXME in this row in the column locations corresponding to UI.
783  U_->insertLocalValues(local_row, UI(0, NumU), UV(0, NumU));
784  }
785  }
786  }
787 
788  // At this point L and U have the values of A in the structure of L
789  // and U, and diagonal vector D
790 
791  isInitialized_ = true;
792 }
793 
794 template <class MatrixType>
795 void RILUK<MatrixType>::compute_serial() {
796  // Fill L and U with numbers. This supports nonzero pattern reuse by calling
797  // initialize() once and then compute() multiple times.
798  initAllValues(*A_local_);
799 
800  // MinMachNum should be officially defined, for now pick something a little
801  // bigger than IEEE underflow value
802 
803  const scalar_type MinDiagonalValue = STS::rmin();
804  const scalar_type MaxDiagonalValue = STS::one() / MinDiagonalValue;
805 
806  size_t NumIn, NumL, NumU;
807 
808  // Get Maximum Row length
809  const size_t MaxNumEntries =
810  L_->getLocalMaxNumRowEntries() + U_->getLocalMaxNumRowEntries() + 1;
811 
812  Teuchos::Array<local_ordinal_type> InI(MaxNumEntries); // Allocate temp space
813  Teuchos::Array<scalar_type> InV(MaxNumEntries);
814  size_t num_cols = U_->getColMap()->getLocalNumElements();
815  Teuchos::Array<int> colflag(num_cols, -1);
816 
817  auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
818 
819  // Now start the factorization.
820 
821  using IST = typename row_matrix_type::impl_scalar_type;
822  for (size_t i = 0; i < L_->getLocalNumRows(); ++i) {
823  local_ordinal_type local_row = i;
824  // Need some integer workspace and pointers
825  size_t NumUU;
826  local_inds_host_view_type UUI;
827  values_host_view_type UUV;
828 
829  // Fill InV, InI with current row of L, D and U combined
830 
831  NumIn = MaxNumEntries;
832  nonconst_local_inds_host_view_type InI_v(InI.data(), MaxNumEntries);
833  nonconst_values_host_view_type InV_v(reinterpret_cast<IST*>(InV.data()), MaxNumEntries);
834 
835  L_->getLocalRowCopy(local_row, InI_v, InV_v, NumL);
836 
837  InV[NumL] = DV(i); // Put in diagonal
838  InI[NumL] = local_row;
839 
840  nonconst_local_inds_host_view_type InI_sub(InI.data() + NumL + 1, MaxNumEntries - NumL - 1);
841  nonconst_values_host_view_type InV_sub(reinterpret_cast<IST*>(InV.data()) + NumL + 1, MaxNumEntries - NumL - 1);
842 
843  U_->getLocalRowCopy(local_row, InI_sub, InV_sub, NumU);
844  NumIn = NumL + NumU + 1;
845 
846  // Set column flags
847  for (size_t j = 0; j < NumIn; ++j) {
848  colflag[InI[j]] = j;
849  }
850 
851  scalar_type diagmod = STS::zero(); // Off-diagonal accumulator
852 
853  for (size_t jj = 0; jj < NumL; ++jj) {
854  local_ordinal_type j = InI[jj];
855  IST multiplier = InV[jj]; // current_mults++;
856 
857  InV[jj] *= static_cast<scalar_type>(DV(j));
858 
859  U_->getLocalRowView(j, UUI, UUV); // View of row above
860  NumUU = UUI.size();
861 
862  if (RelaxValue_ == STM::zero()) {
863  for (size_t k = 0; k < NumUU; ++k) {
864  const int kk = colflag[UUI[k]];
865  // FIXME (mfh 23 Dec 2013) Wait a second, we just set
866  // colflag above using size_t (which is generally unsigned),
867  // but now we're querying it using int (which is signed).
868  if (kk > -1) {
869  InV[kk] -= static_cast<scalar_type>(multiplier * UUV[k]);
870  }
871  }
872 
873  } else {
874  for (size_t k = 0; k < NumUU; ++k) {
875  // FIXME (mfh 23 Dec 2013) Wait a second, we just set
876  // colflag above using size_t (which is generally unsigned),
877  // but now we're querying it using int (which is signed).
878  const int kk = colflag[UUI[k]];
879  if (kk > -1) {
880  InV[kk] -= static_cast<scalar_type>(multiplier * UUV[k]);
881  } else {
882  diagmod -= static_cast<scalar_type>(multiplier * UUV[k]);
883  }
884  }
885  }
886  }
887 
888  if (NumL) {
889  // Replace current row of L
890  L_->replaceLocalValues(local_row, InI(0, NumL), InV(0, NumL));
891  }
892 
893  DV(i) = InV[NumL]; // Extract Diagonal value
894 
895  if (RelaxValue_ != STM::zero()) {
896  DV(i) += RelaxValue_ * diagmod; // Add off diagonal modifications
897  }
898 
899  if (STS::magnitude(DV(i)) > STS::magnitude(MaxDiagonalValue)) {
900  if (STS::real(DV(i)) < STM::zero()) {
901  DV(i) = -MinDiagonalValue;
902  } else {
903  DV(i) = MinDiagonalValue;
904  }
905  } else {
906  DV(i) = static_cast<impl_scalar_type>(STS::one()) / DV(i); // Invert diagonal value
907  }
908 
909  for (size_t j = 0; j < NumU; ++j) {
910  InV[NumL + 1 + j] *= static_cast<scalar_type>(DV(i)); // Scale U by inverse of diagonal
911  }
912 
913  if (NumU) {
914  // Replace current row of L and U
915  U_->replaceLocalValues(local_row, InI(NumL + 1, NumU), InV(NumL + 1, NumU));
916  }
917 
918  // Reset column flags
919  for (size_t j = 0; j < NumIn; ++j) {
920  colflag[InI[j]] = -1;
921  }
922  }
923 
924  // The domain of L and the range of U are exactly their own row maps
925  // (there is no communication). The domain of U and the range of L
926  // must be the same as those of the original matrix, However if the
927  // original matrix is a VbrMatrix, these two latter maps are
928  // translation from a block map to a point map.
929  // FIXME (mfh 23 Dec 2013) Do we know that the column Map of L_ is
930  // always one-to-one?
931  L_->fillComplete(L_->getColMap(), A_local_->getRangeMap());
932  U_->fillComplete(A_local_->getDomainMap(), U_->getRowMap());
933 
934  // If L_solver_ or U_solver store modified factors internally, we need to reset those
935  L_solver_->setMatrix(L_);
936  L_solver_->compute(); // NOTE: Only do compute if the pointer changed. Otherwise, do nothing
937  U_solver_->setMatrix(U_);
938  U_solver_->compute(); // NOTE: Only do compute if the pointer changed. Otherwise, do nothing
939 }
940 
941 template <class MatrixType>
942 void RILUK<MatrixType>::compute_kkspiluk() {
943  L_->resumeFill();
944  U_->resumeFill();
945 
946  L_->setAllToScalar(STS::zero()); // Zero out L and U matrices
947  U_->setAllToScalar(STS::zero());
948 
949  using row_map_type = typename crs_matrix_type::local_matrix_device_type::row_map_type;
950  auto lclL = L_->getLocalMatrixDevice();
951  row_map_type L_rowmap = lclL.graph.row_map;
952  auto L_entries = lclL.graph.entries;
953  auto L_values = lclL.values;
954 
955  auto lclU = U_->getLocalMatrixDevice();
956  row_map_type U_rowmap = lclU.graph.row_map;
957  auto U_entries = lclU.graph.entries;
958  auto U_values = lclU.values;
959 
960  auto lclMtx = A_local_crs_->getLocalMatrixDevice();
961  KokkosSparse::spiluk_numeric(KernelHandle_.getRawPtr(), LevelOfFill_,
962  lclMtx.graph.row_map, lclMtx.graph.entries, lclMtx.values,
963  L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values);
964 
965  L_->fillComplete(L_->getColMap(), A_local_->getRangeMap());
966  U_->fillComplete(A_local_->getDomainMap(), U_->getRowMap());
967 
968  L_solver_->compute();
969  U_solver_->compute();
970 }
971 
972 template <class MatrixType>
973 void RILUK<MatrixType>::compute_kkspiluk_stream() {
974  std::vector<lno_row_view_t> L_rowmap_v(num_streams_);
975  std::vector<lno_nonzero_view_t> L_entries_v(num_streams_);
976  std::vector<scalar_nonzero_view_t> L_values_v(num_streams_);
977  std::vector<lno_row_view_t> U_rowmap_v(num_streams_);
978  std::vector<lno_nonzero_view_t> U_entries_v(num_streams_);
979  std::vector<scalar_nonzero_view_t> U_values_v(num_streams_);
980  std::vector<kk_handle_type*> KernelHandle_rawptr_v_(num_streams_);
981  for (int i = 0; i < num_streams_; i++) {
982  L_v_[i]->resumeFill();
983  U_v_[i]->resumeFill();
984 
985  L_v_[i]->setAllToScalar(STS::zero()); // Zero out L and U matrices
986  U_v_[i]->setAllToScalar(STS::zero());
987 
988  auto lclL = L_v_[i]->getLocalMatrixDevice();
989  L_rowmap_v[i] = lclL.graph.row_map;
990  L_entries_v[i] = lclL.graph.entries;
991  L_values_v[i] = lclL.values;
992 
993  auto lclU = U_v_[i]->getLocalMatrixDevice();
994  U_rowmap_v[i] = lclU.graph.row_map;
995  U_entries_v[i] = lclU.graph.entries;
996  U_values_v[i] = lclU.values;
997  KernelHandle_rawptr_v_[i] = KernelHandle_v_[i].getRawPtr();
998  }
999 
1000  {
1001  auto lclMtx = A_local_crs_->getLocalMatrixDevice();
1002  // A_local_diagblks was already setup during initialize, just copy the corresponding
1003  // values from A_local_crs_ in parallel now.
1004  using TeamPolicy = Kokkos::TeamPolicy<execution_space>;
1005  const auto A_nrows = lclMtx.numRows();
1006  auto rows_per_block = ((A_nrows % num_streams_) == 0)
1007  ? (A_nrows / num_streams_)
1008  : (A_nrows / num_streams_ + 1);
1009  for (int i = 0; i < num_streams_; i++) {
1010  const auto start_row_offset = i * rows_per_block;
1011  auto rowptrs = A_local_diagblks_rowmap_v_[i];
1012  auto colindices = A_local_diagblks_entries_v_[i];
1013  auto values = A_local_diagblks_values_v_[i];
1014  const bool reordered = hasStreamReordered_;
1015  typename lno_nonzero_view_t::non_const_type reverse_perm = hasStreamReordered_ ? reverse_perm_v_[i] : typename lno_nonzero_view_t::non_const_type{};
1016  TeamPolicy pol(exec_space_instances_[i], A_local_diagblks_rowmap_v_[i].extent(0) - 1, Kokkos::AUTO);
1017  Kokkos::parallel_for(
1018  pol, KOKKOS_LAMBDA(const typename TeamPolicy::member_type& team) {
1019  const auto irow = team.league_rank();
1020  const auto irow_A = start_row_offset + (reordered ? reverse_perm(irow) : irow);
1021  const auto A_local_crs_row = lclMtx.rowConst(irow_A);
1022  const auto begin_row = rowptrs(irow);
1023  const auto num_entries = rowptrs(irow + 1) - begin_row;
1024  Kokkos::parallel_for(Kokkos::TeamThreadRange(team, num_entries), [&](const int j) {
1025  const auto colidx = colindices(begin_row + j);
1026  const auto colidx_A = start_row_offset + (reordered ? reverse_perm(colidx) : colidx);
1027  // Find colidx in A_local_crs_row
1028  const int offset = KokkosSparse::findRelOffset(
1029  &A_local_crs_row.colidx(0), A_local_crs_row.length, colidx_A, 0, false);
1030  values(begin_row + j) = A_local_crs_row.value(offset);
1031  });
1032  });
1033  }
1034  }
1035 
1036  KokkosSparse::Experimental::spiluk_numeric_streams(exec_space_instances_, KernelHandle_rawptr_v_, LevelOfFill_,
1037  A_local_diagblks_rowmap_v_, A_local_diagblks_entries_v_, A_local_diagblks_values_v_,
1038  L_rowmap_v, L_entries_v, L_values_v,
1039  U_rowmap_v, U_entries_v, U_values_v);
1040 
1041  for (int i = 0; i < num_streams_; i++) {
1042  L_v_[i]->fillComplete();
1043  U_v_[i]->fillComplete();
1044  }
1045 
1046  L_solver_->compute();
1047  U_solver_->compute();
1048 }
1049 
1050 template <class MatrixType>
1052  using Teuchos::Array;
1053  using Teuchos::ArrayView;
1054  using Teuchos::RCP;
1055  using Teuchos::rcp;
1056  using Teuchos::rcp_const_cast;
1057  using Teuchos::rcp_dynamic_cast;
1058  const char prefix[] = "Ifpack2::RILUK::compute: ";
1059 
1060  // initialize() checks this too, but it's easier for users if the
1061  // error shows them the name of the method that they actually
1062  // called, rather than the name of some internally called method.
1063  TEUCHOS_TEST_FOR_EXCEPTION(A_.is_null(), std::runtime_error, prefix << "The matrix is null. Please "
1064  "call setMatrix() with a nonnull input before calling this method.");
1065  TEUCHOS_TEST_FOR_EXCEPTION(!A_->isFillComplete(), std::runtime_error, prefix << "The matrix is not "
1066  "fill complete. You may not invoke initialize() or compute() with this "
1067  "matrix until the matrix is fill complete. If your matrix is a "
1068  "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
1069  "range Maps, if appropriate) before calling this method.");
1070 
1071  if (!isInitialized()) {
1072  initialize(); // Don't count this in the compute() time
1073  }
1074 
1075  Teuchos::Time timer("RILUK::compute");
1076 
1077  // Start timing
1078  Teuchos::TimeMonitor timeMon(timer);
1079  double startTime = timer.wallTime();
1080 
1081  isComputed_ = false;
1082 
1083  if (!this->isKokkosKernelsSpiluk_) {
1084  compute_serial();
1085  } else {
1086  // Make sure values in A is picked up even in case of pattern reuse
1087  if (!A_local_crs_nc_.is_null()) {
1088  // copy entries into A_local_crs
1089  A_local_crs_nc_->resumeFill();
1090  local_ordinal_type numRows = A_local_->getLocalNumRows();
1091  nonconst_local_inds_host_view_type indices("indices", A_local_->getLocalMaxNumRowEntries());
1092  nonconst_values_host_view_type values("values", A_local_->getLocalMaxNumRowEntries());
1093  for (local_ordinal_type i = 0; i < numRows; i++) {
1094  size_t numEntries = 0;
1095  A_local_->getLocalRowCopy(i, indices, values, numEntries);
1096  A_local_crs_nc_->replaceLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()), indices.data());
1097  }
1098  A_local_crs_nc_->fillComplete(A_local_->getDomainMap(), A_local_->getRangeMap());
1099  }
1100 
1101  if (!isKokkosKernelsStream_) {
1102  compute_kkspiluk();
1103  } else {
1104  // If streams are on, we potentially have to refresh A_local_diagblks_values_v_
1105  auto lclMtx = A_local_crs_->getLocalMatrixDevice();
1106  KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks_v_, hasStreamReordered_);
1107  for (int i = 0; i < num_streams_; i++) {
1108  A_local_diagblks_values_v_[i] = A_local_diagblks_v_[i].values;
1109  }
1110 
1111  compute_kkspiluk_stream();
1112  }
1113  }
1114 
1115  isComputed_ = true;
1116  ++numCompute_;
1117  computeTime_ += (timer.wallTime() - startTime);
1118 }
1119 
1120 namespace Impl {
1121 template <typename MV, typename Map>
1122 void resetMultiVecIfNeeded(std::unique_ptr<MV>& mv_ptr, const Map& map, const size_t numVectors, bool initialize) {
1123  if (!mv_ptr || mv_ptr->getNumVectors() != numVectors) {
1124  mv_ptr.reset(new MV(map, numVectors, initialize));
1125  }
1126 }
1127 } // namespace Impl
1128 
1129 template <class MatrixType>
1131  apply(const Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& X,
1132  Tpetra::MultiVector<scalar_type, local_ordinal_type, global_ordinal_type, node_type>& Y,
1133  Teuchos::ETransp mode,
1134  scalar_type alpha,
1135  scalar_type beta) const {
1136  using Teuchos::RCP;
1137  using Teuchos::rcpFromRef;
1138 
1140  A_.is_null(), std::runtime_error,
1141  "Ifpack2::RILUK::apply: The matrix is "
1142  "null. Please call setMatrix() with a nonnull input, then initialize() "
1143  "and compute(), before calling this method.");
1145  !isComputed(), std::runtime_error,
1146  "Ifpack2::RILUK::apply: If you have not yet called compute(), "
1147  "you must call compute() before calling this method.");
1148  TEUCHOS_TEST_FOR_EXCEPTION(
1149  X.getNumVectors() != Y.getNumVectors(), std::invalid_argument,
1150  "Ifpack2::RILUK::apply: X and Y do not have the same number of columns. "
1151  "X.getNumVectors() = "
1152  << X.getNumVectors()
1153  << " != Y.getNumVectors() = " << Y.getNumVectors() << ".");
1154  TEUCHOS_TEST_FOR_EXCEPTION(
1155  STS::isComplex && mode == Teuchos::CONJ_TRANS, std::logic_error,
1156  "Ifpack2::RILUK::apply: mode = Teuchos::CONJ_TRANS is not implemented for "
1157  "complex Scalar type. Please talk to the Ifpack2 developers to get this "
1158  "fixed. There is a FIXME in this file about this very issue.");
1159 #ifdef HAVE_IFPACK2_DEBUG
1160  {
1161  if (!isKokkosKernelsStream_) {
1162  const magnitude_type D_nrm1 = D_->norm1();
1163  TEUCHOS_TEST_FOR_EXCEPTION(STM::isnaninf(D_nrm1), std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the stored diagonal is NaN or Inf.");
1164  }
1165  Teuchos::Array<magnitude_type> norms(X.getNumVectors());
1166  X.norm1(norms());
1167  bool good = true;
1168  for (size_t j = 0; j < X.getNumVectors(); ++j) {
1169  if (STM::isnaninf(norms[j])) {
1170  good = false;
1171  break;
1172  }
1173  }
1174  TEUCHOS_TEST_FOR_EXCEPTION(!good, std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the input X is NaN or Inf.");
1175  }
1176 #endif // HAVE_IFPACK2_DEBUG
1177 
1178  const scalar_type one = STS::one();
1179  const scalar_type zero = STS::zero();
1180 
1181  Teuchos::Time timer("RILUK::apply");
1182  double startTime = timer.wallTime();
1183  { // Start timing
1184  Teuchos::TimeMonitor timeMon(timer);
1185  if (alpha == one && beta == zero) {
1186  if (isKokkosKernelsSpiluk_ && isKokkosKernelsStream_ && hasStreamReordered_) {
1187  Impl::resetMultiVecIfNeeded(reordered_x_, X.getMap(), X.getNumVectors(), false);
1188  Impl::resetMultiVecIfNeeded(reordered_y_, Y.getMap(), Y.getNumVectors(), false);
1189  Kokkos::fence();
1190  for (size_t j = 0; j < X.getNumVectors(); j++) {
1191  auto X_j = X.getVector(j);
1192  auto ReorderedX_j = reordered_x_->getVectorNonConst(j);
1193  auto X_lcl = X_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
1194  auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
1195  local_ordinal_type stream_begin = 0;
1196  local_ordinal_type stream_end;
1197  for (int i = 0; i < num_streams_; i++) {
1198  auto perm_i = perm_v_[i];
1199  stream_end = stream_begin + perm_i.extent(0);
1200  auto X_lcl_sub = Kokkos::subview(X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1201  auto ReorderedX_lcl_sub = Kokkos::subview(ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1202  Kokkos::parallel_for(
1203  Kokkos::RangePolicy<execution_space>(exec_space_instances_[i], 0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA(const int& ii) {
1204  ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii);
1205  });
1206  stream_begin = stream_end;
1207  }
1208  }
1209  Kokkos::fence();
1210  if (mode == Teuchos::NO_TRANS) { // Solve L (U Y) = X for Y.
1211  // Solve L Y = X for Y.
1212  L_solver_->apply(*reordered_x_, Y, mode);
1213  // Solve U Y = Y for Y.
1214  U_solver_->apply(Y, *reordered_y_, mode);
1215  } else { // Solve U^P (L^P Y) = X for Y (where P is * or T).
1216  // Solve U^P Y = X for Y.
1217  U_solver_->apply(*reordered_x_, Y, mode);
1218  // Solve L^P Y = Y for Y.
1219  L_solver_->apply(Y, *reordered_y_, mode);
1220  }
1221 
1222  for (size_t j = 0; j < Y.getNumVectors(); j++) {
1223  auto Y_j = Y.getVectorNonConst(j);
1224  auto ReorderedY_j = reordered_y_->getVector(j);
1225  auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
1226  auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
1227  local_ordinal_type stream_begin = 0;
1228  local_ordinal_type stream_end;
1229  for (int i = 0; i < num_streams_; i++) {
1230  auto perm_i = perm_v_[i];
1231  stream_end = stream_begin + perm_i.extent(0);
1232  auto Y_lcl_sub = Kokkos::subview(Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1233  auto ReorderedY_lcl_sub = Kokkos::subview(ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1234  Kokkos::parallel_for(
1235  Kokkos::RangePolicy<execution_space>(exec_space_instances_[i], 0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA(const int& ii) {
1236  Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii));
1237  });
1238  stream_begin = stream_end;
1239  }
1240  }
1241  Kokkos::fence();
1242  } else {
1243  if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
1244 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
1245  // NOTE (Nov-15-2022):
1246  // This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
1247  // since cusparseSpSV_solve() does not support in-place computation
1248  Impl::resetMultiVecIfNeeded(Y_tmp_, Y.getMap(), Y.getNumVectors(), false);
1249 
1250  // Start by solving L Y_tmp = X for Y_tmp.
1251  L_solver_->apply(X, *Y_tmp_, mode);
1252 
1253  if (!this->isKokkosKernelsSpiluk_) {
1254  // Solve D Y = Y. The operation lets us do this in place in Y, so we can
1255  // write "solve D Y = Y for Y."
1256  Y_tmp_->elementWiseMultiply(one, *D_, *Y_tmp_, zero);
1257  }
1258 
1259  U_solver_->apply(*Y_tmp_, Y, mode); // Solve U Y = Y_tmp.
1260 #else
1261  // Start by solving L Y = X for Y.
1262  L_solver_->apply(X, Y, mode);
1263 
1264  if (!this->isKokkosKernelsSpiluk_) {
1265  // Solve D Y = Y. The operation lets us do this in place in Y, so we can
1266  // write "solve D Y = Y for Y."
1267  Y.elementWiseMultiply(one, *D_, Y, zero);
1268  }
1269 
1270  U_solver_->apply(Y, Y, mode); // Solve U Y = Y.
1271 #endif
1272  } else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
1273 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
1274  // NOTE (Nov-15-2022):
1275  // This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
1276  // since cusparseSpSV_solve() does not support in-place computation
1277  Impl::resetMultiVecIfNeeded(Y_tmp_, Y.getMap(), Y.getNumVectors(), false);
1278 
1279  // Start by solving U^P Y_tmp = X for Y_tmp.
1280  U_solver_->apply(X, *Y_tmp_, mode);
1281 
1282  if (!this->isKokkosKernelsSpiluk_) {
1283  // Solve D^P Y = Y.
1284  //
1285  // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
1286  // need to do an elementwise multiply with the conjugate of
1287  // D_, not just with D_ itself.
1288  Y_tmp_->elementWiseMultiply(one, *D_, *Y_tmp_, zero);
1289  }
1290 
1291  L_solver_->apply(*Y_tmp_, Y, mode); // Solve L^P Y = Y_tmp.
1292 #else
1293  // Start by solving U^P Y = X for Y.
1294  U_solver_->apply(X, Y, mode);
1295 
1296  if (!this->isKokkosKernelsSpiluk_) {
1297  // Solve D^P Y = Y.
1298  //
1299  // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
1300  // need to do an elementwise multiply with the conjugate of
1301  // D_, not just with D_ itself.
1302  Y.elementWiseMultiply(one, *D_, Y, zero);
1303  }
1304 
1305  L_solver_->apply(Y, Y, mode); // Solve L^P Y = Y.
1306 #endif
1307  }
1308  }
1309  } else { // alpha != 1 or beta != 0
1310  if (alpha == zero) {
1311  // The special case for beta == 0 ensures that if Y contains Inf
1312  // or NaN values, we replace them with 0 (following BLAS
1313  // convention), rather than multiplying them by 0 to get NaN.
1314  if (beta == zero) {
1315  Y.putScalar(zero);
1316  } else {
1317  Y.scale(beta);
1318  }
1319  } else { // alpha != zero
1320  Impl::resetMultiVecIfNeeded(Y_tmp_, Y.getMap(), Y.getNumVectors(), false);
1321  apply(X, *Y_tmp_, mode);
1322  Y.update(alpha, *Y_tmp_, beta);
1323  }
1324  }
1325  } // end timing
1326 
1327 #ifdef HAVE_IFPACK2_DEBUG
1328  {
1329  Teuchos::Array<magnitude_type> norms(Y.getNumVectors());
1330  Y.norm1(norms());
1331  bool good = true;
1332  for (size_t j = 0; j < Y.getNumVectors(); ++j) {
1333  if (STM::isnaninf(norms[j])) {
1334  good = false;
1335  break;
1336  }
1337  }
1338  TEUCHOS_TEST_FOR_EXCEPTION(!good, std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the output Y is NaN or Inf.");
1339  }
1340 #endif // HAVE_IFPACK2_DEBUG
1341 
1342  ++numApply_;
1343  applyTime_ += (timer.wallTime() - startTime);
1344 }
1345 
1346 // VINH comment out since multiply() is not needed anywhere
1347 // template<class MatrixType>
1348 // void RILUK<MatrixType>::
1349 // multiply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1350 // Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
1351 // const Teuchos::ETransp mode) const
1352 //{
1353 // const scalar_type zero = STS::zero ();
1354 // const scalar_type one = STS::one ();
1355 //
1356 // if (mode != Teuchos::NO_TRANS) {
1357 // U_->apply (X, Y, mode); //
1358 // Y.update (one, X, one); // Y = Y + X (account for implicit unit diagonal)
1359 //
1360 // // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we need
1361 // // to do an elementwise multiply with the conjugate of D_, not
1362 // // just with D_ itself.
1363 // Y.elementWiseMultiply (one, *D_, Y, zero); // y = D*y (D_ has inverse of diagonal)
1364 //
1365 // MV Y_tmp (Y, Teuchos::Copy); // Need a temp copy of Y
1366 // L_->apply (Y_tmp, Y, mode);
1367 // Y.update (one, Y_tmp, one); // (account for implicit unit diagonal)
1368 // }
1369 // else {
1370 // L_->apply (X, Y, mode);
1371 // Y.update (one, X, one); // Y = Y + X (account for implicit unit diagonal)
1372 // Y.elementWiseMultiply (one, *D_, Y, zero); // y = D*y (D_ has inverse of diagonal)
1373 // MV Y_tmp (Y, Teuchos::Copy); // Need a temp copy of Y1
1374 // U_->apply (Y_tmp, Y, mode);
1375 // Y.update (one, Y_tmp, one); // (account for implicit unit diagonal)
1376 // }
1377 // }
1378 
1379 template <class MatrixType>
1380 std::string RILUK<MatrixType>::description() const {
1381  std::ostringstream os;
1382 
1383  // Output is a valid YAML dictionary in flow style. If you don't
1384  // like everything on a single line, you should call describe()
1385  // instead.
1386  os << "\"Ifpack2::RILUK\": {";
1387  os << "Initialized: " << (isInitialized() ? "true" : "false") << ", "
1388  << "Computed: " << (isComputed() ? "true" : "false") << ", ";
1389 
1390  os << "Level-of-fill: " << getLevelOfFill() << ", ";
1391 
1392  if (isKokkosKernelsSpiluk_) os << "KK-SPILUK, ";
1393  if (isKokkosKernelsStream_) os << "KK-Stream, ";
1394 
1395  if (A_.is_null()) {
1396  os << "Matrix: null";
1397  } else {
1398  os << "Global matrix dimensions: ["
1399  << A_->getGlobalNumRows() << ", " << A_->getGlobalNumCols() << "]"
1400  << ", Global nnz: " << A_->getGlobalNumEntries();
1401  }
1402 
1403  if (!L_solver_.is_null()) os << ", " << L_solver_->description();
1404  if (!U_solver_.is_null()) os << ", " << U_solver_->description();
1405 
1406  os << "}";
1407  return os.str();
1408 }
1409 
1410 } // namespace Ifpack2
1411 
1412 #define IFPACK2_RILUK_INSTANT(S, LO, GO, N) \
1413  template class Ifpack2::RILUK<Tpetra::RowMatrix<S, LO, GO, N> >;
1414 
1415 #endif
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getRangeMap() const
Returns the Tpetra::Map object associated with the range of this operator.
Definition: Ifpack2_RILUK_def.hpp:228
node_type::execution_space execution_space
The Kokkos execution space of the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:241
static Teuchos::RCP< const row_matrix_type > makeLocalFilter(const Teuchos::RCP< const row_matrix_type > &A)
Return A, wrapped in a LocalFilter, if necessary.
Definition: Ifpack2_RILUK_def.hpp:417
MatrixType::global_ordinal_type global_ordinal_type
The type of global indices in the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:229
T & get(const std::string &name, T def_value)
bool nonnull(const std::shared_ptr< T > &p)
const Tpetra::Vector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > & getD() const
Return the diagonal entries of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:166
void initialize()
Initialize by computing the symbolic incomplete factorization.
Definition: Ifpack2_RILUK_def.hpp:447
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Tpetra::RowMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > row_matrix_type
Tpetra::RowMatrix specialization used by this class.
Definition: Ifpack2_RILUK_decl.hpp:248
size_type size() const
virtual void setMatrix(const Teuchos::RCP< const row_matrix_type > &A)
Change the matrix to be preconditioned.
Definition: Ifpack2_RILUK_def.hpp:117
std::string description() const
A one-line description of this object.
Definition: Ifpack2_RILUK_def.hpp:1380
void setParameters(const Teuchos::ParameterList &params)
Definition: Ifpack2_RILUK_def.hpp:287
ILU(k) factorization of a given Tpetra::RowMatrix.
Definition: Ifpack2_RILUK_decl.hpp:213
size_t getNodeSmootherComplexity() const
Get a rough estimate of cost per iteration.
Definition: Ifpack2_RILUK_def.hpp:191
Teuchos::RCP< const crs_matrix_type > getCrsMatrix() const
Return the input matrix A as a Tpetra::CrsMatrix, if possible; else throws.
Definition: Ifpack2_RILUK_def.hpp:411
Tpetra::CrsMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > crs_matrix_type
Tpetra::CrsMatrix specialization used by this class for representing L and U.
Definition: Ifpack2_RILUK_decl.hpp:250
const crs_matrix_type & getU() const
Return the U factor of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:179
bool isParameter(const std::string &name) const
MatrixType::node_type node_type
The Node type used by the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:232
T * getRawPtr() const
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getDomainMap() const
Returns the Tpetra::Map object associated with the domain of this operator.
Definition: Ifpack2_RILUK_def.hpp:208
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
MatrixType::local_ordinal_type local_ordinal_type
The type of local indices in the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:226
virtual ~RILUK()
Destructor (declared virtual for memory safety).
Definition: Ifpack2_RILUK_def.hpp:94
&quot;Preconditioner&quot; that solves local sparse triangular systems.
Definition: Ifpack2_LocalSparseTriangularSolver_decl.hpp:46
void compute()
Compute the (numeric) incomplete factorization.
Definition: Ifpack2_RILUK_def.hpp:1051
void apply(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_type alpha=Teuchos::ScalarTraits< scalar_type >::one(), scalar_type beta=Teuchos::ScalarTraits< scalar_type >::zero()) const
Apply the (inverse of the) incomplete factorization to X, resulting in Y.
Definition: Ifpack2_RILUK_def.hpp:1131
Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type
The type of the magnitude (absolute value) of a matrix entry.
Definition: Ifpack2_RILUK_decl.hpp:235
void resize(size_type new_size, const value_type &x=value_type())
Construct a level filled graph for use in computing an ILU(k) incomplete factorization.
Definition: Ifpack2_IlukGraph.hpp:67
IntegralType getIntegralValue(const std::string &str, const std::string &paramName="", const std::string &sublistName="") const
Teuchos::RCP< const row_matrix_type > getMatrix() const
Get the input matrix.
Definition: Ifpack2_RILUK_def.hpp:405
bool isType(const std::string &name) const
Declaration of MDF interface.
Access only local rows and columns of a sparse matrix.
Definition: Ifpack2_LocalFilter_decl.hpp:127
static double wallTime()
const crs_matrix_type & getL() const
Return the L factor of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:150
std::string typeName(const T &t)
MatrixType::scalar_type scalar_type
The type of the entries of the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:223
bool is_null() const