Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_RILUK_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
4 //
5 // Copyright 2009 NTESS and the Ifpack2 contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef IFPACK2_CRSRILUK_DEF_HPP
11 #define IFPACK2_CRSRILUK_DEF_HPP
12 
13 #include "Ifpack2_RILUK_decl.hpp"
14 #include "Ifpack2_LocalFilter.hpp"
15 #include "Tpetra_CrsMatrix.hpp"
16 #include "Teuchos_StandardParameterEntryValidators.hpp"
17 #include "Ifpack2_LocalSparseTriangularSolver.hpp"
18 #include "Ifpack2_Details_getParamTryingTypes.hpp"
19 #include "Ifpack2_Details_getCrsMatrix.hpp"
20 #include "Kokkos_Sort.hpp"
21 #include "KokkosSparse_Utils.hpp"
22 #include "KokkosKernels_Sorting.hpp"
23 #include "KokkosSparse_IOUtils.hpp"
24 
25 namespace Ifpack2 {
26 
27 namespace Details {
28 struct IlukImplType {
29  enum Enum {
30  Serial,
31  KSPILUK
32  };
33 
34  static void loadPLTypeOption (Teuchos::Array<std::string>& type_strs, Teuchos::Array<Enum>& type_enums) {
35  type_strs.resize(2);
36  type_strs[0] = "Serial";
37  type_strs[1] = "KSPILUK";
38  type_enums.resize(2);
39  type_enums[0] = Serial;
40  type_enums[1] = KSPILUK;
41  }
42 };
43 }
44 
45 template<class MatrixType>
47  : A_ (Matrix_in),
48  LevelOfFill_ (0),
49  Overalloc_ (2.),
50  isAllocated_ (false),
51  isInitialized_ (false),
52  isComputed_ (false),
53  numInitialize_ (0),
54  numCompute_ (0),
55  numApply_ (0),
56  initializeTime_ (0.0),
57  computeTime_ (0.0),
58  applyTime_ (0.0),
59  RelaxValue_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
60  Athresh_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
61  Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
62  isKokkosKernelsSpiluk_(false),
63  isKokkosKernelsStream_(false),
64  num_streams_(0),
65  hasStreamReordered_(false)
66 {
67  allocateSolvers();
68 }
69 
70 
71 template<class MatrixType>
73  : A_ (Matrix_in),
74  LevelOfFill_ (0),
75  Overalloc_ (2.),
76  isAllocated_ (false),
77  isInitialized_ (false),
78  isComputed_ (false),
79  numInitialize_ (0),
80  numCompute_ (0),
81  numApply_ (0),
82  initializeTime_ (0.0),
83  computeTime_ (0.0),
84  applyTime_ (0.0),
85  RelaxValue_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
86  Athresh_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
87  Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
88  isKokkosKernelsSpiluk_(false),
89  isKokkosKernelsStream_(false),
90  num_streams_(0),
91  hasStreamReordered_(false)
92 {
93  allocateSolvers();
94 }
95 
96 
97 template<class MatrixType>
99 {
100  if (!isKokkosKernelsStream_) {
101  if (Teuchos::nonnull (KernelHandle_)) {
102  KernelHandle_->destroy_spiluk_handle();
103  }
104  }
105  else {
106  for (int i = 0; i < num_streams_; i++) {
107  if (Teuchos::nonnull (KernelHandle_v_[i])) {
108  KernelHandle_v_[i]->destroy_spiluk_handle();
109  }
110  }
111  }
112 }
113 
114 template<class MatrixType>
116 {
118  L_solver_->setObjectLabel("lower");
120  U_solver_->setObjectLabel("upper");
121 }
122 
123 template<class MatrixType>
124 void
126 {
127  // It's legal for A to be null; in that case, you may not call
128  // initialize() until calling setMatrix() with a nonnull input.
129  // Regardless, setting the matrix invalidates any previous
130  // factorization.
131  if (A.getRawPtr () != A_.getRawPtr ()) {
132  isAllocated_ = false;
133  isInitialized_ = false;
134  isComputed_ = false;
135  A_local_ = Teuchos::null;
136  Graph_ = Teuchos::null;
137 
138  // The sparse triangular solvers get a triangular factor as their
139  // input matrix. The triangular factors L_ and U_ are getting
140  // reset, so we reset the solvers' matrices to null. Do that
141  // before setting L_ and U_ to null, so that latter step actually
142  // frees the factors.
143  if (! L_solver_.is_null ()) {
144  L_solver_->setMatrix (Teuchos::null);
145  }
146  if (! U_solver_.is_null ()) {
147  U_solver_->setMatrix (Teuchos::null);
148  }
149 
150  L_ = Teuchos::null;
151  U_ = Teuchos::null;
152  D_ = Teuchos::null;
153  A_ = A;
154  }
155 }
156 
157 
158 
159 template<class MatrixType>
162 {
164  L_.is_null (), std::runtime_error, "Ifpack2::RILUK::getL: The L factor "
165  "is null. Please call initialize() (and preferably also compute()) "
166  "before calling this method. If the input matrix has not yet been set, "
167  "you must first call setMatrix() with a nonnull input matrix before you "
168  "may call initialize() or compute().");
169  return *L_;
170 }
171 
172 
173 template<class MatrixType>
174 const Tpetra::Vector<typename RILUK<MatrixType>::scalar_type,
179 {
181  D_.is_null (), std::runtime_error, "Ifpack2::RILUK::getD: The D factor "
182  "(of diagonal entries) is null. Please call initialize() (and "
183  "preferably also compute()) before calling this method. If the input "
184  "matrix has not yet been set, you must first call setMatrix() with a "
185  "nonnull input matrix before you may call initialize() or compute().");
186  return *D_;
187 }
188 
189 
190 template<class MatrixType>
193 {
195  U_.is_null (), std::runtime_error, "Ifpack2::RILUK::getU: The U factor "
196  "is null. Please call initialize() (and preferably also compute()) "
197  "before calling this method. If the input matrix has not yet been set, "
198  "you must first call setMatrix() with a nonnull input matrix before you "
199  "may call initialize() or compute().");
200  return *U_;
201 }
202 
203 
204 template<class MatrixType>
207  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::getNodeSmootherComplexity: "
208  "The input matrix A is null. Please call setMatrix() with a nonnull "
209  "input matrix, then call compute(), before calling this method.");
210  // RILUK methods cost roughly one apply + the nnz in the upper+lower triangles
211  if(!L_.is_null() && !U_.is_null())
212  return A_->getLocalNumEntries() + L_->getLocalNumEntries() + U_->getLocalNumEntries();
213  else
214  return 0;
215 }
216 
217 
218 template<class MatrixType>
221  typename RILUK<MatrixType>::node_type> >
224  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::getDomainMap: "
225  "The matrix is null. Please call setMatrix() with a nonnull input "
226  "before calling this method.");
227 
228  // FIXME (mfh 25 Jan 2014) Shouldn't this just come from A_?
230  Graph_.is_null (), std::runtime_error, "Ifpack2::RILUK::getDomainMap: "
231  "The computed graph is null. Please call initialize() before calling "
232  "this method.");
233  return Graph_->getL_Graph ()->getDomainMap ();
234 }
235 
236 
237 template<class MatrixType>
240  typename RILUK<MatrixType>::node_type> >
243  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::getRangeMap: "
244  "The matrix is null. Please call setMatrix() with a nonnull input "
245  "before calling this method.");
246 
247  // FIXME (mfh 25 Jan 2014) Shouldn't this just come from A_?
249  Graph_.is_null (), std::runtime_error, "Ifpack2::RILUK::getRangeMap: "
250  "The computed graph is null. Please call initialize() before calling "
251  "this method.");
252  return Graph_->getL_Graph ()->getRangeMap ();
253 }
254 
255 
256 template<class MatrixType>
258 {
259  using Teuchos::null;
260  using Teuchos::rcp;
261 
262  if (! isAllocated_) {
263  if (!isKokkosKernelsStream_) {
264  // Deallocate any existing storage. This avoids storing 2x
265  // memory, since RCP op= does not deallocate until after the
266  // assignment.
267  L_ = null;
268  U_ = null;
269  D_ = null;
270 
271  // Allocate Matrix using ILUK graphs
272  L_ = rcp (new crs_matrix_type (Graph_->getL_Graph ()));
273  U_ = rcp (new crs_matrix_type (Graph_->getU_Graph ()));
274  L_->setAllToScalar (STS::zero ()); // Zero out L and U matrices
275  U_->setAllToScalar (STS::zero ());
276 
277  // FIXME (mfh 24 Jan 2014) This assumes domain == range Map for L and U.
278  L_->fillComplete ();
279  U_->fillComplete ();
280  D_ = rcp (new vec_type (Graph_->getL_Graph ()->getRowMap ()));
281  }
282  else {
283  L_v_ = std::vector< Teuchos::RCP<crs_matrix_type> >(num_streams_);
284  U_v_ = std::vector< Teuchos::RCP<crs_matrix_type> >(num_streams_);
285  for (int i = 0; i < num_streams_; i++) {
286  L_v_[i] = null;
287  U_v_[i] = null;
288 
289  L_v_[i] = rcp (new crs_matrix_type (Graph_v_[i]->getL_Graph ()));
290  U_v_[i] = rcp (new crs_matrix_type (Graph_v_[i]->getU_Graph ()));
291  L_v_[i]->setAllToScalar (STS::zero ()); // Zero out L and U matrices
292  U_v_[i]->setAllToScalar (STS::zero ());
293 
294  L_v_[i]->fillComplete ();
295  U_v_[i]->fillComplete ();
296  }
297  }
298  }
299  isAllocated_ = true;
300 }
301 
302 
303 template<class MatrixType>
304 void
307 {
308  using Teuchos::RCP;
310  using Teuchos::Array;
311  using Details::getParamTryingTypes;
312  const char prefix[] = "Ifpack2::RILUK: ";
313 
314  // Default values of the various parameters.
315  int fillLevel = 0;
316  magnitude_type absThresh = STM::zero ();
317  magnitude_type relThresh = STM::one ();
318  magnitude_type relaxValue = STM::zero ();
319  double overalloc = 2.;
320  int nstreams = 0;
321 
322  // "fact: iluk level-of-fill" parsing is more complicated, because
323  // we want to allow as many types as make sense. int is the native
324  // type, but we also want to accept double (for backwards
325  // compatibilty with ILUT). You can't cast arbitrary magnitude_type
326  // (e.g., Sacado::MP::Vector) to int, so we use float instead, to
327  // get coverage of the most common magnitude_type cases. Weirdly,
328  // there's an Ifpack2 test that sets the fill level as a
329  // global_ordinal_type.
330  {
331  const std::string paramName ("fact: iluk level-of-fill");
332  getParamTryingTypes<int, int, global_ordinal_type, double, float>
333  (fillLevel, params, paramName, prefix);
334  }
335  // For the other parameters, we prefer magnitude_type, but allow
336  // double for backwards compatibility.
337  {
338  const std::string paramName ("fact: absolute threshold");
339  getParamTryingTypes<magnitude_type, magnitude_type, double>
340  (absThresh, params, paramName, prefix);
341  }
342  {
343  const std::string paramName ("fact: relative threshold");
344  getParamTryingTypes<magnitude_type, magnitude_type, double>
345  (relThresh, params, paramName, prefix);
346  }
347  {
348  const std::string paramName ("fact: relax value");
349  getParamTryingTypes<magnitude_type, magnitude_type, double>
350  (relaxValue, params, paramName, prefix);
351  }
352  {
353  const std::string paramName ("fact: iluk overalloc");
354  getParamTryingTypes<double, double>
355  (overalloc, params, paramName, prefix);
356  }
357 
358  // Parsing implementation type
359  Details::IlukImplType::Enum ilukimplType = Details::IlukImplType::Serial;
360  do {
361  static const char typeName[] = "fact: type";
362 
363  if ( ! params.isType<std::string>(typeName)) break;
364 
365  // Map std::string <-> IlukImplType::Enum.
366  Array<std::string> ilukimplTypeStrs;
367  Array<Details::IlukImplType::Enum> ilukimplTypeEnums;
368  Details::IlukImplType::loadPLTypeOption (ilukimplTypeStrs, ilukimplTypeEnums);
370  s2i(ilukimplTypeStrs (), ilukimplTypeEnums (), typeName, false);
371 
372  ilukimplType = s2i.getIntegralValue(params.get<std::string>(typeName));
373  } while (0);
374 
375  if (ilukimplType == Details::IlukImplType::KSPILUK) {
376  this->isKokkosKernelsSpiluk_ = true;
377  }
378  else {
379  this->isKokkosKernelsSpiluk_ = false;
380  }
381 
382  {
383  const std::string paramName ("fact: kspiluk number-of-streams");
384  getParamTryingTypes<int, int, global_ordinal_type>
385  (nstreams, params, paramName, prefix);
386  }
387 
388  // Forward to trisolvers.
389  L_solver_->setParameters(params);
390  U_solver_->setParameters(params);
391 
392  // "Commit" the values only after validating all of them. This
393  // ensures that there are no side effects if this routine throws an
394  // exception.
395 
396  LevelOfFill_ = fillLevel;
397  Overalloc_ = overalloc;
398 #ifdef KOKKOS_ENABLE_OPENMP
399  if constexpr (std::is_same_v<execution_space, Kokkos::OpenMP>) {
400  nstreams = std::min(nstreams, execution_space{}.concurrency());
401  }
402 #endif
403  num_streams_ = nstreams;
404 
405  if (num_streams_ >= 1) {
406  this->isKokkosKernelsStream_ = true;
407  // Will we do reordering in streams?
408  if (params.isParameter("fact: kspiluk reordering in streams"))
409  hasStreamReordered_ = params.get<bool> ("fact: kspiluk reordering in streams");
410  }
411  else {
412  this->isKokkosKernelsStream_ = false;
413  }
414 
415  // mfh 28 Nov 2012: The previous code would not assign Athresh_,
416  // Rthresh_, or RelaxValue_, if the read-in value was -1. I don't
417  // know if keeping this behavior is correct, but I'll keep it just
418  // so as not to change previous behavior.
419 
420  if (absThresh != -STM::one ()) {
421  Athresh_ = absThresh;
422  }
423  if (relThresh != -STM::one ()) {
424  Rthresh_ = relThresh;
425  }
426  if (relaxValue != -STM::one ()) {
427  RelaxValue_ = relaxValue;
428  }
429 }
430 
431 
432 template<class MatrixType>
435  return Teuchos::rcp_implicit_cast<const row_matrix_type> (A_);
436 }
437 
438 
439 template<class MatrixType>
442  return Teuchos::rcp_dynamic_cast<const crs_matrix_type> (A_, true);
443 }
444 
445 
446 template<class MatrixType>
449 {
450  using Teuchos::RCP;
451  using Teuchos::rcp;
452  using Teuchos::rcp_dynamic_cast;
453  using Teuchos::rcp_implicit_cast;
454 
455  // If A_'s communicator only has one process, or if its column and
456  // row Maps are the same, then it is already local, so use it
457  // directly.
458  if (A->getRowMap ()->getComm ()->getSize () == 1 ||
459  A->getRowMap ()->isSameAs (* (A->getColMap ()))) {
460  return A;
461  }
462 
463  // If A_ is already a LocalFilter, then use it directly. This
464  // should be the case if RILUK is being used through
465  // AdditiveSchwarz, for example.
466  RCP<const LocalFilter<row_matrix_type> > A_lf_r =
467  rcp_dynamic_cast<const LocalFilter<row_matrix_type> > (A);
468  if (! A_lf_r.is_null ()) {
469  return rcp_implicit_cast<const row_matrix_type> (A_lf_r);
470  }
471  else {
472  // A_'s communicator has more than one process, its row Map and
473  // its column Map differ, and A_ is not a LocalFilter. Thus, we
474  // have to wrap it in a LocalFilter.
475  return rcp (new LocalFilter<row_matrix_type> (A));
476  }
477 }
478 
479 
480 template<class MatrixType>
482 {
483  using Teuchos::RCP;
484  using Teuchos::rcp;
485  using Teuchos::rcp_const_cast;
486  using Teuchos::rcp_dynamic_cast;
487  using Teuchos::rcp_implicit_cast;
488  using Teuchos::Array;
489  using Teuchos::ArrayView;
490  typedef Tpetra::CrsGraph<local_ordinal_type,
492  node_type> crs_graph_type;
493  typedef Tpetra::Map<local_ordinal_type,
494  global_ordinal_type,
495  node_type> crs_map_type;
496  const char prefix[] = "Ifpack2::RILUK::initialize: ";
497 
499  (A_.is_null (), std::runtime_error, prefix << "The matrix is null. Please "
500  "call setMatrix() with a nonnull input before calling this method.");
502  (! A_->isFillComplete (), std::runtime_error, prefix << "The matrix is not "
503  "fill complete. You may not invoke initialize() or compute() with this "
504  "matrix until the matrix is fill complete. If your matrix is a "
505  "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
506  "range Maps, if appropriate) before calling this method.");
507 
508  Teuchos::Time timer ("RILUK::initialize");
509  double startTime = timer.wallTime();
510  { // Start timing
511  Teuchos::TimeMonitor timeMon (timer);
512 
513  // Calling initialize() means that the user asserts that the graph
514  // of the sparse matrix may have changed. We must not just reuse
515  // the previous graph in that case.
516  //
517  // Regarding setting isAllocated_ to false: Eventually, we may want
518  // some kind of clever memory reuse strategy, but it's always
519  // correct just to blow everything away and start over.
520  isInitialized_ = false;
521  isAllocated_ = false;
522  isComputed_ = false;
523  Graph_ = Teuchos::null;
524  Y_tmp_ = nullptr;
525  reordered_x_ = nullptr;
526  reordered_y_ = nullptr;
527 
528  if (isKokkosKernelsStream_) {
529  Graph_v_ = std::vector< Teuchos::RCP<iluk_graph_type> >(num_streams_);
530  A_local_diagblks = std::vector<local_matrix_device_type>(num_streams_);
531  for (int i = 0; i < num_streams_; i++) {
532  Graph_v_[i] = Teuchos::null;
533  }
534  }
535 
536  A_local_ = makeLocalFilter (A_);
537 
539  A_local_.is_null (), std::logic_error, "Ifpack2::RILUK::initialize: "
540  "makeLocalFilter returned null; it failed to compute A_local. "
541  "Please report this bug to the Ifpack2 developers.");
542 
543  // FIXME (mfh 24 Jan 2014, 26 Mar 2014) It would be more efficient
544  // to rewrite RILUK so that it works with any RowMatrix input, not
545  // just CrsMatrix. (That would require rewriting IlukGraph to
546  // handle a Tpetra::RowGraph.) However, to make it work for now,
547  // we just copy the input matrix if it's not a CrsMatrix.
548 
549  {
550  A_local_crs_ = Details::getCrsMatrix(A_local_);
551  if(A_local_crs_.is_null()) {
552  local_ordinal_type numRows = A_local_->getLocalNumRows();
553  Array<size_t> entriesPerRow(numRows);
554  for(local_ordinal_type i = 0; i < numRows; i++) {
555  entriesPerRow[i] = A_local_->getNumEntriesInLocalRow(i);
556  }
557  A_local_crs_nc_ =
558  rcp (new crs_matrix_type (A_local_->getRowMap (),
559  A_local_->getColMap (),
560  entriesPerRow()));
561  // copy entries into A_local_crs
562  nonconst_local_inds_host_view_type indices("indices",A_local_->getLocalMaxNumRowEntries());
563  nonconst_values_host_view_type values("values",A_local_->getLocalMaxNumRowEntries());
564  for(local_ordinal_type i = 0; i < numRows; i++) {
565  size_t numEntries = 0;
566  A_local_->getLocalRowCopy(i, indices, values, numEntries);
567  A_local_crs_nc_->insertLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()), indices.data());
568  }
569  A_local_crs_nc_->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ());
570  A_local_crs_ = rcp_const_cast<const crs_matrix_type> (A_local_crs_nc_);
571  }
572  if (!isKokkosKernelsStream_) {
573  Graph_ = rcp (new Ifpack2::IlukGraph<crs_graph_type,kk_handle_type> (A_local_crs_->getCrsGraph (),
574  LevelOfFill_, 0, Overalloc_));
575  }
576  else {
577  std::vector<int> weights(num_streams_);
578  std::fill(weights.begin(), weights.end(), 1);
579  exec_space_instances_ = Kokkos::Experimental::partition_space(execution_space(), weights);
580 
581  auto lclMtx = A_local_crs_->getLocalMatrixDevice();
582  if (!hasStreamReordered_) {
583  KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
584  } else {
585  perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);
586  reverse_perm_v_.resize(perm_v_.size());
587  for(size_t istream=0; istream < perm_v_.size(); ++istream) {
588  using perm_type = typename lno_nonzero_view_t::non_const_type;
589  const auto perm = perm_v_[istream];
590  const auto perm_length = perm.extent(0);
591  perm_type reverse_perm(
592  Kokkos::view_alloc(Kokkos::WithoutInitializing, "reverse_perm"),
593  perm_length);
594  Kokkos::parallel_for(Kokkos::RangePolicy<execution_space>(exec_space_instances_[istream], 0, perm_length),
595  KOKKOS_LAMBDA(const local_ordinal_type ii) {
596  reverse_perm(perm(ii)) = ii;
597  });
598  reverse_perm_v_[istream] = reverse_perm;
599  }
600  }
601 
602  A_local_diagblks_rowmap_v_ = std::vector<lno_row_view_t>(num_streams_);
603  A_local_diagblks_entries_v_ = std::vector<lno_nonzero_view_t>(num_streams_);
604  A_local_diagblks_values_v_ = std::vector<scalar_nonzero_view_t>(num_streams_);
605 
606  for(int i = 0; i < num_streams_; i++) {
607  A_local_diagblks_rowmap_v_[i] = A_local_diagblks[i].graph.row_map;
608  A_local_diagblks_entries_v_[i] = A_local_diagblks[i].graph.entries;
609  A_local_diagblks_values_v_[i] = A_local_diagblks[i].values;
610 
611  Teuchos::RCP<const crs_map_type> A_local_diagblks_RowMap = rcp (new crs_map_type(A_local_diagblks[i].numRows(),
612  A_local_diagblks[i].numRows(),
613  A_local_crs_->getRowMap()->getComm()));
614  Teuchos::RCP<const crs_map_type> A_local_diagblks_ColMap = rcp (new crs_map_type(A_local_diagblks[i].numCols(),
615  A_local_diagblks[i].numCols(),
616  A_local_crs_->getColMap()->getComm()));
617  Teuchos::RCP<crs_matrix_type> A_local_diagblks_ = rcp (new crs_matrix_type(A_local_diagblks_RowMap,
618  A_local_diagblks_ColMap,
619  A_local_diagblks[i]));
620  Graph_v_[i] = rcp (new Ifpack2::IlukGraph<crs_graph_type,kk_handle_type> (A_local_diagblks_->getCrsGraph(),
621  LevelOfFill_, 0, Overalloc_));
622  }
623  }
624  }
625 
626  if (this->isKokkosKernelsSpiluk_) {
627  if (!isKokkosKernelsStream_) {
628  this->KernelHandle_ = Teuchos::rcp (new kk_handle_type ());
629  KernelHandle_->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,
630  A_local_->getLocalNumRows(),
631  2*A_local_->getLocalNumEntries()*(LevelOfFill_+1),
632  2*A_local_->getLocalNumEntries()*(LevelOfFill_+1) );
633  Graph_->initialize (KernelHandle_); // this calls spiluk_symbolic
634  }
635  else {
636  KernelHandle_v_ = std::vector< Teuchos::RCP<kk_handle_type> >(num_streams_);
637  for (int i = 0; i < num_streams_; i++) {
638  KernelHandle_v_[i] = Teuchos::rcp (new kk_handle_type ());
639  KernelHandle_v_[i]->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,
640  A_local_diagblks[i].numRows(),
641  2*A_local_diagblks[i].nnz()*(LevelOfFill_+1),
642  2*A_local_diagblks[i].nnz()*(LevelOfFill_+1) );
643  Graph_v_[i]->initialize (KernelHandle_v_[i]); // this calls spiluk_symbolic
644  }
645  }
646  }
647  else {
648  Graph_->initialize ();
649  }
650 
651  allocate_L_and_U ();
652  checkOrderingConsistency (*A_local_);
653  if (!isKokkosKernelsStream_) {
654  L_solver_->setMatrix (L_);
655  }
656  else {
657  L_solver_->setStreamInfo (isKokkosKernelsStream_, num_streams_, exec_space_instances_);
658  L_solver_->setMatrices (L_v_);
659  }
660  L_solver_->initialize ();
661 
662  if (!isKokkosKernelsStream_) {
663  U_solver_->setMatrix (U_);
664  }
665  else {
666  U_solver_->setStreamInfo (isKokkosKernelsStream_, num_streams_, exec_space_instances_);
667  U_solver_->setMatrices (U_v_);
668  }
669  U_solver_->initialize ();
670 
671  // Do not call initAllValues. compute() always calls initAllValues to
672  // fill L and U with possibly new numbers. initialize() is concerned
673  // only with the nonzero pattern.
674  } // Stop timing
675 
676  isInitialized_ = true;
677  ++numInitialize_;
678  initializeTime_ += (timer.wallTime() - startTime);
679 }
680 
681 template<class MatrixType>
682 void
684 checkOrderingConsistency (const row_matrix_type& A)
685 {
686  // First check that the local row map ordering is the same as the local portion of the column map.
687  // The extraction of the strictly lower/upper parts of A, as well as the factorization,
688  // implicitly assume that this is the case.
689  Teuchos::ArrayView<const global_ordinal_type> rowGIDs = A.getRowMap()->getLocalElementList();
690  Teuchos::ArrayView<const global_ordinal_type> colGIDs = A.getColMap()->getLocalElementList();
691  bool gidsAreConsistentlyOrdered=true;
692  global_ordinal_type indexOfInconsistentGID=0;
693  for (global_ordinal_type i=0; i<rowGIDs.size(); ++i) {
694  if (rowGIDs[i] != colGIDs[i]) {
695  gidsAreConsistentlyOrdered=false;
696  indexOfInconsistentGID=i;
697  break;
698  }
699  }
700  TEUCHOS_TEST_FOR_EXCEPTION(gidsAreConsistentlyOrdered==false, std::runtime_error,
701  "The ordering of the local GIDs in the row and column maps is not the same"
702  << std::endl << "at index " << indexOfInconsistentGID
703  << ". Consistency is required, as all calculations are done with"
704  << std::endl << "local indexing.");
705 }
706 
707 template<class MatrixType>
708 void
709 RILUK<MatrixType>::
710 initAllValues (const row_matrix_type& A)
711 {
712  using Teuchos::ArrayRCP;
713  using Teuchos::Comm;
714  using Teuchos::ptr;
715  using Teuchos::RCP;
716  using Teuchos::REDUCE_SUM;
717  using Teuchos::reduceAll;
718  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> map_type;
719 
720  size_t NumIn = 0, NumL = 0, NumU = 0;
721  bool DiagFound = false;
722  size_t NumNonzeroDiags = 0;
723  size_t MaxNumEntries = A.getGlobalMaxNumRowEntries();
724 
725  // Allocate temporary space for extracting the strictly
726  // lower and upper parts of the matrix A.
727  nonconst_local_inds_host_view_type InI("InI",MaxNumEntries);
728  Teuchos::Array<local_ordinal_type> LI(MaxNumEntries);
729  Teuchos::Array<local_ordinal_type> UI(MaxNumEntries);
730  nonconst_values_host_view_type InV("InV",MaxNumEntries);
731  Teuchos::Array<scalar_type> LV(MaxNumEntries);
732  Teuchos::Array<scalar_type> UV(MaxNumEntries);
733 
734  // Check if values should be inserted or replaced
735  const bool ReplaceValues = L_->isStaticGraph () || L_->isLocallyIndexed ();
736 
737  L_->resumeFill ();
738  U_->resumeFill ();
739  if (ReplaceValues) {
740  L_->setAllToScalar (STS::zero ()); // Zero out L and U matrices
741  U_->setAllToScalar (STS::zero ());
742  }
743 
744  D_->putScalar (STS::zero ()); // Set diagonal values to zero
745  auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
746 
747  RCP<const map_type> rowMap = L_->getRowMap ();
748 
749  // First we copy the user's matrix into L and U, regardless of fill level.
750  // It is important to note that L and U are populated using local indices.
751  // This means that if the row map GIDs are not monotonically increasing
752  // (i.e., permuted or gappy), then the strictly lower (upper) part of the
753  // matrix is not the one that you would get if you based L (U) on GIDs.
754  // This is ok, as the *order* of the GIDs in the rowmap is a better
755  // expression of the user's intent than the GIDs themselves.
756 
757  Teuchos::ArrayView<const global_ordinal_type> nodeGIDs = rowMap->getLocalElementList();
758  for (size_t myRow=0; myRow<A.getLocalNumRows(); ++myRow) {
759  local_ordinal_type local_row = myRow;
760 
761  //TODO JJH 4April2014 An optimization is to use getLocalRowView. Not all matrices support this,
762  // we'd need to check via the Tpetra::RowMatrix method supportsRowViews().
763  A.getLocalRowCopy (local_row, InI, InV, NumIn); // Get Values and Indices
764 
765  // Split into L and U (we don't assume that indices are ordered).
766 
767  NumL = 0;
768  NumU = 0;
769  DiagFound = false;
770 
771  for (size_t j = 0; j < NumIn; ++j) {
772  const local_ordinal_type k = InI[j];
773 
774  if (k == local_row) {
775  DiagFound = true;
776  // Store perturbed diagonal in Tpetra::Vector D_
777  DV(local_row) += Rthresh_ * InV[j] + IFPACK2_SGN(InV[j]) * Athresh_;
778  }
779  else if (k < 0) { // Out of range
781  true, std::runtime_error, "Ifpack2::RILUK::initAllValues: current "
782  "GID k = " << k << " < 0. I'm not sure why this is an error; it is "
783  "probably an artifact of the undocumented assumptions of the "
784  "original implementation (likely copied and pasted from Ifpack). "
785  "Nevertheless, the code I found here insisted on this being an error "
786  "state, so I will throw an exception here.");
787  }
788  else if (k < local_row) {
789  LI[NumL] = k;
790  LV[NumL] = InV[j];
791  NumL++;
792  }
793  else if (Teuchos::as<size_t>(k) <= rowMap->getLocalNumElements()) {
794  UI[NumU] = k;
795  UV[NumU] = InV[j];
796  NumU++;
797  }
798  }
799 
800  // Check in things for this row of L and U
801 
802  if (DiagFound) {
803  ++NumNonzeroDiags;
804  } else {
805  DV(local_row) = Athresh_;
806  }
807 
808  if (NumL) {
809  if (ReplaceValues) {
810  L_->replaceLocalValues(local_row, LI(0, NumL), LV(0,NumL));
811  } else {
812  //FIXME JJH 24April2014 Is this correct? I believe this case is when there aren't already values
813  //FIXME in this row in the column locations corresponding to UI.
814  L_->insertLocalValues(local_row, LI(0,NumL), LV(0,NumL));
815  }
816  }
817 
818  if (NumU) {
819  if (ReplaceValues) {
820  U_->replaceLocalValues(local_row, UI(0,NumU), UV(0,NumU));
821  } else {
822  //FIXME JJH 24April2014 Is this correct? I believe this case is when there aren't already values
823  //FIXME in this row in the column locations corresponding to UI.
824  U_->insertLocalValues(local_row, UI(0,NumU), UV(0,NumU));
825  }
826  }
827  }
828 
829  // At this point L and U have the values of A in the structure of L
830  // and U, and diagonal vector D
831 
832  isInitialized_ = true;
833 }
834 
835 template<class MatrixType>
836 void RILUK<MatrixType>::compute_serial ()
837 {
838  // Fill L and U with numbers. This supports nonzero pattern reuse by calling
839  // initialize() once and then compute() multiple times.
840  initAllValues (*A_local_);
841 
842  // MinMachNum should be officially defined, for now pick something a little
843  // bigger than IEEE underflow value
844 
845  const scalar_type MinDiagonalValue = STS::rmin ();
846  const scalar_type MaxDiagonalValue = STS::one () / MinDiagonalValue;
847 
848  size_t NumIn, NumL, NumU;
849 
850  // Get Maximum Row length
851  const size_t MaxNumEntries =
852  L_->getLocalMaxNumRowEntries () + U_->getLocalMaxNumRowEntries () + 1;
853 
854  Teuchos::Array<local_ordinal_type> InI(MaxNumEntries); // Allocate temp space
855  Teuchos::Array<scalar_type> InV(MaxNumEntries);
856  size_t num_cols = U_->getColMap()->getLocalNumElements();
857  Teuchos::Array<int> colflag(num_cols, -1);
858 
859  auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
860 
861  // Now start the factorization.
862 
863  using IST = typename row_matrix_type::impl_scalar_type;
864  for (size_t i = 0; i < L_->getLocalNumRows (); ++i) {
865  local_ordinal_type local_row = i;
866  // Need some integer workspace and pointers
867  size_t NumUU;
868  local_inds_host_view_type UUI;
869  values_host_view_type UUV;
870 
871  // Fill InV, InI with current row of L, D and U combined
872 
873  NumIn = MaxNumEntries;
874  nonconst_local_inds_host_view_type InI_v(InI.data(),MaxNumEntries);
875  nonconst_values_host_view_type InV_v(reinterpret_cast<IST*>(InV.data()),MaxNumEntries);
876 
877  L_->getLocalRowCopy (local_row, InI_v , InV_v, NumL);
878 
879  InV[NumL] = DV(i); // Put in diagonal
880  InI[NumL] = local_row;
881 
882  nonconst_local_inds_host_view_type InI_sub(InI.data()+NumL+1,MaxNumEntries-NumL-1);
883  nonconst_values_host_view_type InV_sub(reinterpret_cast<IST*>(InV.data())+NumL+1,MaxNumEntries-NumL-1);
884 
885  U_->getLocalRowCopy (local_row, InI_sub,InV_sub, NumU);
886  NumIn = NumL+NumU+1;
887 
888  // Set column flags
889  for (size_t j = 0; j < NumIn; ++j) {
890  colflag[InI[j]] = j;
891  }
892 
893  scalar_type diagmod = STS::zero (); // Off-diagonal accumulator
894 
895  for (size_t jj = 0; jj < NumL; ++jj) {
896  local_ordinal_type j = InI[jj];
897  IST multiplier = InV[jj]; // current_mults++;
898 
899  InV[jj] *= static_cast<scalar_type>(DV(j));
900 
901  U_->getLocalRowView(j, UUI, UUV); // View of row above
902  NumUU = UUI.size();
903 
904  if (RelaxValue_ == STM::zero ()) {
905  for (size_t k = 0; k < NumUU; ++k) {
906  const int kk = colflag[UUI[k]];
907  // FIXME (mfh 23 Dec 2013) Wait a second, we just set
908  // colflag above using size_t (which is generally unsigned),
909  // but now we're querying it using int (which is signed).
910  if (kk > -1) {
911  InV[kk] -= static_cast<scalar_type>(multiplier * UUV[k]);
912  }
913  }
914 
915  }
916  else {
917  for (size_t k = 0; k < NumUU; ++k) {
918  // FIXME (mfh 23 Dec 2013) Wait a second, we just set
919  // colflag above using size_t (which is generally unsigned),
920  // but now we're querying it using int (which is signed).
921  const int kk = colflag[UUI[k]];
922  if (kk > -1) {
923  InV[kk] -= static_cast<scalar_type>(multiplier*UUV[k]);
924  }
925  else {
926  diagmod -= static_cast<scalar_type>(multiplier*UUV[k]);
927  }
928  }
929  }
930  }
931 
932  if (NumL) {
933  // Replace current row of L
934  L_->replaceLocalValues (local_row, InI (0, NumL), InV (0, NumL));
935  }
936 
937  DV(i) = InV[NumL]; // Extract Diagonal value
938 
939  if (RelaxValue_ != STM::zero ()) {
940  DV(i) += RelaxValue_*diagmod; // Add off diagonal modifications
941  }
942 
943  if (STS::magnitude (DV(i)) > STS::magnitude (MaxDiagonalValue)) {
944  if (STS::real (DV(i)) < STM::zero ()) {
945  DV(i) = -MinDiagonalValue;
946  }
947  else {
948  DV(i) = MinDiagonalValue;
949  }
950  }
951  else {
952  DV(i) = static_cast<impl_scalar_type>(STS::one ()) / DV(i); // Invert diagonal value
953  }
954 
955  for (size_t j = 0; j < NumU; ++j) {
956  InV[NumL+1+j] *= static_cast<scalar_type>(DV(i)); // Scale U by inverse of diagonal
957  }
958 
959  if (NumU) {
960  // Replace current row of L and U
961  U_->replaceLocalValues (local_row, InI (NumL+1, NumU), InV (NumL+1, NumU));
962  }
963 
964  // Reset column flags
965  for (size_t j = 0; j < NumIn; ++j) {
966  colflag[InI[j]] = -1;
967  }
968  }
969 
970  // The domain of L and the range of U are exactly their own row maps
971  // (there is no communication). The domain of U and the range of L
972  // must be the same as those of the original matrix, However if the
973  // original matrix is a VbrMatrix, these two latter maps are
974  // translation from a block map to a point map.
975  // FIXME (mfh 23 Dec 2013) Do we know that the column Map of L_ is
976  // always one-to-one?
977  L_->fillComplete (L_->getColMap (), A_local_->getRangeMap ());
978  U_->fillComplete (A_local_->getDomainMap (), U_->getRowMap ());
979 
980  // If L_solver_ or U_solver store modified factors internally, we need to reset those
981  L_solver_->setMatrix (L_);
982  L_solver_->compute ();//NOTE: Only do compute if the pointer changed. Otherwise, do nothing
983  U_solver_->setMatrix (U_);
984  U_solver_->compute ();//NOTE: Only do compute if the pointer changed. Otherwise, do nothing
985 
986 }
987 
988 template<class MatrixType>
989 void RILUK<MatrixType>::compute_kkspiluk()
990 {
991  L_->resumeFill ();
992  U_->resumeFill ();
993 
994  L_->setAllToScalar (STS::zero ()); // Zero out L and U matrices
995  U_->setAllToScalar (STS::zero ());
996 
997  using row_map_type = typename crs_matrix_type::local_matrix_device_type::row_map_type;
998  auto lclL = L_->getLocalMatrixDevice();
999  row_map_type L_rowmap = lclL.graph.row_map;
1000  auto L_entries = lclL.graph.entries;
1001  auto L_values = lclL.values;
1002 
1003  auto lclU = U_->getLocalMatrixDevice();
1004  row_map_type U_rowmap = lclU.graph.row_map;
1005  auto U_entries = lclU.graph.entries;
1006  auto U_values = lclU.values;
1007 
1008  auto lclMtx = A_local_crs_->getLocalMatrixDevice();
1009  KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), LevelOfFill_,
1010  lclMtx.graph.row_map, lclMtx.graph.entries, lclMtx.values,
1011  L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values );
1012 
1013  L_->fillComplete (L_->getColMap (), A_local_->getRangeMap ());
1014  U_->fillComplete (A_local_->getDomainMap (), U_->getRowMap ());
1015 
1016  L_solver_->compute ();
1017  U_solver_->compute ();
1018 }
1019 
1020 template<class MatrixType>
1021 void RILUK<MatrixType>::compute_kkspiluk_stream()
1022 {
1023  for(int i = 0; i < num_streams_; i++) {
1024  L_v_[i]->resumeFill ();
1025  U_v_[i]->resumeFill ();
1026 
1027  L_v_[i]->setAllToScalar (STS::zero ()); // Zero out L and U matrices
1028  U_v_[i]->setAllToScalar (STS::zero ());
1029  }
1030  std::vector<lno_row_view_t> L_rowmap_v(num_streams_);
1031  std::vector<lno_nonzero_view_t> L_entries_v(num_streams_);
1032  std::vector<scalar_nonzero_view_t> L_values_v(num_streams_);
1033  std::vector<lno_row_view_t> U_rowmap_v(num_streams_);
1034  std::vector<lno_nonzero_view_t> U_entries_v(num_streams_);
1035  std::vector<scalar_nonzero_view_t> U_values_v(num_streams_);
1036  std::vector<kk_handle_type *> KernelHandle_rawptr_v_(num_streams_);
1037  for(int i = 0; i < num_streams_; i++) {
1038  auto lclL = L_v_[i]->getLocalMatrixDevice();
1039  L_rowmap_v[i] = lclL.graph.row_map;
1040  L_entries_v[i] = lclL.graph.entries;
1041  L_values_v[i] = lclL.values;
1042 
1043  auto lclU = U_v_[i]->getLocalMatrixDevice();
1044  U_rowmap_v[i] = lclU.graph.row_map;
1045  U_entries_v[i] = lclU.graph.entries;
1046  U_values_v[i] = lclU.values;
1047  KernelHandle_rawptr_v_[i] = KernelHandle_v_[i].getRawPtr();
1048  }
1049 
1050  {
1051  auto lclMtx = A_local_crs_->getLocalMatrixDevice();
1052  // A_local_diagblks was already setup during initialize, just copy the corresponding
1053  // values from A_local_crs_ in parallel now.
1054  using TeamPolicy = Kokkos::TeamPolicy<execution_space>;
1055  const auto A_nrows = lclMtx.numRows();
1056  auto rows_per_block = ((A_nrows % num_streams_) == 0)
1057  ? (A_nrows / num_streams_)
1058  : (A_nrows / num_streams_ + 1);
1059  for(int i = 0; i < num_streams_; i++) {
1060  const auto start_row_offset = i * rows_per_block;
1061  auto rowptrs = A_local_diagblks_rowmap_v_[i];
1062  auto colindices = A_local_diagblks_entries_v_[i];
1063  auto values = A_local_diagblks_values_v_[i];
1064  const bool reordered = hasStreamReordered_;
1065  typename lno_nonzero_view_t::non_const_type reverse_perm = hasStreamReordered_ ? reverse_perm_v_[i] : typename lno_nonzero_view_t::non_const_type{};
1066  TeamPolicy pol(exec_space_instances_[i], A_local_diagblks_rowmap_v_[i].extent(0) - 1, Kokkos::AUTO);
1067  Kokkos::parallel_for(pol, KOKKOS_LAMBDA (const typename TeamPolicy::member_type &team) {
1068  const auto irow = team.league_rank();
1069  const auto irow_A = start_row_offset + (reordered ? reverse_perm(irow) : irow);
1070  const auto A_local_crs_row = lclMtx.rowConst(irow_A);
1071  const auto begin_row = rowptrs(irow);
1072  const auto num_entries = rowptrs(irow + 1) - begin_row;
1073  Kokkos::parallel_for(Kokkos::TeamThreadRange(team, num_entries), [&](const int j) {
1074  const auto colidx = colindices(begin_row + j);
1075  const auto colidx_A = start_row_offset + (reordered ? reverse_perm(colidx) : colidx);
1076  // Find colidx in A_local_crs_row
1077  const int offset = KokkosSparse::findRelOffset(
1078  &A_local_crs_row.colidx(0), A_local_crs_row.length, colidx_A, 0, false);
1079  values(begin_row + j) = A_local_crs_row.value(offset);
1080  });
1081  });
1082  }
1083  }
1084 
1085  KokkosSparse::Experimental::spiluk_numeric_streams( exec_space_instances_, KernelHandle_rawptr_v_, LevelOfFill_,
1086  A_local_diagblks_rowmap_v_, A_local_diagblks_entries_v_, A_local_diagblks_values_v_,
1087  L_rowmap_v, L_entries_v, L_values_v,
1088  U_rowmap_v, U_entries_v, U_values_v );
1089  for(int i = 0; i < num_streams_; i++) {
1090  L_v_[i]->fillComplete ();
1091  U_v_[i]->fillComplete ();
1092  }
1093 
1094  L_solver_->compute ();
1095  U_solver_->compute ();
1096 }
1097 
1098 template<class MatrixType>
1100 {
1101  using Teuchos::RCP;
1102  using Teuchos::rcp;
1103  using Teuchos::rcp_const_cast;
1104  using Teuchos::rcp_dynamic_cast;
1105  using Teuchos::Array;
1106  using Teuchos::ArrayView;
1107  const char prefix[] = "Ifpack2::RILUK::compute: ";
1108 
1109  // initialize() checks this too, but it's easier for users if the
1110  // error shows them the name of the method that they actually
1111  // called, rather than the name of some internally called method.
1113  (A_.is_null (), std::runtime_error, prefix << "The matrix is null. Please "
1114  "call setMatrix() with a nonnull input before calling this method.");
1116  (! A_->isFillComplete (), std::runtime_error, prefix << "The matrix is not "
1117  "fill complete. You may not invoke initialize() or compute() with this "
1118  "matrix until the matrix is fill complete. If your matrix is a "
1119  "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
1120  "range Maps, if appropriate) before calling this method.");
1121 
1122  if (! isInitialized ()) {
1123  initialize (); // Don't count this in the compute() time
1124  }
1125 
1126  Teuchos::Time timer ("RILUK::compute");
1127 
1128  // Start timing
1129  Teuchos::TimeMonitor timeMon (timer);
1130  double startTime = timer.wallTime();
1131 
1132  isComputed_ = false;
1133 
1134  if (!this->isKokkosKernelsSpiluk_) {
1135  compute_serial();
1136  }
1137  else {
1138  //Make sure values in A is picked up even in case of pattern reuse
1139  if(!A_local_crs_nc_.is_null()) {
1140  A_local_crs_nc_->resumeFill();
1141  local_ordinal_type numRows = A_local_->getLocalNumRows();
1142  Array<size_t> entriesPerRow(numRows);
1143  for(local_ordinal_type i = 0; i < numRows; i++) {
1144  entriesPerRow[i] = A_local_->getNumEntriesInLocalRow(i);
1145  }
1146  // copy entries into A_local_crs
1147  nonconst_local_inds_host_view_type indices("indices",A_local_->getLocalMaxNumRowEntries());
1148  nonconst_values_host_view_type values("values",A_local_->getLocalMaxNumRowEntries());
1149  for(local_ordinal_type i = 0; i < numRows; i++) {
1150  size_t numEntries = 0;
1151  A_local_->getLocalRowCopy(i, indices, values, numEntries);
1152  A_local_crs_nc_->replaceLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()),indices.data());
1153  }
1154  A_local_crs_nc_->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ());
1155  }
1156 
1157  if (!isKokkosKernelsStream_) {
1158  compute_kkspiluk();
1159  }
1160  else {
1161  compute_kkspiluk_stream();
1162  }
1163  }
1164 
1165  isComputed_ = true;
1166  ++numCompute_;
1167  computeTime_ += (timer.wallTime() - startTime);
1168 }
1169 
1170 namespace Impl {
1171 template <typename MV, typename Map>
1172 void resetMultiVecIfNeeded(std::unique_ptr<MV> &mv_ptr, const Map &map, const size_t numVectors, bool initialize)
1173 {
1174  if(!mv_ptr || mv_ptr->getNumVectors() != numVectors) {
1175  mv_ptr.reset(new MV(map, numVectors, initialize));
1176  }
1177 }
1178 }
1179 
1180 template<class MatrixType>
1181 void
1183 apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1184  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
1185  Teuchos::ETransp mode,
1186  scalar_type alpha,
1187  scalar_type beta) const
1188 {
1189  using Teuchos::RCP;
1190  using Teuchos::rcpFromRef;
1191 
1193  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::apply: The matrix is "
1194  "null. Please call setMatrix() with a nonnull input, then initialize() "
1195  "and compute(), before calling this method.");
1197  ! isComputed (), std::runtime_error,
1198  "Ifpack2::RILUK::apply: If you have not yet called compute(), "
1199  "you must call compute() before calling this method.");
1200  TEUCHOS_TEST_FOR_EXCEPTION(
1201  X.getNumVectors () != Y.getNumVectors (), std::invalid_argument,
1202  "Ifpack2::RILUK::apply: X and Y do not have the same number of columns. "
1203  "X.getNumVectors() = " << X.getNumVectors ()
1204  << " != Y.getNumVectors() = " << Y.getNumVectors () << ".");
1205  TEUCHOS_TEST_FOR_EXCEPTION(
1206  STS::isComplex && mode == Teuchos::CONJ_TRANS, std::logic_error,
1207  "Ifpack2::RILUK::apply: mode = Teuchos::CONJ_TRANS is not implemented for "
1208  "complex Scalar type. Please talk to the Ifpack2 developers to get this "
1209  "fixed. There is a FIXME in this file about this very issue.");
1210 #ifdef HAVE_IFPACK2_DEBUG
1211  {
1212  if (!isKokkosKernelsStream_) {
1213  const magnitude_type D_nrm1 = D_->norm1 ();
1214  TEUCHOS_TEST_FOR_EXCEPTION( STM::isnaninf (D_nrm1), std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the stored diagonal is NaN or Inf.");
1215  }
1216  Teuchos::Array<magnitude_type> norms (X.getNumVectors ());
1217  X.norm1 (norms ());
1218  bool good = true;
1219  for (size_t j = 0; j < X.getNumVectors (); ++j) {
1220  if (STM::isnaninf (norms[j])) {
1221  good = false;
1222  break;
1223  }
1224  }
1225  TEUCHOS_TEST_FOR_EXCEPTION( ! good, std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the input X is NaN or Inf.");
1226  }
1227 #endif // HAVE_IFPACK2_DEBUG
1228 
1229  const scalar_type one = STS::one ();
1230  const scalar_type zero = STS::zero ();
1231 
1232  Teuchos::Time timer ("RILUK::apply");
1233  double startTime = timer.wallTime();
1234  { // Start timing
1235  Teuchos::TimeMonitor timeMon (timer);
1236  if (alpha == one && beta == zero) {
1237  if (isKokkosKernelsSpiluk_ && isKokkosKernelsStream_ && hasStreamReordered_) {
1238  Impl::resetMultiVecIfNeeded(reordered_x_, X.getMap(), X.getNumVectors(), false);
1239  Impl::resetMultiVecIfNeeded(reordered_y_, Y.getMap(), Y.getNumVectors(), false);
1240  Kokkos::fence();
1241  for (size_t j = 0; j < X.getNumVectors(); j++) {
1242  auto X_j = X.getVector(j);
1243  auto ReorderedX_j = reordered_x_->getVectorNonConst(j);
1244  auto X_lcl = X_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
1245  auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
1246  local_ordinal_type stream_begin = 0;
1247  local_ordinal_type stream_end;
1248  for(int i = 0; i < num_streams_; i++) {
1249  auto perm_i = perm_v_[i];
1250  stream_end = stream_begin + perm_i.extent(0);
1251  auto X_lcl_sub = Kokkos::subview (X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1252  auto ReorderedX_lcl_sub = Kokkos::subview (ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1253  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(exec_space_instances_[i], 0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
1254  ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii);
1255  });
1256  stream_begin = stream_end;
1257  }
1258  }
1259  Kokkos::fence();
1260  if (mode == Teuchos::NO_TRANS) { // Solve L (U Y) = X for Y.
1261  // Solve L Y = X for Y.
1262  L_solver_->apply (*reordered_x_, Y, mode);
1263  // Solve U Y = Y for Y.
1264  U_solver_->apply (Y, *reordered_y_, mode);
1265  }
1266  else { // Solve U^P (L^P Y) = X for Y (where P is * or T).
1267  // Solve U^P Y = X for Y.
1268  U_solver_->apply (*reordered_x_, Y, mode);
1269  // Solve L^P Y = Y for Y.
1270  L_solver_->apply (Y, *reordered_y_, mode);
1271  }
1272 
1273  for (size_t j = 0; j < Y.getNumVectors(); j++) {
1274  auto Y_j = Y.getVectorNonConst(j);
1275  auto ReorderedY_j = reordered_y_->getVector(j);
1276  auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
1277  auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
1278  local_ordinal_type stream_begin = 0;
1279  local_ordinal_type stream_end;
1280  for(int i = 0; i < num_streams_; i++) {
1281  auto perm_i = perm_v_[i];
1282  stream_end = stream_begin + perm_i.extent(0);
1283  auto Y_lcl_sub = Kokkos::subview (Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1284  auto ReorderedY_lcl_sub = Kokkos::subview (ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1285  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(exec_space_instances_[i], 0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
1286  Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii));
1287  });
1288  stream_begin = stream_end;
1289  }
1290  }
1291  Kokkos::fence();
1292  }
1293  else {
1294  if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
1295 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
1296  //NOTE (Nov-15-2022):
1297  //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
1298  //since cusparseSpSV_solve() does not support in-place computation
1299  Impl::resetMultiVecIfNeeded(Y_tmp_, Y.getMap(), Y.getNumVectors(), false);
1300 
1301  // Start by solving L Y_tmp = X for Y_tmp.
1302  L_solver_->apply (X, *Y_tmp_, mode);
1303 
1304  if (!this->isKokkosKernelsSpiluk_) {
1305  // Solve D Y = Y. The operation lets us do this in place in Y, so we can
1306  // write "solve D Y = Y for Y."
1307  Y_tmp_->elementWiseMultiply (one, *D_, *Y_tmp_, zero);
1308  }
1309 
1310  U_solver_->apply (*Y_tmp_, Y, mode); // Solve U Y = Y_tmp.
1311 #else
1312  // Start by solving L Y = X for Y.
1313  L_solver_->apply (X, Y, mode);
1314 
1315  if (!this->isKokkosKernelsSpiluk_) {
1316  // Solve D Y = Y. The operation lets us do this in place in Y, so we can
1317  // write "solve D Y = Y for Y."
1318  Y.elementWiseMultiply (one, *D_, Y, zero);
1319  }
1320 
1321  U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
1322 #endif
1323  }
1324  else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
1325 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
1326  //NOTE (Nov-15-2022):
1327  //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
1328  //since cusparseSpSV_solve() does not support in-place computation
1329  Impl::resetMultiVecIfNeeded(Y_tmp_, Y.getMap(), Y.getNumVectors(), false);
1330 
1331  // Start by solving U^P Y_tmp = X for Y_tmp.
1332  U_solver_->apply (X, *Y_tmp_, mode);
1333 
1334  if (!this->isKokkosKernelsSpiluk_) {
1335  // Solve D^P Y = Y.
1336  //
1337  // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
1338  // need to do an elementwise multiply with the conjugate of
1339  // D_, not just with D_ itself.
1340  Y_tmp_->elementWiseMultiply (one, *D_, *Y_tmp_, zero);
1341  }
1342 
1343  L_solver_->apply (*Y_tmp_, Y, mode); // Solve L^P Y = Y_tmp.
1344 #else
1345  // Start by solving U^P Y = X for Y.
1346  U_solver_->apply (X, Y, mode);
1347 
1348  if (!this->isKokkosKernelsSpiluk_) {
1349  // Solve D^P Y = Y.
1350  //
1351  // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
1352  // need to do an elementwise multiply with the conjugate of
1353  // D_, not just with D_ itself.
1354  Y.elementWiseMultiply (one, *D_, Y, zero);
1355  }
1356 
1357  L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
1358 #endif
1359  }
1360  }
1361  }
1362  else { // alpha != 1 or beta != 0
1363  if (alpha == zero) {
1364  // The special case for beta == 0 ensures that if Y contains Inf
1365  // or NaN values, we replace them with 0 (following BLAS
1366  // convention), rather than multiplying them by 0 to get NaN.
1367  if (beta == zero) {
1368  Y.putScalar (zero);
1369  } else {
1370  Y.scale (beta);
1371  }
1372  } else { // alpha != zero
1373  Impl::resetMultiVecIfNeeded(Y_tmp_, Y.getMap(), Y.getNumVectors(), false);
1374  apply (X, *Y_tmp_, mode);
1375  Y.update (alpha, *Y_tmp_, beta);
1376  }
1377  }
1378  }//end timing
1379 
1380 #ifdef HAVE_IFPACK2_DEBUG
1381  {
1382  Teuchos::Array<magnitude_type> norms (Y.getNumVectors ());
1383  Y.norm1 (norms ());
1384  bool good = true;
1385  for (size_t j = 0; j < Y.getNumVectors (); ++j) {
1386  if (STM::isnaninf (norms[j])) {
1387  good = false;
1388  break;
1389  }
1390  }
1391  TEUCHOS_TEST_FOR_EXCEPTION( ! good, std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the output Y is NaN or Inf.");
1392  }
1393 #endif // HAVE_IFPACK2_DEBUG
1394 
1395  ++numApply_;
1396  applyTime_ += (timer.wallTime() - startTime);
1397 }
1398 
1399 
1400 //VINH comment out since multiply() is not needed anywhere
1401 //template<class MatrixType>
1402 //void RILUK<MatrixType>::
1403 //multiply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1404 // Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
1405 // const Teuchos::ETransp mode) const
1406 //{
1407 // const scalar_type zero = STS::zero ();
1408 // const scalar_type one = STS::one ();
1409 //
1410 // if (mode != Teuchos::NO_TRANS) {
1411 // U_->apply (X, Y, mode); //
1412 // Y.update (one, X, one); // Y = Y + X (account for implicit unit diagonal)
1413 //
1414 // // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we need
1415 // // to do an elementwise multiply with the conjugate of D_, not
1416 // // just with D_ itself.
1417 // Y.elementWiseMultiply (one, *D_, Y, zero); // y = D*y (D_ has inverse of diagonal)
1418 //
1419 // MV Y_tmp (Y, Teuchos::Copy); // Need a temp copy of Y
1420 // L_->apply (Y_tmp, Y, mode);
1421 // Y.update (one, Y_tmp, one); // (account for implicit unit diagonal)
1422 // }
1423 // else {
1424 // L_->apply (X, Y, mode);
1425 // Y.update (one, X, one); // Y = Y + X (account for implicit unit diagonal)
1426 // Y.elementWiseMultiply (one, *D_, Y, zero); // y = D*y (D_ has inverse of diagonal)
1427 // MV Y_tmp (Y, Teuchos::Copy); // Need a temp copy of Y1
1428 // U_->apply (Y_tmp, Y, mode);
1429 // Y.update (one, Y_tmp, one); // (account for implicit unit diagonal)
1430 // }
1431 //}
1432 
1433 template<class MatrixType>
1435 {
1436  std::ostringstream os;
1437 
1438  // Output is a valid YAML dictionary in flow style. If you don't
1439  // like everything on a single line, you should call describe()
1440  // instead.
1441  os << "\"Ifpack2::RILUK\": {";
1442  os << "Initialized: " << (isInitialized () ? "true" : "false") << ", "
1443  << "Computed: " << (isComputed () ? "true" : "false") << ", ";
1444 
1445  os << "Level-of-fill: " << getLevelOfFill() << ", ";
1446 
1447  if(isKokkosKernelsSpiluk_) os<<"KK-SPILUK, ";
1448  if(isKokkosKernelsStream_) os<<"KK-Stream, ";
1449 
1450  if (A_.is_null ()) {
1451  os << "Matrix: null";
1452  }
1453  else {
1454  os << "Global matrix dimensions: ["
1455  << A_->getGlobalNumRows () << ", " << A_->getGlobalNumCols () << "]"
1456  << ", Global nnz: " << A_->getGlobalNumEntries();
1457  }
1458 
1459  if (! L_solver_.is_null ()) os << ", " << L_solver_->description ();
1460  if (! U_solver_.is_null ()) os << ", " << U_solver_->description ();
1461 
1462  os << "}";
1463  return os.str ();
1464 }
1465 
1466 } // namespace Ifpack2
1467 
1468 #define IFPACK2_RILUK_INSTANT(S,LO,GO,N) \
1469  template class Ifpack2::RILUK< Tpetra::RowMatrix<S, LO, GO, N> >;
1470 
1471 #endif
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getRangeMap() const
Returns the Tpetra::Map object associated with the range of this operator.
Definition: Ifpack2_RILUK_def.hpp:241
node_type::execution_space execution_space
The Kokkos execution space of the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:243
static Teuchos::RCP< const row_matrix_type > makeLocalFilter(const Teuchos::RCP< const row_matrix_type > &A)
Return A, wrapped in a LocalFilter, if necessary.
Definition: Ifpack2_RILUK_def.hpp:448
MatrixType::global_ordinal_type global_ordinal_type
The type of global indices in the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:231
T & get(const std::string &name, T def_value)
bool nonnull(const std::shared_ptr< T > &p)
const Tpetra::Vector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > & getD() const
Return the diagonal entries of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:178
void initialize()
Initialize by computing the symbolic incomplete factorization.
Definition: Ifpack2_RILUK_def.hpp:481
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Tpetra::RowMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > row_matrix_type
Tpetra::RowMatrix specialization used by this class.
Definition: Ifpack2_RILUK_decl.hpp:249
size_type size() const
virtual void setMatrix(const Teuchos::RCP< const row_matrix_type > &A)
Change the matrix to be preconditioned.
Definition: Ifpack2_RILUK_def.hpp:125
std::string description() const
A one-line description of this object.
Definition: Ifpack2_RILUK_def.hpp:1434
void setParameters(const Teuchos::ParameterList &params)
Definition: Ifpack2_RILUK_def.hpp:306
ILU(k) factorization of a given Tpetra::RowMatrix.
Definition: Ifpack2_RILUK_decl.hpp:213
size_t getNodeSmootherComplexity() const
Get a rough estimate of cost per iteration.
Definition: Ifpack2_RILUK_def.hpp:205
Teuchos::RCP< const crs_matrix_type > getCrsMatrix() const
Return the input matrix A as a Tpetra::CrsMatrix, if possible; else throws.
Definition: Ifpack2_RILUK_def.hpp:441
Tpetra::CrsMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > crs_matrix_type
Tpetra::CrsMatrix specialization used by this class for representing L and U.
Definition: Ifpack2_RILUK_decl.hpp:252
const crs_matrix_type & getU() const
Return the U factor of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:192
bool isParameter(const std::string &name) const
MatrixType::node_type node_type
The Node type used by the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:234
T * getRawPtr() const
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getDomainMap() const
Returns the Tpetra::Map object associated with the domain of this operator.
Definition: Ifpack2_RILUK_def.hpp:222
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
MatrixType::local_ordinal_type local_ordinal_type
The type of local indices in the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:228
virtual ~RILUK()
Destructor (declared virtual for memory safety).
Definition: Ifpack2_RILUK_def.hpp:98
&quot;Preconditioner&quot; that solves local sparse triangular systems.
Definition: Ifpack2_LocalSparseTriangularSolver_decl.hpp:46
void compute()
Compute the (numeric) incomplete factorization.
Definition: Ifpack2_RILUK_def.hpp:1099
void apply(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_type alpha=Teuchos::ScalarTraits< scalar_type >::one(), scalar_type beta=Teuchos::ScalarTraits< scalar_type >::zero()) const
Apply the (inverse of the) incomplete factorization to X, resulting in Y.
Definition: Ifpack2_RILUK_def.hpp:1183
Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type
The type of the magnitude (absolute value) of a matrix entry.
Definition: Ifpack2_RILUK_decl.hpp:237
void resize(size_type new_size, const value_type &x=value_type())
Construct a level filled graph for use in computing an ILU(k) incomplete factorization.
Definition: Ifpack2_IlukGraph.hpp:67
IntegralType getIntegralValue(const std::string &str, const std::string &paramName="", const std::string &sublistName="") const
Teuchos::RCP< const row_matrix_type > getMatrix() const
Get the input matrix.
Definition: Ifpack2_RILUK_def.hpp:434
bool isType(const std::string &name) const
Declaration of MDF interface.
Access only local rows and columns of a sparse matrix.
Definition: Ifpack2_LocalFilter_decl.hpp:128
static double wallTime()
const crs_matrix_type & getL() const
Return the L factor of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:161
std::string typeName(const T &t)
MatrixType::scalar_type scalar_type
The type of the entries of the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:225
bool is_null() const