Ifpack2 Templated Preconditioning Package  Version 1.0
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Groups Pages
Ifpack2_RILUK_def.hpp
1 /*@HEADER
2 // ***********************************************************************
3 //
4 // Ifpack2: Templated Object-Oriented Algebraic Preconditioner Package
5 // Copyright (2009) Sandia Corporation
6 //
7 // Under terms of Contract DE-AC04-94AL85000, there is a non-exclusive
8 // license for use of this work by or on behalf of the U.S. Government.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // ***********************************************************************
38 //@HEADER
39 */
40 
41 #ifndef IFPACK2_CRSRILUK_DEF_HPP
42 #define IFPACK2_CRSRILUK_DEF_HPP
43 
44 #include "Ifpack2_LocalFilter.hpp"
45 #include "Tpetra_CrsMatrix.hpp"
46 #include "Teuchos_StandardParameterEntryValidators.hpp"
47 #include "Ifpack2_LocalSparseTriangularSolver.hpp"
48 #include "Ifpack2_Details_getParamTryingTypes.hpp"
49 #include "Ifpack2_Details_getCrsMatrix.hpp"
50 #include "Kokkos_Sort.hpp"
51 #include "KokkosSparse_Utils.hpp"
52 #include "KokkosKernels_Sorting.hpp"
53 #include "KokkosSparse_IOUtils.hpp"
54 
55 namespace Ifpack2 {
56 
57 namespace Details {
58 struct IlukImplType {
59  enum Enum {
60  Serial,
61  KSPILUK
62  };
63 
64  static void loadPLTypeOption (Teuchos::Array<std::string>& type_strs, Teuchos::Array<Enum>& type_enums) {
65  type_strs.resize(2);
66  type_strs[0] = "Serial";
67  type_strs[1] = "KSPILUK";
68  type_enums.resize(2);
69  type_enums[0] = Serial;
70  type_enums[1] = KSPILUK;
71  }
72 };
73 }
74 
75 template<class MatrixType>
77  : A_ (Matrix_in),
78  LevelOfFill_ (0),
79  Overalloc_ (2.),
80  isAllocated_ (false),
81  isInitialized_ (false),
82  isComputed_ (false),
83  numInitialize_ (0),
84  numCompute_ (0),
85  numApply_ (0),
86  initializeTime_ (0.0),
87  computeTime_ (0.0),
88  applyTime_ (0.0),
89  RelaxValue_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
90  Athresh_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
91  Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
92  isKokkosKernelsSpiluk_(false),
93  isKokkosKernelsStream_(false),
94  num_streams_(0),
95  hasStreamReordered_(false)
96 {
97  allocateSolvers();
98 }
99 
100 
101 template<class MatrixType>
103  : A_ (Matrix_in),
104  LevelOfFill_ (0),
105  Overalloc_ (2.),
106  isAllocated_ (false),
107  isInitialized_ (false),
108  isComputed_ (false),
109  numInitialize_ (0),
110  numCompute_ (0),
111  numApply_ (0),
112  initializeTime_ (0.0),
113  computeTime_ (0.0),
114  applyTime_ (0.0),
115  RelaxValue_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
116  Athresh_ (Teuchos::ScalarTraits<magnitude_type>::zero ()),
117  Rthresh_ (Teuchos::ScalarTraits<magnitude_type>::one ()),
118  isKokkosKernelsSpiluk_(false),
119  isKokkosKernelsStream_(false),
120  num_streams_(0),
121  hasStreamReordered_(false)
122 {
123  allocateSolvers();
124 }
125 
126 
127 template<class MatrixType>
129 {
130  if (!isKokkosKernelsStream_) {
131  if (Teuchos::nonnull (KernelHandle_)) {
132  KernelHandle_->destroy_spiluk_handle();
133  }
134  }
135  else {
136  for (int i = 0; i < num_streams_; i++) {
137  if (Teuchos::nonnull (KernelHandle_v_[i])) {
138  KernelHandle_v_[i]->destroy_spiluk_handle();
139  }
140  }
141  }
142 }
143 
144 template<class MatrixType>
146 {
148  L_solver_->setObjectLabel("lower");
150  U_solver_->setObjectLabel("upper");
151 }
152 
153 template<class MatrixType>
154 void
156 {
157  // It's legal for A to be null; in that case, you may not call
158  // initialize() until calling setMatrix() with a nonnull input.
159  // Regardless, setting the matrix invalidates any previous
160  // factorization.
161  if (A.getRawPtr () != A_.getRawPtr ()) {
162  isAllocated_ = false;
163  isInitialized_ = false;
164  isComputed_ = false;
165  A_local_ = Teuchos::null;
166  Graph_ = Teuchos::null;
167 
168  // The sparse triangular solvers get a triangular factor as their
169  // input matrix. The triangular factors L_ and U_ are getting
170  // reset, so we reset the solvers' matrices to null. Do that
171  // before setting L_ and U_ to null, so that latter step actually
172  // frees the factors.
173  if (! L_solver_.is_null ()) {
174  L_solver_->setMatrix (Teuchos::null);
175  }
176  if (! U_solver_.is_null ()) {
177  U_solver_->setMatrix (Teuchos::null);
178  }
179 
180  L_ = Teuchos::null;
181  U_ = Teuchos::null;
182  D_ = Teuchos::null;
183  A_ = A;
184  }
185 }
186 
187 
188 
189 template<class MatrixType>
192 {
194  L_.is_null (), std::runtime_error, "Ifpack2::RILUK::getL: The L factor "
195  "is null. Please call initialize() (and preferably also compute()) "
196  "before calling this method. If the input matrix has not yet been set, "
197  "you must first call setMatrix() with a nonnull input matrix before you "
198  "may call initialize() or compute().");
199  return *L_;
200 }
201 
202 
203 template<class MatrixType>
204 const Tpetra::Vector<typename RILUK<MatrixType>::scalar_type,
209 {
211  D_.is_null (), std::runtime_error, "Ifpack2::RILUK::getD: The D factor "
212  "(of diagonal entries) is null. Please call initialize() (and "
213  "preferably also compute()) before calling this method. If the input "
214  "matrix has not yet been set, you must first call setMatrix() with a "
215  "nonnull input matrix before you may call initialize() or compute().");
216  return *D_;
217 }
218 
219 
220 template<class MatrixType>
223 {
225  U_.is_null (), std::runtime_error, "Ifpack2::RILUK::getU: The U factor "
226  "is null. Please call initialize() (and preferably also compute()) "
227  "before calling this method. If the input matrix has not yet been set, "
228  "you must first call setMatrix() with a nonnull input matrix before you "
229  "may call initialize() or compute().");
230  return *U_;
231 }
232 
233 
234 template<class MatrixType>
237  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::getNodeSmootherComplexity: "
238  "The input matrix A is null. Please call setMatrix() with a nonnull "
239  "input matrix, then call compute(), before calling this method.");
240  // RILUK methods cost roughly one apply + the nnz in the upper+lower triangles
241  if(!L_.is_null() && !U_.is_null())
242  return A_->getLocalNumEntries() + L_->getLocalNumEntries() + U_->getLocalNumEntries();
243  else
244  return 0;
245 }
246 
247 
248 template<class MatrixType>
251  typename RILUK<MatrixType>::node_type> >
254  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::getDomainMap: "
255  "The matrix is null. Please call setMatrix() with a nonnull input "
256  "before calling this method.");
257 
258  // FIXME (mfh 25 Jan 2014) Shouldn't this just come from A_?
260  Graph_.is_null (), std::runtime_error, "Ifpack2::RILUK::getDomainMap: "
261  "The computed graph is null. Please call initialize() before calling "
262  "this method.");
263  return Graph_->getL_Graph ()->getDomainMap ();
264 }
265 
266 
267 template<class MatrixType>
270  typename RILUK<MatrixType>::node_type> >
273  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::getRangeMap: "
274  "The matrix is null. Please call setMatrix() with a nonnull input "
275  "before calling this method.");
276 
277  // FIXME (mfh 25 Jan 2014) Shouldn't this just come from A_?
279  Graph_.is_null (), std::runtime_error, "Ifpack2::RILUK::getRangeMap: "
280  "The computed graph is null. Please call initialize() before calling "
281  "this method.");
282  return Graph_->getL_Graph ()->getRangeMap ();
283 }
284 
285 
286 template<class MatrixType>
288 {
289  using Teuchos::null;
290  using Teuchos::rcp;
291 
292  if (! isAllocated_) {
293  if (!isKokkosKernelsStream_) {
294  // Deallocate any existing storage. This avoids storing 2x
295  // memory, since RCP op= does not deallocate until after the
296  // assignment.
297  L_ = null;
298  U_ = null;
299  D_ = null;
300 
301  // Allocate Matrix using ILUK graphs
302  L_ = rcp (new crs_matrix_type (Graph_->getL_Graph ()));
303  U_ = rcp (new crs_matrix_type (Graph_->getU_Graph ()));
304  L_->setAllToScalar (STS::zero ()); // Zero out L and U matrices
305  U_->setAllToScalar (STS::zero ());
306 
307  // FIXME (mfh 24 Jan 2014) This assumes domain == range Map for L and U.
308  L_->fillComplete ();
309  U_->fillComplete ();
310  D_ = rcp (new vec_type (Graph_->getL_Graph ()->getRowMap ()));
311  }
312  else {
313  L_v_ = std::vector< Teuchos::RCP<crs_matrix_type> >(num_streams_);
314  U_v_ = std::vector< Teuchos::RCP<crs_matrix_type> >(num_streams_);
315  for (int i = 0; i < num_streams_; i++) {
316  L_v_[i] = null;
317  U_v_[i] = null;
318 
319  L_v_[i] = rcp (new crs_matrix_type (Graph_v_[i]->getL_Graph ()));
320  U_v_[i] = rcp (new crs_matrix_type (Graph_v_[i]->getU_Graph ()));
321  L_v_[i]->setAllToScalar (STS::zero ()); // Zero out L and U matrices
322  U_v_[i]->setAllToScalar (STS::zero ());
323 
324  L_v_[i]->fillComplete ();
325  U_v_[i]->fillComplete ();
326  }
327  }
328  }
329  isAllocated_ = true;
330 }
331 
332 
333 template<class MatrixType>
334 void
337 {
338  using Teuchos::RCP;
340  using Teuchos::Array;
341  using Details::getParamTryingTypes;
342  const char prefix[] = "Ifpack2::RILUK: ";
343 
344  // Default values of the various parameters.
345  int fillLevel = 0;
346  magnitude_type absThresh = STM::zero ();
347  magnitude_type relThresh = STM::one ();
348  magnitude_type relaxValue = STM::zero ();
349  double overalloc = 2.;
350  int nstreams = 0;
351 
352  // "fact: iluk level-of-fill" parsing is more complicated, because
353  // we want to allow as many types as make sense. int is the native
354  // type, but we also want to accept double (for backwards
355  // compatibilty with ILUT). You can't cast arbitrary magnitude_type
356  // (e.g., Sacado::MP::Vector) to int, so we use float instead, to
357  // get coverage of the most common magnitude_type cases. Weirdly,
358  // there's an Ifpack2 test that sets the fill level as a
359  // global_ordinal_type.
360  {
361  const std::string paramName ("fact: iluk level-of-fill");
362  getParamTryingTypes<int, int, global_ordinal_type, double, float>
363  (fillLevel, params, paramName, prefix);
364  }
365  // For the other parameters, we prefer magnitude_type, but allow
366  // double for backwards compatibility.
367  {
368  const std::string paramName ("fact: absolute threshold");
369  getParamTryingTypes<magnitude_type, magnitude_type, double>
370  (absThresh, params, paramName, prefix);
371  }
372  {
373  const std::string paramName ("fact: relative threshold");
374  getParamTryingTypes<magnitude_type, magnitude_type, double>
375  (relThresh, params, paramName, prefix);
376  }
377  {
378  const std::string paramName ("fact: relax value");
379  getParamTryingTypes<magnitude_type, magnitude_type, double>
380  (relaxValue, params, paramName, prefix);
381  }
382  {
383  const std::string paramName ("fact: iluk overalloc");
384  getParamTryingTypes<double, double>
385  (overalloc, params, paramName, prefix);
386  }
387 
388  // Parsing implementation type
389  Details::IlukImplType::Enum ilukimplType = Details::IlukImplType::Serial;
390  do {
391  static const char typeName[] = "fact: type";
392 
393  if ( ! params.isType<std::string>(typeName)) break;
394 
395  // Map std::string <-> IlukImplType::Enum.
396  Array<std::string> ilukimplTypeStrs;
397  Array<Details::IlukImplType::Enum> ilukimplTypeEnums;
398  Details::IlukImplType::loadPLTypeOption (ilukimplTypeStrs, ilukimplTypeEnums);
400  s2i(ilukimplTypeStrs (), ilukimplTypeEnums (), typeName, false);
401 
402  ilukimplType = s2i.getIntegralValue(params.get<std::string>(typeName));
403  } while (0);
404 
405  if (ilukimplType == Details::IlukImplType::KSPILUK) {
406  this->isKokkosKernelsSpiluk_ = true;
407  }
408  else {
409  this->isKokkosKernelsSpiluk_ = false;
410  }
411 
412  {
413  const std::string paramName ("fact: kspiluk number-of-streams");
414  getParamTryingTypes<int, int, global_ordinal_type>
415  (nstreams, params, paramName, prefix);
416  }
417 
418  // Forward to trisolvers.
419  L_solver_->setParameters(params);
420  U_solver_->setParameters(params);
421 
422  // "Commit" the values only after validating all of them. This
423  // ensures that there are no side effects if this routine throws an
424  // exception.
425 
426  LevelOfFill_ = fillLevel;
427  Overalloc_ = overalloc;
428  num_streams_ = nstreams;
429 
430  if (num_streams_ >= 1) {
431  this->isKokkosKernelsStream_ = true;
432  // Will we do reordering in streams?
433  if (params.isParameter("fact: kspiluk reordering in streams"))
434  hasStreamReordered_ = params.get<bool> ("fact: kspiluk reordering in streams");
435  }
436  else {
437  this->isKokkosKernelsStream_ = false;
438  }
439 
440  // mfh 28 Nov 2012: The previous code would not assign Athresh_,
441  // Rthresh_, or RelaxValue_, if the read-in value was -1. I don't
442  // know if keeping this behavior is correct, but I'll keep it just
443  // so as not to change previous behavior.
444 
445  if (absThresh != -STM::one ()) {
446  Athresh_ = absThresh;
447  }
448  if (relThresh != -STM::one ()) {
449  Rthresh_ = relThresh;
450  }
451  if (relaxValue != -STM::one ()) {
452  RelaxValue_ = relaxValue;
453  }
454 }
455 
456 
457 template<class MatrixType>
460  return Teuchos::rcp_implicit_cast<const row_matrix_type> (A_);
461 }
462 
463 
464 template<class MatrixType>
467  return Teuchos::rcp_dynamic_cast<const crs_matrix_type> (A_, true);
468 }
469 
470 
471 template<class MatrixType>
474 {
475  using Teuchos::RCP;
476  using Teuchos::rcp;
477  using Teuchos::rcp_dynamic_cast;
478  using Teuchos::rcp_implicit_cast;
479 
480  // If A_'s communicator only has one process, or if its column and
481  // row Maps are the same, then it is already local, so use it
482  // directly.
483  if (A->getRowMap ()->getComm ()->getSize () == 1 ||
484  A->getRowMap ()->isSameAs (* (A->getColMap ()))) {
485  return A;
486  }
487 
488  // If A_ is already a LocalFilter, then use it directly. This
489  // should be the case if RILUK is being used through
490  // AdditiveSchwarz, for example.
491  RCP<const LocalFilter<row_matrix_type> > A_lf_r =
492  rcp_dynamic_cast<const LocalFilter<row_matrix_type> > (A);
493  if (! A_lf_r.is_null ()) {
494  return rcp_implicit_cast<const row_matrix_type> (A_lf_r);
495  }
496  else {
497  // A_'s communicator has more than one process, its row Map and
498  // its column Map differ, and A_ is not a LocalFilter. Thus, we
499  // have to wrap it in a LocalFilter.
500  return rcp (new LocalFilter<row_matrix_type> (A));
501  }
502 }
503 
504 
505 template<class MatrixType>
507 {
508  using Teuchos::RCP;
509  using Teuchos::rcp;
510  using Teuchos::rcp_const_cast;
511  using Teuchos::rcp_dynamic_cast;
512  using Teuchos::rcp_implicit_cast;
513  using Teuchos::Array;
514  using Teuchos::ArrayView;
515  typedef Tpetra::CrsGraph<local_ordinal_type,
517  node_type> crs_graph_type;
518  typedef Tpetra::Map<local_ordinal_type,
519  global_ordinal_type,
520  node_type> crs_map_type;
521  const char prefix[] = "Ifpack2::RILUK::initialize: ";
522 
524  (A_.is_null (), std::runtime_error, prefix << "The matrix is null. Please "
525  "call setMatrix() with a nonnull input before calling this method.");
527  (! A_->isFillComplete (), std::runtime_error, prefix << "The matrix is not "
528  "fill complete. You may not invoke initialize() or compute() with this "
529  "matrix until the matrix is fill complete. If your matrix is a "
530  "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
531  "range Maps, if appropriate) before calling this method.");
532 
533  Teuchos::Time timer ("RILUK::initialize");
534  double startTime = timer.wallTime();
535  { // Start timing
536  Teuchos::TimeMonitor timeMon (timer);
537 
538  // Calling initialize() means that the user asserts that the graph
539  // of the sparse matrix may have changed. We must not just reuse
540  // the previous graph in that case.
541  //
542  // Regarding setting isAllocated_ to false: Eventually, we may want
543  // some kind of clever memory reuse strategy, but it's always
544  // correct just to blow everything away and start over.
545  isInitialized_ = false;
546  isAllocated_ = false;
547  isComputed_ = false;
548  Graph_ = Teuchos::null;
549 
550  if (isKokkosKernelsStream_) {
551  Graph_v_ = std::vector< Teuchos::RCP<iluk_graph_type> >(num_streams_);
552  A_local_diagblks = std::vector<local_matrix_device_type>(num_streams_);
553  for (int i = 0; i < num_streams_; i++) {
554  Graph_v_[i] = Teuchos::null;
555  }
556  }
557 
558  A_local_ = makeLocalFilter (A_);
559 
561  A_local_.is_null (), std::logic_error, "Ifpack2::RILUK::initialize: "
562  "makeLocalFilter returned null; it failed to compute A_local. "
563  "Please report this bug to the Ifpack2 developers.");
564 
565  // FIXME (mfh 24 Jan 2014, 26 Mar 2014) It would be more efficient
566  // to rewrite RILUK so that it works with any RowMatrix input, not
567  // just CrsMatrix. (That would require rewriting IlukGraph to
568  // handle a Tpetra::RowGraph.) However, to make it work for now,
569  // we just copy the input matrix if it's not a CrsMatrix.
570 
571  {
572  RCP<const crs_matrix_type> A_local_crs = Details::getCrsMatrix(A_local_);
573  if(A_local_crs.is_null()) {
574  local_ordinal_type numRows = A_local_->getLocalNumRows();
575  Array<size_t> entriesPerRow(numRows);
576  for(local_ordinal_type i = 0; i < numRows; i++) {
577  entriesPerRow[i] = A_local_->getNumEntriesInLocalRow(i);
578  }
579  RCP<crs_matrix_type> A_local_crs_nc =
580  rcp (new crs_matrix_type (A_local_->getRowMap (),
581  A_local_->getColMap (),
582  entriesPerRow()));
583  // copy entries into A_local_crs
584  nonconst_local_inds_host_view_type indices("indices",A_local_->getLocalMaxNumRowEntries());
585  nonconst_values_host_view_type values("values",A_local_->getLocalMaxNumRowEntries());
586  for(local_ordinal_type i = 0; i < numRows; i++) {
587  size_t numEntries = 0;
588  A_local_->getLocalRowCopy(i, indices, values, numEntries);
589  A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()), indices.data());
590  }
591  A_local_crs_nc->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ());
592  A_local_crs = rcp_const_cast<const crs_matrix_type> (A_local_crs_nc);
593  }
594  if (!isKokkosKernelsStream_) {
595  Graph_ = rcp (new Ifpack2::IlukGraph<crs_graph_type,kk_handle_type> (A_local_crs->getCrsGraph (),
596  LevelOfFill_, 0, Overalloc_));
597  }
598  else {
599  auto lclMtx = A_local_crs->getLocalMatrixDevice();
600  if (!hasStreamReordered_)
601  KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
602  else
603  perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);
604  for(int i = 0; i < num_streams_; i++) {
605  Teuchos::RCP<const crs_map_type> A_local_diagblks_RowMap = rcp (new crs_map_type(A_local_diagblks[i].numRows(),
606  A_local_diagblks[i].numRows(),
607  A_local_crs->getRowMap()->getComm()));
608  Teuchos::RCP<const crs_map_type> A_local_diagblks_ColMap = rcp (new crs_map_type(A_local_diagblks[i].numCols(),
609  A_local_diagblks[i].numCols(),
610  A_local_crs->getColMap()->getComm()));
611  Teuchos::RCP<crs_matrix_type> A_local_diagblks_ = rcp (new crs_matrix_type(A_local_diagblks_RowMap,
612  A_local_diagblks_ColMap,
613  A_local_diagblks[i]));
614  Graph_v_[i] = rcp (new Ifpack2::IlukGraph<crs_graph_type,kk_handle_type> (A_local_diagblks_->getCrsGraph(),
615  LevelOfFill_, 0, Overalloc_));
616  }
617  }
618  }
619 
620  if (this->isKokkosKernelsSpiluk_) {
621  if (!isKokkosKernelsStream_) {
622  this->KernelHandle_ = Teuchos::rcp (new kk_handle_type ());
623  KernelHandle_->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,
624  A_local_->getLocalNumRows(),
625  2*A_local_->getLocalNumEntries()*(LevelOfFill_+1),
626  2*A_local_->getLocalNumEntries()*(LevelOfFill_+1) );
627  Graph_->initialize (KernelHandle_); // this calls spiluk_symbolic
628  }
629  else {
630  KernelHandle_v_ = std::vector< Teuchos::RCP<kk_handle_type> >(num_streams_);
631  for (int i = 0; i < num_streams_; i++) {
632  KernelHandle_v_[i] = Teuchos::rcp (new kk_handle_type ());
633  KernelHandle_v_[i]->create_spiluk_handle( KokkosSparse::Experimental::SPILUKAlgorithm::SEQLVLSCHD_TP1,
634  A_local_diagblks[i].numRows(),
635  2*A_local_diagblks[i].nnz()*(LevelOfFill_+1),
636  2*A_local_diagblks[i].nnz()*(LevelOfFill_+1) );
637  Graph_v_[i]->initialize (KernelHandle_v_[i]); // this calls spiluk_symbolic
638  }
639  std::vector<int> weights(num_streams_);
640  std::fill(weights.begin(), weights.end(), 1);
641  exec_space_instances_ = Kokkos::Experimental::partition_space(execution_space(), weights);
642  }
643  }
644  else {
645  Graph_->initialize ();
646  }
647 
648  allocate_L_and_U ();
649  checkOrderingConsistency (*A_local_);
650  if (!isKokkosKernelsStream_) {
651  L_solver_->setMatrix (L_);
652  }
653  else {
654  L_solver_->setStreamInfo (isKokkosKernelsStream_, num_streams_, exec_space_instances_);
655  L_solver_->setMatrices (L_v_);
656  }
657  L_solver_->initialize ();
658  //NOTE (Nov-09-2022):
659  //For Cuda >= 11.3 (using cusparseSpSV), skip trisolve computes here.
660  //Instead, call trisolve computes within RILUK compute
661 #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030)
662  L_solver_->compute ();//NOTE: It makes sense to do compute here because only the nonzero pattern is involved in trisolve compute
663 #endif
664 
665  if (!isKokkosKernelsStream_) {
666  U_solver_->setMatrix (U_);
667  }
668  else {
669  U_solver_->setStreamInfo (isKokkosKernelsStream_, num_streams_, exec_space_instances_);
670  U_solver_->setMatrices (U_v_);
671  }
672  U_solver_->initialize ();
673 #if !defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || !defined(KOKKOS_ENABLE_CUDA) || (CUDA_VERSION < 11030)
674  U_solver_->compute ();//NOTE: It makes sense to do compute here because only the nonzero pattern is involved in trisolve compute
675 #endif
676 
677  // Do not call initAllValues. compute() always calls initAllValues to
678  // fill L and U with possibly new numbers. initialize() is concerned
679  // only with the nonzero pattern.
680  } // Stop timing
681 
682  isInitialized_ = true;
683  ++numInitialize_;
684  initializeTime_ += (timer.wallTime() - startTime);
685 }
686 
687 template<class MatrixType>
688 void
690 checkOrderingConsistency (const row_matrix_type& A)
691 {
692  // First check that the local row map ordering is the same as the local portion of the column map.
693  // The extraction of the strictly lower/upper parts of A, as well as the factorization,
694  // implicitly assume that this is the case.
695  Teuchos::ArrayView<const global_ordinal_type> rowGIDs = A.getRowMap()->getLocalElementList();
696  Teuchos::ArrayView<const global_ordinal_type> colGIDs = A.getColMap()->getLocalElementList();
697  bool gidsAreConsistentlyOrdered=true;
698  global_ordinal_type indexOfInconsistentGID=0;
699  for (global_ordinal_type i=0; i<rowGIDs.size(); ++i) {
700  if (rowGIDs[i] != colGIDs[i]) {
701  gidsAreConsistentlyOrdered=false;
702  indexOfInconsistentGID=i;
703  break;
704  }
705  }
706  TEUCHOS_TEST_FOR_EXCEPTION(gidsAreConsistentlyOrdered==false, std::runtime_error,
707  "The ordering of the local GIDs in the row and column maps is not the same"
708  << std::endl << "at index " << indexOfInconsistentGID
709  << ". Consistency is required, as all calculations are done with"
710  << std::endl << "local indexing.");
711 }
712 
713 template<class MatrixType>
714 void
715 RILUK<MatrixType>::
716 initAllValues (const row_matrix_type& A)
717 {
718  using Teuchos::ArrayRCP;
719  using Teuchos::Comm;
720  using Teuchos::ptr;
721  using Teuchos::RCP;
722  using Teuchos::REDUCE_SUM;
723  using Teuchos::reduceAll;
724  typedef Tpetra::Map<local_ordinal_type,global_ordinal_type,node_type> map_type;
725 
726  size_t NumIn = 0, NumL = 0, NumU = 0;
727  bool DiagFound = false;
728  size_t NumNonzeroDiags = 0;
729  size_t MaxNumEntries = A.getGlobalMaxNumRowEntries();
730 
731  // Allocate temporary space for extracting the strictly
732  // lower and upper parts of the matrix A.
733  nonconst_local_inds_host_view_type InI("InI",MaxNumEntries);
734  Teuchos::Array<local_ordinal_type> LI(MaxNumEntries);
735  Teuchos::Array<local_ordinal_type> UI(MaxNumEntries);
736  nonconst_values_host_view_type InV("InV",MaxNumEntries);
737  Teuchos::Array<scalar_type> LV(MaxNumEntries);
738  Teuchos::Array<scalar_type> UV(MaxNumEntries);
739 
740  // Check if values should be inserted or replaced
741  const bool ReplaceValues = L_->isStaticGraph () || L_->isLocallyIndexed ();
742 
743  L_->resumeFill ();
744  U_->resumeFill ();
745  if (ReplaceValues) {
746  L_->setAllToScalar (STS::zero ()); // Zero out L and U matrices
747  U_->setAllToScalar (STS::zero ());
748  }
749 
750  D_->putScalar (STS::zero ()); // Set diagonal values to zero
751  auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
752 
753  RCP<const map_type> rowMap = L_->getRowMap ();
754 
755  // First we copy the user's matrix into L and U, regardless of fill level.
756  // It is important to note that L and U are populated using local indices.
757  // This means that if the row map GIDs are not monotonically increasing
758  // (i.e., permuted or gappy), then the strictly lower (upper) part of the
759  // matrix is not the one that you would get if you based L (U) on GIDs.
760  // This is ok, as the *order* of the GIDs in the rowmap is a better
761  // expression of the user's intent than the GIDs themselves.
762 
763  Teuchos::ArrayView<const global_ordinal_type> nodeGIDs = rowMap->getLocalElementList();
764  for (size_t myRow=0; myRow<A.getLocalNumRows(); ++myRow) {
765  local_ordinal_type local_row = myRow;
766 
767  //TODO JJH 4April2014 An optimization is to use getLocalRowView. Not all matrices support this,
768  // we'd need to check via the Tpetra::RowMatrix method supportsRowViews().
769  A.getLocalRowCopy (local_row, InI, InV, NumIn); // Get Values and Indices
770 
771  // Split into L and U (we don't assume that indices are ordered).
772 
773  NumL = 0;
774  NumU = 0;
775  DiagFound = false;
776 
777  for (size_t j = 0; j < NumIn; ++j) {
778  const local_ordinal_type k = InI[j];
779 
780  if (k == local_row) {
781  DiagFound = true;
782  // Store perturbed diagonal in Tpetra::Vector D_
783  DV(local_row) += Rthresh_ * InV[j] + IFPACK2_SGN(InV[j]) * Athresh_;
784  }
785  else if (k < 0) { // Out of range
787  true, std::runtime_error, "Ifpack2::RILUK::initAllValues: current "
788  "GID k = " << k << " < 0. I'm not sure why this is an error; it is "
789  "probably an artifact of the undocumented assumptions of the "
790  "original implementation (likely copied and pasted from Ifpack). "
791  "Nevertheless, the code I found here insisted on this being an error "
792  "state, so I will throw an exception here.");
793  }
794  else if (k < local_row) {
795  LI[NumL] = k;
796  LV[NumL] = InV[j];
797  NumL++;
798  }
799  else if (Teuchos::as<size_t>(k) <= rowMap->getLocalNumElements()) {
800  UI[NumU] = k;
801  UV[NumU] = InV[j];
802  NumU++;
803  }
804  }
805 
806  // Check in things for this row of L and U
807 
808  if (DiagFound) {
809  ++NumNonzeroDiags;
810  } else {
811  DV(local_row) = Athresh_;
812  }
813 
814  if (NumL) {
815  if (ReplaceValues) {
816  L_->replaceLocalValues(local_row, LI(0, NumL), LV(0,NumL));
817  } else {
818  //FIXME JJH 24April2014 Is this correct? I believe this case is when there aren't already values
819  //FIXME in this row in the column locations corresponding to UI.
820  L_->insertLocalValues(local_row, LI(0,NumL), LV(0,NumL));
821  }
822  }
823 
824  if (NumU) {
825  if (ReplaceValues) {
826  U_->replaceLocalValues(local_row, UI(0,NumU), UV(0,NumU));
827  } else {
828  //FIXME JJH 24April2014 Is this correct? I believe this case is when there aren't already values
829  //FIXME in this row in the column locations corresponding to UI.
830  U_->insertLocalValues(local_row, UI(0,NumU), UV(0,NumU));
831  }
832  }
833  }
834 
835  // At this point L and U have the values of A in the structure of L
836  // and U, and diagonal vector D
837 
838  isInitialized_ = true;
839 }
840 
841 
842 template<class MatrixType>
844 {
845  using Teuchos::RCP;
846  using Teuchos::rcp;
847  using Teuchos::rcp_const_cast;
848  using Teuchos::rcp_dynamic_cast;
849  using Teuchos::Array;
850  using Teuchos::ArrayView;
851  const char prefix[] = "Ifpack2::RILUK::compute: ";
852 
853  // initialize() checks this too, but it's easier for users if the
854  // error shows them the name of the method that they actually
855  // called, rather than the name of some internally called method.
857  (A_.is_null (), std::runtime_error, prefix << "The matrix is null. Please "
858  "call setMatrix() with a nonnull input before calling this method.");
860  (! A_->isFillComplete (), std::runtime_error, prefix << "The matrix is not "
861  "fill complete. You may not invoke initialize() or compute() with this "
862  "matrix until the matrix is fill complete. If your matrix is a "
863  "Tpetra::CrsMatrix, please call fillComplete on it (with the domain and "
864  "range Maps, if appropriate) before calling this method.");
865 
866  if (! isInitialized ()) {
867  initialize (); // Don't count this in the compute() time
868  }
869 
870  Teuchos::Time timer ("RILUK::compute");
871 
872  // Start timing
873  Teuchos::TimeMonitor timeMon (timer);
874  double startTime = timer.wallTime();
875 
876  isComputed_ = false;
877 
878  if (!this->isKokkosKernelsSpiluk_) {
879  // Fill L and U with numbers. This supports nonzero pattern reuse by calling
880  // initialize() once and then compute() multiple times.
881  initAllValues (*A_local_);
882 
883  // MinMachNum should be officially defined, for now pick something a little
884  // bigger than IEEE underflow value
885 
886  const scalar_type MinDiagonalValue = STS::rmin ();
887  const scalar_type MaxDiagonalValue = STS::one () / MinDiagonalValue;
888 
889  size_t NumIn, NumL, NumU;
890 
891  // Get Maximum Row length
892  const size_t MaxNumEntries =
893  L_->getLocalMaxNumRowEntries () + U_->getLocalMaxNumRowEntries () + 1;
894 
895  Teuchos::Array<local_ordinal_type> InI(MaxNumEntries); // Allocate temp space
896  Teuchos::Array<scalar_type> InV(MaxNumEntries);
897  size_t num_cols = U_->getColMap()->getLocalNumElements();
898  Teuchos::Array<int> colflag(num_cols);
899 
900  auto DV = Kokkos::subview(D_->getLocalViewHost(Tpetra::Access::ReadWrite), Kokkos::ALL(), 0);
901 
902  // Now start the factorization.
903 
904  for (size_t j = 0; j < num_cols; ++j) {
905  colflag[j] = -1;
906  }
907  using IST = typename row_matrix_type::impl_scalar_type;
908  for (size_t i = 0; i < L_->getLocalNumRows (); ++i) {
909  local_ordinal_type local_row = i;
910  // Need some integer workspace and pointers
911  size_t NumUU;
912  local_inds_host_view_type UUI;
913  values_host_view_type UUV;
914 
915  // Fill InV, InI with current row of L, D and U combined
916 
917  NumIn = MaxNumEntries;
918  nonconst_local_inds_host_view_type InI_v(InI.data(),MaxNumEntries);
919  nonconst_values_host_view_type InV_v(reinterpret_cast<IST*>(InV.data()),MaxNumEntries);
920 
921  L_->getLocalRowCopy (local_row, InI_v , InV_v, NumL);
922 
923  InV[NumL] = DV(i); // Put in diagonal
924  InI[NumL] = local_row;
925 
926  nonconst_local_inds_host_view_type InI_sub(InI.data()+NumL+1,MaxNumEntries-NumL-1);
927  nonconst_values_host_view_type InV_sub(reinterpret_cast<IST*>(InV.data())+NumL+1,MaxNumEntries-NumL-1);
928 
929  U_->getLocalRowCopy (local_row, InI_sub,InV_sub, NumU);
930  NumIn = NumL+NumU+1;
931 
932  // Set column flags
933  for (size_t j = 0; j < NumIn; ++j) {
934  colflag[InI[j]] = j;
935  }
936 
937  scalar_type diagmod = STS::zero (); // Off-diagonal accumulator
938 
939  for (size_t jj = 0; jj < NumL; ++jj) {
940  local_ordinal_type j = InI[jj];
941  IST multiplier = InV[jj]; // current_mults++;
942 
943  InV[jj] *= static_cast<scalar_type>(DV(j));
944 
945  U_->getLocalRowView(j, UUI, UUV); // View of row above
946  NumUU = UUI.size();
947 
948  if (RelaxValue_ == STM::zero ()) {
949  for (size_t k = 0; k < NumUU; ++k) {
950  const int kk = colflag[UUI[k]];
951  // FIXME (mfh 23 Dec 2013) Wait a second, we just set
952  // colflag above using size_t (which is generally unsigned),
953  // but now we're querying it using int (which is signed).
954  if (kk > -1) {
955  InV[kk] -= static_cast<scalar_type>(multiplier * UUV[k]);
956  }
957  }
958 
959  }
960  else {
961  for (size_t k = 0; k < NumUU; ++k) {
962  // FIXME (mfh 23 Dec 2013) Wait a second, we just set
963  // colflag above using size_t (which is generally unsigned),
964  // but now we're querying it using int (which is signed).
965  const int kk = colflag[UUI[k]];
966  if (kk > -1) {
967  InV[kk] -= static_cast<scalar_type>(multiplier*UUV[k]);
968  }
969  else {
970  diagmod -= static_cast<scalar_type>(multiplier*UUV[k]);
971  }
972  }
973  }
974  }
975 
976  if (NumL) {
977  // Replace current row of L
978  L_->replaceLocalValues (local_row, InI (0, NumL), InV (0, NumL));
979  }
980 
981  DV(i) = InV[NumL]; // Extract Diagonal value
982 
983  if (RelaxValue_ != STM::zero ()) {
984  DV(i) += RelaxValue_*diagmod; // Add off diagonal modifications
985  }
986 
987  if (STS::magnitude (DV(i)) > STS::magnitude (MaxDiagonalValue)) {
988  if (STS::real (DV(i)) < STM::zero ()) {
989  DV(i) = -MinDiagonalValue;
990  }
991  else {
992  DV(i) = MinDiagonalValue;
993  }
994  }
995  else {
996  DV(i) = static_cast<impl_scalar_type>(STS::one ()) / DV(i); // Invert diagonal value
997  }
998 
999  for (size_t j = 0; j < NumU; ++j) {
1000  InV[NumL+1+j] *= static_cast<scalar_type>(DV(i)); // Scale U by inverse of diagonal
1001  }
1002 
1003  if (NumU) {
1004  // Replace current row of L and U
1005  U_->replaceLocalValues (local_row, InI (NumL+1, NumU), InV (NumL+1, NumU));
1006  }
1007 
1008  // Reset column flags
1009  for (size_t j = 0; j < NumIn; ++j) {
1010  colflag[InI[j]] = -1;
1011  }
1012  }
1013 
1014  // The domain of L and the range of U are exactly their own row maps
1015  // (there is no communication). The domain of U and the range of L
1016  // must be the same as those of the original matrix, However if the
1017  // original matrix is a VbrMatrix, these two latter maps are
1018  // translation from a block map to a point map.
1019  // FIXME (mfh 23 Dec 2013) Do we know that the column Map of L_ is
1020  // always one-to-one?
1021  L_->fillComplete (L_->getColMap (), A_local_->getRangeMap ());
1022  U_->fillComplete (A_local_->getDomainMap (), U_->getRowMap ());
1023 
1024  // If L_solver_ or U_solver store modified factors internally, we need to reset those
1025  L_solver_->setMatrix (L_);
1026  L_solver_->compute ();//NOTE: Only do compute if the pointer changed. Otherwise, do nothing
1027  U_solver_->setMatrix (U_);
1028  U_solver_->compute ();//NOTE: Only do compute if the pointer changed. Otherwise, do nothing
1029  }
1030  else {
1031  {//Make sure values in A is picked up even in case of pattern reuse
1032  RCP<const crs_matrix_type> A_local_crs = Details::getCrsMatrix(A_local_);
1033  if(A_local_crs.is_null()) {
1034  local_ordinal_type numRows = A_local_->getLocalNumRows();
1035  Array<size_t> entriesPerRow(numRows);
1036  for(local_ordinal_type i = 0; i < numRows; i++) {
1037  entriesPerRow[i] = A_local_->getNumEntriesInLocalRow(i);
1038  }
1039  RCP<crs_matrix_type> A_local_crs_nc =
1040  rcp (new crs_matrix_type (A_local_->getRowMap (),
1041  A_local_->getColMap (),
1042  entriesPerRow()));
1043  // copy entries into A_local_crs
1044  nonconst_local_inds_host_view_type indices("indices",A_local_->getLocalMaxNumRowEntries());
1045  nonconst_values_host_view_type values("values",A_local_->getLocalMaxNumRowEntries());
1046  for(local_ordinal_type i = 0; i < numRows; i++) {
1047  size_t numEntries = 0;
1048  A_local_->getLocalRowCopy(i, indices, values, numEntries);
1049  A_local_crs_nc->insertLocalValues(i, numEntries, reinterpret_cast<scalar_type*>(values.data()),indices.data());
1050  }
1051  A_local_crs_nc->fillComplete (A_local_->getDomainMap (), A_local_->getRangeMap ());
1052  A_local_crs = rcp_const_cast<const crs_matrix_type> (A_local_crs_nc);
1053  }
1054  auto lclMtx = A_local_crs->getLocalMatrixDevice();
1055  if (!isKokkosKernelsStream_) {
1056  A_local_rowmap_ = lclMtx.graph.row_map;
1057  A_local_entries_ = lclMtx.graph.entries;
1058  A_local_values_ = lclMtx.values;
1059  }
1060  else {
1061  if (!hasStreamReordered_)
1062  KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks);
1063  else
1064  perm_v_ = KokkosSparse::Impl::kk_extract_diagonal_blocks_crsmatrix_sequential(lclMtx, A_local_diagblks, true);
1065 
1066  A_local_diagblks_rowmap_v_ = std::vector<lno_row_view_t>(num_streams_);
1067  A_local_diagblks_entries_v_ = std::vector<lno_nonzero_view_t>(num_streams_);
1068  A_local_diagblks_values_v_ = std::vector<scalar_nonzero_view_t>(num_streams_);
1069  for(int i = 0; i < num_streams_; i++) {
1070  A_local_diagblks_rowmap_v_[i] = A_local_diagblks[i].graph.row_map;
1071  A_local_diagblks_entries_v_[i] = A_local_diagblks[i].graph.entries;
1072  A_local_diagblks_values_v_[i] = A_local_diagblks[i].values;
1073  }
1074  }
1075  }
1076 
1077  if (!isKokkosKernelsStream_) {
1078  L_->resumeFill ();
1079  U_->resumeFill ();
1080 
1081  if (L_->isStaticGraph () || L_->isLocallyIndexed ()) {
1082  L_->setAllToScalar (STS::zero ()); // Zero out L and U matrices
1083  U_->setAllToScalar (STS::zero ());
1084  }
1085  }
1086  else {
1087  for(int i = 0; i < num_streams_; i++) {
1088  L_v_[i]->resumeFill ();
1089  U_v_[i]->resumeFill ();
1090 
1091  if (L_v_[i]->isStaticGraph () || L_v_[i]->isLocallyIndexed ()) {
1092  L_v_[i]->setAllToScalar (STS::zero ()); // Zero out L and U matrices
1093  U_v_[i]->setAllToScalar (STS::zero ());
1094  }
1095  }
1096  }
1097 
1098  using row_map_type = typename crs_matrix_type::local_matrix_device_type::row_map_type;
1099 
1100  if (!isKokkosKernelsStream_) {
1101  auto lclL = L_->getLocalMatrixDevice();
1102  row_map_type L_rowmap = lclL.graph.row_map;
1103  auto L_entries = lclL.graph.entries;
1104  auto L_values = lclL.values;
1105 
1106  auto lclU = U_->getLocalMatrixDevice();
1107  row_map_type U_rowmap = lclU.graph.row_map;
1108  auto U_entries = lclU.graph.entries;
1109  auto U_values = lclU.values;
1110 
1111  KokkosSparse::Experimental::spiluk_numeric( KernelHandle_.getRawPtr(), LevelOfFill_,
1112  A_local_rowmap_, A_local_entries_, A_local_values_,
1113  L_rowmap, L_entries, L_values, U_rowmap, U_entries, U_values );
1114 
1115  L_->fillComplete (L_->getColMap (), A_local_->getRangeMap ());
1116  U_->fillComplete (A_local_->getDomainMap (), U_->getRowMap ());
1117 
1118  L_solver_->setMatrix (L_);
1119  U_solver_->setMatrix (U_);
1120  }
1121  else {
1122  std::vector<lno_row_view_t> L_rowmap_v(num_streams_);
1123  std::vector<lno_nonzero_view_t> L_entries_v(num_streams_);
1124  std::vector<scalar_nonzero_view_t> L_values_v(num_streams_);
1125  std::vector<lno_row_view_t> U_rowmap_v(num_streams_);
1126  std::vector<lno_nonzero_view_t> U_entries_v(num_streams_);
1127  std::vector<scalar_nonzero_view_t> U_values_v(num_streams_);
1128  std::vector<kk_handle_type *> KernelHandle_rawptr_v_(num_streams_);
1129  for(int i = 0; i < num_streams_; i++) {
1130  auto lclL = L_v_[i]->getLocalMatrixDevice();
1131  L_rowmap_v[i] = lclL.graph.row_map;
1132  L_entries_v[i] = lclL.graph.entries;
1133  L_values_v[i] = lclL.values;
1134 
1135  auto lclU = U_v_[i]->getLocalMatrixDevice();
1136  U_rowmap_v[i] = lclU.graph.row_map;
1137  U_entries_v[i] = lclU.graph.entries;
1138  U_values_v[i] = lclU.values;
1139  KernelHandle_rawptr_v_[i] = KernelHandle_v_[i].getRawPtr();
1140  }
1141  KokkosSparse::Experimental::spiluk_numeric_streams( exec_space_instances_, KernelHandle_rawptr_v_, LevelOfFill_,
1142  A_local_diagblks_rowmap_v_, A_local_diagblks_entries_v_, A_local_diagblks_values_v_,
1143  L_rowmap_v, L_entries_v, L_values_v,
1144  U_rowmap_v, U_entries_v, U_values_v );
1145  for(int i = 0; i < num_streams_; i++) {
1146  L_v_[i]->fillComplete ();
1147  U_v_[i]->fillComplete ();
1148  }
1149 
1150  L_solver_->setMatrices (L_v_);
1151  U_solver_->setMatrices (U_v_);
1152  }
1153 
1154  L_solver_->compute ();//NOTE: Only do compute if the pointer changed. Otherwise, do nothing
1155  U_solver_->compute ();//NOTE: Only do compute if the pointer changed. Otherwise, do nothing
1156  }
1157 
1158  isComputed_ = true;
1159  ++numCompute_;
1160  computeTime_ += (timer.wallTime() - startTime);
1161 }
1162 
1163 
1164 template<class MatrixType>
1165 void
1167 apply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1168  Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
1169  Teuchos::ETransp mode,
1170  scalar_type alpha,
1171  scalar_type beta) const
1172 {
1173  using Teuchos::RCP;
1174  using Teuchos::rcpFromRef;
1175 
1177  A_.is_null (), std::runtime_error, "Ifpack2::RILUK::apply: The matrix is "
1178  "null. Please call setMatrix() with a nonnull input, then initialize() "
1179  "and compute(), before calling this method.");
1181  ! isComputed (), std::runtime_error,
1182  "Ifpack2::RILUK::apply: If you have not yet called compute(), "
1183  "you must call compute() before calling this method.");
1184  TEUCHOS_TEST_FOR_EXCEPTION(
1185  X.getNumVectors () != Y.getNumVectors (), std::invalid_argument,
1186  "Ifpack2::RILUK::apply: X and Y do not have the same number of columns. "
1187  "X.getNumVectors() = " << X.getNumVectors ()
1188  << " != Y.getNumVectors() = " << Y.getNumVectors () << ".");
1189  TEUCHOS_TEST_FOR_EXCEPTION(
1190  STS::isComplex && mode == Teuchos::CONJ_TRANS, std::logic_error,
1191  "Ifpack2::RILUK::apply: mode = Teuchos::CONJ_TRANS is not implemented for "
1192  "complex Scalar type. Please talk to the Ifpack2 developers to get this "
1193  "fixed. There is a FIXME in this file about this very issue.");
1194 #ifdef HAVE_IFPACK2_DEBUG
1195  {
1196  const magnitude_type D_nrm1 = D_->norm1 ();
1197  TEUCHOS_TEST_FOR_EXCEPTION( STM::isnaninf (D_nrm1), std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the stored diagonal is NaN or Inf.");
1198  Teuchos::Array<magnitude_type> norms (X.getNumVectors ());
1199  X.norm1 (norms ());
1200  bool good = true;
1201  for (size_t j = 0; j < X.getNumVectors (); ++j) {
1202  if (STM::isnaninf (norms[j])) {
1203  good = false;
1204  break;
1205  }
1206  }
1207  TEUCHOS_TEST_FOR_EXCEPTION( ! good, std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the input X is NaN or Inf.");
1208  }
1209 #endif // HAVE_IFPACK2_DEBUG
1210 
1211  const scalar_type one = STS::one ();
1212  const scalar_type zero = STS::zero ();
1213 
1214  Teuchos::Time timer ("RILUK::apply");
1215  double startTime = timer.wallTime();
1216  { // Start timing
1217  Teuchos::TimeMonitor timeMon (timer);
1218  if (alpha == one && beta == zero) {
1219  if (isKokkosKernelsSpiluk_ && isKokkosKernelsStream_ && hasStreamReordered_) {
1220  MV ReorderedX (X.getMap(), X.getNumVectors());
1221  MV ReorderedY (Y.getMap(), Y.getNumVectors());
1222  for (size_t j = 0; j < X.getNumVectors(); j++) {
1223  auto X_j = X.getVector(j);
1224  auto ReorderedX_j = ReorderedX.getVectorNonConst(j);
1225  auto X_lcl = X_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
1226  auto ReorderedX_lcl = ReorderedX_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
1227  local_ordinal_type stream_begin = 0;
1228  local_ordinal_type stream_end;
1229  for(int i = 0; i < num_streams_; i++) {
1230  auto perm_i = perm_v_[i];
1231  stream_end = stream_begin + perm_i.extent(0);
1232  auto X_lcl_sub = Kokkos::subview (X_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1233  auto ReorderedX_lcl_sub = Kokkos::subview (ReorderedX_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1234  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
1235  ReorderedX_lcl_sub(perm_i(ii)) = X_lcl_sub(ii);
1236  });
1237  stream_begin = stream_end;
1238  }
1239  }
1240  Kokkos::fence(); // Make sure X is completely reordered
1241  if (mode == Teuchos::NO_TRANS) { // Solve L (U Y) = X for Y.
1242  // Solve L Y = X for Y.
1243  L_solver_->apply (ReorderedX, Y, mode);
1244  // Solve U Y = Y for Y.
1245  U_solver_->apply (Y, ReorderedY, mode);
1246  }
1247  else { // Solve U^P (L^P Y) = X for Y (where P is * or T).
1248  // Solve U^P Y = X for Y.
1249  U_solver_->apply (ReorderedX, Y, mode);
1250  // Solve L^P Y = Y for Y.
1251  L_solver_->apply (Y, ReorderedY, mode);
1252  }
1253 
1254  for (size_t j = 0; j < Y.getNumVectors(); j++) {
1255  auto Y_j = Y.getVectorNonConst(j);
1256  auto ReorderedY_j = ReorderedY.getVector(j);
1257  auto Y_lcl = Y_j->getLocalViewDevice(Tpetra::Access::ReadWrite);
1258  auto ReorderedY_lcl = ReorderedY_j->getLocalViewDevice(Tpetra::Access::ReadOnly);
1259  local_ordinal_type stream_begin = 0;
1260  local_ordinal_type stream_end;
1261  for(int i = 0; i < num_streams_; i++) {
1262  auto perm_i = perm_v_[i];
1263  stream_end = stream_begin + perm_i.extent(0);
1264  auto Y_lcl_sub = Kokkos::subview (Y_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1265  auto ReorderedY_lcl_sub = Kokkos::subview (ReorderedY_lcl, Kokkos::make_pair(stream_begin, stream_end), 0);
1266  Kokkos::parallel_for( Kokkos::RangePolicy<execution_space>(0, static_cast<int>(perm_i.extent(0))), KOKKOS_LAMBDA ( const int& ii ) {
1267  Y_lcl_sub(ii) = ReorderedY_lcl_sub(perm_i(ii));
1268  });
1269  stream_begin = stream_end;
1270  }
1271  }
1272  }
1273  else {
1274  if (mode == Teuchos::NO_TRANS) { // Solve L (D (U Y)) = X for Y.
1275 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
1276  //NOTE (Nov-15-2022):
1277  //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
1278  //since cusparseSpSV_solve() does not support in-place computation
1279  MV Y_tmp (Y.getMap (), Y.getNumVectors ());
1280 
1281  // Start by solving L Y_tmp = X for Y_tmp.
1282  L_solver_->apply (X, Y_tmp, mode);
1283 
1284  if (!this->isKokkosKernelsSpiluk_) {
1285  // Solve D Y = Y. The operation lets us do this in place in Y, so we can
1286  // write "solve D Y = Y for Y."
1287  Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
1288  }
1289 
1290  U_solver_->apply (Y_tmp, Y, mode); // Solve U Y = Y_tmp.
1291 #else
1292  // Start by solving L Y = X for Y.
1293  L_solver_->apply (X, Y, mode);
1294 
1295  if (!this->isKokkosKernelsSpiluk_) {
1296  // Solve D Y = Y. The operation lets us do this in place in Y, so we can
1297  // write "solve D Y = Y for Y."
1298  Y.elementWiseMultiply (one, *D_, Y, zero);
1299  }
1300 
1301  U_solver_->apply (Y, Y, mode); // Solve U Y = Y.
1302 #endif
1303  }
1304  else { // Solve U^P (D^P (L^P Y)) = X for Y (where P is * or T).
1305 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && defined(KOKKOS_ENABLE_CUDA) && (CUDA_VERSION >= 11030)
1306  //NOTE (Nov-15-2022):
1307  //This is a workaround for Cuda >= 11.3 (using cusparseSpSV)
1308  //since cusparseSpSV_solve() does not support in-place computation
1309  MV Y_tmp (Y.getMap (), Y.getNumVectors ());
1310 
1311  // Start by solving U^P Y_tmp = X for Y_tmp.
1312  U_solver_->apply (X, Y_tmp, mode);
1313 
1314  if (!this->isKokkosKernelsSpiluk_) {
1315  // Solve D^P Y = Y.
1316  //
1317  // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
1318  // need to do an elementwise multiply with the conjugate of
1319  // D_, not just with D_ itself.
1320  Y_tmp.elementWiseMultiply (one, *D_, Y_tmp, zero);
1321  }
1322 
1323  L_solver_->apply (Y_tmp, Y, mode); // Solve L^P Y = Y_tmp.
1324 #else
1325  // Start by solving U^P Y = X for Y.
1326  U_solver_->apply (X, Y, mode);
1327 
1328  if (!this->isKokkosKernelsSpiluk_) {
1329  // Solve D^P Y = Y.
1330  //
1331  // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we
1332  // need to do an elementwise multiply with the conjugate of
1333  // D_, not just with D_ itself.
1334  Y.elementWiseMultiply (one, *D_, Y, zero);
1335  }
1336 
1337  L_solver_->apply (Y, Y, mode); // Solve L^P Y = Y.
1338 #endif
1339  }
1340  }
1341  }
1342  else { // alpha != 1 or beta != 0
1343  if (alpha == zero) {
1344  // The special case for beta == 0 ensures that if Y contains Inf
1345  // or NaN values, we replace them with 0 (following BLAS
1346  // convention), rather than multiplying them by 0 to get NaN.
1347  if (beta == zero) {
1348  Y.putScalar (zero);
1349  } else {
1350  Y.scale (beta);
1351  }
1352  } else { // alpha != zero
1353  MV Y_tmp (Y.getMap (), Y.getNumVectors ());
1354  apply (X, Y_tmp, mode);
1355  Y.update (alpha, Y_tmp, beta);
1356  }
1357  }
1358  }//end timing
1359 
1360 #ifdef HAVE_IFPACK2_DEBUG
1361  {
1362  Teuchos::Array<magnitude_type> norms (Y.getNumVectors ());
1363  Y.norm1 (norms ());
1364  bool good = true;
1365  for (size_t j = 0; j < Y.getNumVectors (); ++j) {
1366  if (STM::isnaninf (norms[j])) {
1367  good = false;
1368  break;
1369  }
1370  }
1371  TEUCHOS_TEST_FOR_EXCEPTION( ! good, std::runtime_error, "Ifpack2::RILUK::apply: The 1-norm of the output Y is NaN or Inf.");
1372  }
1373 #endif // HAVE_IFPACK2_DEBUG
1374 
1375  ++numApply_;
1376  applyTime_ += (timer.wallTime() - startTime);
1377 }
1378 
1379 
1380 //VINH comment out since multiply() is not needed anywhere
1381 //template<class MatrixType>
1382 //void RILUK<MatrixType>::
1383 //multiply (const Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& X,
1384 // Tpetra::MultiVector<scalar_type,local_ordinal_type,global_ordinal_type,node_type>& Y,
1385 // const Teuchos::ETransp mode) const
1386 //{
1387 // const scalar_type zero = STS::zero ();
1388 // const scalar_type one = STS::one ();
1389 //
1390 // if (mode != Teuchos::NO_TRANS) {
1391 // U_->apply (X, Y, mode); //
1392 // Y.update (one, X, one); // Y = Y + X (account for implicit unit diagonal)
1393 //
1394 // // FIXME (mfh 24 Jan 2014) If mode = Teuchos::CONJ_TRANS, we need
1395 // // to do an elementwise multiply with the conjugate of D_, not
1396 // // just with D_ itself.
1397 // Y.elementWiseMultiply (one, *D_, Y, zero); // y = D*y (D_ has inverse of diagonal)
1398 //
1399 // MV Y_tmp (Y, Teuchos::Copy); // Need a temp copy of Y
1400 // L_->apply (Y_tmp, Y, mode);
1401 // Y.update (one, Y_tmp, one); // (account for implicit unit diagonal)
1402 // }
1403 // else {
1404 // L_->apply (X, Y, mode);
1405 // Y.update (one, X, one); // Y = Y + X (account for implicit unit diagonal)
1406 // Y.elementWiseMultiply (one, *D_, Y, zero); // y = D*y (D_ has inverse of diagonal)
1407 // MV Y_tmp (Y, Teuchos::Copy); // Need a temp copy of Y1
1408 // U_->apply (Y_tmp, Y, mode);
1409 // Y.update (one, Y_tmp, one); // (account for implicit unit diagonal)
1410 // }
1411 //}
1412 
1413 template<class MatrixType>
1415 {
1416  std::ostringstream os;
1417 
1418  // Output is a valid YAML dictionary in flow style. If you don't
1419  // like everything on a single line, you should call describe()
1420  // instead.
1421  os << "\"Ifpack2::RILUK\": {";
1422  os << "Initialized: " << (isInitialized () ? "true" : "false") << ", "
1423  << "Computed: " << (isComputed () ? "true" : "false") << ", ";
1424 
1425  os << "Level-of-fill: " << getLevelOfFill() << ", ";
1426 
1427  if(isKokkosKernelsSpiluk_) os<<"KK-SPILUK, ";
1428  if(isKokkosKernelsStream_) os<<"KK-Stream, ";
1429 
1430  if (A_.is_null ()) {
1431  os << "Matrix: null";
1432  }
1433  else {
1434  os << "Global matrix dimensions: ["
1435  << A_->getGlobalNumRows () << ", " << A_->getGlobalNumCols () << "]"
1436  << ", Global nnz: " << A_->getGlobalNumEntries();
1437  }
1438 
1439  if (! L_solver_.is_null ()) os << ", " << L_solver_->description ();
1440  if (! U_solver_.is_null ()) os << ", " << U_solver_->description ();
1441 
1442  os << "}";
1443  return os.str ();
1444 }
1445 
1446 } // namespace Ifpack2
1447 
1448 #define IFPACK2_RILUK_INSTANT(S,LO,GO,N) \
1449  template class Ifpack2::RILUK< Tpetra::RowMatrix<S, LO, GO, N> >;
1450 
1451 #endif
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getRangeMap() const
Returns the Tpetra::Map object associated with the range of this operator.
Definition: Ifpack2_RILUK_def.hpp:271
node_type::execution_space execution_space
The Kokkos execution space of the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:275
MatrixType::global_ordinal_type global_ordinal_type
The type of global indices in the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:263
T & get(const std::string &name, T def_value)
bool nonnull(const std::shared_ptr< T > &p)
const Tpetra::Vector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > & getD() const
Return the diagonal entries of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:208
void initialize()
Initialize by computing the symbolic incomplete factorization.
Definition: Ifpack2_RILUK_def.hpp:506
#define TEUCHOS_TEST_FOR_EXCEPTION(throw_exception_test, Exception, msg)
Tpetra::RowMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > row_matrix_type
Tpetra::RowMatrix specialization used by this class.
Definition: Ifpack2_RILUK_decl.hpp:281
size_type size() const
virtual void setMatrix(const Teuchos::RCP< const row_matrix_type > &A)
Change the matrix to be preconditioned.
Definition: Ifpack2_RILUK_def.hpp:155
std::string description() const
A one-line description of this object.
Definition: Ifpack2_RILUK_def.hpp:1414
void setParameters(const Teuchos::ParameterList &params)
Definition: Ifpack2_RILUK_def.hpp:336
ILU(k) factorization of a given Tpetra::RowMatrix.
Definition: Ifpack2_RILUK_decl.hpp:245
size_t getNodeSmootherComplexity() const
Get a rough estimate of cost per iteration.
Definition: Ifpack2_RILUK_def.hpp:235
Teuchos::RCP< const crs_matrix_type > getCrsMatrix() const
Return the input matrix A as a Tpetra::CrsMatrix, if possible; else throws.
Definition: Ifpack2_RILUK_def.hpp:466
Tpetra::CrsMatrix< scalar_type, local_ordinal_type, global_ordinal_type, node_type > crs_matrix_type
Tpetra::CrsMatrix specialization used by this class for representing L and U.
Definition: Ifpack2_RILUK_decl.hpp:284
const crs_matrix_type & getU() const
Return the U factor of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:222
bool isParameter(const std::string &name) const
MatrixType::node_type node_type
The Node type used by the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:266
T * getRawPtr() const
Teuchos::RCP< const Tpetra::Map< local_ordinal_type, global_ordinal_type, node_type > > getDomainMap() const
Returns the Tpetra::Map object associated with the domain of this operator.
Definition: Ifpack2_RILUK_def.hpp:252
TEUCHOS_DEPRECATED RCP< T > rcp(T *p, Dealloc_T dealloc, bool owns_mem)
MatrixType::local_ordinal_type local_ordinal_type
The type of local indices in the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:260
virtual ~RILUK()
Destructor (declared virtual for memory safety).
Definition: Ifpack2_RILUK_def.hpp:128
&quot;Preconditioner&quot; that solves local sparse triangular systems.
Definition: Ifpack2_LocalSparseTriangularSolver_decl.hpp:79
void compute()
Compute the (numeric) incomplete factorization.
Definition: Ifpack2_RILUK_def.hpp:843
void apply(const Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &X, Tpetra::MultiVector< scalar_type, local_ordinal_type, global_ordinal_type, node_type > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, scalar_type alpha=Teuchos::ScalarTraits< scalar_type >::one(), scalar_type beta=Teuchos::ScalarTraits< scalar_type >::zero()) const
Apply the (inverse of the) incomplete factorization to X, resulting in Y.
Definition: Ifpack2_RILUK_def.hpp:1167
Teuchos::ScalarTraits< scalar_type >::magnitudeType magnitude_type
The type of the magnitude (absolute value) of a matrix entry.
Definition: Ifpack2_RILUK_decl.hpp:269
crs_matrix_type::impl_scalar_type impl_scalar_type
Scalar type stored in Kokkos::Views (CrsMatrix and MultiVector)
Definition: Ifpack2_RILUK_decl.hpp:293
void resize(size_type new_size, const value_type &x=value_type())
Construct a level filled graph for use in computing an ILU(k) incomplete factorization.
Definition: Ifpack2_IlukGraph.hpp:100
IntegralType getIntegralValue(const std::string &str, const std::string &paramName="", const std::string &sublistName="") const
Teuchos::RCP< const row_matrix_type > getMatrix() const
Get the input matrix.
Definition: Ifpack2_RILUK_def.hpp:459
size_type size() const
bool isType(const std::string &name) const
static double wallTime()
const crs_matrix_type & getL() const
Return the L factor of the ILU factorization.
Definition: Ifpack2_RILUK_def.hpp:191
std::string typeName(const T &t)
MatrixType::scalar_type scalar_type
The type of the entries of the input MatrixType.
Definition: Ifpack2_RILUK_decl.hpp:257
bool is_null() const