Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_Cuda.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
12 
13 #include "Tpetra_Details_IntRowPtrHelper.hpp"
14 
15 #ifdef HAVE_TPETRA_INST_CUDA
16 namespace Tpetra {
17 namespace MMdetails {
18 
19 /*********************************************************************************************************/
20 // MMM KernelWrappers for Partial Specialization to CUDA
21 template <class Scalar,
22  class LocalOrdinal,
23  class GlobalOrdinal,
24  class LocalOrdinalViewType>
25 struct KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType> {
26  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28  const LocalOrdinalViewType& Acol2Brow,
29  const LocalOrdinalViewType& Acol2Irow,
30  const LocalOrdinalViewType& Bcol2Ccol,
31  const LocalOrdinalViewType& Icol2Ccol,
32  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34  const std::string& label = std::string(),
35  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
36 
37  static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
38  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
39  const LocalOrdinalViewType& Acol2Brow,
40  const LocalOrdinalViewType& Acol2Irow,
41  const LocalOrdinalViewType& Bcol2Ccol,
42  const LocalOrdinalViewType& Icol2Ccol,
43  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
44  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
45  const std::string& label = std::string(),
46  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
47 };
48 
49 // Jacobi KernelWrappers for Partial Specialization to Cuda
50 template <class Scalar,
51  class LocalOrdinal,
52  class GlobalOrdinal, class LocalOrdinalViewType>
53 struct KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType> {
54  static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
55  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
56  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
57  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
58  const LocalOrdinalViewType& Acol2Brow,
59  const LocalOrdinalViewType& Acol2Irow,
60  const LocalOrdinalViewType& Bcol2Ccol,
61  const LocalOrdinalViewType& Icol2Ccol,
62  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
63  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
64  const std::string& label = std::string(),
65  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66 
67  static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
68  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
69  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
70  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
71  const LocalOrdinalViewType& Acol2Brow,
72  const LocalOrdinalViewType& Acol2Irow,
73  const LocalOrdinalViewType& Bcol2Ccol,
74  const LocalOrdinalViewType& Icol2Ccol,
75  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
76  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
77  const std::string& label = std::string(),
78  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79 
80  static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
81  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
82  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
83  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
84  const LocalOrdinalViewType& Acol2Brow,
85  const LocalOrdinalViewType& Acol2Irow,
86  const LocalOrdinalViewType& Bcol2Ccol,
87  const LocalOrdinalViewType& Icol2Ccol,
88  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
89  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
90  const std::string& label = std::string(),
91  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
92 };
93 
94 /*********************************************************************************************************/
95 // AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
96 template <class Scalar,
97  class LocalOrdinal,
98  class GlobalOrdinal,
99  class LocalOrdinalViewType>
100 void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
101  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
102  const LocalOrdinalViewType& Acol2Brow,
103  const LocalOrdinalViewType& Acol2Irow,
104  const LocalOrdinalViewType& Bcol2Ccol,
105  const LocalOrdinalViewType& Icol2Ccol,
106  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
107  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
108  const std::string& label,
109  const Teuchos::RCP<Teuchos::ParameterList>& params) {
110  using Teuchos::RCP;
111  using Teuchos::rcp;
112  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaWrapper"));
113 
114  // Node-specific code
115  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
116  std::string nodename("Cuda");
117 
118  // Lots and lots of typedefs
120  typedef typename KCRS::device_type device_t;
121  typedef typename KCRS::StaticCrsGraphType graph_t;
122  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
123  using int_view_t = Kokkos::View<int*, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
124  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
125  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
126  typedef typename KCRS::values_type::non_const_type scalar_view_t;
127  // typedef typename graph_t::row_map_type::const_type lno_view_t_const;
128 
129  // Options
130  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
131  std::string myalg("SPGEMM_KK_MEMORY");
132  if (!params.is_null()) {
133  if (params->isParameter("cuda: algorithm"))
134  myalg = params->get("cuda: algorithm", myalg);
135  if (params->isParameter("cuda: team work size"))
136  team_work_size = params->get("cuda: team work size", team_work_size);
137  }
138 
139  // KokkosKernelsHandle
140  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
141  typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
142  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>
143  KernelHandle;
144  using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
145  typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
146  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
147 
148  // Grab the Kokkos::SparseCrsMatrices
149  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
150  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
151 
152  c_lno_view_t Arowptr = Amat.graph.row_map,
153  Browptr = Bmat.graph.row_map;
154  const lno_nnz_view_t Acolind = Amat.graph.entries,
155  Bcolind = Bmat.graph.entries;
156  const scalar_view_t Avals = Amat.values,
157  Bvals = Bmat.values;
158 
159  // Get the algorithm mode
160  std::string alg = nodename + std::string(" algorithm");
161  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
162  if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
163  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
164 
165  // Merge the B and Bimport matrices
166  KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
167 
168  // if Kokkos Kernels is going to use the cuSparse TPL for SpGEMM, this matrix
169  // needs to be sorted
170  // Try to mirror the Kokkos Kernels internal SpGEMM TPL use logic
171  // inspired by https://github.com/trilinos/Trilinos/pull/11709
172  // and https://github.com/kokkos/kokkos-kernels/pull/2008
173 #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
174  if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
175  if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
176  KokkosSparse::sort_crs_matrix(Bmerged);
177  }
178  }
179 #endif
180 
181  MM = Teuchos::null;
182  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaCore"));
183 
184  // Do the multiply on whatever we've got
185  typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
186  typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
187  typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
188 
189  // regardless of whether integer row ptrs are used, need to ultimately produce the row pointer, entries, and values of the expected types
190  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lno_row"), AnumRows + 1);
191  lno_nnz_view_t entriesC;
192  scalar_view_t valuesC;
193 
194  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
195  const bool useIntRowptrs =
196  irph.shouldUseIntRowptrs() &&
197  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
198 
199  if (useIntRowptrs) {
200  IntKernelHandle kh;
201  kh.create_spgemm_handle(alg_enum);
202  kh.set_team_work_size(team_work_size);
203 
204  int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_int_row"), AnumRows + 1);
205 
206  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
207  auto Bint = irph.getIntRowptrMatrix(Bmerged);
208 
209  {
210  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels symbolic int");
211  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, false, Bint.graph.row_map, Bint.graph.entries, false, int_row_mapC);
212  }
213 
214  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels numeric int");
215 
216  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
217  if (c_nnz_size) {
218  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
219  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
220  }
221  KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, Aint.values, false, Bint.graph.row_map, Bint.graph.entries, Bint.values, false, int_row_mapC, entriesC, valuesC);
222  // transfer the integer rowptrs back to the correct rowptr type
223  Kokkos::parallel_for(
224  int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
225  kh.destroy_spgemm_handle();
226 
227  } else {
228  KernelHandle kh;
229  kh.create_spgemm_handle(alg_enum);
230  kh.set_team_work_size(team_work_size);
231 
232  {
233  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels symbolic non-int");
234  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, false, Bmerged.graph.row_map, Bmerged.graph.entries, false, row_mapC);
235  }
236 
237  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels numeric non-int");
238  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
239  if (c_nnz_size) {
240  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
241  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
242  }
243 
244  KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, Amat.values, false, Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false, row_mapC, entriesC, valuesC);
245 
246  kh.destroy_spgemm_handle();
247  }
248 
249  MM = Teuchos::null;
250  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaSort"));
251 
252  // Sort & set values
253  if (params.is_null() || params->get("sort entries", true))
254  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
255  C.setAllValues(row_mapC, entriesC, valuesC);
256 
257  MM = Teuchos::null;
258  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaESFC"));
259 
260  // Final Fillcomplete
261  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
262  labelList->set("Timer Label", label);
263  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
264  RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
265  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
266 }
267 
268 /*********************************************************************************************************/
269 template <class Scalar,
270  class LocalOrdinal,
271  class GlobalOrdinal,
272  class LocalOrdinalViewType>
273 void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
274  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
275  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
276  const LocalOrdinalViewType& targetMapToOrigRow_dev,
277  const LocalOrdinalViewType& targetMapToImportRow_dev,
278  const LocalOrdinalViewType& Bcol2Ccol_dev,
279  const LocalOrdinalViewType& Icol2Ccol_dev,
280  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
281  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
282  const std::string& label,
283  const Teuchos::RCP<Teuchos::ParameterList>& params) {
284  // FIXME: Right now, this is a cut-and-paste of the serial kernel
285  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
286 
287  using Teuchos::RCP;
288  using Teuchos::rcp;
289  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Reuse SerialCore"));
290 
291  // Lots and lots of typedefs
292  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
293  typedef typename KCRS::StaticCrsGraphType graph_t;
294  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
295  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
296  typedef typename KCRS::values_type::non_const_type scalar_view_t;
297 
298  typedef Scalar SC;
299  typedef LocalOrdinal LO;
300  typedef GlobalOrdinal GO;
301  typedef Node NO;
302  typedef Map<LO, GO, NO> map_type;
303  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
304  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
305  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
306 
307  // Since this is being run on Cuda, we need to fence because the below code will use UVM
308  // typename graph_t::execution_space().fence();
309 
310  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
311  // KDDKDD UVM Ideally, this function would run on device and use
312  // KDDKDD UVM KokkosKernels instead of this host implementation.
313  auto targetMapToOrigRow =
314  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
315  targetMapToOrigRow_dev);
316  auto targetMapToImportRow =
317  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
318  targetMapToImportRow_dev);
319  auto Bcol2Ccol =
320  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
321  Bcol2Ccol_dev);
322  auto Icol2Ccol =
323  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
324  Icol2Ccol_dev);
325 
326  // Sizes
327  RCP<const map_type> Ccolmap = C.getColMap();
328  size_t m = Aview.origMatrix->getLocalNumRows();
329  size_t n = Ccolmap->getLocalNumElements();
330 
331  // Grab the Kokkos::SparseCrsMatrices & inner stuff
332  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
333  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
334  const KCRS& Cmat = C.getLocalMatrixHost();
335 
336  c_lno_view_t Arowptr = Amat.graph.row_map,
337  Browptr = Bmat.graph.row_map,
338  Crowptr = Cmat.graph.row_map;
339  const lno_nnz_view_t Acolind = Amat.graph.entries,
340  Bcolind = Bmat.graph.entries,
341  Ccolind = Cmat.graph.entries;
342  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
343  scalar_view_t Cvals = Cmat.values;
344 
345  c_lno_view_t Irowptr;
346  lno_nnz_view_t Icolind;
347  scalar_view_t Ivals;
348  if (!Bview.importMatrix.is_null()) {
349  auto lclB = Bview.importMatrix->getLocalMatrixHost();
350  Irowptr = lclB.graph.row_map;
351  Icolind = lclB.graph.entries;
352  Ivals = lclB.values;
353  }
354 
355  // Classic csr assembly (low memory edition)
356  // mfh 27 Sep 2016: The c_status array is an implementation detail
357  // of the local sparse matrix-matrix multiply routine.
358 
359  // The status array will contain the index into colind where this entry was last deposited.
360  // c_status[i] < CSR_ip - not in the row yet
361  // c_status[i] >= CSR_ip - this is the entry where you can find the data
362  // We start with this filled with INVALID's indicating that there are no entries yet.
363  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
364  std::vector<size_t> c_status(n, ST_INVALID);
365 
366  // For each row of A/C
367  size_t CSR_ip = 0, OLD_ip = 0;
368  for (size_t i = 0; i < m; i++) {
369  // First fill the c_status array w/ locations where we're allowed to
370  // generate nonzeros for this row
371  OLD_ip = Crowptr[i];
372  CSR_ip = Crowptr[i + 1];
373  for (size_t k = OLD_ip; k < CSR_ip; k++) {
374  c_status[Ccolind[k]] = k;
375 
376  // Reset values in the row of C
377  Cvals[k] = SC_ZERO;
378  }
379 
380  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
381  LO Aik = Acolind[k];
382  const SC Aval = Avals[k];
383  if (Aval == SC_ZERO)
384  continue;
385 
386  if (targetMapToOrigRow[Aik] != LO_INVALID) {
387  // Local matrix
388  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
389 
390  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
391  LO Bkj = Bcolind[j];
392  LO Cij = Bcol2Ccol[Bkj];
393 
394  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
395  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
396  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
397 
398  Cvals[c_status[Cij]] += Aval * Bvals[j];
399  }
400 
401  } else {
402  // Remote matrix
403  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
404  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
405  LO Ikj = Icolind[j];
406  LO Cij = Icol2Ccol[Ikj];
407 
408  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
409  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
410  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
411 
412  Cvals[c_status[Cij]] += Aval * Ivals[j];
413  }
414  }
415  }
416  }
417 
418  C.fillComplete(C.getDomainMap(), C.getRangeMap());
419 }
420 
421 /*********************************************************************************************************/
422 template <class Scalar,
423  class LocalOrdinal,
424  class GlobalOrdinal,
425  class LocalOrdinalViewType>
426 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
427  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
428  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
429  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
430  const LocalOrdinalViewType& Acol2Brow,
431  const LocalOrdinalViewType& Acol2Irow,
432  const LocalOrdinalViewType& Bcol2Ccol,
433  const LocalOrdinalViewType& Icol2Ccol,
434  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
435  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
436  const std::string& label,
437  const Teuchos::RCP<Teuchos::ParameterList>& params) {
438  // Node-specific code
439  using Teuchos::RCP;
440  using Teuchos::rcp;
441  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Jacobi CudaWrapper"));
442 
443  // Options
444  // int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
445  std::string myalg("KK");
446  if (!params.is_null()) {
447  if (params->isParameter("cuda: jacobi algorithm"))
448  myalg = params->get("cuda: jacobi algorithm", myalg);
449  }
450 
451  if (myalg == "MSAK") {
452  ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
453  } else if (myalg == "KK") {
454  jacobi_A_B_newmatrix_KokkosKernels(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
455  } else {
456  throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
457  }
458 
459  MM = Teuchos::null;
460  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
461 
462  // Final Fillcomplete
463  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
464  labelList->set("Timer Label", label);
465  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
466 
467  // NOTE: MSAK already fillCompletes, so we have to check here
468  if (!C.isFillComplete()) {
469  RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
470  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
471  }
472 }
473 
474 /*********************************************************************************************************/
475 template <class Scalar,
476  class LocalOrdinal,
477  class GlobalOrdinal,
478  class LocalOrdinalViewType>
479 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
480  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
481  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
482  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
483  const LocalOrdinalViewType& targetMapToOrigRow_dev,
484  const LocalOrdinalViewType& targetMapToImportRow_dev,
485  const LocalOrdinalViewType& Bcol2Ccol_dev,
486  const LocalOrdinalViewType& Icol2Ccol_dev,
487  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
488  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
489  const std::string& label,
490  const Teuchos::RCP<Teuchos::ParameterList>& params) {
491  // FIXME: Right now, this is a cut-and-paste of the serial kernel
492  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
493 
494  using Teuchos::RCP;
495  using Teuchos::rcp;
496  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse CudaCore"));
497 
498  // Lots and lots of typedefs
499  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
500  typedef typename KCRS::StaticCrsGraphType graph_t;
501  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
502  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
503  typedef typename KCRS::values_type::non_const_type scalar_view_t;
504  typedef typename scalar_view_t::memory_space scalar_memory_space;
505 
506  typedef Scalar SC;
507  typedef LocalOrdinal LO;
508  typedef GlobalOrdinal GO;
509  typedef Node NO;
510  typedef Map<LO, GO, NO> map_type;
511  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
512  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
513  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
514 
515  // Since this is being run on Cuda, we need to fence because the below host code will use UVM
516  // KDDKDD typename graph_t::execution_space().fence();
517 
518  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
519  // KDDKDD UVM Ideally, this function would run on device and use
520  // KDDKDD UVM KokkosKernels instead of this host implementation.
521  auto targetMapToOrigRow =
522  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
523  targetMapToOrigRow_dev);
524  auto targetMapToImportRow =
525  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
526  targetMapToImportRow_dev);
527  auto Bcol2Ccol =
528  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
529  Bcol2Ccol_dev);
530  auto Icol2Ccol =
531  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
532  Icol2Ccol_dev);
533 
534  // Sizes
535  RCP<const map_type> Ccolmap = C.getColMap();
536  size_t m = Aview.origMatrix->getLocalNumRows();
537  size_t n = Ccolmap->getLocalNumElements();
538 
539  // Grab the Kokkos::SparseCrsMatrices & inner stuff
540  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
541  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
542  const KCRS& Cmat = C.getLocalMatrixHost();
543 
544  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
545  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
546  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
547  scalar_view_t Cvals = Cmat.values;
548 
549  c_lno_view_t Irowptr;
550  lno_nnz_view_t Icolind;
551  scalar_view_t Ivals;
552  if (!Bview.importMatrix.is_null()) {
553  auto lclB = Bview.importMatrix->getLocalMatrixHost();
554  Irowptr = lclB.graph.row_map;
555  Icolind = lclB.graph.entries;
556  Ivals = lclB.values;
557  }
558 
559  // Jacobi-specific inner stuff
560  auto Dvals =
561  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
562 
563  // The status array will contain the index into colind where this entry was last deposited.
564  // c_status[i] < CSR_ip - not in the row yet
565  // c_status[i] >= CSR_ip - this is the entry where you can find the data
566  // We start with this filled with INVALID's indicating that there are no entries yet.
567  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
568  std::vector<size_t> c_status(n, ST_INVALID);
569 
570  // For each row of A/C
571  size_t CSR_ip = 0, OLD_ip = 0;
572  for (size_t i = 0; i < m; i++) {
573  // First fill the c_status array w/ locations where we're allowed to
574  // generate nonzeros for this row
575  OLD_ip = Crowptr[i];
576  CSR_ip = Crowptr[i + 1];
577  for (size_t k = OLD_ip; k < CSR_ip; k++) {
578  c_status[Ccolind[k]] = k;
579 
580  // Reset values in the row of C
581  Cvals[k] = SC_ZERO;
582  }
583 
584  SC minusOmegaDval = -omega * Dvals(i, 0);
585 
586  // Entries of B
587  for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
588  Scalar Bval = Bvals[j];
589  if (Bval == SC_ZERO)
590  continue;
591  LO Bij = Bcolind[j];
592  LO Cij = Bcol2Ccol[Bij];
593 
594  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
595  std::runtime_error, "Trying to insert a new entry into a static graph");
596 
597  Cvals[c_status[Cij]] = Bvals[j];
598  }
599 
600  // Entries of -omega * Dinv * A * B
601  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
602  LO Aik = Acolind[k];
603  const SC Aval = Avals[k];
604  if (Aval == SC_ZERO)
605  continue;
606 
607  if (targetMapToOrigRow[Aik] != LO_INVALID) {
608  // Local matrix
609  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
610 
611  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
612  LO Bkj = Bcolind[j];
613  LO Cij = Bcol2Ccol[Bkj];
614 
615  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
616  std::runtime_error, "Trying to insert a new entry into a static graph");
617 
618  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
619  }
620 
621  } else {
622  // Remote matrix
623  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
624  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
625  LO Ikj = Icolind[j];
626  LO Cij = Icol2Ccol[Ikj];
627 
628  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
629  std::runtime_error, "Trying to insert a new entry into a static graph");
630 
631  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
632  }
633  }
634  }
635  }
636 
637  MM = Teuchos::null;
638  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse ESFC"));
639 
640  C.fillComplete(C.getDomainMap(), C.getRangeMap());
641 }
642 
643 /*********************************************************************************************************/
644 template <class Scalar,
645  class LocalOrdinal,
646  class GlobalOrdinal,
647  class LocalOrdinalViewType>
648 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
649  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Dinv,
650  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
651  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
652  const LocalOrdinalViewType& Acol2Brow,
653  const LocalOrdinalViewType& Acol2Irow,
654  const LocalOrdinalViewType& Bcol2Ccol,
655  const LocalOrdinalViewType& Icol2Ccol,
656  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
657  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
658  const std::string& label,
659  const Teuchos::RCP<Teuchos::ParameterList>& params) {
660  // Check if the diagonal entries exist in debug mode
661  const bool debug = Tpetra::Details::Behavior::debug();
662  if (debug) {
663  auto rowMap = Aview.origMatrix->getRowMap();
665  Aview.origMatrix->getLocalDiagCopy(diags);
666  size_t diagLength = rowMap->getLocalNumElements();
667  Teuchos::Array<Scalar> diagonal(diagLength);
668  diags.get1dCopy(diagonal());
669 
670  for (size_t i = 0; i < diagLength; ++i) {
671  TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
672  std::runtime_error,
673  "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl
674  << "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
675  }
676  }
677 
678  using Teuchos::RCP;
679  using Teuchos::rcp;
680  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix KokkosKernels"));
681 
682  // Usings
683  using device_t = typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
685  using graph_t = typename matrix_t::StaticCrsGraphType;
686  using lno_view_t = typename graph_t::row_map_type::non_const_type;
687  using int_view_t = Kokkos::View<int*,
688  typename lno_view_t::array_layout,
689  typename lno_view_t::memory_space,
690  typename lno_view_t::memory_traits>;
691  using c_lno_view_t = typename graph_t::row_map_type::const_type;
692  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
693  using scalar_view_t = typename matrix_t::values_type::non_const_type;
694 
695  // KokkosKernels handle
696  using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
697  typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
698  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
699 
700  using int_handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
701  typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
702  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
703 
704  // Merge the B and Bimport matrices
705  const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
706 
707  // Get the properties and arrays of input matrices
708  const matrix_t& Amat = Aview.origMatrix->getLocalMatrixDevice();
709  const matrix_t& Bmat = Bview.origMatrix->getLocalMatrixDevice();
710 
711  typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
712  typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
713  typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
714 
715  // Arrays of the output matrix
716  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("row_mapC"), AnumRows + 1);
717  lno_nnz_view_t entriesC;
718  scalar_view_t valuesC;
719 
720  // Options
721  int team_work_size = 16;
722  std::string myalg("SPGEMM_KK_MEMORY");
723  if (!params.is_null()) {
724  if (params->isParameter("cuda: algorithm"))
725  myalg = params->get("cuda: algorithm", myalg);
726  if (params->isParameter("cuda: team work size"))
727  team_work_size = params->get("cuda: team work size", team_work_size);
728  }
729 
730  // Get the algorithm mode
731  std::string alg("Cuda algorithm");
732  if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
733  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
734 
735  // decide whether to use integer-typed row pointers for this spgemm
736  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
737  const bool useIntRowptrs =
738  irph.shouldUseIntRowptrs() &&
739  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
740 
741  if (useIntRowptrs) {
742  int_handle_t kh;
743  kh.create_spgemm_handle(alg_enum);
744  kh.set_team_work_size(team_work_size);
745 
746  int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("int_row_mapC"), AnumRows + 1);
747 
748  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
749  auto Bint = irph.getIntRowptrMatrix(Bmerged);
750 
751  {
752  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic int");
753  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
754  Aint.graph.row_map, Aint.graph.entries, false,
755  Bint.graph.row_map, Bint.graph.entries, false,
756  int_row_mapC);
757  }
758 
759  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
760  if (c_nnz_size) {
761  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
762  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
763  }
764  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric int");
765 
766  // even though there is no TPL for this, we have to use the same handle that was used in the symbolic phase,
767  // so need to have a special int-typed call for this as well.
768  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
769  Aint.graph.row_map, Aint.graph.entries, Amat.values, false,
770  Bint.graph.row_map, Bint.graph.entries, Bint.values, false,
771  int_row_mapC, entriesC, valuesC,
772  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
773  // transfer the integer rowptrs back to the correct rowptr type
774  Kokkos::parallel_for(
775  int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
776  kh.destroy_spgemm_handle();
777  } else {
778  handle_t kh;
779  kh.create_spgemm_handle(alg_enum);
780  kh.set_team_work_size(team_work_size);
781 
782  {
783  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic non-int");
784  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
785  Amat.graph.row_map, Amat.graph.entries, false,
786  Bmerged.graph.row_map, Bmerged.graph.entries, false,
787  row_mapC);
788  }
789 
790  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
791  if (c_nnz_size) {
792  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
793  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
794  }
795 
796  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric non-int");
797  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
798  Amat.graph.row_map, Amat.graph.entries, Amat.values, false,
799  Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false,
800  row_mapC, entriesC, valuesC,
801  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
802  kh.destroy_spgemm_handle();
803  }
804 
805  MM = Teuchos::null;
806  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaSort"));
807 
808  // Sort & set values
809  if (params.is_null() || params->get("sort entries", true))
810  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
811  C.setAllValues(row_mapC, entriesC, valuesC);
812 
813  MM = Teuchos::null;
814  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
815 
816  // Final Fillcomplete
817  Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
818  labelList->set("Timer Label", label);
819  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
820  Teuchos::RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
821  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
822 }
823 
824 } // namespace MMdetails
825 } // namespace Tpetra
826 
827 #endif // CUDA
828 
829 #endif
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
A distributed dense vector.