Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_Cuda.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
12 
13 #include "Tpetra_Details_IntRowPtrHelper.hpp"
14 
15 #ifdef HAVE_TPETRA_INST_CUDA
16 namespace Tpetra {
17 namespace MMdetails {
18 
19 /*********************************************************************************************************/
20 // MMM KernelWrappers for Partial Specialization to CUDA
21 template<class Scalar,
22  class LocalOrdinal,
23  class GlobalOrdinal,
24  class LocalOrdinalViewType>
25 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
26  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28  const LocalOrdinalViewType & Acol2Brow,
29  const LocalOrdinalViewType & Acol2Irow,
30  const LocalOrdinalViewType & Bcol2Ccol,
31  const LocalOrdinalViewType & Icol2Ccol,
32  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34  const std::string& label = std::string(),
35  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
36 
37 
38 
39  static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
40  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
41  const LocalOrdinalViewType & Acol2Brow,
42  const LocalOrdinalViewType & Acol2Irow,
43  const LocalOrdinalViewType & Bcol2Ccol,
44  const LocalOrdinalViewType & Icol2Ccol,
45  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
46  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
47  const std::string& label = std::string(),
48  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
49 
50 };
51 
52 // Jacobi KernelWrappers for Partial Specialization to Cuda
53 template<class Scalar,
54  class LocalOrdinal,
55  class GlobalOrdinal, class LocalOrdinalViewType>
56 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
57  static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
58  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
59  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
60  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
61  const LocalOrdinalViewType & Acol2Brow,
62  const LocalOrdinalViewType & Acol2Irow,
63  const LocalOrdinalViewType & Bcol2Ccol,
64  const LocalOrdinalViewType & Icol2Ccol,
65  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
66  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
67  const std::string& label = std::string(),
68  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
69 
70  static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
71  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
72  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
73  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
74  const LocalOrdinalViewType & Acol2Brow,
75  const LocalOrdinalViewType & Acol2Irow,
76  const LocalOrdinalViewType & Bcol2Ccol,
77  const LocalOrdinalViewType & Icol2Ccol,
78  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
79  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
80  const std::string& label = std::string(),
81  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
82 
83  static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
84  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
85  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
86  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
87  const LocalOrdinalViewType & Acol2Brow,
88  const LocalOrdinalViewType & Acol2Irow,
89  const LocalOrdinalViewType & Bcol2Ccol,
90  const LocalOrdinalViewType & Icol2Ccol,
91  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
92  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
93  const std::string& label = std::string(),
94  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
95 };
96 
97 
98 /*********************************************************************************************************/
99 // AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
100 template<class Scalar,
101  class LocalOrdinal,
102  class GlobalOrdinal,
103  class LocalOrdinalViewType>
104 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
105  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
106  const LocalOrdinalViewType & Acol2Brow,
107  const LocalOrdinalViewType & Acol2Irow,
108  const LocalOrdinalViewType & Bcol2Ccol,
109  const LocalOrdinalViewType & Icol2Ccol,
110  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
111  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
112  const std::string& label,
113  const Teuchos::RCP<Teuchos::ParameterList>& params) {
114 
115 
116 #ifdef HAVE_TPETRA_MMM_TIMINGS
117  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
118  using Teuchos::TimeMonitor;
119  Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaWrapper")))));
120 #endif
121  // Node-specific code
122  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
123  std::string nodename("Cuda");
124 
125  // Lots and lots of typedefs
126  using Teuchos::RCP;
128  typedef typename KCRS::device_type device_t;
129  typedef typename KCRS::StaticCrsGraphType graph_t;
130  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
131  using int_view_t = Kokkos::View<int *, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
132  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
133  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
134  typedef typename KCRS::values_type::non_const_type scalar_view_t;
135  //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
136 
137  // Options
138  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
139  std::string myalg("SPGEMM_KK_MEMORY");
140  if(!params.is_null()) {
141  if(params->isParameter("cuda: algorithm"))
142  myalg = params->get("cuda: algorithm",myalg);
143  if(params->isParameter("cuda: team work size"))
144  team_work_size = params->get("cuda: team work size",team_work_size);
145  }
146 
147  // KokkosKernelsHandle
148  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
149  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
150  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
151  using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
152  typename int_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
153  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
154 
155  // Grab the Kokkos::SparseCrsMatrices
156  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
157  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
158 
159  c_lno_view_t Arowptr = Amat.graph.row_map,
160  Browptr = Bmat.graph.row_map;
161  const lno_nnz_view_t Acolind = Amat.graph.entries,
162  Bcolind = Bmat.graph.entries;
163  const scalar_view_t Avals = Amat.values,
164  Bvals = Bmat.values;
165 
166  // Get the algorithm mode
167  std::string alg = nodename+std::string(" algorithm");
168  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
169  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
170  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
171 
172  // Merge the B and Bimport matrices
173  KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
174 
175 
176  // if Kokkos Kernels is going to use the cuSparse TPL for SpGEMM, this matrix
177  // needs to be sorted
178  // Try to mirror the Kokkos Kernels internal SpGEMM TPL use logic
179  // inspired by https://github.com/trilinos/Trilinos/pull/11709
180  // and https://github.com/kokkos/kokkos-kernels/pull/2008
181 #if defined(KOKKOS_ENABLE_CUDA) \
182  && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) \
183  && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
184  if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
185  if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
186  KokkosSparse::sort_crs_matrix(Bmerged);
187  }
188  }
189 #endif
190 
191 #ifdef HAVE_TPETRA_MMM_TIMINGS
192  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaCore"))));
193 #endif
194 
195  // Do the multiply on whatever we've got
196  typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
197  typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
198  typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
199 
200  // regardless of whether integer row ptrs are used, need to ultimately produce the row pointer, entries, and values of the expected types
201  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lno_row"), AnumRows + 1);
202  lno_nnz_view_t entriesC;
203  scalar_view_t valuesC;
204 
205  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
206  const bool useIntRowptrs =
207  irph.shouldUseIntRowptrs() &&
208  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
209 
210  if (useIntRowptrs) {
211  IntKernelHandle kh;
212  kh.create_spgemm_handle(alg_enum);
213  kh.set_team_work_size(team_work_size);
214 
215  int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_int_row"), AnumRows + 1);
216 
217  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
218  auto Bint = irph.getIntRowptrMatrix(Bmerged);
219  KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,false,Bint.graph.row_map,Bint.graph.entries,false, int_row_mapC);
220 
221  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
222  if (c_nnz_size){
223  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
224  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
225  }
226  KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,Aint.values,false,Bint.graph.row_map,Bint.graph.entries,Bint.values,false,int_row_mapC,entriesC,valuesC);
227  // transfer the integer rowptrs back to the correct rowptr type
228  Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(int i){ row_mapC(i) = int_row_mapC(i);});
229  kh.destroy_spgemm_handle();
230 
231  } else {
232  KernelHandle kh;
233  kh.create_spgemm_handle(alg_enum);
234  kh.set_team_work_size(team_work_size);
235 
236  KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
237 
238  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
239  if (c_nnz_size){
240  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
241  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
242  }
243 
244  KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
245 
246  kh.destroy_spgemm_handle();
247  }
248 
249 
250 #ifdef HAVE_TPETRA_MMM_TIMINGS
251  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaSort"))));
252 #endif
253 
254  // Sort & set values
255  if (params.is_null() || params->get("sort entries",true))
256  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
257  C.setAllValues(row_mapC,entriesC,valuesC);
258 
259 #ifdef HAVE_TPETRA_MMM_TIMINGS
260  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaESFC"))));
261 #endif
262 
263  // Final Fillcomplete
264  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
265  labelList->set("Timer Label",label);
266  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
267  RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
268  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
269 }
270 
271 
272 /*********************************************************************************************************/
273 template<class Scalar,
274  class LocalOrdinal,
275  class GlobalOrdinal,
276  class LocalOrdinalViewType>
277 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
278  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
279  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
280  const LocalOrdinalViewType & targetMapToOrigRow_dev,
281  const LocalOrdinalViewType & targetMapToImportRow_dev,
282  const LocalOrdinalViewType & Bcol2Ccol_dev,
283  const LocalOrdinalViewType & Icol2Ccol_dev,
284  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
285  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
286  const std::string& label,
287  const Teuchos::RCP<Teuchos::ParameterList>& params) {
288 
289  // FIXME: Right now, this is a cut-and-paste of the serial kernel
290  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
291 
292 #ifdef HAVE_TPETRA_MMM_TIMINGS
293  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
294  using Teuchos::TimeMonitor;
295  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
296  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
297 #endif
298  using Teuchos::RCP;
299  using Teuchos::rcp;
300 
301 
302  // Lots and lots of typedefs
303  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
304  typedef typename KCRS::StaticCrsGraphType graph_t;
305  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
306  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
307  typedef typename KCRS::values_type::non_const_type scalar_view_t;
308 
309  typedef Scalar SC;
310  typedef LocalOrdinal LO;
311  typedef GlobalOrdinal GO;
312  typedef Node NO;
313  typedef Map<LO,GO,NO> map_type;
314  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
315  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
316  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
317 
318  // Since this is being run on Cuda, we need to fence because the below code will use UVM
319  // typename graph_t::execution_space().fence();
320 
321  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
322  // KDDKDD UVM Ideally, this function would run on device and use
323  // KDDKDD UVM KokkosKernels instead of this host implementation.
324  auto targetMapToOrigRow =
325  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
326  targetMapToOrigRow_dev);
327  auto targetMapToImportRow =
328  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
329  targetMapToImportRow_dev);
330  auto Bcol2Ccol =
331  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
332  Bcol2Ccol_dev);
333  auto Icol2Ccol =
334  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
335  Icol2Ccol_dev);
336 
337  // Sizes
338  RCP<const map_type> Ccolmap = C.getColMap();
339  size_t m = Aview.origMatrix->getLocalNumRows();
340  size_t n = Ccolmap->getLocalNumElements();
341 
342  // Grab the Kokkos::SparseCrsMatrices & inner stuff
343  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
344  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
345  const KCRS & Cmat = C.getLocalMatrixHost();
346 
347  c_lno_view_t Arowptr = Amat.graph.row_map,
348  Browptr = Bmat.graph.row_map,
349  Crowptr = Cmat.graph.row_map;
350  const lno_nnz_view_t Acolind = Amat.graph.entries,
351  Bcolind = Bmat.graph.entries,
352  Ccolind = Cmat.graph.entries;
353  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
354  scalar_view_t Cvals = Cmat.values;
355 
356  c_lno_view_t Irowptr;
357  lno_nnz_view_t Icolind;
358  scalar_view_t Ivals;
359  if(!Bview.importMatrix.is_null()) {
360  auto lclB = Bview.importMatrix->getLocalMatrixHost();
361  Irowptr = lclB.graph.row_map;
362  Icolind = lclB.graph.entries;
363  Ivals = lclB.values;
364  }
365 
366 #ifdef HAVE_TPETRA_MMM_TIMINGS
367  MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
368 #endif
369 
370  // Classic csr assembly (low memory edition)
371  // mfh 27 Sep 2016: The c_status array is an implementation detail
372  // of the local sparse matrix-matrix multiply routine.
373 
374  // The status array will contain the index into colind where this entry was last deposited.
375  // c_status[i] < CSR_ip - not in the row yet
376  // c_status[i] >= CSR_ip - this is the entry where you can find the data
377  // We start with this filled with INVALID's indicating that there are no entries yet.
378  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
379  std::vector<size_t> c_status(n, ST_INVALID);
380 
381  // For each row of A/C
382  size_t CSR_ip = 0, OLD_ip = 0;
383  for (size_t i = 0; i < m; i++) {
384  // First fill the c_status array w/ locations where we're allowed to
385  // generate nonzeros for this row
386  OLD_ip = Crowptr[i];
387  CSR_ip = Crowptr[i+1];
388  for (size_t k = OLD_ip; k < CSR_ip; k++) {
389  c_status[Ccolind[k]] = k;
390 
391  // Reset values in the row of C
392  Cvals[k] = SC_ZERO;
393  }
394 
395  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
396  LO Aik = Acolind[k];
397  const SC Aval = Avals[k];
398  if (Aval == SC_ZERO)
399  continue;
400 
401  if (targetMapToOrigRow[Aik] != LO_INVALID) {
402  // Local matrix
403  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
404 
405  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
406  LO Bkj = Bcolind[j];
407  LO Cij = Bcol2Ccol[Bkj];
408 
409  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
410  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
411  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
412 
413  Cvals[c_status[Cij]] += Aval * Bvals[j];
414  }
415 
416  } else {
417  // Remote matrix
418  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
419  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
420  LO Ikj = Icolind[j];
421  LO Cij = Icol2Ccol[Ikj];
422 
423  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
425  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
426 
427  Cvals[c_status[Cij]] += Aval * Ivals[j];
428  }
429  }
430  }
431  }
432 
433  C.fillComplete(C.getDomainMap(), C.getRangeMap());
434 }
435 
436 /*********************************************************************************************************/
437 template<class Scalar,
438  class LocalOrdinal,
439  class GlobalOrdinal,
440  class LocalOrdinalViewType>
441 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
442  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
443  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
444  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
445  const LocalOrdinalViewType & Acol2Brow,
446  const LocalOrdinalViewType & Acol2Irow,
447  const LocalOrdinalViewType & Bcol2Ccol,
448  const LocalOrdinalViewType & Icol2Ccol,
449  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
450  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
451  const std::string& label,
452  const Teuchos::RCP<Teuchos::ParameterList>& params) {
453 
454 #ifdef HAVE_TPETRA_MMM_TIMINGS
455  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
456  using Teuchos::TimeMonitor;
457  Teuchos::RCP<TimeMonitor> MM;
458 #endif
459 
460  // Node-specific code
461  using Teuchos::RCP;
462 
463  // Options
464  //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
465  std::string myalg("KK");
466  if(!params.is_null()) {
467  if(params->isParameter("cuda: jacobi algorithm"))
468  myalg = params->get("cuda: jacobi algorithm",myalg);
469  }
470 
471  if(myalg == "MSAK") {
472  ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
473  }
474  else if(myalg == "KK") {
475  jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
476  }
477  else {
478  throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
479  }
480 
481 #ifdef HAVE_TPETRA_MMM_TIMINGS
482  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
483 #endif
484 
485  // Final Fillcomplete
486  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
487  labelList->set("Timer Label",label);
488  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
489 
490  // NOTE: MSAK already fillCompletes, so we have to check here
491  if(!C.isFillComplete()) {
492  RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
493  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
494  }
495 
496 }
497 
498 
499 
500 /*********************************************************************************************************/
501 template<class Scalar,
502  class LocalOrdinal,
503  class GlobalOrdinal,
504  class LocalOrdinalViewType>
505 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
506  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
507  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
508  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
509  const LocalOrdinalViewType & targetMapToOrigRow_dev,
510  const LocalOrdinalViewType & targetMapToImportRow_dev,
511  const LocalOrdinalViewType & Bcol2Ccol_dev,
512  const LocalOrdinalViewType & Icol2Ccol_dev,
513  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
514  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
515  const std::string& label,
516  const Teuchos::RCP<Teuchos::ParameterList>& params) {
517 
518  // FIXME: Right now, this is a cut-and-paste of the serial kernel
519  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
520 
521 #ifdef HAVE_TPETRA_MMM_TIMINGS
522  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
523  using Teuchos::TimeMonitor;
524  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore"))));
525  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
526 #endif
527  using Teuchos::RCP;
528  using Teuchos::rcp;
529 
530  // Lots and lots of typedefs
531  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
532  typedef typename KCRS::StaticCrsGraphType graph_t;
533  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
534  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
535  typedef typename KCRS::values_type::non_const_type scalar_view_t;
536  typedef typename scalar_view_t::memory_space scalar_memory_space;
537 
538  typedef Scalar SC;
539  typedef LocalOrdinal LO;
540  typedef GlobalOrdinal GO;
541  typedef Node NO;
542  typedef Map<LO,GO,NO> map_type;
543  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
544  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
545  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
546 
547  // Since this is being run on Cuda, we need to fence because the below host code will use UVM
548  // KDDKDD typename graph_t::execution_space().fence();
549 
550  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
551  // KDDKDD UVM Ideally, this function would run on device and use
552  // KDDKDD UVM KokkosKernels instead of this host implementation.
553  auto targetMapToOrigRow =
554  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
555  targetMapToOrigRow_dev);
556  auto targetMapToImportRow =
557  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
558  targetMapToImportRow_dev);
559  auto Bcol2Ccol =
560  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
561  Bcol2Ccol_dev);
562  auto Icol2Ccol =
563  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
564  Icol2Ccol_dev);
565 
566 
567  // Sizes
568  RCP<const map_type> Ccolmap = C.getColMap();
569  size_t m = Aview.origMatrix->getLocalNumRows();
570  size_t n = Ccolmap->getLocalNumElements();
571 
572  // Grab the Kokkos::SparseCrsMatrices & inner stuff
573  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
574  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
575  const KCRS & Cmat = C.getLocalMatrixHost();
576 
577  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
578  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
579  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
580  scalar_view_t Cvals = Cmat.values;
581 
582  c_lno_view_t Irowptr;
583  lno_nnz_view_t Icolind;
584  scalar_view_t Ivals;
585  if(!Bview.importMatrix.is_null()) {
586  auto lclB = Bview.importMatrix->getLocalMatrixHost();
587  Irowptr = lclB.graph.row_map;
588  Icolind = lclB.graph.entries;
589  Ivals = lclB.values;
590  }
591 
592  // Jacobi-specific inner stuff
593  auto Dvals =
594  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
595 
596 #ifdef HAVE_TPETRA_MMM_TIMINGS
597  MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore - Compare"))));
598 #endif
599 
600  // The status array will contain the index into colind where this entry was last deposited.
601  // c_status[i] < CSR_ip - not in the row yet
602  // c_status[i] >= CSR_ip - this is the entry where you can find the data
603  // We start with this filled with INVALID's indicating that there are no entries yet.
604  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
605  std::vector<size_t> c_status(n, ST_INVALID);
606 
607  // For each row of A/C
608  size_t CSR_ip = 0, OLD_ip = 0;
609  for (size_t i = 0; i < m; i++) {
610 
611  // First fill the c_status array w/ locations where we're allowed to
612  // generate nonzeros for this row
613  OLD_ip = Crowptr[i];
614  CSR_ip = Crowptr[i+1];
615  for (size_t k = OLD_ip; k < CSR_ip; k++) {
616  c_status[Ccolind[k]] = k;
617 
618  // Reset values in the row of C
619  Cvals[k] = SC_ZERO;
620  }
621 
622  SC minusOmegaDval = -omega*Dvals(i,0);
623 
624  // Entries of B
625  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
626  Scalar Bval = Bvals[j];
627  if (Bval == SC_ZERO)
628  continue;
629  LO Bij = Bcolind[j];
630  LO Cij = Bcol2Ccol[Bij];
631 
632  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
633  std::runtime_error, "Trying to insert a new entry into a static graph");
634 
635  Cvals[c_status[Cij]] = Bvals[j];
636  }
637 
638  // Entries of -omega * Dinv * A * B
639  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
640  LO Aik = Acolind[k];
641  const SC Aval = Avals[k];
642  if (Aval == SC_ZERO)
643  continue;
644 
645  if (targetMapToOrigRow[Aik] != LO_INVALID) {
646  // Local matrix
647  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
648 
649  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
650  LO Bkj = Bcolind[j];
651  LO Cij = Bcol2Ccol[Bkj];
652 
653  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
654  std::runtime_error, "Trying to insert a new entry into a static graph");
655 
656  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
657  }
658 
659  } else {
660  // Remote matrix
661  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
662  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
663  LO Ikj = Icolind[j];
664  LO Cij = Icol2Ccol[Ikj];
665 
666  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
667  std::runtime_error, "Trying to insert a new entry into a static graph");
668 
669  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
670  }
671  }
672  }
673  }
674 
675 #ifdef HAVE_TPETRA_MMM_TIMINGS
676  MM2= Teuchos::null;
677  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
678 #endif
679 
680  C.fillComplete(C.getDomainMap(), C.getRangeMap());
681 
682 }
683 
684 /*********************************************************************************************************/
685 template<class Scalar,
686  class LocalOrdinal,
687  class GlobalOrdinal,
688  class LocalOrdinalViewType>
689 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
690  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
691  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
692  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
693  const LocalOrdinalViewType & Acol2Brow,
694  const LocalOrdinalViewType & Acol2Irow,
695  const LocalOrdinalViewType & Bcol2Ccol,
696  const LocalOrdinalViewType & Icol2Ccol,
697  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
698  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
699  const std::string& label,
700  const Teuchos::RCP<Teuchos::ParameterList>& params) {
701 
702 #ifdef HAVE_TPETRA_MMM_TIMINGS
703  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
704  using Teuchos::TimeMonitor;
705  Teuchos::RCP<TimeMonitor> MM;
706 #endif
707 
708  // Check if the diagonal entries exist in debug mode
709  const bool debug = Tpetra::Details::Behavior::debug();
710  if(debug) {
711 
712  auto rowMap = Aview.origMatrix->getRowMap();
714  Aview.origMatrix->getLocalDiagCopy(diags);
715  size_t diagLength = rowMap->getLocalNumElements();
716  Teuchos::Array<Scalar> diagonal(diagLength);
717  diags.get1dCopy(diagonal());
718 
719  for(size_t i = 0; i < diagLength; ++i) {
720  TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
721  std::runtime_error,
722  "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
723  "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
724  }
725  }
726 
727  // Usings
728  using device_t = typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
730  using graph_t = typename matrix_t::StaticCrsGraphType;
731  using lno_view_t = typename graph_t::row_map_type::non_const_type;
732  using int_view_t = Kokkos::View<int *,
733  typename lno_view_t::array_layout,
734  typename lno_view_t::memory_space,
735  typename lno_view_t::memory_traits>;
736  using c_lno_view_t = typename graph_t::row_map_type::const_type;
737  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
738  using scalar_view_t = typename matrix_t::values_type::non_const_type;
739 
740  // KokkosKernels handle
741  using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
742  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
743  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
744 
745  using int_handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
746  typename int_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
747  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
748 
749  // Merge the B and Bimport matrices
750  const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
751 
752  // Get the properties and arrays of input matrices
753  const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
754  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
755 
756  typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
757  typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
758  typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
759 
760  // Arrays of the output matrix
761  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("row_mapC"), AnumRows + 1);
762  lno_nnz_view_t entriesC;
763  scalar_view_t valuesC;
764 
765  // Options
766  int team_work_size = 16;
767  std::string myalg("SPGEMM_KK_MEMORY");
768  if(!params.is_null()) {
769  if(params->isParameter("cuda: algorithm"))
770  myalg = params->get("cuda: algorithm",myalg);
771  if(params->isParameter("cuda: team work size"))
772  team_work_size = params->get("cuda: team work size",team_work_size);
773  }
774 
775  // Get the algorithm mode
776  std::string alg("Cuda algorithm");
777  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
778  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
779 
780  // decide whether to use integer-typed row pointers for this spgemm
781  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
782  const bool useIntRowptrs =
783  irph.shouldUseIntRowptrs() &&
784  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
785 
786  if (useIntRowptrs) {
787  int_handle_t kh;
788  kh.create_spgemm_handle(alg_enum);
789  kh.set_team_work_size(team_work_size);
790 
791  int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing("int_row_mapC"), AnumRows + 1);
792 
793  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
794  auto Bint = irph.getIntRowptrMatrix(Bmerged);
795 
796  KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
797  Aint.graph.row_map, Aint.graph.entries, false,
798  Bint.graph.row_map, Bint.graph.entries, false,
799  int_row_mapC);
800 
801  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
802  if (c_nnz_size){
803  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
804  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
805  }
806 
807  // even though there is no TPL for this, we have to use the same handle that was used in the symbolic phase,
808  // so need to have a special int-typed call for this as well.
809  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
810  Aint.graph.row_map, Aint.graph.entries, Amat.values, false,
811  Bint.graph.row_map, Bint.graph.entries, Bint.values, false,
812  int_row_mapC, entriesC, valuesC,
813  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
814  // transfer the integer rowptrs back to the correct rowptr type
815  Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(int i){ row_mapC(i) = int_row_mapC(i);});
816  kh.destroy_spgemm_handle();
817  } else {
818  handle_t kh;
819  kh.create_spgemm_handle(alg_enum);
820  kh.set_team_work_size(team_work_size);
821 
822  KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
823  Amat.graph.row_map, Amat.graph.entries, false,
824  Bmerged.graph.row_map, Bmerged.graph.entries, false,
825  row_mapC);
826 
827  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
828  if (c_nnz_size){
829  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
830  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
831  }
832 
833  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
834  Amat.graph.row_map, Amat.graph.entries, Amat.values, false,
835  Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false,
836  row_mapC, entriesC, valuesC,
837  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
838  kh.destroy_spgemm_handle();
839  }
840 
841 
842 
843 #ifdef HAVE_TPETRA_MMM_TIMINGS
844  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaSort"))));
845 #endif
846 
847  // Sort & set values
848  if (params.is_null() || params->get("sort entries",true))
849  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
850  C.setAllValues(row_mapC,entriesC,valuesC);
851 
852 #ifdef HAVE_TPETRA_MMM_TIMINGS
853  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
854 #endif
855 
856  // Final Fillcomplete
857  Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
858  labelList->set("Timer Label",label);
859  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
860  Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
861  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
862 }
863 
864  }//MMdetails
865 }//Tpetra
866 
867 #endif//CUDA
868 
869 #endif
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
A distributed dense vector.