Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_Cuda.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
12 
13 #include "Tpetra_Details_IntRowPtrHelper.hpp"
14 
15 #ifdef HAVE_TPETRA_INST_CUDA
16 namespace Tpetra {
17 namespace MMdetails {
18 
19 /*********************************************************************************************************/
20 // MMM KernelWrappers for Partial Specialization to CUDA
21 template<class Scalar,
22  class LocalOrdinal,
23  class GlobalOrdinal,
24  class LocalOrdinalViewType>
25 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
26  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28  const LocalOrdinalViewType & Acol2Brow,
29  const LocalOrdinalViewType & Acol2Irow,
30  const LocalOrdinalViewType & Bcol2Ccol,
31  const LocalOrdinalViewType & Icol2Ccol,
32  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34  const std::string& label = std::string(),
35  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
36 
37 
38 
39  static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
40  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
41  const LocalOrdinalViewType & Acol2Brow,
42  const LocalOrdinalViewType & Acol2Irow,
43  const LocalOrdinalViewType & Bcol2Ccol,
44  const LocalOrdinalViewType & Icol2Ccol,
45  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
46  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
47  const std::string& label = std::string(),
48  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
49 
50 };
51 
52 // Jacobi KernelWrappers for Partial Specialization to Cuda
53 template<class Scalar,
54  class LocalOrdinal,
55  class GlobalOrdinal, class LocalOrdinalViewType>
56 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
57  static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
58  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
59  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
60  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
61  const LocalOrdinalViewType & Acol2Brow,
62  const LocalOrdinalViewType & Acol2Irow,
63  const LocalOrdinalViewType & Bcol2Ccol,
64  const LocalOrdinalViewType & Icol2Ccol,
65  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
66  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
67  const std::string& label = std::string(),
68  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
69 
70  static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
71  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
72  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
73  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
74  const LocalOrdinalViewType & Acol2Brow,
75  const LocalOrdinalViewType & Acol2Irow,
76  const LocalOrdinalViewType & Bcol2Ccol,
77  const LocalOrdinalViewType & Icol2Ccol,
78  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
79  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
80  const std::string& label = std::string(),
81  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
82 
83  static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
84  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
85  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
86  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
87  const LocalOrdinalViewType & Acol2Brow,
88  const LocalOrdinalViewType & Acol2Irow,
89  const LocalOrdinalViewType & Bcol2Ccol,
90  const LocalOrdinalViewType & Icol2Ccol,
91  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
92  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
93  const std::string& label = std::string(),
94  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
95 };
96 
97 
98 /*********************************************************************************************************/
99 // AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
100 template<class Scalar,
101  class LocalOrdinal,
102  class GlobalOrdinal,
103  class LocalOrdinalViewType>
104 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
105  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
106  const LocalOrdinalViewType & Acol2Brow,
107  const LocalOrdinalViewType & Acol2Irow,
108  const LocalOrdinalViewType & Bcol2Ccol,
109  const LocalOrdinalViewType & Icol2Ccol,
110  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
111  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
112  const std::string& label,
113  const Teuchos::RCP<Teuchos::ParameterList>& params) {
114 
115  using Teuchos::RCP;
116  using Teuchos::rcp;
117  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaWrapper"));
118 
119  // Node-specific code
120  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
121  std::string nodename("Cuda");
122 
123  // Lots and lots of typedefs
125  typedef typename KCRS::device_type device_t;
126  typedef typename KCRS::StaticCrsGraphType graph_t;
127  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
128  using int_view_t = Kokkos::View<int *, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
129  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
130  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
131  typedef typename KCRS::values_type::non_const_type scalar_view_t;
132  //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
133 
134  // Options
135  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
136  std::string myalg("SPGEMM_KK_MEMORY");
137  if(!params.is_null()) {
138  if(params->isParameter("cuda: algorithm"))
139  myalg = params->get("cuda: algorithm",myalg);
140  if(params->isParameter("cuda: team work size"))
141  team_work_size = params->get("cuda: team work size",team_work_size);
142  }
143 
144  // KokkosKernelsHandle
145  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
146  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
147  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
148  using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
149  typename int_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
150  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
151 
152  // Grab the Kokkos::SparseCrsMatrices
153  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
154  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
155 
156  c_lno_view_t Arowptr = Amat.graph.row_map,
157  Browptr = Bmat.graph.row_map;
158  const lno_nnz_view_t Acolind = Amat.graph.entries,
159  Bcolind = Bmat.graph.entries;
160  const scalar_view_t Avals = Amat.values,
161  Bvals = Bmat.values;
162 
163  // Get the algorithm mode
164  std::string alg = nodename+std::string(" algorithm");
165  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
166  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
167  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
168 
169  // Merge the B and Bimport matrices
170  KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
171 
172 
173  // if Kokkos Kernels is going to use the cuSparse TPL for SpGEMM, this matrix
174  // needs to be sorted
175  // Try to mirror the Kokkos Kernels internal SpGEMM TPL use logic
176  // inspired by https://github.com/trilinos/Trilinos/pull/11709
177  // and https://github.com/kokkos/kokkos-kernels/pull/2008
178 #if defined(KOKKOS_ENABLE_CUDA) \
179  && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) \
180  && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
181  if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
182  if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
183  KokkosSparse::sort_crs_matrix(Bmerged);
184  }
185  }
186 #endif
187 
188  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaCore"));
189 
190  // Do the multiply on whatever we've got
191  typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
192  typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
193  typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
194 
195  // regardless of whether integer row ptrs are used, need to ultimately produce the row pointer, entries, and values of the expected types
196  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lno_row"), AnumRows + 1);
197  lno_nnz_view_t entriesC;
198  scalar_view_t valuesC;
199 
200  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
201  const bool useIntRowptrs =
202  irph.shouldUseIntRowptrs() &&
203  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
204 
205  if (useIntRowptrs) {
206  IntKernelHandle kh;
207  kh.create_spgemm_handle(alg_enum);
208  kh.set_team_work_size(team_work_size);
209 
210  int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_int_row"), AnumRows + 1);
211 
212  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
213  auto Bint = irph.getIntRowptrMatrix(Bmerged);
214 
215  {
216  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels symbolic int");
217  KokkosSparse::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,false,Bint.graph.row_map,Bint.graph.entries,false, int_row_mapC);
218  }
219 
220  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: Newmatrix KokkosKernels numeric int");
221 
222  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
223  if (c_nnz_size){
224  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
225  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
226  }
227  KokkosSparse::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,Aint.values,false,Bint.graph.row_map,Bint.graph.entries,Bint.values,false,int_row_mapC,entriesC,valuesC);
228  // transfer the integer rowptrs back to the correct rowptr type
229  Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(int i){ row_mapC(i) = int_row_mapC(i);});
230  kh.destroy_spgemm_handle();
231 
232  } else {
233  KernelHandle kh;
234  kh.create_spgemm_handle(alg_enum);
235  kh.set_team_work_size(team_work_size);
236 
237  {
238  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels symbolic non-int");
239  KokkosSparse::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
240  }
241 
242  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix KokkosKernels numeric non-int");
243  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
244  if (c_nnz_size){
245  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
246  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
247  }
248 
249  KokkosSparse::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
250 
251  kh.destroy_spgemm_handle();
252  }
253 
254 
255  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaSort"));
256 
257  // Sort & set values
258  if (params.is_null() || params->get("sort entries",true))
259  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
260  C.setAllValues(row_mapC,entriesC,valuesC);
261 
262  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Newmatrix CudaESFC"));
263 
264  // Final Fillcomplete
265  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
266  labelList->set("Timer Label",label);
267  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
268  RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
269  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
270 }
271 
272 
273 /*********************************************************************************************************/
274 template<class Scalar,
275  class LocalOrdinal,
276  class GlobalOrdinal,
277  class LocalOrdinalViewType>
278 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
279  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
280  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
281  const LocalOrdinalViewType & targetMapToOrigRow_dev,
282  const LocalOrdinalViewType & targetMapToImportRow_dev,
283  const LocalOrdinalViewType & Bcol2Ccol_dev,
284  const LocalOrdinalViewType & Icol2Ccol_dev,
285  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
286  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
287  const std::string& label,
288  const Teuchos::RCP<Teuchos::ParameterList>& params) {
289 
290  // FIXME: Right now, this is a cut-and-paste of the serial kernel
291  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
292 
293  using Teuchos::RCP;
294  using Teuchos::rcp;
295  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Reuse SerialCore"));
296 
297  // Lots and lots of typedefs
298  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
299  typedef typename KCRS::StaticCrsGraphType graph_t;
300  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
301  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
302  typedef typename KCRS::values_type::non_const_type scalar_view_t;
303 
304  typedef Scalar SC;
305  typedef LocalOrdinal LO;
306  typedef GlobalOrdinal GO;
307  typedef Node NO;
308  typedef Map<LO,GO,NO> map_type;
309  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
310  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
311  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
312 
313  // Since this is being run on Cuda, we need to fence because the below code will use UVM
314  // typename graph_t::execution_space().fence();
315 
316  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
317  // KDDKDD UVM Ideally, this function would run on device and use
318  // KDDKDD UVM KokkosKernels instead of this host implementation.
319  auto targetMapToOrigRow =
320  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
321  targetMapToOrigRow_dev);
322  auto targetMapToImportRow =
323  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
324  targetMapToImportRow_dev);
325  auto Bcol2Ccol =
326  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
327  Bcol2Ccol_dev);
328  auto Icol2Ccol =
329  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
330  Icol2Ccol_dev);
331 
332  // Sizes
333  RCP<const map_type> Ccolmap = C.getColMap();
334  size_t m = Aview.origMatrix->getLocalNumRows();
335  size_t n = Ccolmap->getLocalNumElements();
336 
337  // Grab the Kokkos::SparseCrsMatrices & inner stuff
338  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
339  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
340  const KCRS & Cmat = C.getLocalMatrixHost();
341 
342  c_lno_view_t Arowptr = Amat.graph.row_map,
343  Browptr = Bmat.graph.row_map,
344  Crowptr = Cmat.graph.row_map;
345  const lno_nnz_view_t Acolind = Amat.graph.entries,
346  Bcolind = Bmat.graph.entries,
347  Ccolind = Cmat.graph.entries;
348  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
349  scalar_view_t Cvals = Cmat.values;
350 
351  c_lno_view_t Irowptr;
352  lno_nnz_view_t Icolind;
353  scalar_view_t Ivals;
354  if(!Bview.importMatrix.is_null()) {
355  auto lclB = Bview.importMatrix->getLocalMatrixHost();
356  Irowptr = lclB.graph.row_map;
357  Icolind = lclB.graph.entries;
358  Ivals = lclB.values;
359  }
360 
361  // Classic csr assembly (low memory edition)
362  // mfh 27 Sep 2016: The c_status array is an implementation detail
363  // of the local sparse matrix-matrix multiply routine.
364 
365  // The status array will contain the index into colind where this entry was last deposited.
366  // c_status[i] < CSR_ip - not in the row yet
367  // c_status[i] >= CSR_ip - this is the entry where you can find the data
368  // We start with this filled with INVALID's indicating that there are no entries yet.
369  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
370  std::vector<size_t> c_status(n, ST_INVALID);
371 
372  // For each row of A/C
373  size_t CSR_ip = 0, OLD_ip = 0;
374  for (size_t i = 0; i < m; i++) {
375  // First fill the c_status array w/ locations where we're allowed to
376  // generate nonzeros for this row
377  OLD_ip = Crowptr[i];
378  CSR_ip = Crowptr[i+1];
379  for (size_t k = OLD_ip; k < CSR_ip; k++) {
380  c_status[Ccolind[k]] = k;
381 
382  // Reset values in the row of C
383  Cvals[k] = SC_ZERO;
384  }
385 
386  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
387  LO Aik = Acolind[k];
388  const SC Aval = Avals[k];
389  if (Aval == SC_ZERO)
390  continue;
391 
392  if (targetMapToOrigRow[Aik] != LO_INVALID) {
393  // Local matrix
394  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
395 
396  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
397  LO Bkj = Bcolind[j];
398  LO Cij = Bcol2Ccol[Bkj];
399 
400  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
401  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
402  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
403 
404  Cvals[c_status[Cij]] += Aval * Bvals[j];
405  }
406 
407  } else {
408  // Remote matrix
409  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
410  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
411  LO Ikj = Icolind[j];
412  LO Cij = Icol2Ccol[Ikj];
413 
414  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
415  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
416  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
417 
418  Cvals[c_status[Cij]] += Aval * Ivals[j];
419  }
420  }
421  }
422  }
423 
424  C.fillComplete(C.getDomainMap(), C.getRangeMap());
425 }
426 
427 /*********************************************************************************************************/
428 template<class Scalar,
429  class LocalOrdinal,
430  class GlobalOrdinal,
431  class LocalOrdinalViewType>
432 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
433  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
434  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
435  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
436  const LocalOrdinalViewType & Acol2Brow,
437  const LocalOrdinalViewType & Acol2Irow,
438  const LocalOrdinalViewType & Bcol2Ccol,
439  const LocalOrdinalViewType & Icol2Ccol,
440  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
441  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
442  const std::string& label,
443  const Teuchos::RCP<Teuchos::ParameterList>& params) {
444 
445 
446 
447  // Node-specific code
448  using Teuchos::RCP;
449  using Teuchos::rcp;
450  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: Jacobi CudaWrapper"));
451 
452  // Options
453  //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
454  std::string myalg("KK");
455  if(!params.is_null()) {
456  if(params->isParameter("cuda: jacobi algorithm"))
457  myalg = params->get("cuda: jacobi algorithm",myalg);
458  }
459 
460  if(myalg == "MSAK") {
461  ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
462  }
463  else if(myalg == "KK") {
464  jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
465  }
466  else {
467  throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
468  }
469 
470  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
471 
472  // Final Fillcomplete
473  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
474  labelList->set("Timer Label",label);
475  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
476 
477  // NOTE: MSAK already fillCompletes, so we have to check here
478  if(!C.isFillComplete()) {
479  RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
480  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
481  }
482 
483 }
484 
485 
486 
487 /*********************************************************************************************************/
488 template<class Scalar,
489  class LocalOrdinal,
490  class GlobalOrdinal,
491  class LocalOrdinalViewType>
492 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
493  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
494  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
495  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
496  const LocalOrdinalViewType & targetMapToOrigRow_dev,
497  const LocalOrdinalViewType & targetMapToImportRow_dev,
498  const LocalOrdinalViewType & Bcol2Ccol_dev,
499  const LocalOrdinalViewType & Icol2Ccol_dev,
500  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
501  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
502  const std::string& label,
503  const Teuchos::RCP<Teuchos::ParameterList>& params) {
504 
505  // FIXME: Right now, this is a cut-and-paste of the serial kernel
506  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
507 
508  using Teuchos::RCP;
509  using Teuchos::rcp;
510  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse CudaCore"));
511 
512  // Lots and lots of typedefs
513  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
514  typedef typename KCRS::StaticCrsGraphType graph_t;
515  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
516  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
517  typedef typename KCRS::values_type::non_const_type scalar_view_t;
518  typedef typename scalar_view_t::memory_space scalar_memory_space;
519 
520  typedef Scalar SC;
521  typedef LocalOrdinal LO;
522  typedef GlobalOrdinal GO;
523  typedef Node NO;
524  typedef Map<LO,GO,NO> map_type;
525  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
526  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
527  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
528 
529  // Since this is being run on Cuda, we need to fence because the below host code will use UVM
530  // KDDKDD typename graph_t::execution_space().fence();
531 
532  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
533  // KDDKDD UVM Ideally, this function would run on device and use
534  // KDDKDD UVM KokkosKernels instead of this host implementation.
535  auto targetMapToOrigRow =
536  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
537  targetMapToOrigRow_dev);
538  auto targetMapToImportRow =
539  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
540  targetMapToImportRow_dev);
541  auto Bcol2Ccol =
542  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
543  Bcol2Ccol_dev);
544  auto Icol2Ccol =
545  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
546  Icol2Ccol_dev);
547 
548 
549  // Sizes
550  RCP<const map_type> Ccolmap = C.getColMap();
551  size_t m = Aview.origMatrix->getLocalNumRows();
552  size_t n = Ccolmap->getLocalNumElements();
553 
554  // Grab the Kokkos::SparseCrsMatrices & inner stuff
555  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
556  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
557  const KCRS & Cmat = C.getLocalMatrixHost();
558 
559  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
560  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
561  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
562  scalar_view_t Cvals = Cmat.values;
563 
564  c_lno_view_t Irowptr;
565  lno_nnz_view_t Icolind;
566  scalar_view_t Ivals;
567  if(!Bview.importMatrix.is_null()) {
568  auto lclB = Bview.importMatrix->getLocalMatrixHost();
569  Irowptr = lclB.graph.row_map;
570  Icolind = lclB.graph.entries;
571  Ivals = lclB.values;
572  }
573 
574  // Jacobi-specific inner stuff
575  auto Dvals =
576  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
577 
578  // The status array will contain the index into colind where this entry was last deposited.
579  // c_status[i] < CSR_ip - not in the row yet
580  // c_status[i] >= CSR_ip - this is the entry where you can find the data
581  // We start with this filled with INVALID's indicating that there are no entries yet.
582  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
583  std::vector<size_t> c_status(n, ST_INVALID);
584 
585  // For each row of A/C
586  size_t CSR_ip = 0, OLD_ip = 0;
587  for (size_t i = 0; i < m; i++) {
588 
589  // First fill the c_status array w/ locations where we're allowed to
590  // generate nonzeros for this row
591  OLD_ip = Crowptr[i];
592  CSR_ip = Crowptr[i+1];
593  for (size_t k = OLD_ip; k < CSR_ip; k++) {
594  c_status[Ccolind[k]] = k;
595 
596  // Reset values in the row of C
597  Cvals[k] = SC_ZERO;
598  }
599 
600  SC minusOmegaDval = -omega*Dvals(i,0);
601 
602  // Entries of B
603  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
604  Scalar Bval = Bvals[j];
605  if (Bval == SC_ZERO)
606  continue;
607  LO Bij = Bcolind[j];
608  LO Cij = Bcol2Ccol[Bij];
609 
610  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
611  std::runtime_error, "Trying to insert a new entry into a static graph");
612 
613  Cvals[c_status[Cij]] = Bvals[j];
614  }
615 
616  // Entries of -omega * Dinv * A * B
617  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
618  LO Aik = Acolind[k];
619  const SC Aval = Avals[k];
620  if (Aval == SC_ZERO)
621  continue;
622 
623  if (targetMapToOrigRow[Aik] != LO_INVALID) {
624  // Local matrix
625  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
626 
627  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
628  LO Bkj = Bcolind[j];
629  LO Cij = Bcol2Ccol[Bkj];
630 
631  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
632  std::runtime_error, "Trying to insert a new entry into a static graph");
633 
634  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
635  }
636 
637  } else {
638  // Remote matrix
639  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
640  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
641  LO Ikj = Icolind[j];
642  LO Cij = Icol2Ccol[Ikj];
643 
644  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
645  std::runtime_error, "Trying to insert a new entry into a static graph");
646 
647  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
648  }
649  }
650  }
651  }
652 
653  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Reuse ESFC"));
654 
655  C.fillComplete(C.getDomainMap(), C.getRangeMap());
656 
657 }
658 
659 /*********************************************************************************************************/
660 template<class Scalar,
661  class LocalOrdinal,
662  class GlobalOrdinal,
663  class LocalOrdinalViewType>
664 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
665  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
666  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
667  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
668  const LocalOrdinalViewType & Acol2Brow,
669  const LocalOrdinalViewType & Acol2Irow,
670  const LocalOrdinalViewType & Bcol2Ccol,
671  const LocalOrdinalViewType & Icol2Ccol,
672  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
673  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
674  const std::string& label,
675  const Teuchos::RCP<Teuchos::ParameterList>& params) {
676 
677 
678 
679  // Check if the diagonal entries exist in debug mode
680  const bool debug = Tpetra::Details::Behavior::debug();
681  if(debug) {
682 
683  auto rowMap = Aview.origMatrix->getRowMap();
685  Aview.origMatrix->getLocalDiagCopy(diags);
686  size_t diagLength = rowMap->getLocalNumElements();
687  Teuchos::Array<Scalar> diagonal(diagLength);
688  diags.get1dCopy(diagonal());
689 
690  for(size_t i = 0; i < diagLength; ++i) {
691  TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
692  std::runtime_error,
693  "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
694  "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
695  }
696  }
697 
698  using Teuchos::RCP;
699  using Teuchos::rcp;
700  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix KokkosKernels"));
701 
702 
703  // Usings
704  using device_t = typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
706  using graph_t = typename matrix_t::StaticCrsGraphType;
707  using lno_view_t = typename graph_t::row_map_type::non_const_type;
708  using int_view_t = Kokkos::View<int *,
709  typename lno_view_t::array_layout,
710  typename lno_view_t::memory_space,
711  typename lno_view_t::memory_traits>;
712  using c_lno_view_t = typename graph_t::row_map_type::const_type;
713  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
714  using scalar_view_t = typename matrix_t::values_type::non_const_type;
715 
716  // KokkosKernels handle
717  using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
718  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
719  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
720 
721  using int_handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
722  typename int_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
723  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
724 
725  // Merge the B and Bimport matrices
726  const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
727 
728  // Get the properties and arrays of input matrices
729  const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
730  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
731 
732  typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
733  typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
734  typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
735 
736  // Arrays of the output matrix
737  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("row_mapC"), AnumRows + 1);
738  lno_nnz_view_t entriesC;
739  scalar_view_t valuesC;
740 
741  // Options
742  int team_work_size = 16;
743  std::string myalg("SPGEMM_KK_MEMORY");
744  if(!params.is_null()) {
745  if(params->isParameter("cuda: algorithm"))
746  myalg = params->get("cuda: algorithm",myalg);
747  if(params->isParameter("cuda: team work size"))
748  team_work_size = params->get("cuda: team work size",team_work_size);
749  }
750 
751  // Get the algorithm mode
752  std::string alg("Cuda algorithm");
753  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
754  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
755 
756  // decide whether to use integer-typed row pointers for this spgemm
757  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
758  const bool useIntRowptrs =
759  irph.shouldUseIntRowptrs() &&
760  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
761 
762  if (useIntRowptrs) {
763  int_handle_t kh;
764  kh.create_spgemm_handle(alg_enum);
765  kh.set_team_work_size(team_work_size);
766 
767  int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing("int_row_mapC"), AnumRows + 1);
768 
769  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
770  auto Bint = irph.getIntRowptrMatrix(Bmerged);
771 
772  {
773  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic int");
774  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
775  Aint.graph.row_map, Aint.graph.entries, false,
776  Bint.graph.row_map, Bint.graph.entries, false,
777  int_row_mapC);
778  }
779 
780  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
781  if (c_nnz_size){
782  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
783  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
784  }
785  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric int");
786 
787  // even though there is no TPL for this, we have to use the same handle that was used in the symbolic phase,
788  // so need to have a special int-typed call for this as well.
789  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
790  Aint.graph.row_map, Aint.graph.entries, Amat.values, false,
791  Bint.graph.row_map, Bint.graph.entries, Bint.values, false,
792  int_row_mapC, entriesC, valuesC,
793  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
794  // transfer the integer rowptrs back to the correct rowptr type
795  Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(int i){ row_mapC(i) = int_row_mapC(i);});
796  kh.destroy_spgemm_handle();
797  } else {
798  handle_t kh;
799  kh.create_spgemm_handle(alg_enum);
800  kh.set_team_work_size(team_work_size);
801 
802  {
803  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels symbolic non-int");
804  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
805  Amat.graph.row_map, Amat.graph.entries, false,
806  Bmerged.graph.row_map, Bmerged.graph.entries, false,
807  row_mapC);
808  }
809 
810  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
811  if (c_nnz_size){
812  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
813  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
814  }
815 
816  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: KokkosKernels numeric non-int");
817  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
818  Amat.graph.row_map, Amat.graph.entries, Amat.values, false,
819  Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false,
820  row_mapC, entriesC, valuesC,
821  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
822  kh.destroy_spgemm_handle();
823  }
824 
825 
826 
827  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaSort"));
828 
829  // Sort & set values
830  if (params.is_null() || params->get("sort entries",true))
831  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
832  C.setAllValues(row_mapC,entriesC,valuesC);
833 
834  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: Newmatrix CudaESFC"));
835 
836  // Final Fillcomplete
837  Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
838  labelList->set("Timer Label",label);
839  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
840  Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
841  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
842 }
843 
844  }//MMdetails
845 }//Tpetra
846 
847 #endif//CUDA
848 
849 #endif
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
A distributed dense vector.