Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_SYCL.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_SYCL_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_SYCL_DEF_HPP
12 
13 #include "Tpetra_Details_IntRowPtrHelper.hpp"
14 
15 #ifdef HAVE_TPETRA_INST_SYCL
16 namespace Tpetra {
17 namespace MMdetails {
18 
19 /*********************************************************************************************************/
20 // MMM KernelWrappers for Partial Specialization to SYCL
21 template <class Scalar,
22  class LocalOrdinal,
23  class GlobalOrdinal,
24  class LocalOrdinalViewType>
25 struct KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode, LocalOrdinalViewType> {
26  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
27  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
28  const LocalOrdinalViewType& Acol2Brow,
29  const LocalOrdinalViewType& Acol2Irow,
30  const LocalOrdinalViewType& Bcol2Ccol,
31  const LocalOrdinalViewType& Icol2Ccol,
32  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
33  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
34  const std::string& label = std::string(),
35  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
36 
37  static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
38  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
39  const LocalOrdinalViewType& Acol2Brow,
40  const LocalOrdinalViewType& Acol2Irow,
41  const LocalOrdinalViewType& Bcol2Ccol,
42  const LocalOrdinalViewType& Icol2Ccol,
43  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
44  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
45  const std::string& label = std::string(),
46  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
47 };
48 
49 // Jacobi KernelWrappers for Partial Specialization to SYCL
50 template <class Scalar,
51  class LocalOrdinal,
52  class GlobalOrdinal, class LocalOrdinalViewType>
53 struct KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode, LocalOrdinalViewType> {
54  static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
55  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Dinv,
56  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
57  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
58  const LocalOrdinalViewType& Acol2Brow,
59  const LocalOrdinalViewType& Acol2Irow,
60  const LocalOrdinalViewType& Bcol2Ccol,
61  const LocalOrdinalViewType& Icol2Ccol,
62  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
63  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
64  const std::string& label = std::string(),
65  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66 
67  static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
68  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Dinv,
69  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
70  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
71  const LocalOrdinalViewType& Acol2Brow,
72  const LocalOrdinalViewType& Acol2Irow,
73  const LocalOrdinalViewType& Bcol2Ccol,
74  const LocalOrdinalViewType& Icol2Ccol,
75  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
76  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
77  const std::string& label = std::string(),
78  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79 
80  static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
81  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Dinv,
82  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
83  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
84  const LocalOrdinalViewType& Acol2Brow,
85  const LocalOrdinalViewType& Acol2Irow,
86  const LocalOrdinalViewType& Bcol2Ccol,
87  const LocalOrdinalViewType& Icol2Ccol,
88  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
89  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
90  const std::string& label = std::string(),
91  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
92 };
93 
94 /*********************************************************************************************************/
95 // AB NewMatrix Kernel wrappers (KokkosKernels/SYCL Version)
96 template <class Scalar,
97  class LocalOrdinal,
98  class GlobalOrdinal,
99  class LocalOrdinalViewType>
100 void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode, LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
101  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
102  const LocalOrdinalViewType& Acol2Brow,
103  const LocalOrdinalViewType& Acol2Irow,
104  const LocalOrdinalViewType& Bcol2Ccol,
105  const LocalOrdinalViewType& Icol2Ccol,
106  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
107  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
108  const std::string& label,
109  const Teuchos::RCP<Teuchos::ParameterList>& params) {
110 #ifdef HAVE_TPETRA_MMM_TIMINGS
111  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
112  using Teuchos::TimeMonitor;
113  Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SYCLWrapper")))));
114 #endif
115  // Node-specific code
116  typedef Tpetra::KokkosCompat::KokkosSYCLWrapperNode Node;
117  std::string nodename("SYCL");
118 
119  // Lots and lots of typedefs
120  using Teuchos::RCP;
122  typedef typename KCRS::device_type device_t;
123  typedef typename KCRS::StaticCrsGraphType graph_t;
124  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
125  using int_view_t = Kokkos::View<int*, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
126  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
127  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
128  typedef typename KCRS::values_type::non_const_type scalar_view_t;
129  // typedef typename graph_t::row_map_type::const_type lno_view_t_const;
130 
131  // Options
132  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
133  std::string myalg("SPGEMM_KK_MEMORY");
134  if (!params.is_null()) {
135  if (params->isParameter("sycl: algorithm"))
136  myalg = params->get("sycl: algorithm", myalg);
137  if (params->isParameter("sycl: team work size"))
138  team_work_size = params->get("sycl: team work size", team_work_size);
139  }
140 
141  // KokkosKernelsHandle
142  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
143  typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
144  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>
145  KernelHandle;
146  using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
147  typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
148  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
149 
150  // Grab the Kokkos::SparseCrsMatrices
151  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
152  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
153 
154  c_lno_view_t Arowptr = Amat.graph.row_map,
155  Browptr = Bmat.graph.row_map;
156  const lno_nnz_view_t Acolind = Amat.graph.entries,
157  Bcolind = Bmat.graph.entries;
158  const scalar_view_t Avals = Amat.values,
159  Bvals = Bmat.values;
160 
161  // Get the algorithm mode
162  std::string alg = nodename + std::string(" algorithm");
163  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
164  if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
165  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
166 
167  // Merge the B and Bimport matrices
168  const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
169 
170 #ifdef HAVE_TPETRA_MMM_TIMINGS
171  MM = Teuchos::null;
172  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SYCLCore"))));
173 #endif
174 
175  // Do the multiply on whatever we've got
176  typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
177  typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
178  typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
179 
180  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lno_row"), AnumRows + 1);
181  lno_nnz_view_t entriesC;
182  scalar_view_t valuesC;
183 
184  // static_assert(std::is_void_v<typename KCRS::row_map_type::non_const_value_type>, "");
185  // static_assert(std::is_void_v<typename decltype(Bmerged)::row_map_type::non_const_value_type>, "");
186  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
187  const bool useIntRowptrs =
188  irph.shouldUseIntRowptrs() &&
189  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
190 
191  if (useIntRowptrs) {
192  IntKernelHandle kh;
193  kh.create_spgemm_handle(alg_enum);
194  kh.set_team_work_size(team_work_size);
195 
196  int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_int_row"), AnumRows + 1);
197 
198  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
199  auto Bint = irph.getIntRowptrMatrix(Bmerged);
200  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, false, Bint.graph.row_map, Bint.graph.entries, false, int_row_mapC);
201 
202  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
203  if (c_nnz_size) {
204  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
205  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
206  }
207  KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Aint.graph.row_map, Aint.graph.entries, Aint.values, false, Bint.graph.row_map, Bint.graph.entries, Bint.values, false, int_row_mapC, entriesC, valuesC);
208  // transfer the integer rowptrs back to the correct rowptr type
209  Kokkos::parallel_for(
210  int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
211  kh.destroy_spgemm_handle();
212 
213  } else {
214  KernelHandle kh;
215  kh.create_spgemm_handle(alg_enum);
216  kh.set_team_work_size(team_work_size);
217 
218  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, false, Bmerged.graph.row_map, Bmerged.graph.entries, false, row_mapC);
219 
220  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
221  if (c_nnz_size) {
222  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
223  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
224  }
225 
226  KokkosSparse::spgemm_numeric(&kh, AnumRows, BnumRows, BnumCols, Amat.graph.row_map, Amat.graph.entries, Amat.values, false, Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false, row_mapC, entriesC, valuesC);
227  kh.destroy_spgemm_handle();
228  }
229 
230 #ifdef HAVE_TPETRA_MMM_TIMINGS
231  MM = Teuchos::null;
232  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SYCLSort"))));
233 #endif
234 
235  // Sort & set values
236  if (params.is_null() || params->get("sort entries", true))
237  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
238  C.setAllValues(row_mapC, entriesC, valuesC);
239 
240 #ifdef HAVE_TPETRA_MMM_TIMINGS
241  MM = Teuchos::null;
242  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SYCLESFC"))));
243 #endif
244 
245  // Final Fillcomplete
246  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
247  labelList->set("Timer Label", label);
248  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
249  RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > dummyExport;
250  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
251 }
252 
253 /*********************************************************************************************************/
254 template <class Scalar,
255  class LocalOrdinal,
256  class GlobalOrdinal,
257  class LocalOrdinalViewType>
258 void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode, LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
259  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
260  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
261  const LocalOrdinalViewType& targetMapToOrigRow_dev,
262  const LocalOrdinalViewType& targetMapToImportRow_dev,
263  const LocalOrdinalViewType& Bcol2Ccol_dev,
264  const LocalOrdinalViewType& Icol2Ccol_dev,
265  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
266  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
267  const std::string& label,
268  const Teuchos::RCP<Teuchos::ParameterList>& params) {
269  // FIXME: Right now, this is a cut-and-paste of the serial kernel
270  typedef Tpetra::KokkosCompat::KokkosSYCLWrapperNode Node;
271 
272 #ifdef HAVE_TPETRA_MMM_TIMINGS
273  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
274  using Teuchos::TimeMonitor;
275  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
276  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
277 #endif
278  using Teuchos::RCP;
279  using Teuchos::rcp;
280 
281  // Lots and lots of typedefs
282  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
283  typedef typename KCRS::StaticCrsGraphType graph_t;
284  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
285  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
286  typedef typename KCRS::values_type::non_const_type scalar_view_t;
287 
288  typedef Scalar SC;
289  typedef LocalOrdinal LO;
290  typedef GlobalOrdinal GO;
291  typedef Node NO;
292  typedef Map<LO, GO, NO> map_type;
293  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
294  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
295  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
296 
297  // Since this is being run on SYCL, we need to fence because the below code will use UVM
298  // typename graph_t::execution_space().fence();
299 
300  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
301  // KDDKDD UVM Ideally, this function would run on device and use
302  // KDDKDD UVM KokkosKernels instead of this host implementation.
303  auto targetMapToOrigRow =
304  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
305  targetMapToOrigRow_dev);
306  auto targetMapToImportRow =
307  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
308  targetMapToImportRow_dev);
309  auto Bcol2Ccol =
310  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
311  Bcol2Ccol_dev);
312  auto Icol2Ccol =
313  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
314  Icol2Ccol_dev);
315 
316  // Sizes
317  RCP<const map_type> Ccolmap = C.getColMap();
318  size_t m = Aview.origMatrix->getLocalNumRows();
319  size_t n = Ccolmap->getLocalNumElements();
320 
321  // Grab the Kokkos::SparseCrsMatrices & inner stuff
322  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
323  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
324  const KCRS& Cmat = C.getLocalMatrixHost();
325 
326  c_lno_view_t Arowptr = Amat.graph.row_map,
327  Browptr = Bmat.graph.row_map,
328  Crowptr = Cmat.graph.row_map;
329  const lno_nnz_view_t Acolind = Amat.graph.entries,
330  Bcolind = Bmat.graph.entries,
331  Ccolind = Cmat.graph.entries;
332  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
333  scalar_view_t Cvals = Cmat.values;
334 
335  c_lno_view_t Irowptr;
336  lno_nnz_view_t Icolind;
337  scalar_view_t Ivals;
338  if (!Bview.importMatrix.is_null()) {
339  auto lclB = Bview.importMatrix->getLocalMatrixHost();
340  Irowptr = lclB.graph.row_map;
341  Icolind = lclB.graph.entries;
342  Ivals = lclB.values;
343  }
344 
345 #ifdef HAVE_TPETRA_MMM_TIMINGS
346  MM2 = Teuchos::null;
347  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
348 #endif
349 
350  // Classic csr assembly (low memory edition)
351  // mfh 27 Sep 2016: The c_status array is an implementation detail
352  // of the local sparse matrix-matrix multiply routine.
353 
354  // The status array will contain the index into colind where this entry was last deposited.
355  // c_status[i] < CSR_ip - not in the row yet
356  // c_status[i] >= CSR_ip - this is the entry where you can find the data
357  // We start with this filled with INVALID's indicating that there are no entries yet.
358  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
359  std::vector<size_t> c_status(n, ST_INVALID);
360 
361  // For each row of A/C
362  size_t CSR_ip = 0, OLD_ip = 0;
363  for (size_t i = 0; i < m; i++) {
364  // First fill the c_status array w/ locations where we're allowed to
365  // generate nonzeros for this row
366  OLD_ip = Crowptr[i];
367  CSR_ip = Crowptr[i + 1];
368  for (size_t k = OLD_ip; k < CSR_ip; k++) {
369  c_status[Ccolind[k]] = k;
370 
371  // Reset values in the row of C
372  Cvals[k] = SC_ZERO;
373  }
374 
375  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
376  LO Aik = Acolind[k];
377  const SC Aval = Avals[k];
378  if (Aval == SC_ZERO)
379  continue;
380 
381  if (targetMapToOrigRow[Aik] != LO_INVALID) {
382  // Local matrix
383  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
384 
385  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
386  LO Bkj = Bcolind[j];
387  LO Cij = Bcol2Ccol[Bkj];
388 
389  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
390  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
391  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
392 
393  Cvals[c_status[Cij]] += Aval * Bvals[j];
394  }
395 
396  } else {
397  // Remote matrix
398  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
399  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
400  LO Ikj = Icolind[j];
401  LO Cij = Icol2Ccol[Ikj];
402 
403  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
404  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
405  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
406 
407  Cvals[c_status[Cij]] += Aval * Ivals[j];
408  }
409  }
410  }
411  }
412 
413  C.fillComplete(C.getDomainMap(), C.getRangeMap());
414 }
415 
416 /*********************************************************************************************************/
417 template <class Scalar,
418  class LocalOrdinal,
419  class GlobalOrdinal,
420  class LocalOrdinalViewType>
421 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
422  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Dinv,
423  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
424  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
425  const LocalOrdinalViewType& Acol2Brow,
426  const LocalOrdinalViewType& Acol2Irow,
427  const LocalOrdinalViewType& Bcol2Ccol,
428  const LocalOrdinalViewType& Icol2Ccol,
429  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
430  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
431  const std::string& label,
432  const Teuchos::RCP<Teuchos::ParameterList>& params) {
433 #ifdef HAVE_TPETRA_MMM_TIMINGS
434  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
435  using Teuchos::TimeMonitor;
436  Teuchos::RCP<TimeMonitor> MM;
437 #endif
438 
439  // Node-specific code
440  using Teuchos::RCP;
441 
442  // Options
443  // int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
444  std::string myalg("KK");
445  if (!params.is_null()) {
446  if (params->isParameter("sycl: jacobi algorithm"))
447  myalg = params->get("sycl: jacobi algorithm", myalg);
448  }
449 
450  if (myalg == "MSAK") {
451  ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
452  } else if (myalg == "KK") {
453  jacobi_A_B_newmatrix_KokkosKernels(omega, Dinv, Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
454  } else {
455  throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
456  }
457 
458 #ifdef HAVE_TPETRA_MMM_TIMINGS
459  MM = Teuchos::null;
460  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix SYCLESFC"))));
461 #endif
462 
463  // Final Fillcomplete
464  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
465  labelList->set("Timer Label", label);
466  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
467 
468  // NOTE: MSAK already fillCompletes, so we have to check here
469  if (!C.isFillComplete()) {
470  RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > dummyExport;
471  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
472  }
473 }
474 
475 /*********************************************************************************************************/
476 template <class Scalar,
477  class LocalOrdinal,
478  class GlobalOrdinal,
479  class LocalOrdinalViewType>
480 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode, LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
481  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Dinv,
482  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
483  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
484  const LocalOrdinalViewType& targetMapToOrigRow_dev,
485  const LocalOrdinalViewType& targetMapToImportRow_dev,
486  const LocalOrdinalViewType& Bcol2Ccol_dev,
487  const LocalOrdinalViewType& Icol2Ccol_dev,
488  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
489  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
490  const std::string& label,
491  const Teuchos::RCP<Teuchos::ParameterList>& params) {
492  // FIXME: Right now, this is a cut-and-paste of the serial kernel
493  typedef Tpetra::KokkosCompat::KokkosSYCLWrapperNode Node;
494 
495 #ifdef HAVE_TPETRA_MMM_TIMINGS
496  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
497  using Teuchos::TimeMonitor;
498  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SYCLCore"))));
499  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
500 #endif
501  using Teuchos::RCP;
502  using Teuchos::rcp;
503 
504  // Lots and lots of typedefs
505  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
506  typedef typename KCRS::StaticCrsGraphType graph_t;
507  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
508  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
509  typedef typename KCRS::values_type::non_const_type scalar_view_t;
510  typedef typename scalar_view_t::memory_space scalar_memory_space;
511 
512  typedef Scalar SC;
513  typedef LocalOrdinal LO;
514  typedef GlobalOrdinal GO;
515  typedef Node NO;
516  typedef Map<LO, GO, NO> map_type;
517  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
518  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
519  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
520 
521  // Since this is being run on SYCL, we need to fence because the below host code will use UVM
522  // KDDKDD typename graph_t::execution_space().fence();
523 
524  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
525  // KDDKDD UVM Ideally, this function would run on device and use
526  // KDDKDD UVM KokkosKernels instead of this host implementation.
527  auto targetMapToOrigRow =
528  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
529  targetMapToOrigRow_dev);
530  auto targetMapToImportRow =
531  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
532  targetMapToImportRow_dev);
533  auto Bcol2Ccol =
534  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
535  Bcol2Ccol_dev);
536  auto Icol2Ccol =
537  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
538  Icol2Ccol_dev);
539 
540  // Sizes
541  RCP<const map_type> Ccolmap = C.getColMap();
542  size_t m = Aview.origMatrix->getLocalNumRows();
543  size_t n = Ccolmap->getLocalNumElements();
544 
545  // Grab the Kokkos::SparseCrsMatrices & inner stuff
546  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
547  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
548  const KCRS& Cmat = C.getLocalMatrixHost();
549 
550  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
551  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
552  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
553  scalar_view_t Cvals = Cmat.values;
554 
555  c_lno_view_t Irowptr;
556  lno_nnz_view_t Icolind;
557  scalar_view_t Ivals;
558  if (!Bview.importMatrix.is_null()) {
559  auto lclB = Bview.importMatrix->getLocalMatrixHost();
560  Irowptr = lclB.graph.row_map;
561  Icolind = lclB.graph.entries;
562  Ivals = lclB.values;
563  }
564 
565  // Jacobi-specific inner stuff
566  auto Dvals =
567  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
568 
569 #ifdef HAVE_TPETRA_MMM_TIMINGS
570  MM2 = Teuchos::null;
571  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SYCLCore - Compare"))));
572 #endif
573 
574  // The status array will contain the index into colind where this entry was last deposited.
575  // c_status[i] < CSR_ip - not in the row yet
576  // c_status[i] >= CSR_ip - this is the entry where you can find the data
577  // We start with this filled with INVALID's indicating that there are no entries yet.
578  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
579  std::vector<size_t> c_status(n, ST_INVALID);
580 
581  // For each row of A/C
582  size_t CSR_ip = 0, OLD_ip = 0;
583  for (size_t i = 0; i < m; i++) {
584  // First fill the c_status array w/ locations where we're allowed to
585  // generate nonzeros for this row
586  OLD_ip = Crowptr[i];
587  CSR_ip = Crowptr[i + 1];
588  for (size_t k = OLD_ip; k < CSR_ip; k++) {
589  c_status[Ccolind[k]] = k;
590 
591  // Reset values in the row of C
592  Cvals[k] = SC_ZERO;
593  }
594 
595  SC minusOmegaDval = -omega * Dvals(i, 0);
596 
597  // Entries of B
598  for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
599  Scalar Bval = Bvals[j];
600  if (Bval == SC_ZERO)
601  continue;
602  LO Bij = Bcolind[j];
603  LO Cij = Bcol2Ccol[Bij];
604 
605  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
606  std::runtime_error, "Trying to insert a new entry into a static graph");
607 
608  Cvals[c_status[Cij]] = Bvals[j];
609  }
610 
611  // Entries of -omega * Dinv * A * B
612  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
613  LO Aik = Acolind[k];
614  const SC Aval = Avals[k];
615  if (Aval == SC_ZERO)
616  continue;
617 
618  if (targetMapToOrigRow[Aik] != LO_INVALID) {
619  // Local matrix
620  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
621 
622  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
623  LO Bkj = Bcolind[j];
624  LO Cij = Bcol2Ccol[Bkj];
625 
626  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
627  std::runtime_error, "Trying to insert a new entry into a static graph");
628 
629  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
630  }
631 
632  } else {
633  // Remote matrix
634  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
635  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
636  LO Ikj = Icolind[j];
637  LO Cij = Icol2Ccol[Ikj];
638 
639  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
640  std::runtime_error, "Trying to insert a new entry into a static graph");
641 
642  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
643  }
644  }
645  }
646  }
647 
648 #ifdef HAVE_TPETRA_MMM_TIMINGS
649  MM2 = Teuchos::null;
650  MM = Teuchos::null;
651  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
652 #endif
653 
654  C.fillComplete(C.getDomainMap(), C.getRangeMap());
655 }
656 
657 /*********************************************************************************************************/
658 template <class Scalar,
659  class LocalOrdinal,
660  class GlobalOrdinal,
661  class LocalOrdinalViewType>
662 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode, LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
663  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Dinv,
664  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Aview,
665  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& Bview,
666  const LocalOrdinalViewType& Acol2Brow,
667  const LocalOrdinalViewType& Acol2Irow,
668  const LocalOrdinalViewType& Bcol2Ccol,
669  const LocalOrdinalViewType& Icol2Ccol,
670  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode>& C,
671  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > Cimport,
672  const std::string& label,
673  const Teuchos::RCP<Teuchos::ParameterList>& params) {
674 #ifdef HAVE_TPETRA_MMM_TIMINGS
675  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
676  using Teuchos::TimeMonitor;
677  Teuchos::RCP<TimeMonitor> MM;
678 #endif
679 
680  // Check if the diagonal entries exist in debug mode
681  const bool debug = Tpetra::Details::Behavior::debug();
682  if (debug) {
683  auto rowMap = Aview.origMatrix->getRowMap();
684  Tpetra::Vector<Scalar> diags(rowMap);
685  Aview.origMatrix->getLocalDiagCopy(diags);
686  size_t diagLength = rowMap->getLocalNumElements();
687  Teuchos::Array<Scalar> diagonal(diagLength);
688  diags.get1dCopy(diagonal());
689 
690  for (size_t i = 0; i < diagLength; ++i) {
691  TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
692  std::runtime_error,
693  "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl
694  << "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
695  }
696  }
697 
698  // Usings
699  using device_t = typename Tpetra::KokkosCompat::KokkosSYCLWrapperNode::device_type;
701  using graph_t = typename matrix_t::StaticCrsGraphType;
702  using lno_view_t = typename graph_t::row_map_type::non_const_type;
703  using int_view_t = Kokkos::View<int*,
704  typename lno_view_t::array_layout,
705  typename lno_view_t::memory_space,
706  typename lno_view_t::memory_traits>;
707  using c_lno_view_t = typename graph_t::row_map_type::const_type;
708  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
709  using scalar_view_t = typename matrix_t::values_type::non_const_type;
710 
711  // KokkosKernels handle
712  using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
713  typename lno_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
714  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
715  using int_handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
716  typename int_view_t::const_value_type, typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
717  typename device_t::execution_space, typename device_t::memory_space, typename device_t::memory_space>;
718 
719  // Merge the B and Bimport matrices
720  const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview, Acol2Brow, Acol2Irow, Bcol2Ccol, Icol2Ccol, C.getColMap()->getLocalNumElements());
721 
722  // Get the properties and arrays of input matrices
723  const matrix_t& Amat = Aview.origMatrix->getLocalMatrixDevice();
724  const matrix_t& Bmat = Bview.origMatrix->getLocalMatrixDevice();
725 
726  typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
727  typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
728  typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
729 
730  // Arrays of the output matrix
731  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("row_mapC"), AnumRows + 1);
732  lno_nnz_view_t entriesC;
733  scalar_view_t valuesC;
734 
735  // Options
736  int team_work_size = 16;
737  std::string myalg("SPGEMM_KK_MEMORY");
738  if (!params.is_null()) {
739  if (params->isParameter("sycl: algorithm"))
740  myalg = params->get("sycl: algorithm", myalg);
741  if (params->isParameter("sycl: team work size"))
742  team_work_size = params->get("sycl: team work size", team_work_size);
743  }
744 
745  // Get the algorithm mode
746  std::string nodename("SYCL");
747  std::string alg = nodename + std::string(" algorithm");
748  if (!params.is_null() && params->isParameter(alg)) myalg = params->get(alg, myalg);
749  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
750 
751  // decide whether to use integer-typed row pointers for this spgemm
752  Tpetra::Details::IntRowPtrHelper<decltype(Bmerged)> irph(Bmerged.nnz(), Bmerged.graph.row_map);
753  const bool useIntRowptrs =
754  irph.shouldUseIntRowptrs() &&
755  Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
756 
757  if (useIntRowptrs) {
758  int_handle_t kh;
759  kh.create_spgemm_handle(alg_enum);
760  kh.set_team_work_size(team_work_size);
761 
762  int_view_t int_row_mapC(Kokkos::ViewAllocateWithoutInitializing("int_row_mapC"), AnumRows + 1);
763 
764  auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
765  auto Bint = irph.getIntRowptrMatrix(Bmerged);
766 
767  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
768  Aint.graph.row_map, Aint.graph.entries, false,
769  Bint.graph.row_map, Bint.graph.entries, false,
770  int_row_mapC);
771 
772  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
773  if (c_nnz_size) {
774  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
775  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
776  }
777 
778  // even though there is no TPL for this, we have to use the same handle that was used in the symbolic phase,
779  // so need to have a special int-typed call for this as well.
780  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
781  Aint.graph.row_map, Aint.graph.entries, Amat.values, false,
782  Bint.graph.row_map, Bint.graph.entries, Bint.values, false,
783  int_row_mapC, entriesC, valuesC,
784  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
785  // transfer the integer rowptrs back to the correct rowptr type
786  Kokkos::parallel_for(
787  int_row_mapC.size(), KOKKOS_LAMBDA(int i) { row_mapC(i) = int_row_mapC(i); });
788  kh.destroy_spgemm_handle();
789  } else {
790  handle_t kh;
791  kh.create_spgemm_handle(alg_enum);
792  kh.set_team_work_size(team_work_size);
793 
794  KokkosSparse::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
795  Amat.graph.row_map, Amat.graph.entries, false,
796  Bmerged.graph.row_map, Bmerged.graph.entries, false,
797  row_mapC);
798 
799  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
800  if (c_nnz_size) {
801  entriesC = lno_nnz_view_t(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
802  valuesC = scalar_view_t(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
803  }
804  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
805  Amat.graph.row_map, Amat.graph.entries, Amat.values, false,
806  Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values, false,
807  row_mapC, entriesC, valuesC,
808  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
809  kh.destroy_spgemm_handle();
810  }
811 
812 #ifdef HAVE_TPETRA_MMM_TIMINGS
813  MM = Teuchos::null;
814  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix SYCLSort"))));
815 #endif
816 
817  // Sort & set values
818  if (params.is_null() || params->get("sort entries", true))
819  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
820  C.setAllValues(row_mapC, entriesC, valuesC);
821 
822 #ifdef HAVE_TPETRA_MMM_TIMINGS
823  MM = Teuchos::null;
824  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix SYCLESFC"))));
825 #endif
826 
827  // Final Fillcomplete
828  Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
829  labelList->set("Timer Label", label);
830  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
831  Teuchos::RCP<const Export<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosSYCLWrapperNode> > dummyExport;
832  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
833 }
834 
835 } // namespace MMdetails
836 } // namespace Tpetra
837 
838 #endif // SYCL
839 
840 #endif
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
A distributed dense vector.