42 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
45 #ifdef HAVE_TPETRA_INST_CUDA
51 template<
class Scalar,
54 class LocalOrdinalViewType>
55 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
58 const LocalOrdinalViewType & Acol2Brow,
59 const LocalOrdinalViewType & Acol2Irow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
63 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
64 const std::string& label = std::string(),
65 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
69 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
70 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
71 const LocalOrdinalViewType & Acol2Brow,
72 const LocalOrdinalViewType & Acol2Irow,
73 const LocalOrdinalViewType & Bcol2Ccol,
74 const LocalOrdinalViewType & Icol2Ccol,
75 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
76 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
77 const std::string& label = std::string(),
78 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
83 template<
class Scalar,
85 class GlobalOrdinal,
class LocalOrdinalViewType>
86 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
90 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
91 const LocalOrdinalViewType & Acol2Brow,
92 const LocalOrdinalViewType & Acol2Irow,
93 const LocalOrdinalViewType & Bcol2Ccol,
94 const LocalOrdinalViewType & Icol2Ccol,
95 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
96 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
97 const std::string& label = std::string(),
98 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
100 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
102 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
103 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
104 const LocalOrdinalViewType & Acol2Brow,
105 const LocalOrdinalViewType & Acol2Irow,
106 const LocalOrdinalViewType & Bcol2Ccol,
107 const LocalOrdinalViewType & Icol2Ccol,
108 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
109 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
110 const std::string& label = std::string(),
111 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
113 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
115 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
116 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
117 const LocalOrdinalViewType & Acol2Brow,
118 const LocalOrdinalViewType & Acol2Irow,
119 const LocalOrdinalViewType & Bcol2Ccol,
120 const LocalOrdinalViewType & Icol2Ccol,
121 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
122 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
123 const std::string& label = std::string(),
124 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
130 template<
class Scalar,
133 class LocalOrdinalViewType>
134 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
135 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
136 const LocalOrdinalViewType & Acol2Brow,
137 const LocalOrdinalViewType & Acol2Irow,
138 const LocalOrdinalViewType & Bcol2Ccol,
139 const LocalOrdinalViewType & Icol2Ccol,
140 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
141 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
142 const std::string& label,
143 const Teuchos::RCP<Teuchos::ParameterList>& params) {
146 #ifdef HAVE_TPETRA_MMM_TIMINGS
147 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
148 using Teuchos::TimeMonitor;
149 Teuchos::RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaWrapper")))));
152 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
153 std::string nodename(
"Cuda");
158 typedef typename KCRS::device_type device_t;
159 typedef typename KCRS::StaticCrsGraphType graph_t;
160 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163 typedef typename KCRS::values_type::non_const_type scalar_view_t;
167 int team_work_size = 16;
168 std::string myalg(
"SPGEMM_KK_MEMORY");
169 if(!params.is_null()) {
170 if(params->isParameter(
"cuda: algorithm"))
171 myalg = params->get(
"cuda: algorithm",myalg);
172 if(params->isParameter(
"cuda: team work size"))
173 team_work_size = params->get(
"cuda: team work size",team_work_size);
177 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
179 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space > KernelHandle;
182 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
183 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
185 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
186 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
187 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
189 c_lno_view_t Irowptr;
190 lno_nnz_view_t Icolind;
192 if(!Bview.importMatrix.is_null()) {
193 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
194 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
195 Ivals = Bview.importMatrix->getLocalMatrix().values;
200 std::string alg = nodename+std::string(
" algorithm");
202 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
203 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
206 const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
208 #ifdef HAVE_TPETRA_MMM_TIMINGS
209 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaCore"))));
213 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
214 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
215 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
217 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
218 lno_nnz_view_t entriesC;
219 scalar_view_t valuesC;
221 kh.create_spgemm_handle(alg_enum);
222 kh.set_team_work_size(team_work_size);
224 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,
false,Bmerged.graph.row_map,Bmerged.graph.entries,
false,row_mapC);
226 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
228 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
229 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
231 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,
false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,
false,row_mapC,entriesC,valuesC);
232 kh.destroy_spgemm_handle();
234 #ifdef HAVE_TPETRA_MMM_TIMINGS
235 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaSort"))));
239 if (params.is_null() || params->get(
"sort entries",
true))
240 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
241 C.setAllValues(row_mapC,entriesC,valuesC);
243 #ifdef HAVE_TPETRA_MMM_TIMINGS
244 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaESFC"))));
248 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
249 labelList->set(
"Timer Label",label);
250 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
251 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
252 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
257 template<
class Scalar,
260 class LocalOrdinalViewType>
261 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
262 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
263 const LocalOrdinalViewType & targetMapToOrigRow,
264 const LocalOrdinalViewType & targetMapToImportRow,
265 const LocalOrdinalViewType & Bcol2Ccol,
266 const LocalOrdinalViewType & Icol2Ccol,
267 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
268 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
269 const std::string& label,
270 const Teuchos::RCP<Teuchos::ParameterList>& params) {
273 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
275 #ifdef HAVE_TPETRA_MMM_TIMINGS
276 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
277 using Teuchos::TimeMonitor;
278 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
279 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
287 typedef typename KCRS::StaticCrsGraphType graph_t;
288 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
289 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
290 typedef typename KCRS::values_type::non_const_type scalar_view_t;
293 typedef LocalOrdinal LO;
294 typedef GlobalOrdinal GO;
296 typedef Map<LO,GO,NO> map_type;
297 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
298 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
299 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
302 typename graph_t::execution_space().fence();
305 RCP<const map_type> Ccolmap = C.getColMap();
306 size_t m = Aview.origMatrix->getNodeNumRows();
307 size_t n = Ccolmap->getNodeNumElements();
310 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
311 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
312 const KCRS & Cmat = C.getLocalMatrix();
314 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
315 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
316 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
317 scalar_view_t Cvals = Cmat.values;
319 c_lno_view_t Irowptr;
320 lno_nnz_view_t Icolind;
322 if(!Bview.importMatrix.is_null()) {
323 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
324 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
325 Ivals = Bview.importMatrix->getLocalMatrix().values;
328 #ifdef HAVE_TPETRA_MMM_TIMINGS
329 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
341 std::vector<size_t> c_status(n, ST_INVALID);
344 size_t CSR_ip = 0, OLD_ip = 0;
345 for (
size_t i = 0; i < m; i++) {
349 CSR_ip = Crowptr[i+1];
350 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
351 c_status[Ccolind[k]] = k;
357 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
359 const SC Aval = Avals[k];
363 if (targetMapToOrigRow[Aik] != LO_INVALID) {
365 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
367 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
369 LO Cij = Bcol2Ccol[Bkj];
371 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
372 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
373 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
375 Cvals[c_status[Cij]] += Aval * Bvals[j];
380 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
381 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
383 LO Cij = Icol2Ccol[Ikj];
385 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
386 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
387 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
389 Cvals[c_status[Cij]] += Aval * Ivals[j];
395 C.fillComplete(C.getDomainMap(), C.getRangeMap());
399 template<
class Scalar,
402 class LocalOrdinalViewType>
403 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
404 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
405 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
406 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
407 const LocalOrdinalViewType & Acol2Brow,
408 const LocalOrdinalViewType & Acol2Irow,
409 const LocalOrdinalViewType & Bcol2Ccol,
410 const LocalOrdinalViewType & Icol2Ccol,
411 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
412 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
413 const std::string& label,
414 const Teuchos::RCP<Teuchos::ParameterList>& params) {
416 #ifdef HAVE_TPETRA_MMM_TIMINGS
417 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
418 using Teuchos::TimeMonitor;
419 Teuchos::RCP<TimeMonitor> MM;
427 std::string myalg(
"MSAK");
428 if(!params.is_null()) {
429 if(params->isParameter(
"cuda: jacobi algorithm"))
430 myalg = params->get(
"cuda: jacobi algorithm",myalg);
433 if(myalg ==
"MSAK") {
434 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
436 else if(myalg ==
"KK") {
437 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
440 throw std::runtime_error(
"Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
443 #ifdef HAVE_TPETRA_MMM_TIMINGS
444 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
448 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
449 labelList->set(
"Timer Label",label);
450 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
453 if(!C.isFillComplete()) {
454 RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
455 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
463 template<
class Scalar,
466 class LocalOrdinalViewType>
467 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
468 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
469 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
470 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
471 const LocalOrdinalViewType & targetMapToOrigRow,
472 const LocalOrdinalViewType & targetMapToImportRow,
473 const LocalOrdinalViewType & Bcol2Ccol,
474 const LocalOrdinalViewType & Icol2Ccol,
475 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
476 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
477 const std::string& label,
478 const Teuchos::RCP<Teuchos::ParameterList>& params) {
481 typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
483 #ifdef HAVE_TPETRA_MMM_TIMINGS
484 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
485 using Teuchos::TimeMonitor;
486 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore"))));
487 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
494 typedef typename KCRS::StaticCrsGraphType graph_t;
495 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
496 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
497 typedef typename KCRS::values_type::non_const_type scalar_view_t;
498 typedef typename scalar_view_t::memory_space scalar_memory_space;
501 typedef LocalOrdinal LO;
502 typedef GlobalOrdinal GO;
504 typedef Map<LO,GO,NO> map_type;
505 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
506 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
507 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
510 typename graph_t::execution_space().fence();
513 RCP<const map_type> Ccolmap = C.getColMap();
514 size_t m = Aview.origMatrix->getNodeNumRows();
515 size_t n = Ccolmap->getNodeNumElements();
518 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
519 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
520 const KCRS & Cmat = C.getLocalMatrix();
522 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
523 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
524 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
525 scalar_view_t Cvals = Cmat.values;
527 c_lno_view_t Irowptr;
528 lno_nnz_view_t Icolind;
530 if(!Bview.importMatrix.is_null()) {
531 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
532 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
533 Ivals = Bview.importMatrix->getLocalMatrix().values;
537 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
539 #ifdef HAVE_TPETRA_MMM_TIMINGS
540 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore - Compare"))));
548 std::vector<size_t> c_status(n, ST_INVALID);
551 size_t CSR_ip = 0, OLD_ip = 0;
552 for (
size_t i = 0; i < m; i++) {
557 CSR_ip = Crowptr[i+1];
558 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
559 c_status[Ccolind[k]] = k;
565 SC minusOmegaDval = -omega*Dvals(i,0);
568 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
569 Scalar Bval = Bvals[j];
573 LO Cij = Bcol2Ccol[Bij];
575 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
576 std::runtime_error,
"Trying to insert a new entry into a static graph");
578 Cvals[c_status[Cij]] = Bvals[j];
582 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
584 const SC Aval = Avals[k];
588 if (targetMapToOrigRow[Aik] != LO_INVALID) {
590 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
592 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
594 LO Cij = Bcol2Ccol[Bkj];
596 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
597 std::runtime_error,
"Trying to insert a new entry into a static graph");
599 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
604 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
605 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
607 LO Cij = Icol2Ccol[Ikj];
609 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
610 std::runtime_error,
"Trying to insert a new entry into a static graph");
612 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
618 #ifdef HAVE_TPETRA_MMM_TIMINGS
620 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
623 C.fillComplete(C.getDomainMap(), C.getRangeMap());
628 template<
class Scalar,
631 class LocalOrdinalViewType>
632 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
633 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
634 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
635 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
636 const LocalOrdinalViewType & Acol2Brow,
637 const LocalOrdinalViewType & Acol2Irow,
638 const LocalOrdinalViewType & Bcol2Ccol,
639 const LocalOrdinalViewType & Icol2Ccol,
640 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
641 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
642 const std::string& label,
643 const Teuchos::RCP<Teuchos::ParameterList>& params) {
645 #ifdef HAVE_TPETRA_MMM_TIMINGS
646 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
647 using Teuchos::TimeMonitor;
648 Teuchos::RCP<TimeMonitor> MM;
655 auto rowMap = Aview.origMatrix->getRowMap();
657 Aview.origMatrix->getLocalDiagCopy(diags);
658 size_t diagLength = rowMap->getNodeNumElements();
659 Teuchos::Array<Scalar> diagonal(diagLength);
660 diags.get1dCopy(diagonal());
662 for(
size_t i = 0; i < diagLength; ++i) {
663 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
665 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
666 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
671 using device_t =
typename Kokkos::Compat::KokkosCudaWrapperNode::device_type;
673 using graph_t =
typename matrix_t::StaticCrsGraphType;
674 using lno_view_t =
typename graph_t::row_map_type::non_const_type;
675 using c_lno_view_t =
typename graph_t::row_map_type::const_type;
676 using lno_nnz_view_t =
typename graph_t::entries_type::non_const_type;
677 using scalar_view_t =
typename matrix_t::values_type::non_const_type;
680 using handle_t =
typename KokkosKernels::Experimental::KokkosKernelsHandle<
681 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
682 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
685 c_lno_view_t Irowptr;
686 lno_nnz_view_t Icolind;
688 if(!Bview.importMatrix.is_null()) {
689 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
690 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
691 Ivals = Bview.importMatrix->getLocalMatrix().values;
695 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
698 const matrix_t & Amat = Aview.origMatrix->getLocalMatrix();
699 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrix();
701 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
702 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
703 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
705 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
706 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
707 const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
710 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
711 lno_nnz_view_t entriesC;
712 scalar_view_t valuesC;
715 int team_work_size = 16;
716 std::string myalg(
"SPGEMM_KK_MEMORY");
717 if(!params.is_null()) {
718 if(params->isParameter(
"cuda: algorithm"))
719 myalg = params->get(
"cuda: algorithm",myalg);
720 if(params->isParameter(
"cuda: team work size"))
721 team_work_size = params->get(
"cuda: team work size",team_work_size);
725 std::string nodename(
"Cuda");
726 std::string alg = nodename + std::string(
" algorithm");
727 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
728 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
733 kh.create_spgemm_handle(alg_enum);
734 kh.set_team_work_size(team_work_size);
736 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
737 Arowptr, Acolind,
false,
738 Browptr, Bcolind,
false,
741 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
743 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
744 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
747 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
748 Arowptr, Acolind, Avals,
false,
749 Browptr, Bcolind, Bvals,
false,
750 row_mapC, entriesC, valuesC,
751 omega, Dinv.getLocalViewDevice());
752 kh.destroy_spgemm_handle();
754 #ifdef HAVE_TPETRA_MMM_TIMINGS
755 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaSort"))));
759 if (params.is_null() || params->get(
"sort entries",
true))
760 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
761 C.setAllValues(row_mapC,entriesC,valuesC);
763 #ifdef HAVE_TPETRA_MMM_TIMINGS
764 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
768 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
769 labelList->set(
"Timer Label",label);
770 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
771 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
772 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
A distributed dense vector.