10 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
13 #include "Tpetra_Details_IntRowPtrHelper.hpp"
15 #ifdef HAVE_TPETRA_INST_CUDA
21 template<
class Scalar,
24 class LocalOrdinalViewType>
25 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
26 static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
27 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
28 const LocalOrdinalViewType & Acol2Brow,
29 const LocalOrdinalViewType & Acol2Irow,
30 const LocalOrdinalViewType & Bcol2Ccol,
31 const LocalOrdinalViewType & Icol2Ccol,
32 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
33 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
34 const std::string& label = std::string(),
35 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
39 static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
40 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
41 const LocalOrdinalViewType & Acol2Brow,
42 const LocalOrdinalViewType & Acol2Irow,
43 const LocalOrdinalViewType & Bcol2Ccol,
44 const LocalOrdinalViewType & Icol2Ccol,
45 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
46 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
47 const std::string& label = std::string(),
48 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
53 template<
class Scalar,
55 class GlobalOrdinal,
class LocalOrdinalViewType>
56 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
57 static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
58 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
59 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
60 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
61 const LocalOrdinalViewType & Acol2Brow,
62 const LocalOrdinalViewType & Acol2Irow,
63 const LocalOrdinalViewType & Bcol2Ccol,
64 const LocalOrdinalViewType & Icol2Ccol,
65 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
66 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
67 const std::string& label = std::string(),
68 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
70 static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
71 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
72 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
73 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
74 const LocalOrdinalViewType & Acol2Brow,
75 const LocalOrdinalViewType & Acol2Irow,
76 const LocalOrdinalViewType & Bcol2Ccol,
77 const LocalOrdinalViewType & Icol2Ccol,
78 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
79 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
80 const std::string& label = std::string(),
81 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
83 static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
84 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
85 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
86 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
87 const LocalOrdinalViewType & Acol2Brow,
88 const LocalOrdinalViewType & Acol2Irow,
89 const LocalOrdinalViewType & Bcol2Ccol,
90 const LocalOrdinalViewType & Icol2Ccol,
91 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
92 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
93 const std::string& label = std::string(),
94 const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
100 template<
class Scalar,
103 class LocalOrdinalViewType>
104 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
105 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
106 const LocalOrdinalViewType & Acol2Brow,
107 const LocalOrdinalViewType & Acol2Irow,
108 const LocalOrdinalViewType & Bcol2Ccol,
109 const LocalOrdinalViewType & Icol2Ccol,
110 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
111 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
112 const std::string& label,
113 const Teuchos::RCP<Teuchos::ParameterList>& params) {
116 #ifdef HAVE_TPETRA_MMM_TIMINGS
117 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
118 using Teuchos::TimeMonitor;
119 Teuchos::RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaWrapper")))));
122 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
123 std::string nodename(
"Cuda");
128 typedef typename KCRS::device_type device_t;
129 typedef typename KCRS::StaticCrsGraphType graph_t;
130 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
131 using int_view_t = Kokkos::View<int *, typename lno_view_t::array_layout, typename lno_view_t::memory_space, typename lno_view_t::memory_traits>;
132 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
133 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
134 typedef typename KCRS::values_type::non_const_type scalar_view_t;
138 int team_work_size = 16;
139 std::string myalg(
"SPGEMM_KK_MEMORY");
140 if(!params.is_null()) {
141 if(params->isParameter(
"cuda: algorithm"))
142 myalg = params->get(
"cuda: algorithm",myalg);
143 if(params->isParameter(
"cuda: team work size"))
144 team_work_size = params->get(
"cuda: team work size",team_work_size);
148 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
149 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
150 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space > KernelHandle;
151 using IntKernelHandle = KokkosKernels::Experimental::KokkosKernelsHandle<
152 typename int_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
153 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
156 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
157 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
159 c_lno_view_t Arowptr = Amat.graph.row_map,
160 Browptr = Bmat.graph.row_map;
161 const lno_nnz_view_t Acolind = Amat.graph.entries,
162 Bcolind = Bmat.graph.entries;
163 const scalar_view_t Avals = Amat.values,
167 std::string alg = nodename+std::string(
" algorithm");
169 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
170 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
173 KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
181 #if defined(KOKKOS_ENABLE_CUDA) \
182 && defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) \
183 && ((CUDA_VERSION < 11000) || (CUDA_VERSION >= 11040))
184 if constexpr (std::is_same_v<typename device_t::execution_space, Kokkos::Cuda>) {
185 if (!KokkosSparse::isCrsGraphSorted(Bmerged.graph.row_map, Bmerged.graph.entries)) {
186 KokkosSparse::sort_crs_matrix(Bmerged);
191 #ifdef HAVE_TPETRA_MMM_TIMINGS
192 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaCore"))));
196 typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
197 typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
198 typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
201 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lno_row"), AnumRows + 1);
202 lno_nnz_view_t entriesC;
203 scalar_view_t valuesC;
206 const bool useIntRowptrs =
207 irph.shouldUseIntRowptrs() &&
208 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
212 kh.create_spgemm_handle(alg_enum);
213 kh.set_team_work_size(team_work_size);
215 int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_int_row"), AnumRows + 1);
217 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
218 auto Bint = irph.getIntRowptrMatrix(Bmerged);
219 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,
false,Bint.graph.row_map,Bint.graph.entries,
false, int_row_mapC);
221 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
223 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
224 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
226 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Aint.graph.row_map,Aint.graph.entries,Aint.values,
false,Bint.graph.row_map,Bint.graph.entries,Bint.values,
false,int_row_mapC,entriesC,valuesC);
228 Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(
int i){ row_mapC(i) = int_row_mapC(i);});
229 kh.destroy_spgemm_handle();
233 kh.create_spgemm_handle(alg_enum);
234 kh.set_team_work_size(team_work_size);
236 KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,
false,Bmerged.graph.row_map,Bmerged.graph.entries,
false,row_mapC);
238 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
240 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
241 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
244 KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,
false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,
false,row_mapC,entriesC,valuesC);
246 kh.destroy_spgemm_handle();
250 #ifdef HAVE_TPETRA_MMM_TIMINGS
251 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaSort"))));
255 if (params.is_null() || params->get(
"sort entries",
true))
256 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
257 C.setAllValues(row_mapC,entriesC,valuesC);
259 #ifdef HAVE_TPETRA_MMM_TIMINGS
260 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaESFC"))));
264 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
265 labelList->set(
"Timer Label",label);
266 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
267 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
268 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
273 template<
class Scalar,
276 class LocalOrdinalViewType>
277 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
278 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
279 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
280 const LocalOrdinalViewType & targetMapToOrigRow_dev,
281 const LocalOrdinalViewType & targetMapToImportRow_dev,
282 const LocalOrdinalViewType & Bcol2Ccol_dev,
283 const LocalOrdinalViewType & Icol2Ccol_dev,
284 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
285 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
286 const std::string& label,
287 const Teuchos::RCP<Teuchos::ParameterList>& params) {
290 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
292 #ifdef HAVE_TPETRA_MMM_TIMINGS
293 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
294 using Teuchos::TimeMonitor;
295 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
296 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
303 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
304 typedef typename KCRS::StaticCrsGraphType graph_t;
305 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
306 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
307 typedef typename KCRS::values_type::non_const_type scalar_view_t;
310 typedef LocalOrdinal LO;
311 typedef GlobalOrdinal GO;
313 typedef Map<LO,GO,NO> map_type;
314 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
315 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
316 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
324 auto targetMapToOrigRow =
325 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
326 targetMapToOrigRow_dev);
327 auto targetMapToImportRow =
328 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
329 targetMapToImportRow_dev);
331 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
334 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
338 RCP<const map_type> Ccolmap = C.getColMap();
339 size_t m = Aview.origMatrix->getLocalNumRows();
340 size_t n = Ccolmap->getLocalNumElements();
343 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
344 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
345 const KCRS & Cmat = C.getLocalMatrixHost();
347 c_lno_view_t Arowptr = Amat.graph.row_map,
348 Browptr = Bmat.graph.row_map,
349 Crowptr = Cmat.graph.row_map;
350 const lno_nnz_view_t Acolind = Amat.graph.entries,
351 Bcolind = Bmat.graph.entries,
352 Ccolind = Cmat.graph.entries;
353 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
354 scalar_view_t Cvals = Cmat.values;
356 c_lno_view_t Irowptr;
357 lno_nnz_view_t Icolind;
359 if(!Bview.importMatrix.is_null()) {
360 auto lclB = Bview.importMatrix->getLocalMatrixHost();
361 Irowptr = lclB.graph.row_map;
362 Icolind = lclB.graph.entries;
366 #ifdef HAVE_TPETRA_MMM_TIMINGS
367 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
379 std::vector<size_t> c_status(n, ST_INVALID);
382 size_t CSR_ip = 0, OLD_ip = 0;
383 for (
size_t i = 0; i < m; i++) {
387 CSR_ip = Crowptr[i+1];
388 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
389 c_status[Ccolind[k]] = k;
395 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
397 const SC Aval = Avals[k];
401 if (targetMapToOrigRow[Aik] != LO_INVALID) {
403 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
405 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
407 LO Cij = Bcol2Ccol[Bkj];
409 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
410 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
411 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
413 Cvals[c_status[Cij]] += Aval * Bvals[j];
418 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
419 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
421 LO Cij = Icol2Ccol[Ikj];
423 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
425 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
427 Cvals[c_status[Cij]] += Aval * Ivals[j];
433 C.fillComplete(C.getDomainMap(), C.getRangeMap());
437 template<
class Scalar,
440 class LocalOrdinalViewType>
441 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
442 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
443 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
444 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
445 const LocalOrdinalViewType & Acol2Brow,
446 const LocalOrdinalViewType & Acol2Irow,
447 const LocalOrdinalViewType & Bcol2Ccol,
448 const LocalOrdinalViewType & Icol2Ccol,
449 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
450 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
451 const std::string& label,
452 const Teuchos::RCP<Teuchos::ParameterList>& params) {
454 #ifdef HAVE_TPETRA_MMM_TIMINGS
455 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
456 using Teuchos::TimeMonitor;
457 Teuchos::RCP<TimeMonitor> MM;
465 std::string myalg(
"KK");
466 if(!params.is_null()) {
467 if(params->isParameter(
"cuda: jacobi algorithm"))
468 myalg = params->get(
"cuda: jacobi algorithm",myalg);
471 if(myalg ==
"MSAK") {
472 ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
474 else if(myalg ==
"KK") {
475 jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
478 throw std::runtime_error(
"Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
481 #ifdef HAVE_TPETRA_MMM_TIMINGS
482 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
486 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
487 labelList->set(
"Timer Label",label);
488 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
491 if(!C.isFillComplete()) {
492 RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
493 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
501 template<
class Scalar,
504 class LocalOrdinalViewType>
505 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
506 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
507 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
508 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
509 const LocalOrdinalViewType & targetMapToOrigRow_dev,
510 const LocalOrdinalViewType & targetMapToImportRow_dev,
511 const LocalOrdinalViewType & Bcol2Ccol_dev,
512 const LocalOrdinalViewType & Icol2Ccol_dev,
513 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
514 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
515 const std::string& label,
516 const Teuchos::RCP<Teuchos::ParameterList>& params) {
519 typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
521 #ifdef HAVE_TPETRA_MMM_TIMINGS
522 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
523 using Teuchos::TimeMonitor;
524 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore"))));
525 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
531 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
532 typedef typename KCRS::StaticCrsGraphType graph_t;
533 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
534 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
535 typedef typename KCRS::values_type::non_const_type scalar_view_t;
536 typedef typename scalar_view_t::memory_space scalar_memory_space;
539 typedef LocalOrdinal LO;
540 typedef GlobalOrdinal GO;
542 typedef Map<LO,GO,NO> map_type;
543 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
544 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
545 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
553 auto targetMapToOrigRow =
554 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
555 targetMapToOrigRow_dev);
556 auto targetMapToImportRow =
557 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
558 targetMapToImportRow_dev);
560 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
563 Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
568 RCP<const map_type> Ccolmap = C.getColMap();
569 size_t m = Aview.origMatrix->getLocalNumRows();
570 size_t n = Ccolmap->getLocalNumElements();
573 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
574 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
575 const KCRS & Cmat = C.getLocalMatrixHost();
577 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
578 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
579 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
580 scalar_view_t Cvals = Cmat.values;
582 c_lno_view_t Irowptr;
583 lno_nnz_view_t Icolind;
585 if(!Bview.importMatrix.is_null()) {
586 auto lclB = Bview.importMatrix->getLocalMatrixHost();
587 Irowptr = lclB.graph.row_map;
588 Icolind = lclB.graph.entries;
594 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
596 #ifdef HAVE_TPETRA_MMM_TIMINGS
597 MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore - Compare"))));
605 std::vector<size_t> c_status(n, ST_INVALID);
608 size_t CSR_ip = 0, OLD_ip = 0;
609 for (
size_t i = 0; i < m; i++) {
614 CSR_ip = Crowptr[i+1];
615 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
616 c_status[Ccolind[k]] = k;
622 SC minusOmegaDval = -omega*Dvals(i,0);
625 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
626 Scalar Bval = Bvals[j];
630 LO Cij = Bcol2Ccol[Bij];
632 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
633 std::runtime_error,
"Trying to insert a new entry into a static graph");
635 Cvals[c_status[Cij]] = Bvals[j];
639 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
641 const SC Aval = Avals[k];
645 if (targetMapToOrigRow[Aik] != LO_INVALID) {
647 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
649 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
651 LO Cij = Bcol2Ccol[Bkj];
653 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
654 std::runtime_error,
"Trying to insert a new entry into a static graph");
656 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
661 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
662 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
664 LO Cij = Icol2Ccol[Ikj];
666 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
667 std::runtime_error,
"Trying to insert a new entry into a static graph");
669 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
675 #ifdef HAVE_TPETRA_MMM_TIMINGS
677 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
680 C.fillComplete(C.getDomainMap(), C.getRangeMap());
685 template<
class Scalar,
688 class LocalOrdinalViewType>
689 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
690 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
691 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
692 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
693 const LocalOrdinalViewType & Acol2Brow,
694 const LocalOrdinalViewType & Acol2Irow,
695 const LocalOrdinalViewType & Bcol2Ccol,
696 const LocalOrdinalViewType & Icol2Ccol,
697 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
698 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
699 const std::string& label,
700 const Teuchos::RCP<Teuchos::ParameterList>& params) {
702 #ifdef HAVE_TPETRA_MMM_TIMINGS
703 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
704 using Teuchos::TimeMonitor;
705 Teuchos::RCP<TimeMonitor> MM;
712 auto rowMap = Aview.origMatrix->getRowMap();
714 Aview.origMatrix->getLocalDiagCopy(diags);
715 size_t diagLength = rowMap->getLocalNumElements();
716 Teuchos::Array<Scalar> diagonal(diagLength);
717 diags.get1dCopy(diagonal());
719 for(
size_t i = 0; i < diagLength; ++i) {
720 TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
722 "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
723 "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
728 using device_t =
typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
730 using graph_t =
typename matrix_t::StaticCrsGraphType;
731 using lno_view_t =
typename graph_t::row_map_type::non_const_type;
732 using int_view_t = Kokkos::View<
int *,
733 typename lno_view_t::array_layout,
734 typename lno_view_t::memory_space,
735 typename lno_view_t::memory_traits>;
736 using c_lno_view_t =
typename graph_t::row_map_type::const_type;
737 using lno_nnz_view_t =
typename graph_t::entries_type::non_const_type;
738 using scalar_view_t =
typename matrix_t::values_type::non_const_type;
741 using handle_t =
typename KokkosKernels::Experimental::KokkosKernelsHandle<
742 typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
743 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
745 using int_handle_t =
typename KokkosKernels::Experimental::KokkosKernelsHandle<
746 typename int_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type,
typename scalar_view_t::const_value_type,
747 typename device_t::execution_space,
typename device_t::memory_space,
typename device_t::memory_space >;
750 const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
753 const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
754 const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
756 typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
757 typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
758 typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
761 lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"row_mapC"), AnumRows + 1);
762 lno_nnz_view_t entriesC;
763 scalar_view_t valuesC;
766 int team_work_size = 16;
767 std::string myalg(
"SPGEMM_KK_MEMORY");
768 if(!params.is_null()) {
769 if(params->isParameter(
"cuda: algorithm"))
770 myalg = params->get(
"cuda: algorithm",myalg);
771 if(params->isParameter(
"cuda: team work size"))
772 team_work_size = params->get(
"cuda: team work size",team_work_size);
776 std::string alg(
"Cuda algorithm");
777 if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
778 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
782 const bool useIntRowptrs =
783 irph.shouldUseIntRowptrs() &&
784 Aview.origMatrix->getApplyHelper()->shouldUseIntRowptrs();
788 kh.create_spgemm_handle(alg_enum);
789 kh.set_team_work_size(team_work_size);
791 int_view_t int_row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"int_row_mapC"), AnumRows + 1);
793 auto Aint = Aview.origMatrix->getApplyHelper()->getIntRowptrMatrix(Amat);
794 auto Bint = irph.getIntRowptrMatrix(Bmerged);
796 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
797 Aint.graph.row_map, Aint.graph.entries,
false,
798 Bint.graph.row_map, Bint.graph.entries,
false,
801 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
803 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
804 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
809 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
810 Aint.graph.row_map, Aint.graph.entries, Amat.values,
false,
811 Bint.graph.row_map, Bint.graph.entries, Bint.values,
false,
812 int_row_mapC, entriesC, valuesC,
813 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
815 Kokkos::parallel_for(int_row_mapC.size(), KOKKOS_LAMBDA(
int i){ row_mapC(i) = int_row_mapC(i);});
816 kh.destroy_spgemm_handle();
819 kh.create_spgemm_handle(alg_enum);
820 kh.set_team_work_size(team_work_size);
822 KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
823 Amat.graph.row_map, Amat.graph.entries,
false,
824 Bmerged.graph.row_map, Bmerged.graph.entries,
false,
827 size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
829 entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
830 valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
833 KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
834 Amat.graph.row_map, Amat.graph.entries, Amat.values,
false,
835 Bmerged.graph.row_map, Bmerged.graph.entries, Bmerged.values,
false,
836 row_mapC, entriesC, valuesC,
837 omega, Dinv.getLocalViewDevice(Access::ReadOnly));
838 kh.destroy_spgemm_handle();
843 #ifdef HAVE_TPETRA_MMM_TIMINGS
844 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaSort"))));
848 if (params.is_null() || params->get(
"sort entries",
true))
849 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
850 C.setAllValues(row_mapC,entriesC,valuesC);
852 #ifdef HAVE_TPETRA_MMM_TIMINGS
853 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
857 Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
858 labelList->set(
"Timer Label",label);
859 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
860 Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
861 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
A distributed dense vector.