42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45 #include "Tpetra_RowMatrixTransposer.hpp"
49 namespace MatrixMatrix{
51 namespace ExtraKernels{
55 template<
class CrsMatrixType>
56 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
58 size_t Aest = 100, Best=100;
59 if (A.getNodeNumEntries() > 0)
60 Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
61 if (B.getNodeNumEntries() > 0)
62 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
64 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
65 nnzperrow *= nnzperrow;
72 template<
class CrsMatrixType>
73 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
74 size_t nnzPerRowA = 100, Pcols = 100;
75 if (A.getNodeNumEntries() > 0)
76 nnzPerRowA = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 9;
77 if (P.getNodeNumEntries() > 0)
78 Pcols = (P.getNodeNumCols() > 0) ? P.getNodeNumCols() : 100;
79 return (
size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
82 #if defined (HAVE_TPETRA_INST_OPENMP)
84 template<
class Scalar,
87 class LocalOrdinalViewType>
88 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
90 const LocalOrdinalViewType & targetMapToOrigRow,
91 const LocalOrdinalViewType & targetMapToImportRow,
92 const LocalOrdinalViewType & Bcol2Ccol,
93 const LocalOrdinalViewType & Icol2Ccol,
94 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
95 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
96 const std::string& label,
97 const Teuchos::RCP<Teuchos::ParameterList>& params) {
98 #ifdef HAVE_TPETRA_MMM_TIMINGS
99 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
100 using Teuchos::TimeMonitor;
101 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
103 using Teuchos::Array;
104 using Teuchos::ArrayRCP;
105 using Teuchos::ArrayView;
113 typedef typename KCRS::StaticCrsGraphType graph_t;
114 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
115 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
116 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
117 typedef typename KCRS::values_type::non_const_type scalar_view_t;
121 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
122 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
125 typedef LocalOrdinal LO;
126 typedef GlobalOrdinal GO;
127 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
128 typedef Map<LO,GO,NO> map_type;
133 typedef NO::execution_space execution_space;
134 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
137 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
138 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
139 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
142 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
143 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
145 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
146 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
147 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
148 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
150 c_lno_view_t Irowptr;
151 lno_nnz_view_t Icolind;
153 if(!Bview.importMatrix.is_null()) {
154 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
155 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
156 Ivals = Bview.importMatrix->getLocalMatrix().values;
157 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
161 RCP<const map_type> Ccolmap = C.getColMap();
162 size_t m = Aview.origMatrix->getNodeNumRows();
163 size_t n = Ccolmap->getNodeNumElements();
164 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
167 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
168 if(!params.is_null()) {
169 if(params->isParameter(
"openmp: ltg thread max"))
170 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
174 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
176 lno_nnz_view_t entriesC;
177 scalar_view_t valuesC;
181 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
184 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind",thread_max);
185 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values",thread_max);
187 double thread_chunk = (double)(m) / thread_max;
190 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
193 size_t my_thread_start = tid * thread_chunk;
194 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
195 size_t my_thread_m = my_thread_stop - my_thread_start;
198 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
201 std::vector<size_t> c_status(n,INVALID);
203 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
204 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
207 size_t CSR_ip = 0, OLD_ip = 0;
208 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
212 row_mapC(i) = CSR_ip;
215 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
217 const SC Aval = Avals(k);
221 if (targetMapToOrigRow(Aik) != LO_INVALID) {
228 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
231 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
233 LO Cij = Bcol2Ccol(Bkj);
235 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
237 c_status[Cij] = CSR_ip;
238 Ccolind(CSR_ip) = Cij;
239 Cvals(CSR_ip) = Aval*Bvals(j);
243 Cvals(c_status[Cij]) += Aval*Bvals(j);
254 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
255 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
257 LO Cij = Icol2Ccol(Ikj);
259 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
261 c_status[Cij] = CSR_ip;
262 Ccolind(CSR_ip) = Cij;
263 Cvals(CSR_ip) = Aval*Ivals(j);
267 Cvals(c_status[Cij]) += Aval*Ivals(j);
274 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
276 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
277 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
281 thread_total_nnz(tid) = CSR_ip;
282 tl_colind(tid) = Ccolind;
283 tl_values(tid) = Cvals;
287 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
289 #ifdef HAVE_TPETRA_MMM_TIMINGS
290 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
293 if (params.is_null() || params->get(
"sort entries",
true))
294 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
295 C.setAllValues(row_mapC,entriesC,valuesC);
300 template<
class Scalar,
303 class LocalOrdinalViewType>
304 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
305 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
306 const LocalOrdinalViewType & targetMapToOrigRow,
307 const LocalOrdinalViewType & targetMapToImportRow,
308 const LocalOrdinalViewType & Bcol2Ccol,
309 const LocalOrdinalViewType & Icol2Ccol,
310 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
311 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
312 const std::string& label,
313 const Teuchos::RCP<Teuchos::ParameterList>& params) {
314 #ifdef HAVE_TPETRA_MMM_TIMINGS
315 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
316 using Teuchos::TimeMonitor;
317 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
320 using Teuchos::Array;
321 using Teuchos::ArrayRCP;
322 using Teuchos::ArrayView;
329 typedef typename KCRS::StaticCrsGraphType graph_t;
330 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
331 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
332 typedef typename KCRS::values_type::non_const_type scalar_view_t;
335 typedef LocalOrdinal LO;
336 typedef GlobalOrdinal GO;
337 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
338 typedef Map<LO,GO,NO> map_type;
343 typedef NO::execution_space execution_space;
344 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
347 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
348 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
349 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
352 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
353 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
354 const KCRS & Cmat = C.getLocalMatrix();
356 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
357 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
358 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
359 scalar_view_t Cvals = Cmat.values;
361 c_lno_view_t Irowptr;
362 c_lno_nnz_view_t Icolind;
364 if(!Bview.importMatrix.is_null()) {
365 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
366 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
367 Ivals = Bview.importMatrix->getLocalMatrix().values;
371 RCP<const map_type> Ccolmap = C.getColMap();
372 size_t m = Aview.origMatrix->getNodeNumRows();
373 size_t n = Ccolmap->getNodeNumElements();
376 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
377 if(!params.is_null()) {
378 if(params->isParameter(
"openmp: ltg thread max"))
379 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
382 double thread_chunk = (double)(m) / thread_max;
385 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
388 size_t my_thread_start = tid * thread_chunk;
389 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
392 std::vector<size_t> c_status(n,INVALID);
395 size_t CSR_ip = 0, OLD_ip = 0;
396 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
400 CSR_ip = Crowptr(i+1);
401 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
402 c_status[Ccolind(k)] = k;
407 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
409 const SC Aval = Avals(k);
413 if (targetMapToOrigRow(Aik) != LO_INVALID) {
415 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
417 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
419 LO Cij = Bcol2Ccol(Bkj);
421 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
422 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
423 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
425 Cvals(c_status[Cij]) += Aval * Bvals(j);
429 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
430 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
432 LO Cij = Icol2Ccol(Ikj);
434 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
435 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
436 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
438 Cvals(c_status[Cij]) += Aval * Ivals(j);
449 template<
class Scalar,
452 class LocalOrdinalViewType>
453 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
454 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
455 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
456 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
457 const LocalOrdinalViewType & targetMapToOrigRow,
458 const LocalOrdinalViewType & targetMapToImportRow,
459 const LocalOrdinalViewType & Bcol2Ccol,
460 const LocalOrdinalViewType & Icol2Ccol,
461 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
462 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
463 const std::string& label,
464 const Teuchos::RCP<Teuchos::ParameterList>& params) {
465 #ifdef HAVE_TPETRA_MMM_TIMINGS
466 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
467 using Teuchos::TimeMonitor;
468 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
471 using Teuchos::Array;
472 using Teuchos::ArrayRCP;
473 using Teuchos::ArrayView;
478 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
481 typedef typename KCRS::StaticCrsGraphType graph_t;
482 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
483 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
484 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
485 typedef typename KCRS::values_type::non_const_type scalar_view_t;
489 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
490 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
493 typedef typename scalar_view_t::memory_space scalar_memory_space;
496 typedef LocalOrdinal LO;
497 typedef GlobalOrdinal GO;
499 typedef Map<LO,GO,NO> map_type;
504 typedef NO::execution_space execution_space;
505 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
508 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
509 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
510 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
513 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
514 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
516 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
517 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
518 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
519 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
521 c_lno_view_t Irowptr;
522 lno_nnz_view_t Icolind;
524 if(!Bview.importMatrix.is_null()) {
525 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
526 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
527 Ivals = Bview.importMatrix->getLocalMatrix().values;
528 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
532 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
535 RCP<const map_type> Ccolmap = C.getColMap();
536 size_t m = Aview.origMatrix->getNodeNumRows();
537 size_t n = Ccolmap->getNodeNumElements();
538 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
541 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
542 if(!params.is_null()) {
543 if(params->isParameter(
"openmp: ltg thread max"))
544 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
548 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
550 lno_nnz_view_t entriesC;
551 scalar_view_t valuesC;
555 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
558 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind",thread_max);
559 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values",thread_max);
561 double thread_chunk = (double)(m) / thread_max;
564 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
567 size_t my_thread_start = tid * thread_chunk;
568 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
569 size_t my_thread_m = my_thread_stop - my_thread_start;
572 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
575 std::vector<size_t> c_status(n,INVALID);
577 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
578 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
581 size_t CSR_ip = 0, OLD_ip = 0;
582 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
587 row_mapC(i) = CSR_ip;
589 SC minusOmegaDval = -omega*Dvals(i,0);
592 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
593 const SC Bval = Bvals(j);
597 LO Cij = Bcol2Ccol(Bij);
600 c_status[Cij] = CSR_ip;
601 Ccolind(CSR_ip) = Cij;
602 Cvals(CSR_ip) = Bvals[j];
608 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
610 const SC Aval = Avals(k);
614 if (targetMapToOrigRow(Aik) != LO_INVALID) {
621 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
624 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
626 LO Cij = Bcol2Ccol(Bkj);
628 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
630 c_status[Cij] = CSR_ip;
631 Ccolind(CSR_ip) = Cij;
632 Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
636 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
647 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
648 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
650 LO Cij = Icol2Ccol(Ikj);
652 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
654 c_status[Cij] = CSR_ip;
655 Ccolind(CSR_ip) = Cij;
656 Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
660 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
667 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
669 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
670 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
674 thread_total_nnz(tid) = CSR_ip;
675 tl_colind(tid) = Ccolind;
676 tl_values(tid) = Cvals;
681 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
684 #ifdef HAVE_TPETRA_MMM_TIMINGS
685 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
688 if (params.is_null() || params->get(
"sort entries",
true))
689 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
690 C.setAllValues(row_mapC,entriesC,valuesC);
697 template<
class Scalar,
700 class LocalOrdinalViewType>
701 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
702 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
703 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
704 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
705 const LocalOrdinalViewType & targetMapToOrigRow,
706 const LocalOrdinalViewType & targetMapToImportRow,
707 const LocalOrdinalViewType & Bcol2Ccol,
708 const LocalOrdinalViewType & Icol2Ccol,
709 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
710 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
711 const std::string& label,
712 const Teuchos::RCP<Teuchos::ParameterList>& params) {
713 #ifdef HAVE_TPETRA_MMM_TIMINGS
714 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
715 using Teuchos::TimeMonitor;
716 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
718 using Teuchos::Array;
719 using Teuchos::ArrayRCP;
720 using Teuchos::ArrayView;
725 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
728 typedef typename KCRS::StaticCrsGraphType graph_t;
729 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
730 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
731 typedef typename KCRS::values_type::non_const_type scalar_view_t;
734 typedef typename scalar_view_t::memory_space scalar_memory_space;
737 typedef LocalOrdinal LO;
738 typedef GlobalOrdinal GO;
740 typedef Map<LO,GO,NO> map_type;
745 typedef NO::execution_space execution_space;
746 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
749 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
750 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
751 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
754 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
755 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
756 const KCRS & Cmat = C.getLocalMatrix();
758 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
759 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
760 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
761 scalar_view_t Cvals = Cmat.values;
763 c_lno_view_t Irowptr;
764 c_lno_nnz_view_t Icolind;
766 if(!Bview.importMatrix.is_null()) {
767 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
768 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
769 Ivals = Bview.importMatrix->getLocalMatrix().values;
773 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
776 RCP<const map_type> Ccolmap = C.getColMap();
777 size_t m = Aview.origMatrix->getNodeNumRows();
778 size_t n = Ccolmap->getNodeNumElements();
781 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
782 if(!params.is_null()) {
783 if(params->isParameter(
"openmp: ltg thread max"))
784 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
787 double thread_chunk = (double)(m) / thread_max;
790 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
793 size_t my_thread_start = tid * thread_chunk;
794 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
797 std::vector<size_t> c_status(n,INVALID);
800 size_t CSR_ip = 0, OLD_ip = 0;
801 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
805 CSR_ip = Crowptr(i+1);
807 SC minusOmegaDval = -omega*Dvals(i,0);
809 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
810 c_status[Ccolind(k)] = k;
816 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
817 const SC Bval = Bvals(j);
821 LO Cij = Bcol2Ccol(Bij);
824 Cvals(c_status[Cij]) += Bvals(j);
829 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
831 const SC Aval = Avals(k);
835 if (targetMapToOrigRow(Aik) != LO_INVALID) {
837 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
839 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
841 LO Cij = Bcol2Ccol(Bkj);
843 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
844 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
845 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
847 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
851 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
852 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
854 LO Cij = Icol2Ccol(Ikj);
856 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
857 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
858 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
860 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
872 template<
class InColindArrayType,
class InValsArrayType,
873 class OutRowptrType,
class OutColindType,
class OutValsType>
874 void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
875 const InColindArrayType& Incolind,
876 const InValsArrayType& Invalues,
878 const double thread_chunk,
879 OutRowptrType& row_mapC,
880 OutColindType& entriesC,
881 OutValsType& valuesC ) {
882 typedef OutRowptrType lno_view_t;
883 typedef OutColindType lno_nnz_view_t;
884 typedef OutValsType scalar_view_t;
885 typedef typename lno_view_t::execution_space execution_space;
886 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
889 size_t thread_max = Incolind.size();
895 lno_view_t thread_start_nnz(
"thread_nnz",thread_max+1);
896 Kokkos::parallel_scan(
"LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (
const size_t i,
size_t& update,
const bool final) {
897 size_t mynnz = thread_total_nnz(i);
898 if(
final) thread_start_nnz(i) = update;
900 if(
final && i+1==thread_max) thread_start_nnz(i+1)=update;
902 c_nnz_size = thread_start_nnz(thread_max);
905 row_mapC(m) = thread_start_nnz(thread_max);
908 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size); entriesC = entriesC_;
909 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size); valuesC = valuesC_;
912 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid) {
913 const size_t my_thread_start = tid * thread_chunk;
914 const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
915 const size_t nnz_thread_start = thread_start_nnz(tid);
917 const size_t nnz_thread_stop = thread_start_nnz(tid+1);
928 if (my_thread_start != 0 ) {
929 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
930 row_mapC(i) += nnz_thread_start;
947 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
948 for (
size_t i = 0; i < my_num_nnz; ++i) {
949 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
950 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
955 if(Incolind(tid).data()) free(Incolind(tid).data());
956 if(Invalues(tid).data()) free(Invalues(tid).data());
964 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
965 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
966 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
967 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
968 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
969 const LocalOrdinalViewType & Acol2Brow,
970 const LocalOrdinalViewType & Acol2Irow,
971 const LocalOrdinalViewType & Bcol2Ccol,
972 const LocalOrdinalViewType & Icol2Ccol,
973 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
974 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
975 const std::string& label,
976 const Teuchos::RCP<Teuchos::ParameterList>& params) {
977 #ifdef HAVE_TPETRA_MMM_TIMINGS
978 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
979 using Teuchos::TimeMonitor;
980 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
981 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
984 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
989 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(),0));
990 Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(
" MSAK"),params);
992 #ifdef HAVE_TPETRA_MMM_TIMINGS
993 MM2=Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
999 #ifdef HAVE_TPETRA_MMM_TIMINGS
1000 MM2=Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
1004 Teuchos::ParameterList jparams;
1005 if(!params.is_null()) {
1007 jparams.set(
"label",label+std::string(
" MSAK Add"));
1009 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
1010 Tpetra::MatrixMatrix::add(one,
false,*Bview.origMatrix,Scalar(-omega),
false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,
false));
1011 #ifdef HAVE_TPETRA_MMM_TIMINGS
1018 #if defined (HAVE_TPETRA_INST_OPENMP)
1020 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1021 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1022 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1023 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1024 const LocalOrdinalViewType & Acol2PRow,
1025 const LocalOrdinalViewType & Acol2PRowImport,
1026 const LocalOrdinalViewType & Pcol2Accol,
1027 const LocalOrdinalViewType & PIcol2Accol,
1028 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1029 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1030 const std::string& label,
1031 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1034 using Tpetra::MatrixMatrix::UnmanagedView;
1035 #ifdef HAVE_TPETRA_MMM_TIMINGS
1036 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1038 using Teuchos::TimeMonitor;
1039 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore"))));
1042 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1044 typedef LocalOrdinal LO;
1045 typedef GlobalOrdinal GO;
1047 typedef Map<LO,GO,NO> map_type;
1049 typedef typename KCRS::StaticCrsGraphType graph_t;
1050 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1051 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1052 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1053 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1054 typedef typename KCRS::device_type device_t;
1055 typedef typename device_t::execution_space execution_space;
1056 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1059 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1060 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1061 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1063 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1064 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1067 RCP<const map_type> Accolmap = Ac.getColMap();
1068 size_t m = Rview.origMatrix->getNodeNumRows();
1069 size_t n = Accolmap->getNodeNumElements();
1072 const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1073 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1074 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1076 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
1077 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
1078 lno_nnz_view_t Icolind;
1079 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1080 scalar_view_t Ivals;
1082 if (!Pview.importMatrix.is_null())
1084 const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1085 Irowptr = Imat.graph.row_map;
1086 Icolind = Imat.graph.entries;
1087 Ivals = Imat.values;
1098 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1099 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1102 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1103 if(!params.is_null()) {
1104 if(params->isParameter(
"openmp: ltg thread max"))
1105 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1108 double thread_chunk = (double)(m) / thread_max;
1111 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
1113 lno_nnz_view_t entriesAc;
1114 scalar_view_t valuesAc;
1118 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
1128 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind", thread_max);
1129 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values", thread_max);
1132 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1135 size_t my_thread_start = tid * thread_chunk;
1136 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1137 size_t my_thread_m = my_thread_stop - my_thread_start;
1139 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1141 std::vector<size_t> ac_status(n, INVALID);
1144 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1145 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1146 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1149 size_t nnz = 0, nnz_old = 0;
1151 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1155 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1157 const SC Rik = Rvals(kk);
1161 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1163 const SC Akl = Avals(ll);
1167 if (Acol2PRow[l] != LO_INVALID) {
1174 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1177 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1179 LO Acj = Pcol2Accol(j);
1182 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1183 #ifdef HAVE_TPETRA_DEBUG
1185 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1187 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1190 ac_status[Acj] = nnz;
1191 Accolind(nnz) = Acj;
1192 Acvals(nnz) = Rik*Akl*Plj;
1195 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1205 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1206 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1208 LO Acj = PIcol2Accol(j);
1211 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1212 #ifdef HAVE_TPETRA_DEBUG
1214 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1216 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1219 ac_status[Acj] = nnz;
1220 Accolind(nnz) = Acj;
1221 Acvals(nnz) = Rik*Akl*Plj;
1224 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1231 if (nnz + n > nnzAllocated) {
1233 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1234 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1238 thread_total_nnz(tid) = nnz;
1239 tl_colind(tid) = Accolind;
1240 tl_values(tid) = Acvals;
1242 #ifdef HAVE_TPETRA_MMM_TIMINGS
1243 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
1246 copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1248 #ifdef HAVE_TPETRA_MMM_TIMINGS
1249 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1253 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1255 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1257 #ifdef HAVE_TPETRA_MMM_TIMINGS
1258 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1269 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1270 labelList->set(
"Timer Label",label);
1271 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1272 RCP<const Export<LO,GO,NO> > dummyExport;
1273 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1274 Rview.origMatrix->getRangeMap(),
1283 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1284 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1285 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1286 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1287 const LocalOrdinalViewType & Acol2PRow,
1288 const LocalOrdinalViewType & Acol2PRowImport,
1289 const LocalOrdinalViewType & Pcol2Accol,
1290 const LocalOrdinalViewType & PIcol2Accol,
1291 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1292 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1293 const std::string& label,
1294 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1297 using Tpetra::MatrixMatrix::UnmanagedView;
1298 #ifdef HAVE_TPETRA_MMM_TIMINGS
1299 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1300 using Teuchos::TimeMonitor;
1302 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore"))));
1305 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1307 typedef LocalOrdinal LO;
1308 typedef GlobalOrdinal GO;
1310 typedef Map<LO,GO,NO> map_type;
1312 typedef typename KCRS::StaticCrsGraphType graph_t;
1313 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1314 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1315 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1316 typedef typename KCRS::device_type device_t;
1317 typedef typename device_t::execution_space execution_space;
1318 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1320 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1321 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1322 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1325 RCP<const map_type> Accolmap = Ac.getColMap();
1326 size_t m = Rview.origMatrix->getNodeNumRows();
1327 size_t n = Accolmap->getNodeNumElements();
1330 const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1331 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1332 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1333 const KCRS & Cmat = Ac.getLocalMatrix();
1335 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1336 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1337 lno_nnz_view_t Icolind;
1338 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1339 scalar_view_t Cvals = Cmat.values;
1340 scalar_view_t Ivals;
1342 if (!Pview.importMatrix.is_null())
1344 const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1345 Irowptr = Imat.graph.row_map;
1346 Icolind = Imat.graph.entries;
1347 Ivals = Imat.values;
1351 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1352 if(!params.is_null()) {
1353 if(params->isParameter(
"openmp: ltg thread max"))
1354 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1357 double thread_chunk = (double)(m) / thread_max;
1360 Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1363 size_t my_thread_start = tid * thread_chunk;
1364 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1366 std::vector<size_t> c_status(n, INVALID);
1369 size_t OLD_ip = 0, CSR_ip = 0;
1371 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1374 OLD_ip = Crowptr(i);
1375 CSR_ip = Crowptr(i+1);
1376 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1377 c_status[Ccolind(k)] = k;
1383 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1385 const SC Rik = Rvals(kk);
1389 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1391 const SC Akl = Avals(ll);
1395 if (Acol2PRow[l] != LO_INVALID) {
1402 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1405 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1407 LO Cij = Pcol2Accol(j);
1409 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1410 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1411 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1414 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1423 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1424 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1426 LO Cij = PIcol2Accol(j);
1428 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1429 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1430 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1432 Cvals(c_status[Cij]) += Rik*Akl*Plj;
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...