42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45 #include "Tpetra_RowMatrixTransposer.hpp"
49 namespace MatrixMatrix{
51 namespace ExtraKernels{
55 template<
class CrsMatrixType>
56 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
58 size_t Aest = 100, Best=100;
59 if (A.getLocalNumEntries() > 0)
60 Aest = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
61 if (B.getLocalNumEntries() > 0)
62 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
64 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
65 nnzperrow *= nnzperrow;
72 template<
class CrsMatrixType>
73 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
74 size_t nnzPerRowA = 100, Pcols = 100;
75 if (A.getLocalNumEntries() > 0)
76 nnzPerRowA = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 9;
77 if (P.getLocalNumEntries() > 0)
78 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
79 return (
size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
82 #if defined (HAVE_TPETRA_INST_OPENMP)
84 template<
class Scalar,
87 class LocalOrdinalViewType>
88 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
90 const LocalOrdinalViewType & targetMapToOrigRow,
91 const LocalOrdinalViewType & targetMapToImportRow,
92 const LocalOrdinalViewType & Bcol2Ccol,
93 const LocalOrdinalViewType & Icol2Ccol,
94 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
95 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
96 const std::string& label,
97 const Teuchos::RCP<Teuchos::ParameterList>& params) {
98 #ifdef HAVE_TPETRA_MMM_TIMINGS
99 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
100 using Teuchos::TimeMonitor;
101 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
103 using Teuchos::Array;
104 using Teuchos::ArrayRCP;
105 using Teuchos::ArrayView;
112 typedef typename KCRS::device_type device_t;
113 typedef typename KCRS::StaticCrsGraphType graph_t;
114 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
115 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
116 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
117 typedef typename KCRS::values_type::non_const_type scalar_view_t;
121 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
122 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
125 typedef LocalOrdinal LO;
126 typedef GlobalOrdinal GO;
127 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
128 typedef Map<LO,GO,NO> map_type;
133 typedef NO::execution_space execution_space;
134 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
137 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
138 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
139 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
142 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
143 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
145 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
146 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
147 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
148 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
150 c_lno_view_t Irowptr;
151 lno_nnz_view_t Icolind;
153 if(!Bview.importMatrix.is_null()) {
154 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
155 Irowptr = lclB.graph.row_map;
156 Icolind = lclB.graph.entries;
158 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
162 RCP<const map_type> Ccolmap = C.getColMap();
163 size_t m = Aview.origMatrix->getLocalNumRows();
164 size_t n = Ccolmap->getLocalNumElements();
165 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
168 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
169 if(!params.is_null()) {
170 if(params->isParameter(
"openmp: ltg thread max"))
171 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
175 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
177 lno_nnz_view_t entriesC;
178 scalar_view_t valuesC;
182 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
185 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind",thread_max);
186 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values",thread_max);
188 double thread_chunk = (double)(m) / thread_max;
191 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
194 size_t my_thread_start = tid * thread_chunk;
195 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
196 size_t my_thread_m = my_thread_stop - my_thread_start;
199 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
202 std::vector<size_t> c_status(n,INVALID);
204 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
205 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
208 size_t CSR_ip = 0, OLD_ip = 0;
209 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
213 row_mapC(i) = CSR_ip;
216 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
218 const SC Aval = Avals(k);
222 if (targetMapToOrigRow(Aik) != LO_INVALID) {
229 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
232 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
234 LO Cij = Bcol2Ccol(Bkj);
236 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
238 c_status[Cij] = CSR_ip;
239 Ccolind(CSR_ip) = Cij;
240 Cvals(CSR_ip) = Aval*Bvals(j);
244 Cvals(c_status[Cij]) += Aval*Bvals(j);
255 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
256 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
258 LO Cij = Icol2Ccol(Ikj);
260 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
262 c_status[Cij] = CSR_ip;
263 Ccolind(CSR_ip) = Cij;
264 Cvals(CSR_ip) = Aval*Ivals(j);
268 Cvals(c_status[Cij]) += Aval*Ivals(j);
275 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
277 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
278 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
282 thread_total_nnz(tid) = CSR_ip;
283 tl_colind(tid) = Ccolind;
284 tl_values(tid) = Cvals;
288 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
290 #ifdef HAVE_TPETRA_MMM_TIMINGS
291 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
294 if (params.is_null() || params->get(
"sort entries",
true))
295 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
296 C.setAllValues(row_mapC,entriesC,valuesC);
301 template<
class Scalar,
304 class LocalOrdinalViewType>
305 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
306 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
307 const LocalOrdinalViewType & targetMapToOrigRow,
308 const LocalOrdinalViewType & targetMapToImportRow,
309 const LocalOrdinalViewType & Bcol2Ccol,
310 const LocalOrdinalViewType & Icol2Ccol,
311 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
312 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
313 const std::string& label,
314 const Teuchos::RCP<Teuchos::ParameterList>& params) {
315 #ifdef HAVE_TPETRA_MMM_TIMINGS
316 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
317 using Teuchos::TimeMonitor;
318 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
321 using Teuchos::Array;
322 using Teuchos::ArrayRCP;
323 using Teuchos::ArrayView;
330 typedef typename KCRS::StaticCrsGraphType graph_t;
331 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
332 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
333 typedef typename KCRS::values_type::non_const_type scalar_view_t;
336 typedef LocalOrdinal LO;
337 typedef GlobalOrdinal GO;
338 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
339 typedef Map<LO,GO,NO> map_type;
344 typedef NO::execution_space execution_space;
345 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
348 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
349 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
350 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
353 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
354 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
355 const KCRS & Cmat = C.getLocalMatrixDevice();
357 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
358 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
359 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
360 scalar_view_t Cvals = Cmat.values;
362 c_lno_view_t Irowptr;
363 c_lno_nnz_view_t Icolind;
365 if(!Bview.importMatrix.is_null()) {
366 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
367 Irowptr = lclB.graph.row_map;
368 Icolind = lclB.graph.entries;
373 RCP<const map_type> Ccolmap = C.getColMap();
374 size_t m = Aview.origMatrix->getLocalNumRows();
375 size_t n = Ccolmap->getLocalNumElements();
378 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
379 if(!params.is_null()) {
380 if(params->isParameter(
"openmp: ltg thread max"))
381 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
384 double thread_chunk = (double)(m) / thread_max;
387 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
390 size_t my_thread_start = tid * thread_chunk;
391 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
394 std::vector<size_t> c_status(n,INVALID);
397 size_t CSR_ip = 0, OLD_ip = 0;
398 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
402 CSR_ip = Crowptr(i+1);
403 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
404 c_status[Ccolind(k)] = k;
409 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
411 const SC Aval = Avals(k);
415 if (targetMapToOrigRow(Aik) != LO_INVALID) {
417 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
419 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
421 LO Cij = Bcol2Ccol(Bkj);
423 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
425 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
427 Cvals(c_status[Cij]) += Aval * Bvals(j);
431 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
432 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
434 LO Cij = Icol2Ccol(Ikj);
436 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
437 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
438 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
440 Cvals(c_status[Cij]) += Aval * Ivals(j);
451 template<
class Scalar,
454 class LocalOrdinalViewType>
455 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
456 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
457 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
458 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
459 const LocalOrdinalViewType & targetMapToOrigRow,
460 const LocalOrdinalViewType & targetMapToImportRow,
461 const LocalOrdinalViewType & Bcol2Ccol,
462 const LocalOrdinalViewType & Icol2Ccol,
463 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
464 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
465 const std::string& label,
466 const Teuchos::RCP<Teuchos::ParameterList>& params) {
467 #ifdef HAVE_TPETRA_MMM_TIMINGS
468 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
469 using Teuchos::TimeMonitor;
470 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
473 using Teuchos::Array;
474 using Teuchos::ArrayRCP;
475 using Teuchos::ArrayView;
480 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
482 typedef typename KCRS::device_type device_t;
483 typedef typename KCRS::StaticCrsGraphType graph_t;
484 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
485 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
486 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
487 typedef typename KCRS::values_type::non_const_type scalar_view_t;
491 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
492 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
495 typedef typename scalar_view_t::memory_space scalar_memory_space;
498 typedef LocalOrdinal LO;
499 typedef GlobalOrdinal GO;
501 typedef Map<LO,GO,NO> map_type;
506 typedef NO::execution_space execution_space;
507 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
510 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
511 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
512 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
515 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
516 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
518 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
519 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
520 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
521 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
523 c_lno_view_t Irowptr;
524 lno_nnz_view_t Icolind;
526 if(!Bview.importMatrix.is_null()) {
527 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
528 Irowptr = lclB.graph.row_map;
529 Icolind = lclB.graph.entries;
531 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
536 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
539 RCP<const map_type> Ccolmap = C.getColMap();
540 size_t m = Aview.origMatrix->getLocalNumRows();
541 size_t n = Ccolmap->getLocalNumElements();
542 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
545 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
546 if(!params.is_null()) {
547 if(params->isParameter(
"openmp: ltg thread max"))
548 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
552 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
554 lno_nnz_view_t entriesC;
555 scalar_view_t valuesC;
559 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
562 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind",thread_max);
563 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values",thread_max);
565 double thread_chunk = (double)(m) / thread_max;
568 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
571 size_t my_thread_start = tid * thread_chunk;
572 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
573 size_t my_thread_m = my_thread_stop - my_thread_start;
576 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
579 std::vector<size_t> c_status(n,INVALID);
581 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
582 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
585 size_t CSR_ip = 0, OLD_ip = 0;
586 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
591 row_mapC(i) = CSR_ip;
593 SC minusOmegaDval = -omega*Dvals(i,0);
596 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
597 const SC Bval = Bvals(j);
601 LO Cij = Bcol2Ccol(Bij);
604 c_status[Cij] = CSR_ip;
605 Ccolind(CSR_ip) = Cij;
606 Cvals(CSR_ip) = Bvals[j];
612 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
614 const SC Aval = Avals(k);
618 if (targetMapToOrigRow(Aik) != LO_INVALID) {
625 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
628 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
630 LO Cij = Bcol2Ccol(Bkj);
632 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
634 c_status[Cij] = CSR_ip;
635 Ccolind(CSR_ip) = Cij;
636 Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
640 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
651 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
652 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
654 LO Cij = Icol2Ccol(Ikj);
656 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
658 c_status[Cij] = CSR_ip;
659 Ccolind(CSR_ip) = Cij;
660 Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
664 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
671 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
673 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
674 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
678 thread_total_nnz(tid) = CSR_ip;
679 tl_colind(tid) = Ccolind;
680 tl_values(tid) = Cvals;
685 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
688 #ifdef HAVE_TPETRA_MMM_TIMINGS
689 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
692 if (params.is_null() || params->get(
"sort entries",
true))
693 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
694 C.setAllValues(row_mapC,entriesC,valuesC);
701 template<
class Scalar,
704 class LocalOrdinalViewType>
705 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
706 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
707 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
708 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
709 const LocalOrdinalViewType & targetMapToOrigRow,
710 const LocalOrdinalViewType & targetMapToImportRow,
711 const LocalOrdinalViewType & Bcol2Ccol,
712 const LocalOrdinalViewType & Icol2Ccol,
713 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
714 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
715 const std::string& label,
716 const Teuchos::RCP<Teuchos::ParameterList>& params) {
717 #ifdef HAVE_TPETRA_MMM_TIMINGS
718 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
719 using Teuchos::TimeMonitor;
720 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
722 using Teuchos::Array;
723 using Teuchos::ArrayRCP;
724 using Teuchos::ArrayView;
729 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
732 typedef typename KCRS::StaticCrsGraphType graph_t;
733 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
734 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
735 typedef typename KCRS::values_type::non_const_type scalar_view_t;
738 typedef typename scalar_view_t::memory_space scalar_memory_space;
741 typedef LocalOrdinal LO;
742 typedef GlobalOrdinal GO;
744 typedef Map<LO,GO,NO> map_type;
749 typedef NO::execution_space execution_space;
750 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
753 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
754 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
755 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
758 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
759 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
760 const KCRS & Cmat = C.getLocalMatrixDevice();
762 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
763 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
764 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
765 scalar_view_t Cvals = Cmat.values;
767 c_lno_view_t Irowptr;
768 c_lno_nnz_view_t Icolind;
770 if(!Bview.importMatrix.is_null()) {
771 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
772 Irowptr = lclB.graph.row_map;
773 Icolind = lclB.graph.entries;
779 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
782 RCP<const map_type> Ccolmap = C.getColMap();
783 size_t m = Aview.origMatrix->getLocalNumRows();
784 size_t n = Ccolmap->getLocalNumElements();
787 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
788 if(!params.is_null()) {
789 if(params->isParameter(
"openmp: ltg thread max"))
790 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
793 double thread_chunk = (double)(m) / thread_max;
796 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
799 size_t my_thread_start = tid * thread_chunk;
800 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
803 std::vector<size_t> c_status(n,INVALID);
806 size_t CSR_ip = 0, OLD_ip = 0;
807 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
811 CSR_ip = Crowptr(i+1);
813 SC minusOmegaDval = -omega*Dvals(i,0);
815 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
816 c_status[Ccolind(k)] = k;
822 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
823 const SC Bval = Bvals(j);
827 LO Cij = Bcol2Ccol(Bij);
830 Cvals(c_status[Cij]) += Bvals(j);
835 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
837 const SC Aval = Avals(k);
841 if (targetMapToOrigRow(Aik) != LO_INVALID) {
843 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
845 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
847 LO Cij = Bcol2Ccol(Bkj);
849 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
850 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
851 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
853 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
857 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
858 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
860 LO Cij = Icol2Ccol(Ikj);
862 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
863 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
864 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
866 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
878 template<
class InColindArrayType,
class InValsArrayType,
879 class OutRowptrType,
class OutColindType,
class OutValsType>
880 void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
881 const InColindArrayType& Incolind,
882 const InValsArrayType& Invalues,
884 const double thread_chunk,
885 OutRowptrType& row_mapC,
886 OutColindType& entriesC,
887 OutValsType& valuesC ) {
888 typedef OutRowptrType lno_view_t;
889 typedef OutColindType lno_nnz_view_t;
890 typedef OutValsType scalar_view_t;
891 typedef typename lno_view_t::execution_space execution_space;
892 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
895 size_t thread_max = Incolind.size();
901 lno_view_t thread_start_nnz(
"thread_nnz",thread_max+1);
902 Kokkos::parallel_scan(
"LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (
const size_t i,
size_t& update,
const bool final) {
903 size_t mynnz = thread_total_nnz(i);
904 if(
final) thread_start_nnz(i) = update;
906 if(
final && i+1==thread_max) thread_start_nnz(i+1)=update;
908 c_nnz_size = thread_start_nnz(thread_max);
911 row_mapC(m) = thread_start_nnz(thread_max);
914 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size); entriesC = entriesC_;
915 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size); valuesC = valuesC_;
918 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid) {
919 const size_t my_thread_start = tid * thread_chunk;
920 const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
921 const size_t nnz_thread_start = thread_start_nnz(tid);
923 const size_t nnz_thread_stop = thread_start_nnz(tid+1);
934 if (my_thread_start != 0 ) {
935 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
936 row_mapC(i) += nnz_thread_start;
953 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
954 for (
size_t i = 0; i < my_num_nnz; ++i) {
955 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
956 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
961 if(Incolind(tid).data()) free(Incolind(tid).data());
962 if(Invalues(tid).data()) free(Invalues(tid).data());
970 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
971 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
972 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
973 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
974 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
975 const LocalOrdinalViewType & Acol2Brow,
976 const LocalOrdinalViewType & Acol2Irow,
977 const LocalOrdinalViewType & Bcol2Ccol,
978 const LocalOrdinalViewType & Icol2Ccol,
979 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
980 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
981 const std::string& label,
982 const Teuchos::RCP<Teuchos::ParameterList>& params) {
983 #ifdef HAVE_TPETRA_MMM_TIMINGS
984 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
985 using Teuchos::TimeMonitor;
986 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
987 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
990 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
995 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(),0));
996 Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(
" MSAK"),params);
998 #ifdef HAVE_TPETRA_MMM_TIMINGS
999 MM2=Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
1003 AB->leftScale(Dinv);
1005 #ifdef HAVE_TPETRA_MMM_TIMINGS
1006 MM2=Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
1010 Teuchos::ParameterList jparams;
1011 if(!params.is_null()) {
1013 jparams.set(
"label",label+std::string(
" MSAK Add"));
1015 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
1016 Tpetra::MatrixMatrix::add(one,
false,*Bview.origMatrix,Scalar(-omega),
false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,
false));
1017 #ifdef HAVE_TPETRA_MMM_TIMINGS
1024 #if defined (HAVE_TPETRA_INST_OPENMP)
1026 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1027 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1028 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1029 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1030 const LocalOrdinalViewType & Acol2PRow,
1031 const LocalOrdinalViewType & Acol2PRowImport,
1032 const LocalOrdinalViewType & Pcol2Accol,
1033 const LocalOrdinalViewType & PIcol2Accol,
1034 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1035 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1036 const std::string& label,
1037 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1040 using Tpetra::MatrixMatrix::UnmanagedView;
1041 #ifdef HAVE_TPETRA_MMM_TIMINGS
1042 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1044 using Teuchos::TimeMonitor;
1045 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore"))));
1048 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1050 typedef LocalOrdinal LO;
1051 typedef GlobalOrdinal GO;
1053 typedef Map<LO,GO,NO> map_type;
1055 typedef typename KCRS::StaticCrsGraphType graph_t;
1056 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1057 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1058 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1059 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1060 typedef typename KCRS::device_type device_t;
1061 typedef typename device_t::execution_space execution_space;
1062 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1065 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1066 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1067 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1069 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1070 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1073 RCP<const map_type> Accolmap = Ac.getColMap();
1074 size_t m = Rview.origMatrix->getLocalNumRows();
1075 size_t n = Accolmap->getLocalNumElements();
1078 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1079 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1080 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1082 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1083 Arowptr = Amat.graph.row_map,
1084 Prowptr = Pmat.graph.row_map, Irowptr;
1085 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1086 Acolind = Amat.graph.entries,
1087 Pcolind = Pmat.graph.entries;
1088 lno_nnz_view_t Icolind;
1089 const scalar_view_t Rvals = Rmat.values,
1090 Avals = Amat.values,
1091 Pvals = Pmat.values;
1092 scalar_view_t Ivals;
1094 if (!Pview.importMatrix.is_null())
1096 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1097 Irowptr = Imat.graph.row_map;
1098 Icolind = Imat.graph.entries;
1099 Ivals = Imat.values;
1110 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1111 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1114 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1115 if(!params.is_null()) {
1116 if(params->isParameter(
"openmp: ltg thread max"))
1117 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1120 double thread_chunk = (double)(m) / thread_max;
1123 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
1125 lno_nnz_view_t entriesAc;
1126 scalar_view_t valuesAc;
1130 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
1140 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
1141 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
1144 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1147 size_t my_thread_start = tid * thread_chunk;
1148 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1149 size_t my_thread_m = my_thread_stop - my_thread_start;
1151 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1153 std::vector<size_t> ac_status(n, INVALID);
1156 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1157 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1158 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1161 size_t nnz = 0, nnz_old = 0;
1163 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1167 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1169 const SC Rik = Rvals(kk);
1173 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1175 const SC Akl = Avals(ll);
1179 if (Acol2PRow[l] != LO_INVALID) {
1186 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1189 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1191 LO Acj = Pcol2Accol(j);
1194 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1195 #ifdef HAVE_TPETRA_DEBUG
1197 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1199 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1202 ac_status[Acj] = nnz;
1203 Accolind(nnz) = Acj;
1204 Acvals(nnz) = Rik*Akl*Plj;
1207 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1217 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1218 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1220 LO Acj = PIcol2Accol(j);
1223 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1224 #ifdef HAVE_TPETRA_DEBUG
1226 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1228 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1231 ac_status[Acj] = nnz;
1232 Accolind(nnz) = Acj;
1233 Acvals(nnz) = Rik*Akl*Plj;
1236 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1243 if (nnz + n > nnzAllocated) {
1245 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1246 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1250 thread_total_nnz(tid) = nnz;
1251 tl_colind(tid) = Accolind;
1252 tl_values(tid) = Acvals;
1254 #ifdef HAVE_TPETRA_MMM_TIMINGS
1255 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
1258 copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1260 #ifdef HAVE_TPETRA_MMM_TIMINGS
1261 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1265 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1267 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1269 #ifdef HAVE_TPETRA_MMM_TIMINGS
1270 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1281 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1282 labelList->set(
"Timer Label",label);
1283 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1284 RCP<const Export<LO,GO,NO> > dummyExport;
1285 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1286 Rview.origMatrix->getRangeMap(),
1295 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1296 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1297 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1298 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1299 const LocalOrdinalViewType & Acol2PRow,
1300 const LocalOrdinalViewType & Acol2PRowImport,
1301 const LocalOrdinalViewType & Pcol2Accol,
1302 const LocalOrdinalViewType & PIcol2Accol,
1303 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1304 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1305 const std::string& label,
1306 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1309 using Tpetra::MatrixMatrix::UnmanagedView;
1310 #ifdef HAVE_TPETRA_MMM_TIMINGS
1311 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1312 using Teuchos::TimeMonitor;
1314 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore"))));
1317 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1319 typedef LocalOrdinal LO;
1320 typedef GlobalOrdinal GO;
1322 typedef Map<LO,GO,NO> map_type;
1324 typedef typename KCRS::StaticCrsGraphType graph_t;
1325 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1326 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1327 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1328 typedef typename KCRS::device_type device_t;
1329 typedef typename device_t::execution_space execution_space;
1330 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1332 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1333 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1334 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1337 RCP<const map_type> Accolmap = Ac.getColMap();
1338 size_t m = Rview.origMatrix->getLocalNumRows();
1339 size_t n = Accolmap->getLocalNumElements();
1342 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1343 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1344 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1345 const KCRS & Cmat = Ac.getLocalMatrixDevice();
1347 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1348 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1349 lno_nnz_view_t Icolind;
1350 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1351 scalar_view_t Cvals = Cmat.values;
1352 scalar_view_t Ivals;
1354 if (!Pview.importMatrix.is_null())
1356 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1357 Irowptr = Imat.graph.row_map;
1358 Icolind = Imat.graph.entries;
1359 Ivals = Imat.values;
1363 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1364 if(!params.is_null()) {
1365 if(params->isParameter(
"openmp: ltg thread max"))
1366 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1369 double thread_chunk = (double)(m) / thread_max;
1372 Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1375 size_t my_thread_start = tid * thread_chunk;
1376 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1378 std::vector<size_t> c_status(n, INVALID);
1381 size_t OLD_ip = 0, CSR_ip = 0;
1383 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1386 OLD_ip = Crowptr(i);
1387 CSR_ip = Crowptr(i+1);
1388 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1389 c_status[Ccolind(k)] = k;
1395 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1397 const SC Rik = Rvals(kk);
1401 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1403 const SC Akl = Avals(ll);
1407 if (Acol2PRow[l] != LO_INVALID) {
1414 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1417 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1419 LO Cij = Pcol2Accol(j);
1421 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1422 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1423 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1426 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1435 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1436 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1438 LO Cij = PIcol2Accol(j);
1440 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1441 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1442 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1444 Cvals(c_status[Cij]) += Rik*Akl*Plj;
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...