42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45 #include "Tpetra_RowMatrixTransposer.hpp"
49 namespace MatrixMatrix{
51 namespace ExtraKernels{
55 template<
class CrsMatrixType>
56 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
58 size_t Aest = 100, Best=100;
59 if (A.getNodeNumEntries() > 0)
60 Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
61 if (B.getNodeNumEntries() > 0)
62 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
64 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
65 nnzperrow *= nnzperrow;
72 template<
class CrsMatrixType>
73 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
74 size_t nnzPerRowA = 100, Pcols = 100;
75 if (A.getNodeNumEntries() > 0)
76 nnzPerRowA = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 9;
77 if (P.getNodeNumEntries() > 0)
78 Pcols = (P.getNodeNumCols() > 0) ? P.getNodeNumCols() : 100;
79 return (
size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
82 #if defined (HAVE_TPETRA_INST_OPENMP)
84 template<
class Scalar,
87 class LocalOrdinalViewType>
88 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
89 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
90 const LocalOrdinalViewType & targetMapToOrigRow,
91 const LocalOrdinalViewType & targetMapToImportRow,
92 const LocalOrdinalViewType & Bcol2Ccol,
93 const LocalOrdinalViewType & Icol2Ccol,
94 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
95 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
96 const std::string& label,
97 const Teuchos::RCP<Teuchos::ParameterList>& params) {
98 #ifdef HAVE_TPETRA_MMM_TIMINGS
99 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
100 using Teuchos::TimeMonitor;
102 Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore")));
105 using Teuchos::Array;
106 using Teuchos::ArrayRCP;
107 using Teuchos::ArrayView;
115 typedef typename KCRS::StaticCrsGraphType graph_t;
116 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
117 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
118 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
119 typedef typename KCRS::values_type::non_const_type scalar_view_t;
123 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
124 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
127 typedef LocalOrdinal LO;
128 typedef GlobalOrdinal GO;
129 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
130 typedef Map<LO,GO,NO> map_type;
135 typedef NO::execution_space execution_space;
136 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
139 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
140 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
141 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
144 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
145 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
147 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
148 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
149 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
150 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
152 c_lno_view_t Irowptr;
153 lno_nnz_view_t Icolind;
155 if(!Bview.importMatrix.is_null()) {
156 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
157 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
158 Ivals = Bview.importMatrix->getLocalMatrix().values;
159 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
163 RCP<const map_type> Ccolmap = C.getColMap();
164 size_t m = Aview.origMatrix->getNodeNumRows();
165 size_t n = Ccolmap->getNodeNumElements();
166 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
169 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
170 if(!params.is_null()) {
171 if(params->isParameter(
"openmp: ltg thread max"))
172 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
176 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
178 lno_nnz_view_t entriesC;
179 scalar_view_t valuesC;
183 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
186 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind",thread_max);
187 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values",thread_max);
189 double thread_chunk = (double)(m) / thread_max;
192 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
195 size_t my_thread_start = tid * thread_chunk;
196 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
197 size_t my_thread_m = my_thread_stop - my_thread_start;
200 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
203 std::vector<size_t> c_status(n,INVALID);
205 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
206 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
209 size_t CSR_ip = 0, OLD_ip = 0;
210 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
214 row_mapC(i) = CSR_ip;
217 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
219 const SC Aval = Avals(k);
223 if (targetMapToOrigRow(Aik) != LO_INVALID) {
230 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
233 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
235 LO Cij = Bcol2Ccol(Bkj);
237 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
239 c_status[Cij] = CSR_ip;
240 Ccolind(CSR_ip) = Cij;
241 Cvals(CSR_ip) = Aval*Bvals(j);
245 Cvals(c_status[Cij]) += Aval*Bvals(j);
256 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
257 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
259 LO Cij = Icol2Ccol(Ikj);
261 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
263 c_status[Cij] = CSR_ip;
264 Ccolind(CSR_ip) = Cij;
265 Cvals(CSR_ip) = Aval*Ivals(j);
269 Cvals(c_status[Cij]) += Aval*Ivals(j);
276 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
278 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
279 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
283 thread_total_nnz(tid) = CSR_ip;
284 tl_colind(tid) = Ccolind;
285 tl_values(tid) = Cvals;
289 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
291 #ifdef HAVE_TPETRA_MMM_TIMINGS
293 Teuchos::TimeMonitor MMsort (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort")));
296 if (params.is_null() || params->get(
"sort entries",
true))
297 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
298 C.setAllValues(row_mapC,entriesC,valuesC);
303 template<
class Scalar,
306 class LocalOrdinalViewType>
307 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
308 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
309 const LocalOrdinalViewType & targetMapToOrigRow,
310 const LocalOrdinalViewType & targetMapToImportRow,
311 const LocalOrdinalViewType & Bcol2Ccol,
312 const LocalOrdinalViewType & Icol2Ccol,
313 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
314 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
315 const std::string& label,
316 const Teuchos::RCP<Teuchos::ParameterList>& params) {
317 #ifdef HAVE_TPETRA_MMM_TIMINGS
318 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
319 Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore")));
322 using Teuchos::Array;
323 using Teuchos::ArrayRCP;
324 using Teuchos::ArrayView;
331 typedef typename KCRS::StaticCrsGraphType graph_t;
332 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
333 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
334 typedef typename KCRS::values_type::non_const_type scalar_view_t;
337 typedef LocalOrdinal LO;
338 typedef GlobalOrdinal GO;
339 typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
340 typedef Map<LO,GO,NO> map_type;
345 typedef NO::execution_space execution_space;
346 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
349 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
350 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
351 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
354 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
355 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
356 const KCRS & Cmat = C.getLocalMatrix();
358 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
359 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
360 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
361 scalar_view_t Cvals = Cmat.values;
363 c_lno_view_t Irowptr;
364 c_lno_nnz_view_t Icolind;
366 if(!Bview.importMatrix.is_null()) {
367 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
368 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
369 Ivals = Bview.importMatrix->getLocalMatrix().values;
373 RCP<const map_type> Ccolmap = C.getColMap();
374 size_t m = Aview.origMatrix->getNodeNumRows();
375 size_t n = Ccolmap->getNodeNumElements();
378 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
379 if(!params.is_null()) {
380 if(params->isParameter(
"openmp: ltg thread max"))
381 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
384 double thread_chunk = (double)(m) / thread_max;
387 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
390 size_t my_thread_start = tid * thread_chunk;
391 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
394 std::vector<size_t> c_status(n,INVALID);
397 size_t CSR_ip = 0, OLD_ip = 0;
398 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
402 CSR_ip = Crowptr(i+1);
403 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
404 c_status[Ccolind(k)] = k;
409 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
411 const SC Aval = Avals(k);
415 if (targetMapToOrigRow(Aik) != LO_INVALID) {
417 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
419 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
421 LO Cij = Bcol2Ccol(Bkj);
423 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
425 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
427 Cvals(c_status[Cij]) += Aval * Bvals(j);
431 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
432 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
434 LO Cij = Icol2Ccol(Ikj);
436 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
437 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
438 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
440 Cvals(c_status[Cij]) += Aval * Ivals(j);
451 template<
class Scalar,
454 class LocalOrdinalViewType>
455 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
456 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
457 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
458 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
459 const LocalOrdinalViewType & targetMapToOrigRow,
460 const LocalOrdinalViewType & targetMapToImportRow,
461 const LocalOrdinalViewType & Bcol2Ccol,
462 const LocalOrdinalViewType & Icol2Ccol,
463 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
464 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
465 const std::string& label,
466 const Teuchos::RCP<Teuchos::ParameterList>& params) {
467 #ifdef HAVE_TPETRA_MMM_TIMINGS
468 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
469 Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore")));
472 using Teuchos::Array;
473 using Teuchos::ArrayRCP;
474 using Teuchos::ArrayView;
479 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
482 typedef typename KCRS::StaticCrsGraphType graph_t;
483 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
484 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
485 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
486 typedef typename KCRS::values_type::non_const_type scalar_view_t;
490 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
491 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
494 typedef typename scalar_view_t::memory_space scalar_memory_space;
497 typedef LocalOrdinal LO;
498 typedef GlobalOrdinal GO;
500 typedef Map<LO,GO,NO> map_type;
505 typedef NO::execution_space execution_space;
506 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
509 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
510 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
511 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
514 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
515 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
517 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
518 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
519 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
520 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
522 c_lno_view_t Irowptr;
523 lno_nnz_view_t Icolind;
525 if(!Bview.importMatrix.is_null()) {
526 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
527 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
528 Ivals = Bview.importMatrix->getLocalMatrix().values;
529 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
533 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
536 RCP<const map_type> Ccolmap = C.getColMap();
537 size_t m = Aview.origMatrix->getNodeNumRows();
538 size_t n = Ccolmap->getNodeNumElements();
539 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
542 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
543 if(!params.is_null()) {
544 if(params->isParameter(
"openmp: ltg thread max"))
545 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
549 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
551 lno_nnz_view_t entriesC;
552 scalar_view_t valuesC;
556 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
559 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind",thread_max);
560 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values",thread_max);
562 double thread_chunk = (double)(m) / thread_max;
565 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
568 size_t my_thread_start = tid * thread_chunk;
569 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
570 size_t my_thread_m = my_thread_stop - my_thread_start;
573 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
576 std::vector<size_t> c_status(n,INVALID);
578 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
579 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
582 size_t CSR_ip = 0, OLD_ip = 0;
583 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
588 row_mapC(i) = CSR_ip;
590 SC minusOmegaDval = -omega*Dvals(i,0);
593 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
594 const SC Bval = Bvals(j);
598 LO Cij = Bcol2Ccol(Bij);
601 c_status[Cij] = CSR_ip;
602 Ccolind(CSR_ip) = Cij;
603 Cvals(CSR_ip) = Bvals[j];
609 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
611 const SC Aval = Avals(k);
615 if (targetMapToOrigRow(Aik) != LO_INVALID) {
622 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
625 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
627 LO Cij = Bcol2Ccol(Bkj);
629 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
631 c_status[Cij] = CSR_ip;
632 Ccolind(CSR_ip) = Cij;
633 Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
637 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
648 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
649 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
651 LO Cij = Icol2Ccol(Ikj);
653 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
655 c_status[Cij] = CSR_ip;
656 Ccolind(CSR_ip) = Cij;
657 Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
661 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
668 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
670 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
671 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
675 thread_total_nnz(tid) = CSR_ip;
676 tl_colind(tid) = Ccolind;
677 tl_values(tid) = Cvals;
682 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
685 #ifdef HAVE_TPETRA_MMM_TIMINGS
687 Teuchos::TimeMonitor MMsort (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort")));
690 if (params.is_null() || params->get(
"sort entries",
true))
691 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
692 C.setAllValues(row_mapC,entriesC,valuesC);
699 template<
class Scalar,
702 class LocalOrdinalViewType>
703 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
704 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
705 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
706 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
707 const LocalOrdinalViewType & targetMapToOrigRow,
708 const LocalOrdinalViewType & targetMapToImportRow,
709 const LocalOrdinalViewType & Bcol2Ccol,
710 const LocalOrdinalViewType & Icol2Ccol,
711 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
712 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
713 const std::string& label,
714 const Teuchos::RCP<Teuchos::ParameterList>& params) {
715 #ifdef HAVE_TPETRA_MMM_TIMINGS
716 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
717 Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore")));
719 using Teuchos::Array;
720 using Teuchos::ArrayRCP;
721 using Teuchos::ArrayView;
726 typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
729 typedef typename KCRS::StaticCrsGraphType graph_t;
730 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
731 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
732 typedef typename KCRS::values_type::non_const_type scalar_view_t;
735 typedef typename scalar_view_t::memory_space scalar_memory_space;
738 typedef LocalOrdinal LO;
739 typedef GlobalOrdinal GO;
741 typedef Map<LO,GO,NO> map_type;
746 typedef NO::execution_space execution_space;
747 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
750 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
751 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
752 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
755 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
756 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
757 const KCRS & Cmat = C.getLocalMatrix();
759 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
760 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
761 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
762 scalar_view_t Cvals = Cmat.values;
764 c_lno_view_t Irowptr;
765 c_lno_nnz_view_t Icolind;
767 if(!Bview.importMatrix.is_null()) {
768 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
769 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
770 Ivals = Bview.importMatrix->getLocalMatrix().values;
774 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
777 RCP<const map_type> Ccolmap = C.getColMap();
778 size_t m = Aview.origMatrix->getNodeNumRows();
779 size_t n = Ccolmap->getNodeNumElements();
782 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
783 if(!params.is_null()) {
784 if(params->isParameter(
"openmp: ltg thread max"))
785 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
788 double thread_chunk = (double)(m) / thread_max;
791 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
794 size_t my_thread_start = tid * thread_chunk;
795 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
798 std::vector<size_t> c_status(n,INVALID);
801 size_t CSR_ip = 0, OLD_ip = 0;
802 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
806 CSR_ip = Crowptr(i+1);
808 SC minusOmegaDval = -omega*Dvals(i,0);
810 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
811 c_status[Ccolind(k)] = k;
817 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
818 const SC Bval = Bvals(j);
822 LO Cij = Bcol2Ccol(Bij);
825 Cvals(c_status[Cij]) += Bvals(j);
830 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
832 const SC Aval = Avals(k);
836 if (targetMapToOrigRow(Aik) != LO_INVALID) {
838 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
840 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
842 LO Cij = Bcol2Ccol(Bkj);
844 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
845 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
846 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
848 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
852 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
853 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
855 LO Cij = Icol2Ccol(Ikj);
857 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
858 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
859 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
861 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
873 template<
class InColindArrayType,
class InValsArrayType,
874 class OutRowptrType,
class OutColindType,
class OutValsType>
875 void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
876 const InColindArrayType& Incolind,
877 const InValsArrayType& Invalues,
879 const double thread_chunk,
880 OutRowptrType& row_mapC,
881 OutColindType& entriesC,
882 OutValsType& valuesC ) {
883 typedef OutRowptrType lno_view_t;
884 typedef OutColindType lno_nnz_view_t;
885 typedef OutValsType scalar_view_t;
886 typedef typename lno_view_t::execution_space execution_space;
887 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
890 size_t thread_max = Incolind.size();
896 lno_view_t thread_start_nnz(
"thread_nnz",thread_max+1);
897 Kokkos::parallel_scan(
"LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (
const size_t i,
size_t& update,
const bool final) {
898 size_t mynnz = thread_total_nnz(i);
899 if(
final) thread_start_nnz(i) = update;
901 if(
final && i+1==thread_max) thread_start_nnz(i+1)=update;
903 c_nnz_size = thread_start_nnz(thread_max);
906 row_mapC(m) = thread_start_nnz(thread_max);
909 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size); entriesC = entriesC_;
910 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size); valuesC = valuesC_;
913 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid) {
914 const size_t my_thread_start = tid * thread_chunk;
915 const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
916 const size_t nnz_thread_start = thread_start_nnz(tid);
918 const size_t nnz_thread_stop = thread_start_nnz(tid+1);
929 if (my_thread_start != 0 ) {
930 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
931 row_mapC(i) += nnz_thread_start;
948 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
949 for (
size_t i = 0; i < my_num_nnz; ++i) {
950 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
951 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
956 if(Incolind(tid).data()) free(Incolind(tid).data());
957 if(Invalues(tid).data()) free(Invalues(tid).data());
965 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
966 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
967 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
968 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
969 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
970 const LocalOrdinalViewType & Acol2Brow,
971 const LocalOrdinalViewType & Acol2Irow,
972 const LocalOrdinalViewType & Bcol2Ccol,
973 const LocalOrdinalViewType & Icol2Ccol,
974 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
975 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
976 const std::string& label,
977 const Teuchos::RCP<Teuchos::ParameterList>& params) {
978 #ifdef HAVE_TPETRA_MMM_TIMINGS
979 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
980 using Teuchos::TimeMonitor;
981 Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK")));
982 Teuchos::TimeMonitor MMmult (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply")));
984 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
989 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(),0));
990 Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(
" MSAK"),params);
992 #ifdef HAVE_TPETRA_MMM_TIMINGS
994 Teuchos::TimeMonitor MMscale (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale")));
1000 #ifdef HAVE_TPETRA_MMM_TIMINGS
1002 Teuchos::TimeMonitor MMadd (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add")));
1006 Teuchos::ParameterList jparams;
1007 if(!params.is_null()) {
1009 jparams.set(
"label",label+std::string(
" MSAK Add"));
1011 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
1012 Tpetra::MatrixMatrix::add(one,
false,*Bview.origMatrix,Scalar(-omega),
false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,
false));
1018 #if defined (HAVE_TPETRA_INST_OPENMP)
1020 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1021 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1022 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1023 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1024 const LocalOrdinalViewType & Acol2PRow,
1025 const LocalOrdinalViewType & Acol2PRowImport,
1026 const LocalOrdinalViewType & Pcol2Accol,
1027 const LocalOrdinalViewType & PIcol2Accol,
1028 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1029 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1030 const std::string& label,
1031 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1034 using Tpetra::MatrixMatrix::UnmanagedView;
1035 #ifdef HAVE_TPETRA_MMM_TIMINGS
1036 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1037 Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore")));
1040 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1042 typedef LocalOrdinal LO;
1043 typedef GlobalOrdinal GO;
1045 typedef Map<LO,GO,NO> map_type;
1047 typedef typename KCRS::StaticCrsGraphType graph_t;
1048 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1049 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1050 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1051 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1052 typedef typename KCRS::device_type device_t;
1053 typedef typename device_t::execution_space execution_space;
1054 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1057 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1058 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1059 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1061 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1062 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1065 RCP<const map_type> Accolmap = Ac.getColMap();
1066 size_t m = Rview.origMatrix->getNodeNumRows();
1067 size_t n = Accolmap->getNodeNumElements();
1070 const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1071 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1072 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1074 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
1075 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
1076 lno_nnz_view_t Icolind;
1077 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1078 scalar_view_t Ivals;
1080 if (!Pview.importMatrix.is_null())
1082 const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1083 Irowptr = Imat.graph.row_map;
1084 Icolind = Imat.graph.entries;
1085 Ivals = Imat.values;
1096 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1097 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1100 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1101 if(!params.is_null()) {
1102 if(params->isParameter(
"openmp: ltg thread max"))
1103 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1106 double thread_chunk = (double)(m) / thread_max;
1109 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
1111 lno_nnz_view_t entriesAc;
1112 scalar_view_t valuesAc;
1116 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
1126 Kokkos::View<u_lno_nnz_view_t*> tl_colind(
"top_colind", thread_max);
1127 Kokkos::View<u_scalar_view_t*> tl_values(
"top_values", thread_max);
1130 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1133 size_t my_thread_start = tid * thread_chunk;
1134 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1135 size_t my_thread_m = my_thread_stop - my_thread_start;
1137 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1139 std::vector<size_t> ac_status(n, INVALID);
1142 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1143 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1144 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1147 size_t nnz = 0, nnz_old = 0;
1149 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1153 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1155 const SC Rik = Rvals(kk);
1159 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1161 const SC Akl = Avals(ll);
1165 if (Acol2PRow[l] != LO_INVALID) {
1172 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1175 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1177 LO Acj = Pcol2Accol(j);
1180 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1181 #ifdef HAVE_TPETRA_DEBUG
1183 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1185 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1188 ac_status[Acj] = nnz;
1189 Accolind(nnz) = Acj;
1190 Acvals(nnz) = Rik*Akl*Plj;
1193 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1203 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1204 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1206 LO Acj = PIcol2Accol(j);
1209 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1210 #ifdef HAVE_TPETRA_DEBUG
1212 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1214 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1217 ac_status[Acj] = nnz;
1218 Accolind(nnz) = Acj;
1219 Acvals(nnz) = Rik*Akl*Plj;
1222 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1229 if (nnz + n > nnzAllocated) {
1231 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1232 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1236 thread_total_nnz(tid) = nnz;
1237 tl_colind(tid) = Accolind;
1238 tl_values(tid) = Acvals;
1240 #ifdef HAVE_TPETRA_MMM_TIMINGS
1242 Teuchos::TimeMonitor MMcopy (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local")));
1245 copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1247 #ifdef HAVE_TPETRA_MMM_TIMINGS
1249 Teuchos::TimeMonitor MMsort (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort")));
1253 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1255 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1257 #ifdef HAVE_TPETRA_MMM_TIMINGS
1259 Teuchos::TimeMonitor MMfill (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC")));
1270 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1271 labelList->set(
"Timer Label",label);
1272 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1273 RCP<const Export<LO,GO,NO> > dummyExport;
1274 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1275 Rview.origMatrix->getRangeMap(),
1284 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1285 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1286 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1287 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1288 const LocalOrdinalViewType & Acol2PRow,
1289 const LocalOrdinalViewType & Acol2PRowImport,
1290 const LocalOrdinalViewType & Pcol2Accol,
1291 const LocalOrdinalViewType & PIcol2Accol,
1292 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1293 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1294 const std::string& label,
1295 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1298 using Tpetra::MatrixMatrix::UnmanagedView;
1299 #ifdef HAVE_TPETRA_MMM_TIMINGS
1300 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1301 Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore")));
1304 typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1306 typedef LocalOrdinal LO;
1307 typedef GlobalOrdinal GO;
1309 typedef Map<LO,GO,NO> map_type;
1311 typedef typename KCRS::StaticCrsGraphType graph_t;
1312 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1313 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1314 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1315 typedef typename KCRS::device_type device_t;
1316 typedef typename device_t::execution_space execution_space;
1317 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1319 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1320 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1321 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1324 RCP<const map_type> Accolmap = Ac.getColMap();
1325 size_t m = Rview.origMatrix->getNodeNumRows();
1326 size_t n = Accolmap->getNodeNumElements();
1329 const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1330 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1331 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1332 const KCRS & Cmat = Ac.getLocalMatrix();
1334 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1335 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1336 lno_nnz_view_t Icolind;
1337 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1338 scalar_view_t Cvals = Cmat.values;
1339 scalar_view_t Ivals;
1341 if (!Pview.importMatrix.is_null())
1343 const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1344 Irowptr = Imat.graph.row_map;
1345 Icolind = Imat.graph.entries;
1346 Ivals = Imat.values;
1350 size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1351 if(!params.is_null()) {
1352 if(params->isParameter(
"openmp: ltg thread max"))
1353 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1356 double thread_chunk = (double)(m) / thread_max;
1359 Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1362 size_t my_thread_start = tid * thread_chunk;
1363 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1365 std::vector<size_t> c_status(n, INVALID);
1368 size_t OLD_ip = 0, CSR_ip = 0;
1370 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1373 OLD_ip = Crowptr(i);
1374 CSR_ip = Crowptr(i+1);
1375 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1376 c_status[Ccolind(k)] = k;
1382 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1384 const SC Rik = Rvals(kk);
1388 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1390 const SC Akl = Avals(ll);
1394 if (Acol2PRow[l] != LO_INVALID) {
1401 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1404 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1406 LO Cij = Pcol2Accol(j);
1408 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1409 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1410 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1413 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1422 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1423 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1425 LO Cij = PIcol2Accol(j);
1427 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1428 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1429 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1431 Cvals(c_status[Cij]) += Rik*Akl*Plj;
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...