10 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13 #include "Tpetra_RowMatrixTransposer.hpp"
17 namespace MatrixMatrix{
19 namespace ExtraKernels{
23 template<
class CrsMatrixType>
24 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
26 size_t Aest = 100, Best=100;
27 if (A.getLocalNumEntries() > 0)
28 Aest = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
29 if (B.getLocalNumEntries() > 0)
30 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
32 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
33 nnzperrow *= nnzperrow;
40 template<
class CrsMatrixType>
41 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
42 size_t nnzPerRowA = 100, Pcols = 100;
43 if (A.getLocalNumEntries() > 0)
44 nnzPerRowA = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 9;
45 if (P.getLocalNumEntries() > 0)
46 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
47 return (
size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
50 #if defined (HAVE_TPETRA_INST_OPENMP)
52 template<
class Scalar,
55 class LocalOrdinalViewType>
56 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
57 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
58 const LocalOrdinalViewType & targetMapToOrigRow,
59 const LocalOrdinalViewType & targetMapToImportRow,
60 const LocalOrdinalViewType & Bcol2Ccol,
61 const LocalOrdinalViewType & Icol2Ccol,
62 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
63 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
64 const std::string& label,
65 const Teuchos::RCP<Teuchos::ParameterList>& params) {
66 #ifdef HAVE_TPETRA_MMM_TIMINGS
67 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
68 using Teuchos::TimeMonitor;
69 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
72 using Teuchos::ArrayRCP;
73 using Teuchos::ArrayView;
80 typedef typename KCRS::device_type device_t;
81 typedef typename KCRS::StaticCrsGraphType graph_t;
82 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
83 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
84 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
85 typedef typename KCRS::values_type::non_const_type scalar_view_t;
89 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
90 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
93 typedef LocalOrdinal LO;
94 typedef GlobalOrdinal GO;
95 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
96 typedef Map<LO,GO,NO> map_type;
101 typedef NO::execution_space execution_space;
102 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
105 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
106 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
107 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
110 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
111 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
113 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
114 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
115 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
116 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
118 c_lno_view_t Irowptr;
119 lno_nnz_view_t Icolind;
121 if(!Bview.importMatrix.is_null()) {
122 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
123 Irowptr = lclB.graph.row_map;
124 Icolind = lclB.graph.entries;
126 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
130 RCP<const map_type> Ccolmap = C.getColMap();
131 size_t m = Aview.origMatrix->getLocalNumRows();
132 size_t n = Ccolmap->getLocalNumElements();
133 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
136 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
137 if(!params.is_null()) {
138 if(params->isParameter(
"openmp: ltg thread max"))
139 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
143 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
145 lno_nnz_view_t entriesC;
146 scalar_view_t valuesC;
150 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
153 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind",thread_max);
154 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values",thread_max);
156 double thread_chunk = (double)(m) / thread_max;
159 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
162 size_t my_thread_start = tid * thread_chunk;
163 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
164 size_t my_thread_m = my_thread_stop - my_thread_start;
167 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
170 std::vector<size_t> c_status(n,INVALID);
172 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
173 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
176 size_t CSR_ip = 0, OLD_ip = 0;
177 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
181 row_mapC(i) = CSR_ip;
184 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
186 const SC Aval = Avals(k);
190 if (targetMapToOrigRow(Aik) != LO_INVALID) {
197 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
200 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
202 LO Cij = Bcol2Ccol(Bkj);
204 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
206 c_status[Cij] = CSR_ip;
207 Ccolind(CSR_ip) = Cij;
208 Cvals(CSR_ip) = Aval*Bvals(j);
212 Cvals(c_status[Cij]) += Aval*Bvals(j);
223 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
224 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
226 LO Cij = Icol2Ccol(Ikj);
228 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
230 c_status[Cij] = CSR_ip;
231 Ccolind(CSR_ip) = Cij;
232 Cvals(CSR_ip) = Aval*Ivals(j);
236 Cvals(c_status[Cij]) += Aval*Ivals(j);
243 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
245 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
246 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*) Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
250 thread_total_nnz(tid) = CSR_ip;
251 tl_colind(tid) = Ccolind;
252 tl_values(tid) = Cvals;
256 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
258 #ifdef HAVE_TPETRA_MMM_TIMINGS
259 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
262 if (params.is_null() || params->get(
"sort entries",
true))
263 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
264 C.setAllValues(row_mapC,entriesC,valuesC);
269 template<
class Scalar,
272 class LocalOrdinalViewType>
273 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
274 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
275 const LocalOrdinalViewType & targetMapToOrigRow,
276 const LocalOrdinalViewType & targetMapToImportRow,
277 const LocalOrdinalViewType & Bcol2Ccol,
278 const LocalOrdinalViewType & Icol2Ccol,
279 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
280 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
281 const std::string& label,
282 const Teuchos::RCP<Teuchos::ParameterList>& params) {
283 #ifdef HAVE_TPETRA_MMM_TIMINGS
284 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
285 using Teuchos::TimeMonitor;
286 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
289 using Teuchos::Array;
290 using Teuchos::ArrayRCP;
291 using Teuchos::ArrayView;
298 typedef typename KCRS::StaticCrsGraphType graph_t;
299 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
300 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
301 typedef typename KCRS::values_type::non_const_type scalar_view_t;
304 typedef LocalOrdinal LO;
305 typedef GlobalOrdinal GO;
306 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
307 typedef Map<LO,GO,NO> map_type;
312 typedef NO::execution_space execution_space;
313 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
316 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
317 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
318 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
321 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
322 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
323 const KCRS & Cmat = C.getLocalMatrixDevice();
325 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
326 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
327 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
328 scalar_view_t Cvals = Cmat.values;
330 c_lno_view_t Irowptr;
331 c_lno_nnz_view_t Icolind;
333 if(!Bview.importMatrix.is_null()) {
334 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
335 Irowptr = lclB.graph.row_map;
336 Icolind = lclB.graph.entries;
341 RCP<const map_type> Ccolmap = C.getColMap();
342 size_t m = Aview.origMatrix->getLocalNumRows();
343 size_t n = Ccolmap->getLocalNumElements();
346 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
347 if(!params.is_null()) {
348 if(params->isParameter(
"openmp: ltg thread max"))
349 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
352 double thread_chunk = (double)(m) / thread_max;
355 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
358 size_t my_thread_start = tid * thread_chunk;
359 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
362 std::vector<size_t> c_status(n,INVALID);
365 size_t CSR_ip = 0, OLD_ip = 0;
366 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
370 CSR_ip = Crowptr(i+1);
371 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
372 c_status[Ccolind(k)] = k;
377 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
379 const SC Aval = Avals(k);
383 if (targetMapToOrigRow(Aik) != LO_INVALID) {
385 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
387 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
389 LO Cij = Bcol2Ccol(Bkj);
391 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
392 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
393 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
395 Cvals(c_status[Cij]) += Aval * Bvals(j);
399 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
400 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
402 LO Cij = Icol2Ccol(Ikj);
404 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
405 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
406 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
408 Cvals(c_status[Cij]) += Aval * Ivals(j);
419 template<
class Scalar,
422 class LocalOrdinalViewType>
423 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
424 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
425 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
426 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
427 const LocalOrdinalViewType & targetMapToOrigRow,
428 const LocalOrdinalViewType & targetMapToImportRow,
429 const LocalOrdinalViewType & Bcol2Ccol,
430 const LocalOrdinalViewType & Icol2Ccol,
431 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
432 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
433 const std::string& label,
434 const Teuchos::RCP<Teuchos::ParameterList>& params) {
435 #ifdef HAVE_TPETRA_MMM_TIMINGS
436 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
437 using Teuchos::TimeMonitor;
438 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
441 using Teuchos::Array;
442 using Teuchos::ArrayRCP;
443 using Teuchos::ArrayView;
448 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
450 typedef typename KCRS::device_type device_t;
451 typedef typename KCRS::StaticCrsGraphType graph_t;
452 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
453 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
454 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
455 typedef typename KCRS::values_type::non_const_type scalar_view_t;
459 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
460 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
463 typedef typename scalar_view_t::memory_space scalar_memory_space;
466 typedef LocalOrdinal LO;
467 typedef GlobalOrdinal GO;
469 typedef Map<LO,GO,NO> map_type;
474 typedef NO::execution_space execution_space;
475 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
478 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
479 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
480 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
483 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
484 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
486 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
487 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
488 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
489 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
491 c_lno_view_t Irowptr;
492 lno_nnz_view_t Icolind;
494 if(!Bview.importMatrix.is_null()) {
495 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
496 Irowptr = lclB.graph.row_map;
497 Icolind = lclB.graph.entries;
499 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
504 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
507 RCP<const map_type> Ccolmap = C.getColMap();
508 size_t m = Aview.origMatrix->getLocalNumRows();
509 size_t n = Ccolmap->getLocalNumElements();
510 size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
513 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
514 if(!params.is_null()) {
515 if(params->isParameter(
"openmp: ltg thread max"))
516 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
520 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
522 lno_nnz_view_t entriesC;
523 scalar_view_t valuesC;
527 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
530 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind",thread_max);
531 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values",thread_max);
533 double thread_chunk = (double)(m) / thread_max;
536 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
539 size_t my_thread_start = tid * thread_chunk;
540 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
541 size_t my_thread_m = my_thread_stop - my_thread_start;
544 size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
547 std::vector<size_t> c_status(n,INVALID);
549 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
550 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
553 size_t CSR_ip = 0, OLD_ip = 0;
554 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
559 row_mapC(i) = CSR_ip;
561 SC minusOmegaDval = -omega*Dvals(i,0);
564 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
565 const SC Bval = Bvals(j);
569 LO Cij = Bcol2Ccol(Bij);
572 c_status[Cij] = CSR_ip;
573 Ccolind(CSR_ip) = Cij;
574 Cvals(CSR_ip) = Bvals[j];
580 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
582 const SC Aval = Avals(k);
586 if (targetMapToOrigRow(Aik) != LO_INVALID) {
593 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
596 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
598 LO Cij = Bcol2Ccol(Bkj);
600 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
602 c_status[Cij] = CSR_ip;
603 Ccolind(CSR_ip) = Cij;
604 Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
608 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
619 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
620 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
622 LO Cij = Icol2Ccol(Ikj);
624 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
626 c_status[Cij] = CSR_ip;
627 Ccolind(CSR_ip) = Cij;
628 Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
632 Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
639 if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
641 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
642 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*) Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
646 thread_total_nnz(tid) = CSR_ip;
647 tl_colind(tid) = Ccolind;
648 tl_values(tid) = Cvals;
653 copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
656 #ifdef HAVE_TPETRA_MMM_TIMINGS
657 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
660 if (params.is_null() || params->get(
"sort entries",
true))
661 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
662 C.setAllValues(row_mapC,entriesC,valuesC);
669 template<
class Scalar,
672 class LocalOrdinalViewType>
673 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
674 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
675 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
676 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
677 const LocalOrdinalViewType & targetMapToOrigRow,
678 const LocalOrdinalViewType & targetMapToImportRow,
679 const LocalOrdinalViewType & Bcol2Ccol,
680 const LocalOrdinalViewType & Icol2Ccol,
681 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
682 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
683 const std::string& label,
684 const Teuchos::RCP<Teuchos::ParameterList>& params) {
685 #ifdef HAVE_TPETRA_MMM_TIMINGS
686 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
687 using Teuchos::TimeMonitor;
688 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
690 using Teuchos::Array;
691 using Teuchos::ArrayRCP;
692 using Teuchos::ArrayView;
697 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
700 typedef typename KCRS::StaticCrsGraphType graph_t;
701 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
702 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
703 typedef typename KCRS::values_type::non_const_type scalar_view_t;
706 typedef typename scalar_view_t::memory_space scalar_memory_space;
709 typedef LocalOrdinal LO;
710 typedef GlobalOrdinal GO;
712 typedef Map<LO,GO,NO> map_type;
717 typedef NO::execution_space execution_space;
718 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
721 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
722 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
723 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
726 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
727 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
728 const KCRS & Cmat = C.getLocalMatrixDevice();
730 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
731 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
732 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
733 scalar_view_t Cvals = Cmat.values;
735 c_lno_view_t Irowptr;
736 c_lno_nnz_view_t Icolind;
738 if(!Bview.importMatrix.is_null()) {
739 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
740 Irowptr = lclB.graph.row_map;
741 Icolind = lclB.graph.entries;
747 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
750 RCP<const map_type> Ccolmap = C.getColMap();
751 size_t m = Aview.origMatrix->getLocalNumRows();
752 size_t n = Ccolmap->getLocalNumElements();
755 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
756 if(!params.is_null()) {
757 if(params->isParameter(
"openmp: ltg thread max"))
758 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
761 double thread_chunk = (double)(m) / thread_max;
764 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
767 size_t my_thread_start = tid * thread_chunk;
768 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
771 std::vector<size_t> c_status(n,INVALID);
774 size_t CSR_ip = 0, OLD_ip = 0;
775 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
779 CSR_ip = Crowptr(i+1);
781 SC minusOmegaDval = -omega*Dvals(i,0);
783 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
784 c_status[Ccolind(k)] = k;
790 for (
size_t j = Browptr(i); j < Browptr(i+1); j++) {
791 const SC Bval = Bvals(j);
795 LO Cij = Bcol2Ccol(Bij);
798 Cvals(c_status[Cij]) += Bvals(j);
803 for (
size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
805 const SC Aval = Avals(k);
809 if (targetMapToOrigRow(Aik) != LO_INVALID) {
811 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
813 for (
size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
815 LO Cij = Bcol2Ccol(Bkj);
817 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
818 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
819 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
821 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
825 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
826 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
828 LO Cij = Icol2Ccol(Ikj);
830 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
831 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
832 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
834 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
846 template<
class InColindArrayType,
class InValsArrayType,
847 class OutRowptrType,
class OutColindType,
class OutValsType>
848 void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
849 const InColindArrayType& Incolind,
850 const InValsArrayType& Invalues,
852 const double thread_chunk,
853 OutRowptrType& row_mapC,
854 OutColindType& entriesC,
855 OutValsType& valuesC ) {
856 typedef OutRowptrType lno_view_t;
857 typedef OutColindType lno_nnz_view_t;
858 typedef OutValsType scalar_view_t;
859 typedef typename lno_view_t::execution_space execution_space;
860 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
863 size_t thread_max = Incolind.size();
869 lno_view_t thread_start_nnz(
"thread_nnz",thread_max+1);
870 Kokkos::parallel_scan(
"LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (
const size_t i,
size_t& update,
const bool final) {
871 size_t mynnz = thread_total_nnz(i);
872 if(
final) thread_start_nnz(i) = update;
874 if(
final && i+1==thread_max) thread_start_nnz(i+1)=update;
876 c_nnz_size = thread_start_nnz(thread_max);
879 row_mapC(m) = thread_start_nnz(thread_max);
882 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size); entriesC = entriesC_;
883 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size); valuesC = valuesC_;
886 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid) {
887 const size_t my_thread_start = tid * thread_chunk;
888 const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
889 const size_t nnz_thread_start = thread_start_nnz(tid);
891 const size_t nnz_thread_stop = thread_start_nnz(tid+1);
902 if (my_thread_start != 0 ) {
903 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
904 row_mapC(i) += nnz_thread_start;
921 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
922 for (
size_t i = 0; i < my_num_nnz; ++i) {
923 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
924 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
929 if(Incolind(tid).data()) free(Incolind(tid).data());
930 if(Invalues(tid).data()) free(Invalues(tid).data());
938 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
939 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
940 const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
941 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
942 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
943 const LocalOrdinalViewType & Acol2Brow,
944 const LocalOrdinalViewType & Acol2Irow,
945 const LocalOrdinalViewType & Bcol2Ccol,
946 const LocalOrdinalViewType & Icol2Ccol,
947 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
948 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
949 const std::string& label,
950 const Teuchos::RCP<Teuchos::ParameterList>& params) {
951 #ifdef HAVE_TPETRA_MMM_TIMINGS
952 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
953 using Teuchos::TimeMonitor;
954 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
955 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
958 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
963 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(),0));
964 Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(
" MSAK"),params);
966 #ifdef HAVE_TPETRA_MMM_TIMINGS
967 MM2=Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
973 #ifdef HAVE_TPETRA_MMM_TIMINGS
974 MM2=Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
978 Teuchos::ParameterList jparams;
979 if(!params.is_null()) {
981 jparams.set(
"label",label+std::string(
" MSAK Add"));
983 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
984 Tpetra::MatrixMatrix::add(one,
false,*Bview.origMatrix,Scalar(-omega),
false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,
false));
985 #ifdef HAVE_TPETRA_MMM_TIMINGS
992 #if defined (HAVE_TPETRA_INST_OPENMP)
994 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
995 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
996 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
997 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
998 const LocalOrdinalViewType & Acol2PRow,
999 const LocalOrdinalViewType & Acol2PRowImport,
1000 const LocalOrdinalViewType & Pcol2Accol,
1001 const LocalOrdinalViewType & PIcol2Accol,
1002 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1003 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1004 const std::string& label,
1005 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1008 using Tpetra::MatrixMatrix::UnmanagedView;
1009 #ifdef HAVE_TPETRA_MMM_TIMINGS
1010 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1012 using Teuchos::TimeMonitor;
1013 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore"))));
1016 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1018 typedef LocalOrdinal LO;
1019 typedef GlobalOrdinal GO;
1021 typedef Map<LO,GO,NO> map_type;
1023 typedef typename KCRS::StaticCrsGraphType graph_t;
1024 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1025 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1026 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1027 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1028 typedef typename KCRS::device_type device_t;
1029 typedef typename device_t::execution_space execution_space;
1030 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1033 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1034 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1035 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1037 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1038 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1041 RCP<const map_type> Accolmap = Ac.getColMap();
1042 size_t m = Rview.origMatrix->getLocalNumRows();
1043 size_t n = Accolmap->getLocalNumElements();
1046 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1047 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1048 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1050 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1051 Arowptr = Amat.graph.row_map,
1052 Prowptr = Pmat.graph.row_map, Irowptr;
1053 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1054 Acolind = Amat.graph.entries,
1055 Pcolind = Pmat.graph.entries;
1056 lno_nnz_view_t Icolind;
1057 const scalar_view_t Rvals = Rmat.values,
1058 Avals = Amat.values,
1059 Pvals = Pmat.values;
1060 scalar_view_t Ivals;
1062 if (!Pview.importMatrix.is_null())
1064 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1065 Irowptr = Imat.graph.row_map;
1066 Icolind = Imat.graph.entries;
1067 Ivals = Imat.values;
1078 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1079 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1082 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1083 if(!params.is_null()) {
1084 if(params->isParameter(
"openmp: ltg thread max"))
1085 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1088 double thread_chunk = (double)(m) / thread_max;
1091 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
1093 lno_nnz_view_t entriesAc;
1094 scalar_view_t valuesAc;
1098 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz",thread_max+1);
1108 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
1109 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
1112 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1115 size_t my_thread_start = tid * thread_chunk;
1116 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1117 size_t my_thread_m = my_thread_stop - my_thread_start;
1119 size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1121 std::vector<size_t> ac_status(n, INVALID);
1124 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1125 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1126 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1129 size_t nnz = 0, nnz_old = 0;
1131 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1135 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1137 const SC Rik = Rvals(kk);
1141 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1143 const SC Akl = Avals(ll);
1147 if (Acol2PRow[l] != LO_INVALID) {
1154 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1157 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1159 LO Acj = Pcol2Accol(j);
1162 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1163 #ifdef HAVE_TPETRA_DEBUG
1165 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1167 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1170 ac_status[Acj] = nnz;
1171 Accolind(nnz) = Acj;
1172 Acvals(nnz) = Rik*Akl*Plj;
1175 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1185 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1186 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1188 LO Acj = PIcol2Accol(j);
1191 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1192 #ifdef HAVE_TPETRA_DEBUG
1194 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1196 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1199 ac_status[Acj] = nnz;
1200 Accolind(nnz) = Acj;
1201 Acvals(nnz) = Rik*Akl*Plj;
1204 Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1211 if (nnz + n > nnzAllocated) {
1213 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1214 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type) realloc((
void*) Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1218 thread_total_nnz(tid) = nnz;
1219 tl_colind(tid) = Accolind;
1220 tl_values(tid) = Acvals;
1222 #ifdef HAVE_TPETRA_MMM_TIMINGS
1223 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
1226 copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1228 #ifdef HAVE_TPETRA_MMM_TIMINGS
1229 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1233 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1235 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1237 #ifdef HAVE_TPETRA_MMM_TIMINGS
1238 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1249 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1250 labelList->set(
"Timer Label",label);
1251 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1252 RCP<const Export<LO,GO,NO> > dummyExport;
1253 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1254 Rview.origMatrix->getRangeMap(),
1263 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1264 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1265 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1266 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1267 const LocalOrdinalViewType & Acol2PRow,
1268 const LocalOrdinalViewType & Acol2PRowImport,
1269 const LocalOrdinalViewType & Pcol2Accol,
1270 const LocalOrdinalViewType & PIcol2Accol,
1271 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1272 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1273 const std::string& label,
1274 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1277 using Tpetra::MatrixMatrix::UnmanagedView;
1278 #ifdef HAVE_TPETRA_MMM_TIMINGS
1279 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1280 using Teuchos::TimeMonitor;
1282 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore"))));
1285 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1287 typedef LocalOrdinal LO;
1288 typedef GlobalOrdinal GO;
1290 typedef Map<LO,GO,NO> map_type;
1292 typedef typename KCRS::StaticCrsGraphType graph_t;
1293 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1294 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1295 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1296 typedef typename KCRS::device_type device_t;
1297 typedef typename device_t::execution_space execution_space;
1298 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1300 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1301 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1302 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1305 RCP<const map_type> Accolmap = Ac.getColMap();
1306 size_t m = Rview.origMatrix->getLocalNumRows();
1307 size_t n = Accolmap->getLocalNumElements();
1310 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1311 const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1312 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1313 const KCRS & Cmat = Ac.getLocalMatrixDevice();
1315 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1316 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1317 lno_nnz_view_t Icolind;
1318 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1319 scalar_view_t Cvals = Cmat.values;
1320 scalar_view_t Ivals;
1322 if (!Pview.importMatrix.is_null())
1324 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1325 Irowptr = Imat.graph.row_map;
1326 Icolind = Imat.graph.entries;
1327 Ivals = Imat.values;
1331 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1332 if(!params.is_null()) {
1333 if(params->isParameter(
"openmp: ltg thread max"))
1334 thread_max = std::max((
size_t)1,std::min(thread_max,params->get(
"openmp: ltg thread max",thread_max)));
1337 double thread_chunk = (double)(m) / thread_max;
1340 Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](
const size_t tid)
1343 size_t my_thread_start = tid * thread_chunk;
1344 size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1346 std::vector<size_t> c_status(n, INVALID);
1349 size_t OLD_ip = 0, CSR_ip = 0;
1351 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1354 OLD_ip = Crowptr(i);
1355 CSR_ip = Crowptr(i+1);
1356 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1357 c_status[Ccolind(k)] = k;
1363 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1365 const SC Rik = Rvals(kk);
1369 for (
size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1371 const SC Akl = Avals(ll);
1375 if (Acol2PRow[l] != LO_INVALID) {
1382 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1385 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1387 LO Cij = Pcol2Accol(j);
1389 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1390 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1391 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1394 Cvals(c_status[Cij]) += Rik*Akl*Plj;
1403 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1404 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1406 LO Cij = PIcol2Accol(j);
1408 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1409 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1410 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1412 Cvals(c_status[Cij]) += Rik*Akl*Plj;
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...