10 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13 #include "Tpetra_RowMatrixTransposer.hpp"
17 namespace MatrixMatrix {
19 namespace ExtraKernels {
22 template <
class CrsMatrixType>
23 size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
25 size_t Aest = 100, Best = 100;
26 if (A.getLocalNumEntries() > 0)
27 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
28 if (B.getLocalNumEntries() > 0)
29 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
31 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
32 nnzperrow *= nnzperrow;
38 template <
class CrsMatrixType>
39 size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
40 size_t nnzPerRowA = 100, Pcols = 100;
41 if (A.getLocalNumEntries() > 0)
42 nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
43 if (P.getLocalNumEntries() > 0)
44 Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
45 return (
size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
48 #if defined(HAVE_TPETRA_INST_OPENMP)
50 template <
class Scalar,
53 class LocalOrdinalViewType>
54 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
55 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
56 const LocalOrdinalViewType& targetMapToOrigRow,
57 const LocalOrdinalViewType& targetMapToImportRow,
58 const LocalOrdinalViewType& Bcol2Ccol,
59 const LocalOrdinalViewType& Icol2Ccol,
60 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
61 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
62 const std::string& label,
63 const Teuchos::RCP<Teuchos::ParameterList>& params) {
64 #ifdef HAVE_TPETRA_MMM_TIMINGS
65 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
66 using Teuchos::TimeMonitor;
67 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix LTGCore"))));
70 using Teuchos::ArrayRCP;
71 using Teuchos::ArrayView;
77 typedef typename KCRS::device_type device_t;
78 typedef typename KCRS::StaticCrsGraphType graph_t;
79 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
80 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
81 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
82 typedef typename KCRS::values_type::non_const_type scalar_view_t;
86 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
87 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
90 typedef LocalOrdinal LO;
91 typedef GlobalOrdinal GO;
92 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
93 typedef Map<LO, GO, NO> map_type;
98 typedef NO::execution_space execution_space;
99 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
102 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
103 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
104 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
107 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
108 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
110 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
111 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
112 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
113 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
115 c_lno_view_t Irowptr;
116 lno_nnz_view_t Icolind;
118 if (!Bview.importMatrix.is_null()) {
119 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
120 Irowptr = lclB.graph.row_map;
121 Icolind = lclB.graph.entries;
123 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
127 RCP<const map_type> Ccolmap = C.getColMap();
128 size_t m = Aview.origMatrix->getLocalNumRows();
129 size_t n = Ccolmap->getLocalNumElements();
130 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
133 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
134 if (!params.is_null()) {
135 if (params->isParameter(
"openmp: ltg thread max"))
136 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
140 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
142 lno_nnz_view_t entriesC;
143 scalar_view_t valuesC;
147 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
150 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
151 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
153 double thread_chunk = (double)(m) / thread_max;
156 Kokkos::parallel_for(
"MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
158 size_t my_thread_start = tid * thread_chunk;
159 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
160 size_t my_thread_m = my_thread_stop - my_thread_start;
163 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
166 std::vector<size_t> c_status(n, INVALID);
168 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
169 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
172 size_t CSR_ip = 0, OLD_ip = 0;
173 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
177 row_mapC(i) = CSR_ip;
180 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
182 const SC Aval = Avals(k);
186 if (targetMapToOrigRow(Aik) != LO_INVALID) {
193 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
196 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
198 LO Cij = Bcol2Ccol(Bkj);
200 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
202 c_status[Cij] = CSR_ip;
203 Ccolind(CSR_ip) = Cij;
204 Cvals(CSR_ip) = Aval * Bvals(j);
208 Cvals(c_status[Cij]) += Aval * Bvals(j);
219 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
220 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
222 LO Cij = Icol2Ccol(Ikj);
224 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
226 c_status[Cij] = CSR_ip;
227 Ccolind(CSR_ip) = Cij;
228 Cvals(CSR_ip) = Aval * Ivals(j);
232 Cvals(c_status[Cij]) += Aval * Ivals(j);
239 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
241 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
242 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
246 thread_total_nnz(tid) = CSR_ip;
247 tl_colind(tid) = Ccolind;
248 tl_values(tid) = Cvals;
252 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
254 #ifdef HAVE_TPETRA_MMM_TIMINGS
256 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix OpenMPSort"))));
259 if (params.is_null() || params->get(
"sort entries",
true))
260 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
261 C.setAllValues(row_mapC, entriesC, valuesC);
265 template <
class Scalar,
268 class LocalOrdinalViewType>
269 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
270 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
271 const LocalOrdinalViewType& targetMapToOrigRow,
272 const LocalOrdinalViewType& targetMapToImportRow,
273 const LocalOrdinalViewType& Bcol2Ccol,
274 const LocalOrdinalViewType& Icol2Ccol,
275 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
276 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
277 const std::string& label,
278 const Teuchos::RCP<Teuchos::ParameterList>& params) {
279 #ifdef HAVE_TPETRA_MMM_TIMINGS
280 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
281 using Teuchos::TimeMonitor;
282 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse LTGCore"))));
285 using Teuchos::Array;
286 using Teuchos::ArrayRCP;
287 using Teuchos::ArrayView;
294 typedef typename KCRS::StaticCrsGraphType graph_t;
295 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
296 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
297 typedef typename KCRS::values_type::non_const_type scalar_view_t;
300 typedef LocalOrdinal LO;
301 typedef GlobalOrdinal GO;
302 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
303 typedef Map<LO, GO, NO> map_type;
308 typedef NO::execution_space execution_space;
309 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
312 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
313 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
314 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
317 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
318 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
319 const KCRS& Cmat = C.getLocalMatrixDevice();
321 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
322 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
323 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
324 scalar_view_t Cvals = Cmat.values;
326 c_lno_view_t Irowptr;
327 c_lno_nnz_view_t Icolind;
329 if (!Bview.importMatrix.is_null()) {
330 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
331 Irowptr = lclB.graph.row_map;
332 Icolind = lclB.graph.entries;
337 RCP<const map_type> Ccolmap = C.getColMap();
338 size_t m = Aview.origMatrix->getLocalNumRows();
339 size_t n = Ccolmap->getLocalNumElements();
342 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
343 if (!params.is_null()) {
344 if (params->isParameter(
"openmp: ltg thread max"))
345 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
348 double thread_chunk = (double)(m) / thread_max;
351 Kokkos::parallel_for(
"MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
353 size_t my_thread_start = tid * thread_chunk;
354 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
357 std::vector<size_t> c_status(n, INVALID);
360 size_t CSR_ip = 0, OLD_ip = 0;
361 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
365 CSR_ip = Crowptr(i + 1);
366 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
367 c_status[Ccolind(k)] = k;
372 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
374 const SC Aval = Avals(k);
378 if (targetMapToOrigRow(Aik) != LO_INVALID) {
380 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
382 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
384 LO Cij = Bcol2Ccol(Bkj);
386 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
387 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
388 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
390 Cvals(c_status[Cij]) += Aval * Bvals(j);
394 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
395 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
397 LO Cij = Icol2Ccol(Ikj);
399 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
400 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
401 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
403 Cvals(c_status[Cij]) += Aval * Ivals(j);
414 template <
class Scalar,
417 class LocalOrdinalViewType>
418 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
419 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
420 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
421 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
422 const LocalOrdinalViewType& targetMapToOrigRow,
423 const LocalOrdinalViewType& targetMapToImportRow,
424 const LocalOrdinalViewType& Bcol2Ccol,
425 const LocalOrdinalViewType& Icol2Ccol,
426 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
427 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
428 const std::string& label,
429 const Teuchos::RCP<Teuchos::ParameterList>& params) {
430 #ifdef HAVE_TPETRA_MMM_TIMINGS
431 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
432 using Teuchos::TimeMonitor;
433 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix LTGCore"))));
436 using Teuchos::Array;
437 using Teuchos::ArrayRCP;
438 using Teuchos::ArrayView;
443 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
445 typedef typename KCRS::device_type device_t;
446 typedef typename KCRS::StaticCrsGraphType graph_t;
447 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
448 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
449 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
450 typedef typename KCRS::values_type::non_const_type scalar_view_t;
454 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
455 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
458 typedef typename scalar_view_t::memory_space scalar_memory_space;
461 typedef LocalOrdinal LO;
462 typedef GlobalOrdinal GO;
464 typedef Map<LO, GO, NO> map_type;
469 typedef NO::execution_space execution_space;
470 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
473 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
474 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
475 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
478 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
479 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
481 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
482 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
483 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
484 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
486 c_lno_view_t Irowptr;
487 lno_nnz_view_t Icolind;
489 if (!Bview.importMatrix.is_null()) {
490 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
491 Irowptr = lclB.graph.row_map;
492 Icolind = lclB.graph.entries;
494 b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
499 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
502 RCP<const map_type> Ccolmap = C.getColMap();
503 size_t m = Aview.origMatrix->getLocalNumRows();
504 size_t n = Ccolmap->getLocalNumElements();
505 size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
508 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
509 if (!params.is_null()) {
510 if (params->isParameter(
"openmp: ltg thread max"))
511 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
515 lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
517 lno_nnz_view_t entriesC;
518 scalar_view_t valuesC;
522 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
525 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
526 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
528 double thread_chunk = (double)(m) / thread_max;
531 Kokkos::parallel_for(
"Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
533 size_t my_thread_start = tid * thread_chunk;
534 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
535 size_t my_thread_m = my_thread_stop - my_thread_start;
538 size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
541 std::vector<size_t> c_status(n, INVALID);
543 u_lno_nnz_view_t Ccolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
544 u_scalar_view_t Cvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
547 size_t CSR_ip = 0, OLD_ip = 0;
548 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
553 row_mapC(i) = CSR_ip;
555 SC minusOmegaDval = -omega * Dvals(i, 0);
558 for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
559 const SC Bval = Bvals(j);
563 LO Cij = Bcol2Ccol(Bij);
566 c_status[Cij] = CSR_ip;
567 Ccolind(CSR_ip) = Cij;
568 Cvals(CSR_ip) = Bvals[j];
574 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
576 const SC Aval = Avals(k);
580 if (targetMapToOrigRow(Aik) != LO_INVALID) {
587 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
590 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
592 LO Cij = Bcol2Ccol(Bkj);
594 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
596 c_status[Cij] = CSR_ip;
597 Ccolind(CSR_ip) = Cij;
598 Cvals(CSR_ip) = minusOmegaDval * Aval * Bvals(j);
602 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
613 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
614 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
616 LO Cij = Icol2Ccol(Ikj);
618 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
620 c_status[Cij] = CSR_ip;
621 Ccolind(CSR_ip) = Cij;
622 Cvals(CSR_ip) = minusOmegaDval * Aval * Ivals(j);
626 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
633 if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
635 Ccolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
636 Cvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
640 thread_total_nnz(tid) = CSR_ip;
641 tl_colind(tid) = Ccolind;
642 tl_values(tid) = Cvals;
646 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
648 #ifdef HAVE_TPETRA_MMM_TIMINGS
650 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix OpenMPSort"))));
653 if (params.is_null() || params->get(
"sort entries",
true))
654 Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
655 C.setAllValues(row_mapC, entriesC, valuesC);
659 template <
class Scalar,
662 class LocalOrdinalViewType>
663 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
664 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
665 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
666 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
667 const LocalOrdinalViewType& targetMapToOrigRow,
668 const LocalOrdinalViewType& targetMapToImportRow,
669 const LocalOrdinalViewType& Bcol2Ccol,
670 const LocalOrdinalViewType& Icol2Ccol,
671 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
672 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
673 const std::string& label,
674 const Teuchos::RCP<Teuchos::ParameterList>& params) {
675 #ifdef HAVE_TPETRA_MMM_TIMINGS
676 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
677 using Teuchos::TimeMonitor;
678 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse LTGCore"))));
680 using Teuchos::Array;
681 using Teuchos::ArrayRCP;
682 using Teuchos::ArrayView;
687 typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
690 typedef typename KCRS::StaticCrsGraphType graph_t;
691 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
692 typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
693 typedef typename KCRS::values_type::non_const_type scalar_view_t;
696 typedef typename scalar_view_t::memory_space scalar_memory_space;
699 typedef LocalOrdinal LO;
700 typedef GlobalOrdinal GO;
702 typedef Map<LO, GO, NO> map_type;
707 typedef NO::execution_space execution_space;
708 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
711 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
712 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
713 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
716 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
717 const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
718 const KCRS& Cmat = C.getLocalMatrixDevice();
720 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
721 const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
722 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
723 scalar_view_t Cvals = Cmat.values;
725 c_lno_view_t Irowptr;
726 c_lno_nnz_view_t Icolind;
728 if (!Bview.importMatrix.is_null()) {
729 auto lclB = Bview.importMatrix->getLocalMatrixDevice();
730 Irowptr = lclB.graph.row_map;
731 Icolind = lclB.graph.entries;
737 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
740 RCP<const map_type> Ccolmap = C.getColMap();
741 size_t m = Aview.origMatrix->getLocalNumRows();
742 size_t n = Ccolmap->getLocalNumElements();
745 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
746 if (!params.is_null()) {
747 if (params->isParameter(
"openmp: ltg thread max"))
748 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
751 double thread_chunk = (double)(m) / thread_max;
754 Kokkos::parallel_for(
"Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
756 size_t my_thread_start = tid * thread_chunk;
757 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
760 std::vector<size_t> c_status(n, INVALID);
763 size_t CSR_ip = 0, OLD_ip = 0;
764 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
768 CSR_ip = Crowptr(i + 1);
770 SC minusOmegaDval = -omega * Dvals(i, 0);
772 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
773 c_status[Ccolind(k)] = k;
779 for (
size_t j = Browptr(i); j < Browptr(i + 1); j++) {
780 const SC Bval = Bvals(j);
784 LO Cij = Bcol2Ccol(Bij);
787 Cvals(c_status[Cij]) += Bvals(j);
791 for (
size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
793 const SC Aval = Avals(k);
797 if (targetMapToOrigRow(Aik) != LO_INVALID) {
799 size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
801 for (
size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
803 LO Cij = Bcol2Ccol(Bkj);
805 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
806 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
807 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
809 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
813 size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
814 for (
size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
816 LO Cij = Icol2Ccol(Ikj);
818 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
819 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
820 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
822 Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
833 template <
class InColindArrayType,
class InValsArrayType,
834 class OutRowptrType,
class OutColindType,
class OutValsType>
835 void copy_out_from_thread_memory(
const OutColindType& thread_total_nnz,
836 const InColindArrayType& Incolind,
837 const InValsArrayType& Invalues,
839 const double thread_chunk,
840 OutRowptrType& row_mapC,
841 OutColindType& entriesC,
842 OutValsType& valuesC) {
843 typedef OutRowptrType lno_view_t;
844 typedef OutColindType lno_nnz_view_t;
845 typedef OutValsType scalar_view_t;
846 typedef typename lno_view_t::execution_space execution_space;
847 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
850 size_t thread_max = Incolind.size();
851 size_t c_nnz_size = 0;
856 lno_view_t thread_start_nnz(
"thread_nnz", thread_max + 1);
857 Kokkos::parallel_scan(
"LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t i,
size_t& update,
const bool final) {
858 size_t mynnz = thread_total_nnz(i);
859 if (
final) thread_start_nnz(i) = update;
861 if (
final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
863 c_nnz_size = thread_start_nnz(thread_max);
866 row_mapC(m) = thread_start_nnz(thread_max);
869 lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
870 entriesC = entriesC_;
871 scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
875 Kokkos::parallel_for(
"LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
876 const size_t my_thread_start = tid * thread_chunk;
877 const size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
878 const size_t nnz_thread_start = thread_start_nnz(tid);
880 const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
891 if (my_thread_start != 0) {
892 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
893 row_mapC(i) += nnz_thread_start;
909 const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
910 for (
size_t i = 0; i < my_num_nnz; ++i) {
911 entriesC(nnz_thread_start + i) = Incolind(tid)(i);
912 valuesC(nnz_thread_start + i) = Invalues(tid)(i);
917 if (Incolind(tid).data()) free(Incolind(tid).data());
918 if (Invalues(tid).data()) free(Invalues(tid).data());
925 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
926 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
927 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
928 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
929 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
930 const LocalOrdinalViewType& Acol2Brow,
931 const LocalOrdinalViewType& Acol2Irow,
932 const LocalOrdinalViewType& Bcol2Ccol,
933 const LocalOrdinalViewType& Icol2Ccol,
934 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
935 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
936 const std::string& label,
937 const Teuchos::RCP<Teuchos::ParameterList>& params) {
938 #ifdef HAVE_TPETRA_MMM_TIMINGS
939 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
940 using Teuchos::TimeMonitor;
941 Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK"))));
942 Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Multiply"))));
945 typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
950 Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(
new Matrix_t(C.getRowMap(), 0));
951 Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(
" MSAK"), params);
953 #ifdef HAVE_TPETRA_MMM_TIMINGS
955 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Scale"))));
961 #ifdef HAVE_TPETRA_MMM_TIMINGS
963 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix MSAK Add"))));
967 Teuchos::ParameterList jparams;
968 if (!params.is_null()) {
970 jparams.set(
"label", label + std::string(
" MSAK Add"));
972 Scalar one = Teuchos::ScalarTraits<Scalar>::one();
973 Tpetra::MatrixMatrix::add(one,
false, *Bview.origMatrix, Scalar(-omega),
false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams,
false));
974 #ifdef HAVE_TPETRA_MMM_TIMINGS
979 #if defined(HAVE_TPETRA_INST_OPENMP)
981 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
982 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
983 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
984 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
985 const LocalOrdinalViewType& Acol2PRow,
986 const LocalOrdinalViewType& Acol2PRowImport,
987 const LocalOrdinalViewType& Pcol2Accol,
988 const LocalOrdinalViewType& PIcol2Accol,
989 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
990 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
991 const std::string& label,
992 const Teuchos::RCP<Teuchos::ParameterList>& params) {
994 using Tpetra::MatrixMatrix::UnmanagedView;
995 #ifdef HAVE_TPETRA_MMM_TIMINGS
996 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
998 using Teuchos::TimeMonitor;
999 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix LTGCore"))));
1002 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1004 typedef LocalOrdinal LO;
1005 typedef GlobalOrdinal GO;
1007 typedef Map<LO, GO, NO> map_type;
1009 typedef typename KCRS::StaticCrsGraphType graph_t;
1010 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1011 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1012 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1013 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1014 typedef typename KCRS::device_type device_t;
1015 typedef typename device_t::execution_space execution_space;
1016 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1019 typedef UnmanagedView<lno_view_t> u_lno_view_t;
1020 typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1021 typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1023 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1024 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1027 RCP<const map_type> Accolmap = Ac.getColMap();
1028 size_t m = Rview.origMatrix->getLocalNumRows();
1029 size_t n = Accolmap->getLocalNumElements();
1032 const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
1033 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
1034 const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
1036 c_lno_view_t Rrowptr = Rmat.graph.row_map,
1037 Arowptr = Amat.graph.row_map,
1038 Prowptr = Pmat.graph.row_map, Irowptr;
1039 const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1040 Acolind = Amat.graph.entries,
1041 Pcolind = Pmat.graph.entries;
1042 lno_nnz_view_t Icolind;
1043 const scalar_view_t Rvals = Rmat.values,
1044 Avals = Amat.values,
1045 Pvals = Pmat.values;
1046 scalar_view_t Ivals;
1048 if (!Pview.importMatrix.is_null()) {
1049 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1050 Irowptr = Imat.graph.row_map;
1051 Icolind = Imat.graph.entries;
1052 Ivals = Imat.values;
1063 size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1064 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1067 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1068 if (!params.is_null()) {
1069 if (params->isParameter(
"openmp: ltg thread max"))
1070 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
1073 double thread_chunk = (double)(m) / thread_max;
1076 lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), m + 1);
1078 lno_nnz_view_t entriesAc;
1079 scalar_view_t valuesAc;
1083 lno_nnz_view_t thread_total_nnz(
"thread_total_nnz", thread_max + 1);
1093 Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind(
"top_colind", thread_max);
1094 Kokkos::View<u_scalar_view_t*, device_t> tl_values(
"top_values", thread_max);
1097 Kokkos::parallel_for(
"MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
1099 size_t my_thread_start = tid * thread_chunk;
1100 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1101 size_t my_thread_m = my_thread_stop - my_thread_start;
1103 size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
1105 std::vector<size_t> ac_status(n, INVALID);
1108 u_lno_view_t Acrowptr((
typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
1109 u_lno_nnz_view_t Accolind((
typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1110 u_scalar_view_t Acvals((
typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1113 size_t nnz = 0, nnz_old = 0;
1115 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1119 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1121 const SC Rik = Rvals(kk);
1125 for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1127 const SC Akl = Avals(ll);
1131 if (Acol2PRow[l] != LO_INVALID) {
1138 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1141 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1143 LO Acj = Pcol2Accol(j);
1146 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1147 #ifdef HAVE_TPETRA_DEBUG
1149 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1151 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1154 ac_status[Acj] = nnz;
1155 Accolind(nnz) = Acj;
1156 Acvals(nnz) = Rik * Akl * Plj;
1159 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1169 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1170 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1172 LO Acj = PIcol2Accol(j);
1175 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1176 #ifdef HAVE_TPETRA_DEBUG
1178 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1180 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1183 ac_status[Acj] = nnz;
1184 Accolind(nnz) = Acj;
1185 Acvals(nnz) = Rik * Akl * Plj;
1188 Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1195 if (nnz + n > nnzAllocated) {
1197 Accolind = u_lno_nnz_view_t((
typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1198 Acvals = u_scalar_view_t((
typename u_scalar_view_t::data_type)realloc((
void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1202 thread_total_nnz(tid) = nnz;
1203 tl_colind(tid) = Accolind;
1204 tl_values(tid) = Acvals;
1206 #ifdef HAVE_TPETRA_MMM_TIMINGS
1208 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix copy from thread local"))));
1211 copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1213 #ifdef HAVE_TPETRA_MMM_TIMINGS
1215 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1219 Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1221 Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1223 #ifdef HAVE_TPETRA_MMM_TIMINGS
1225 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1236 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1237 labelList->set(
"Timer Label", label);
1238 if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants",
true));
1239 RCP<const Export<LO, GO, NO> > dummyExport;
1240 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1241 Rview.origMatrix->getRangeMap(),
1248 template <
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class LocalOrdinalViewType>
1249 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1250 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1251 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1252 const LocalOrdinalViewType& Acol2PRow,
1253 const LocalOrdinalViewType& Acol2PRowImport,
1254 const LocalOrdinalViewType& Pcol2Accol,
1255 const LocalOrdinalViewType& PIcol2Accol,
1256 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1257 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1258 const std::string& label,
1259 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1261 using Tpetra::MatrixMatrix::UnmanagedView;
1262 #ifdef HAVE_TPETRA_MMM_TIMINGS
1263 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1265 using Teuchos::TimeMonitor;
1266 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse LTGCore"))));
1269 typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1271 typedef LocalOrdinal LO;
1272 typedef GlobalOrdinal GO;
1274 typedef Map<LO, GO, NO> map_type;
1276 typedef typename KCRS::StaticCrsGraphType graph_t;
1277 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1278 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1279 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1280 typedef typename KCRS::device_type device_t;
1281 typedef typename device_t::execution_space execution_space;
1282 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1284 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1285 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1286 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1289 RCP<const map_type> Accolmap = Ac.getColMap();
1290 size_t m = Rview.origMatrix->getLocalNumRows();
1291 size_t n = Accolmap->getLocalNumElements();
1294 const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
1295 const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
1296 const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
1297 const KCRS& Cmat = Ac.getLocalMatrixDevice();
1299 c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1300 const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1301 lno_nnz_view_t Icolind;
1302 const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1303 scalar_view_t Cvals = Cmat.values;
1304 scalar_view_t Ivals;
1306 if (!Pview.importMatrix.is_null()) {
1307 const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1308 Irowptr = Imat.graph.row_map;
1309 Icolind = Imat.graph.entries;
1310 Ivals = Imat.values;
1314 size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1315 if (!params.is_null()) {
1316 if (params->isParameter(
"openmp: ltg thread max"))
1317 thread_max = std::max((
size_t)1, std::min(thread_max, params->get(
"openmp: ltg thread max", thread_max)));
1320 double thread_chunk = (double)(m) / thread_max;
1323 Kokkos::parallel_for(
"MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](
const size_t tid) {
1325 size_t my_thread_start = tid * thread_chunk;
1326 size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1328 std::vector<size_t> c_status(n, INVALID);
1331 size_t OLD_ip = 0, CSR_ip = 0;
1333 for (
size_t i = my_thread_start; i < my_thread_stop; i++) {
1336 OLD_ip = Crowptr(i);
1337 CSR_ip = Crowptr(i + 1);
1338 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1339 c_status[Ccolind(k)] = k;
1345 for (
size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1347 const SC Rik = Rvals(kk);
1351 for (
size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1353 const SC Akl = Avals(ll);
1357 if (Acol2PRow[l] != LO_INVALID) {
1364 size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1367 for (
size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1369 LO Cij = Pcol2Accol(j);
1371 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1372 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1373 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1375 Cvals(c_status[Cij]) += Rik * Akl * Plj;
1384 size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1385 for (
size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1387 LO Cij = PIcol2Accol(j);
1389 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1390 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1391 <<
"(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1393 Cvals(c_status[Cij]) += Rik * Akl * Plj;
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...