10 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
11 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
14 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
15 #include "Teuchos_VerboseObject.hpp"
16 #include "Teuchos_Array.hpp"
18 #include "Tpetra_ConfigDefs.hpp"
19 #include "Tpetra_CrsMatrix.hpp"
21 #include "Tpetra_RowMatrixTransposer.hpp"
22 #include "Tpetra_ConfigDefs.hpp"
23 #include "Tpetra_Map.hpp"
24 #include "Tpetra_Export.hpp"
29 #include "Teuchos_FancyOStream.hpp"
40 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
41 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
42 #include "TpetraExt_MatrixMatrix_HIP.hpp"
43 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
47 namespace TripleMatrixMultiply {
55 template <
class Scalar,
67 bool call_FillComplete_on_result,
68 const std::string& label,
69 const Teuchos::RCP<Teuchos::ParameterList>& params) {
73 typedef LocalOrdinal LO;
74 typedef GlobalOrdinal GO;
83 #ifdef HAVE_TPETRA_MMM_TIMINGS
84 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
85 using Teuchos::TimeMonitor;
86 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Setup"))));
89 const std::string prefix =
"TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
94 TEUCHOS_TEST_FOR_EXCEPTION(!R.
isFillComplete(), std::runtime_error, prefix <<
"Matrix R is not fill complete.");
95 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
96 TEUCHOS_TEST_FOR_EXCEPTION(!P.
isFillComplete(), std::runtime_error, prefix <<
"Matrix P is not fill complete.");
101 RCP<const crs_matrix_type> Rprime = null;
105 RCP<const crs_matrix_type> Aprime = null;
109 RCP<const crs_matrix_type> Pprime = null;
119 const bool newFlag = !Ac.
getGraph()->isLocallyIndexed() && !Ac.
getGraph()->isGloballyIndexed();
121 using Teuchos::ParameterList;
122 RCP<ParameterList> transposeParams(
new ParameterList);
123 transposeParams->set(
"sort",
false);
125 if (transposeR && &R != &P) {
126 transposer_type transposer(rcpFromRef(R));
127 Rprime = transposer.createTranspose(transposeParams);
129 Rprime = rcpFromRef(R);
133 transposer_type transposer(rcpFromRef(A));
134 Aprime = transposer.createTranspose(transposeParams);
136 Aprime = rcpFromRef(A);
140 transposer_type transposer(rcpFromRef(P));
141 Pprime = transposer.createTranspose(transposeParams);
143 Pprime = rcpFromRef(P);
156 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
157 prefix <<
"ERROR, inner dimensions of op(R) and op(A) "
158 "must match for matrix-matrix product. op(R) is "
159 << Rleft <<
"x" << Rright <<
", op(A) is " << Aleft <<
"x" << Aright);
161 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
162 prefix <<
"ERROR, inner dimensions of op(A) and op(P) "
163 "must match for matrix-matrix product. op(A) is "
164 << Aleft <<
"x" << Aright <<
", op(P) is " << Pleft <<
"x" << Pright);
170 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.
getGlobalNumRows(), std::runtime_error,
171 prefix <<
"ERROR, dimensions of result Ac must "
172 "match dimensions of op(R) * op(A) * op(P). Ac has "
174 <<
" rows, should have at least " << Rleft << std::endl);
186 int numProcs = P.
getComm()->getSize();
190 crs_matrix_struct_type Rview;
191 crs_matrix_struct_type Aview;
192 crs_matrix_struct_type Pview;
194 RCP<const map_type> targetMap_R = Rprime->getRowMap();
195 RCP<const map_type> targetMap_A = Aprime->getRowMap();
196 RCP<const map_type> targetMap_P = Pprime->getRowMap();
198 #ifdef HAVE_TPETRA_MMM_TIMINGS
200 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All I&X"))));
206 RCP<const import_type> dummyImporter;
208 if (!(transposeR && &R == &P))
209 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter,
true, label, params);
211 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
216 targetMap_P = Aprime->getColMap();
219 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(),
false, label, params);
221 RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
223 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
224 if (needs_final_export)
227 Actemp = rcp(&Ac,
false);
229 #ifdef HAVE_TPETRA_MMM_TIMINGS
231 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Multiply"))));
235 if (call_FillComplete_on_result && newFlag) {
236 if (transposeR && &R == &P)
237 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
239 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
240 }
else if (call_FillComplete_on_result) {
241 if (transposeR && &R == &P)
242 MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
244 MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
267 if (transposeR && &R == &P)
268 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
270 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
273 if (needs_final_export) {
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
276 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP exportAndFillComplete"))));
278 Teuchos::ParameterList labelList;
279 labelList.set(
"Timer Label", label);
280 Teuchos::ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
282 RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
284 bool overrideAllreduce =
false;
286 if (!params.is_null()) {
287 Teuchos::ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
289 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
290 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
291 if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
292 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
293 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
295 labelList.set(
"compute global constants", params->get(
"compute global constants",
true));
297 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
299 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete", isMM,
300 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
301 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
303 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
304 Actemp->exportAndFillComplete(Acprime,
306 Acprime->getDomainMap(),
307 Acprime->getRangeMap(),
308 rcp(&labelList,
false));
310 #ifdef HAVE_TPETRA_MMM_STATISTICS
311 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label + std::string(
" RAP MMM"));
317 namespace MMdetails {
320 template <
class Scalar,
324 void mult_R_A_P_newmatrix(
325 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
326 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
327 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
328 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
329 const std::string& label,
330 const Teuchos::RCP<Teuchos::ParameterList>& params) {
331 using Teuchos::Array;
332 using Teuchos::ArrayRCP;
333 using Teuchos::ArrayView;
338 typedef LocalOrdinal LO;
339 typedef GlobalOrdinal GO;
342 typedef Import<LO, GO, NO> import_type;
343 typedef Map<LO, GO, NO> map_type;
346 typedef typename map_type::local_map_type local_map_type;
348 typedef typename KCRS::StaticCrsGraphType graph_t;
349 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
350 typedef typename NO::execution_space execution_space;
351 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
352 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
354 #ifdef HAVE_TPETRA_MMM_TIMINGS
355 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
356 using Teuchos::TimeMonitor;
357 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
359 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
362 RCP<const import_type> Cimport;
363 RCP<const map_type> Ccolmap;
364 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
365 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
366 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
367 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
368 local_map_type Irowmap_local;
369 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
370 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
371 local_map_type Icolmap_local;
372 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
379 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
381 if (Pview.importMatrix.is_null()) {
384 Ccolmap = Pview.colMap;
385 const LO colMapSize =
static_cast<LO
>(Pview.colMap->getLocalNumElements());
387 Kokkos::parallel_for(
388 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
389 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
390 KOKKOS_LAMBDA(
const LO i) {
402 if (!Pimport.is_null() && !Iimport.is_null()) {
403 Cimport = Pimport->setUnion(*Iimport);
404 }
else if (!Pimport.is_null() && Iimport.is_null()) {
405 Cimport = Pimport->setUnion();
406 }
else if (Pimport.is_null() && !Iimport.is_null()) {
407 Cimport = Iimport->setUnion();
409 throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
411 Ccolmap = Cimport->getTargetMap();
416 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
417 std::runtime_error,
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
424 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
425 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
426 Kokkos::parallel_for(
427 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
428 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
430 Kokkos::parallel_for(
431 "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
432 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
440 Ac.replaceColMap(Ccolmap);
458 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
459 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
460 Kokkos::parallel_for(
461 "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
462 GO aidx = Acolmap_local.getGlobalElement(i);
463 LO P_LID = Prowmap_local.getLocalElement(aidx);
464 if (P_LID != LO_INVALID) {
465 targetMapToOrigRow(i) = P_LID;
466 targetMapToImportRow(i) = LO_INVALID;
468 LO I_LID = Irowmap_local.getLocalElement(aidx);
469 targetMapToOrigRow(i) = LO_INVALID;
470 targetMapToImportRow(i) = I_LID;
476 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
477 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
478 targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
479 Ac, Cimport, label, params);
483 template <
class Scalar,
487 void mult_R_A_P_reuse(
488 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
489 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
490 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
491 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
492 const std::string& label,
493 const Teuchos::RCP<Teuchos::ParameterList>& params) {
494 using Teuchos::Array;
495 using Teuchos::ArrayRCP;
496 using Teuchos::ArrayView;
501 typedef LocalOrdinal LO;
502 typedef GlobalOrdinal GO;
505 typedef Import<LO, GO, NO> import_type;
506 typedef Map<LO, GO, NO> map_type;
509 typedef typename map_type::local_map_type local_map_type;
511 typedef typename KCRS::StaticCrsGraphType graph_t;
512 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
513 typedef typename NO::execution_space execution_space;
514 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
515 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
517 #ifdef HAVE_TPETRA_MMM_TIMINGS
518 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
519 using Teuchos::TimeMonitor;
520 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
522 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
525 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
526 RCP<const map_type> Ccolmap = Ac.getColMap();
527 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
528 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
529 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
530 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
531 local_map_type Irowmap_local;
532 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
533 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
534 local_map_type Icolmap_local;
535 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
536 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
539 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
543 Kokkos::parallel_for(
544 range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
545 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
548 if (!Pview.importMatrix.is_null()) {
549 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
550 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
552 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
553 Kokkos::parallel_for(
554 range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
555 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
561 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
562 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
563 Kokkos::parallel_for(
564 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
565 GO aidx = Acolmap_local.getGlobalElement(i);
566 LO B_LID = Prowmap_local.getLocalElement(aidx);
567 if (B_LID != LO_INVALID) {
568 targetMapToOrigRow(i) = B_LID;
569 targetMapToImportRow(i) = LO_INVALID;
571 LO I_LID = Irowmap_local.getLocalElement(aidx);
572 targetMapToOrigRow(i) = LO_INVALID;
573 targetMapToImportRow(i) = I_LID;
579 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
580 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
581 targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
582 Ac, Cimport, label, params);
586 template <
class Scalar,
590 void mult_PT_A_P_newmatrix(
591 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
592 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
593 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
594 const std::string& label,
595 const Teuchos::RCP<Teuchos::ParameterList>& params) {
596 using Teuchos::Array;
597 using Teuchos::ArrayRCP;
598 using Teuchos::ArrayView;
603 typedef LocalOrdinal LO;
604 typedef GlobalOrdinal GO;
607 typedef Import<LO, GO, NO> import_type;
608 typedef Map<LO, GO, NO> map_type;
611 typedef typename map_type::local_map_type local_map_type;
613 typedef typename KCRS::StaticCrsGraphType graph_t;
614 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
615 typedef typename NO::execution_space execution_space;
616 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
617 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
619 #ifdef HAVE_TPETRA_MMM_TIMINGS
620 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
621 using Teuchos::TimeMonitor;
622 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP M5 Cmap")))));
624 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
627 RCP<const import_type> Cimport;
628 RCP<const map_type> Ccolmap;
629 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
630 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
631 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
632 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
633 local_map_type Irowmap_local;
634 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
635 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
636 local_map_type Icolmap_local;
637 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
644 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
646 if (Pview.importMatrix.is_null()) {
649 Ccolmap = Pview.colMap;
650 const LO colMapSize =
static_cast<LO
>(Pview.colMap->getLocalNumElements());
652 Kokkos::parallel_for(
653 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
654 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
655 KOKKOS_LAMBDA(
const LO i) {
667 if (!Pimport.is_null() && !Iimport.is_null()) {
668 Cimport = Pimport->setUnion(*Iimport);
669 }
else if (!Pimport.is_null() && Iimport.is_null()) {
670 Cimport = Pimport->setUnion();
671 }
else if (Pimport.is_null() && !Iimport.is_null()) {
672 Cimport = Iimport->setUnion();
674 throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
676 Ccolmap = Cimport->getTargetMap();
681 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
682 std::runtime_error,
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
689 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
690 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
691 Kokkos::parallel_for(
692 "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
693 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
695 Kokkos::parallel_for(
696 "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
697 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
705 Ac.replaceColMap(Ccolmap);
723 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
724 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
726 Kokkos::parallel_for(
727 "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
728 GO aidx = Acolmap_local.getGlobalElement(i);
729 LO P_LID = Prowmap_local.getLocalElement(aidx);
730 if (P_LID != LO_INVALID) {
731 targetMapToOrigRow(i) = P_LID;
732 targetMapToImportRow(i) = LO_INVALID;
734 LO I_LID = Irowmap_local.getLocalElement(aidx);
735 targetMapToOrigRow(i) = LO_INVALID;
736 targetMapToImportRow(i) = I_LID;
742 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
743 mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
744 targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
745 Ac, Cimport, label, params);
749 template <
class Scalar,
753 void mult_PT_A_P_reuse(
754 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
755 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
756 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
757 const std::string& label,
758 const Teuchos::RCP<Teuchos::ParameterList>& params) {
759 using Teuchos::Array;
760 using Teuchos::ArrayRCP;
761 using Teuchos::ArrayView;
766 typedef LocalOrdinal LO;
767 typedef GlobalOrdinal GO;
770 typedef Import<LO, GO, NO> import_type;
771 typedef Map<LO, GO, NO> map_type;
774 typedef typename map_type::local_map_type local_map_type;
776 typedef typename KCRS::StaticCrsGraphType graph_t;
777 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
778 typedef typename NO::execution_space execution_space;
779 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
780 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
782 #ifdef HAVE_TPETRA_MMM_TIMINGS
783 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
784 using Teuchos::TimeMonitor;
785 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
787 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
790 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
791 RCP<const map_type> Ccolmap = Ac.getColMap();
792 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
793 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
794 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
795 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
796 local_map_type Irowmap_local;
797 if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
798 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
799 local_map_type Icolmap_local;
800 if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
801 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
804 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
808 Kokkos::parallel_for(
809 range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
810 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
813 if (!Pview.importMatrix.is_null()) {
814 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
815 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
817 Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
818 Kokkos::parallel_for(
819 range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(
const LO i) {
820 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
826 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
827 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"), Aview.colMap->getLocalNumElements());
828 Kokkos::parallel_for(
829 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(
const LO i) {
830 GO aidx = Acolmap_local.getGlobalElement(i);
831 LO B_LID = Prowmap_local.getLocalElement(aidx);
832 if (B_LID != LO_INVALID) {
833 targetMapToOrigRow(i) = B_LID;
834 targetMapToImportRow(i) = LO_INVALID;
836 LO I_LID = Irowmap_local.getLocalElement(aidx);
837 targetMapToOrigRow(i) = LO_INVALID;
838 targetMapToImportRow(i) = I_LID;
844 KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
845 mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
846 targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
847 Ac, Cimport, label, params);
853 template <
class Scalar,
857 class LocalOrdinalViewType>
858 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
859 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
860 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
861 const LocalOrdinalViewType& Acol2Prow_dev,
862 const LocalOrdinalViewType& Acol2PIrow_dev,
863 const LocalOrdinalViewType& Pcol2Accol_dev,
864 const LocalOrdinalViewType& PIcol2Accol_dev,
865 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
866 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
867 const std::string& label,
868 const Teuchos::RCP<Teuchos::ParameterList>& params) {
869 #ifdef HAVE_TPETRA_MMM_TIMINGS
870 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
871 using Teuchos::TimeMonitor;
872 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
875 using Teuchos::Array;
876 using Teuchos::ArrayRCP;
877 using Teuchos::ArrayView;
883 typedef typename KCRS::StaticCrsGraphType graph_t;
884 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
885 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
886 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
887 typedef typename KCRS::values_type::non_const_type scalar_view_t;
890 typedef LocalOrdinal LO;
891 typedef GlobalOrdinal GO;
893 typedef Map<LO, GO, NO> map_type;
894 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
895 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
896 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
899 RCP<const map_type> Accolmap = Ac.getColMap();
900 size_t m = Rview.origMatrix->getLocalNumRows();
901 size_t n = Accolmap->getLocalNumElements();
902 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
905 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
907 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
909 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
911 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
915 const auto Amat = Aview.origMatrix->getLocalMatrixHost();
916 const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
917 const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
919 auto Arowptr = Amat.graph.row_map;
920 auto Prowptr = Pmat.graph.row_map;
921 auto Rrowptr = Rmat.graph.row_map;
922 const auto Acolind = Amat.graph.entries;
923 const auto Pcolind = Pmat.graph.entries;
924 const auto Rcolind = Rmat.graph.entries;
925 const auto Avals = Amat.values;
926 const auto Pvals = Pmat.values;
927 const auto Rvals = Rmat.values;
929 typename c_lno_view_t::host_mirror_type::const_type Irowptr;
930 typename lno_nnz_view_t::host_mirror_type Icolind;
931 typename scalar_view_t::host_mirror_type Ivals;
932 if (!Pview.importMatrix.is_null()) {
933 auto lclP = Pview.importMatrix->getLocalMatrixHost();
934 Irowptr = lclP.graph.row_map;
935 Icolind = lclP.graph.entries;
937 p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
940 #ifdef HAVE_TPETRA_MMM_TIMINGS
941 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore - Compare"))));
951 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
952 typename lno_view_t::host_mirror_type Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"), m + 1);
953 typename lno_nnz_view_t::host_mirror_type Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"), CSR_alloc);
954 typename scalar_view_t::host_mirror_type Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"), CSR_alloc);
964 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
965 Array<size_t> ac_status(n, ST_INVALID);
975 size_t nnz = 0, nnz_old = 0;
976 for (
size_t i = 0; i < m; i++) {
982 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
984 const SC Rik = Rvals[kk];
988 for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
990 const SC Akl = Avals[ll];
994 if (Acol2Prow[l] != LO_INVALID) {
1001 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1004 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1006 LO Acj = Pcol2Accol[j];
1009 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1010 #ifdef HAVE_TPETRA_DEBUG
1012 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1014 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1017 ac_status[Acj] = nnz;
1019 Cvals[nnz] = Rik * Akl * Plj;
1022 Cvals[ac_status[Acj]] += Rik * Akl * Plj;
1032 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1033 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1035 LO Acj = PIcol2Accol[j];
1038 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1039 #ifdef HAVE_TPETRA_DEBUG
1041 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1043 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1046 ac_status[Acj] = nnz;
1048 Cvals[nnz] = Rik * Akl * Plj;
1051 Cvals[ac_status[Acj]] += Rik * Akl * Plj;
1058 if (nnz + n > CSR_alloc) {
1060 Kokkos::resize(Ccolind, CSR_alloc);
1061 Kokkos::resize(Cvals, CSR_alloc);
1069 Kokkos::resize(Ccolind, nnz);
1070 Kokkos::resize(Cvals, nnz);
1072 #ifdef HAVE_TPETRA_MMM_TIMINGS
1074 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1076 auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1077 typename KCRS::device_type(), Crowptr);
1078 auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1079 typename KCRS::device_type(), Ccolind);
1080 auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1081 typename KCRS::device_type(), Cvals);
1084 if (params.is_null() || params->get(
"sort entries",
true))
1085 Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1086 Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1088 #ifdef HAVE_TPETRA_MMM_TIMINGS
1090 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1101 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1102 labelList->set(
"Timer Label", label);
1103 if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants",
true));
1104 RCP<const Export<LO, GO, NO> > dummyExport;
1105 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1106 Rview.origMatrix->getRangeMap(),
1115 template <
class Scalar,
1117 class GlobalOrdinal,
1119 class LocalOrdinalViewType>
1120 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1121 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1122 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1123 const LocalOrdinalViewType& Acol2Prow_dev,
1124 const LocalOrdinalViewType& Acol2PIrow_dev,
1125 const LocalOrdinalViewType& Pcol2Accol_dev,
1126 const LocalOrdinalViewType& PIcol2Accol_dev,
1127 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1128 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1129 const std::string& label,
1130 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1131 #ifdef HAVE_TPETRA_MMM_TIMINGS
1132 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1133 using Teuchos::TimeMonitor;
1134 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore"))));
1137 using Teuchos::Array;
1138 using Teuchos::ArrayRCP;
1139 using Teuchos::ArrayView;
1144 typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
1145 typedef typename KCRS::StaticCrsGraphType graph_t;
1146 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1147 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1148 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1151 typedef LocalOrdinal LO;
1152 typedef GlobalOrdinal GO;
1154 typedef Map<LO, GO, NO> map_type;
1155 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1156 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1157 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1160 RCP<const map_type> Accolmap = Ac.getColMap();
1161 size_t m = Rview.origMatrix->getLocalNumRows();
1162 size_t n = Accolmap->getLocalNumElements();
1163 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1166 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1168 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1170 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1172 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1176 const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
1177 const KCRS& Pmat = Pview.origMatrix->getLocalMatrixHost();
1178 const KCRS& Rmat = Rview.origMatrix->getLocalMatrixHost();
1179 const KCRS& Cmat = Ac.getLocalMatrixHost();
1181 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1182 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1183 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1184 scalar_view_t Cvals = Cmat.values;
1186 c_lno_view_t Irowptr;
1187 lno_nnz_view_t Icolind;
1188 scalar_view_t Ivals;
1189 if (!Pview.importMatrix.is_null()) {
1190 auto lclP = Pview.importMatrix->getLocalMatrixHost();
1191 Irowptr = lclP.graph.row_map;
1192 Icolind = lclP.graph.entries;
1193 Ivals = lclP.values;
1194 p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
1197 #ifdef HAVE_TPETRA_MMM_TIMINGS
1198 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore - Compare"))));
1209 Array<size_t> ac_status(n, ST_INVALID);
1223 size_t OLD_ip = 0, CSR_ip = 0;
1224 for (
size_t i = 0; i < m; i++) {
1227 OLD_ip = Crowptr[i];
1228 CSR_ip = Crowptr[i + 1];
1229 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1230 ac_status[Ccolind[k]] = k;
1237 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
1239 const SC Rik = Rvals[kk];
1243 for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1245 const SC Akl = Avals[ll];
1249 if (Acol2Prow[l] != LO_INVALID) {
1256 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1259 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1261 LO Cij = Pcol2Accol[j];
1264 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1265 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1266 <<
"(c_status = " << ac_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1268 Cvals[ac_status[Cij]] += Rik * Akl * Plj;
1277 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1278 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1280 LO Cij = PIcol2Accol[j];
1283 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1284 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph "
1285 <<
"(c_status = " << ac_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1287 Cvals[ac_status[Cij]] += Rik * Akl * Plj;
1294 #ifdef HAVE_TPETRA_MMM_TIMINGS
1295 auto MM3 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse ESFC"))));
1298 Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1304 template <
class Scalar,
1306 class GlobalOrdinal,
1308 class LocalOrdinalViewType>
1309 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1310 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1311 const LocalOrdinalViewType& Acol2Prow,
1312 const LocalOrdinalViewType& Acol2PIrow,
1313 const LocalOrdinalViewType& Pcol2Accol,
1314 const LocalOrdinalViewType& PIcol2Accol,
1315 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1316 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1317 const std::string& label,
1318 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1319 #ifdef HAVE_TPETRA_MMM_TIMINGS
1320 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1321 using Teuchos::TimeMonitor;
1322 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
1326 typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
1327 transposer_type transposer(Pview.origMatrix, label + std::string(
"XP: "));
1329 using Teuchos::ParameterList;
1331 RCP<ParameterList> transposeParams(
new ParameterList);
1332 transposeParams->set(
"sort",
false);
1334 if (!params.is_null()) {
1335 transposeParams->set(
"compute global constants",
1336 params->get(
"compute global constants: temporaries",
1339 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1340 transposer.createTransposeLocal(transposeParams);
1341 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1342 Rview.origMatrix = Ptrans;
1344 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
1350 template <
class Scalar,
1352 class GlobalOrdinal,
1354 class LocalOrdinalViewType>
1355 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1356 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1357 const LocalOrdinalViewType& Acol2Prow,
1358 const LocalOrdinalViewType& Acol2PIrow,
1359 const LocalOrdinalViewType& Pcol2Accol,
1360 const LocalOrdinalViewType& PIcol2Accol,
1361 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1362 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1363 const std::string& label,
1364 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1365 #ifdef HAVE_TPETRA_MMM_TIMINGS
1366 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1367 using Teuchos::TimeMonitor;
1368 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
1372 typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
1373 transposer_type transposer(Pview.origMatrix, label + std::string(
"XP: "));
1375 using Teuchos::ParameterList;
1377 RCP<ParameterList> transposeParams(
new ParameterList);
1378 transposeParams->set(
"sort",
false);
1380 if (!params.is_null()) {
1381 transposeParams->set(
"compute global constants",
1382 params->get(
"compute global constants: temporaries",
1385 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1386 transposer.createTransposeLocal(transposeParams);
1387 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1388 Rview.origMatrix = Ptrans;
1390 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
1398 template <
class Scalar,
1400 class GlobalOrdinal,
1402 void KernelWrappers3MMM<Scalar, LocalOrdinal, GlobalOrdinal, Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1403 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1404 const Teuchos::Array<LocalOrdinal>& Acol2PRow,
1405 const Teuchos::Array<LocalOrdinal>& Acol2PRowImport,
1406 const Teuchos::Array<LocalOrdinal>& Pcol2Accol,
1407 const Teuchos::Array<LocalOrdinal>& PIcol2Accol,
1408 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1409 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1410 const std::string& label,
1411 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1412 #ifdef HAVE_TPETRA_MMM_TIMINGS
1413 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1414 using Teuchos::TimeMonitor;
1415 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
1418 using Teuchos::Array;
1419 using Teuchos::ArrayRCP;
1420 using Teuchos::ArrayView;
1425 typedef LocalOrdinal LO;
1426 typedef GlobalOrdinal GO;
1428 typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
1429 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1430 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1435 size_t n = Ac.getRowMap()->getLocalNumElements();
1436 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1439 ArrayRCP<size_t> Acrowptr_RCP;
1440 ArrayRCP<LO> Accolind_RCP;
1441 ArrayRCP<SC> Acvals_RCP;
1448 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1449 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1450 auto Avals = Aview.origMatrix->getLocalValuesHost(
1451 Tpetra::Access::ReadOnly);
1452 auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1453 auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1454 auto Pvals = Pview.origMatrix->getLocalValuesHost(
1455 Tpetra::Access::ReadOnly);
1456 decltype(Prowptr) Irowptr;
1457 decltype(Pcolind) Icolind;
1458 decltype(Pvals) Ivals;
1460 if (!Pview.importMatrix.is_null()) {
1461 Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1462 Icolind = Pview.importMatrix->getLocalIndicesHost();
1463 Ivals = Pview.importMatrix->getLocalValuesHost(
1464 Tpetra::Access::ReadOnly);
1472 ArrayView<size_t> Acrowptr;
1473 ArrayView<LO> Accolind;
1474 ArrayView<SC> Acvals;
1485 #ifdef HAVE_TPETRA_MMM_TIMINGS
1486 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1493 transposer_type transposer(Pview.origMatrix, label + std::string(
"XP: "));
1495 using Teuchos::ParameterList;
1496 RCP<ParameterList> transposeParams(
new ParameterList);
1497 transposeParams->set(
"sort",
false);
1498 if (!params.is_null()) {
1499 transposeParams->set(
"compute global constants",
1500 params->get(
"compute global constants: temporaries",
1503 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1504 transposer.createTransposeLocal(transposeParams);
1506 auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1507 auto Rcolind = Ptrans->getLocalIndicesHost();
1508 auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1513 #ifdef HAVE_TPETRA_MMM_TIMINGS
1514 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP graph"))));
1517 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1518 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1520 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1521 size_t nnzPerRowA = 100;
1522 if (Aview.origMatrix->getLocalNumEntries() > 0)
1523 nnzPerRowA = Aview.origMatrix->getLocalNumEntries() / Aview.origMatrix->getLocalNumRows();
1524 Acrowptr_RCP.resize(n + 1);
1525 Acrowptr = Acrowptr_RCP();
1526 Accolind_RCP.resize(nnz_alloc);
1527 Accolind = Accolind_RCP();
1529 size_t nnz = 0, nnz_old = 0;
1530 for (
size_t i = 0; i < n; i++) {
1536 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
1539 for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1542 if (Acol2PRow[l] != LO_INVALID) {
1549 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1552 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1554 LO Acj = Pcol2Accol[j];
1556 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1558 ac_status[Acj] = nnz;
1559 Accolind[nnz] = Acj;
1570 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1571 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1573 LO Acj = PIcol2Accol[j];
1575 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1577 ac_status[Acj] = nnz;
1578 Accolind[nnz] = Acj;
1588 if (nnz + std::max(5 * nnzPerRowA, n) > nnz_alloc) {
1590 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5 * nnzPerRowA, n));
1591 Accolind_RCP.resize(nnz_alloc);
1592 Accolind = Accolind_RCP();
1593 Acvals_RCP.resize(nnz_alloc);
1594 Acvals = Acvals_RCP();
1601 Accolind_RCP.resize(nnz);
1602 Accolind = Accolind_RCP();
1605 Acvals_RCP.resize(nnz, SC_ZERO);
1606 Acvals = Acvals_RCP();
1612 #ifdef HAVE_TPETRA_MMM_TIMINGS
1613 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
1616 for (
size_t k = 0; k < n; k++) {
1617 for (
size_t ii = Prowptr[k]; ii < Prowptr[k + 1]; ii++) {
1619 const SC Pki = Pvals[ii];
1620 for (
size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1622 const SC Akl = Avals[ll];
1625 if (Acol2PRow[l] != LO_INVALID) {
1632 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1633 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1635 LO Acj = Pcol2Accol[j];
1637 for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
1638 if (Accolind[pp] == Acj)
1642 Acvals[pp] += Pki * Akl * Pvals[jj];
1651 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1652 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1654 LO Acj = PIcol2Accol[j];
1656 for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
1657 if (Accolind[pp] == Acj)
1661 Acvals[pp] += Pki * Akl * Ivals[jj];
1668 #ifdef HAVE_TPETRA_MMM_TIMINGS
1669 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
1676 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1679 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1681 #ifdef HAVE_TPETRA_MMM_TIMINGS
1682 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
1693 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1694 labelList->set(
"Timer Label", label);
1696 if (!params.is_null()) labelList->set(
"compute global constants", params->get(
"compute global constants",
true));
1697 RCP<const Export<LO, GO, NO> > dummyExport;
1698 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1699 Pview.origMatrix->getDomainMap(),
1701 dummyExport, labelList);
1713 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR, LO, GO, NODE) \
1715 template void TripleMatrixMultiply::MultiplyRAP( \
1716 const CrsMatrix<SCALAR, LO, GO, NODE>& R, \
1718 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
1720 const CrsMatrix<SCALAR, LO, GO, NODE>& P, \
1722 CrsMatrix<SCALAR, LO, GO, NODE>& Ac, \
1723 bool call_FillComplete_on_result, \
1724 const std::string& label, \
1725 const Teuchos::RCP<Teuchos::ParameterList>& params);
1727 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.