41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
60 #include "Teuchos_FancyOStream.hpp"
74 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
76 #include "TpetraExt_MatrixMatrix_HIP.hpp"
77 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
81 namespace TripleMatrixMultiply{
89 template <
class Scalar,
101 bool call_FillComplete_on_result,
102 const std::string& label,
103 const Teuchos::RCP<Teuchos::ParameterList>& params)
108 typedef LocalOrdinal LO;
109 typedef GlobalOrdinal GO;
118 #ifdef HAVE_TPETRA_MMM_TIMINGS
119 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
120 using Teuchos::TimeMonitor;
121 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Setup"))));
124 const std::string prefix =
"TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
129 TEUCHOS_TEST_FOR_EXCEPTION(!R.
isFillComplete(), std::runtime_error, prefix <<
"Matrix R is not fill complete.");
130 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
131 TEUCHOS_TEST_FOR_EXCEPTION(!P.
isFillComplete(), std::runtime_error, prefix <<
"Matrix P is not fill complete.");
136 RCP<const crs_matrix_type> Rprime = null;
140 RCP<const crs_matrix_type> Aprime = null;
144 RCP<const crs_matrix_type> Pprime = null;
154 const bool newFlag = !Ac.
getGraph()->isLocallyIndexed() && !Ac.
getGraph()->isGloballyIndexed();
156 using Teuchos::ParameterList;
157 RCP<ParameterList> transposeParams (
new ParameterList);
158 transposeParams->set (
"sort",
false);
160 if (transposeR && &R != &P) {
161 transposer_type transposer(rcpFromRef (R));
162 Rprime = transposer.createTranspose (transposeParams);
164 Rprime = rcpFromRef(R);
168 transposer_type transposer(rcpFromRef (A));
169 Aprime = transposer.createTranspose (transposeParams);
171 Aprime = rcpFromRef(A);
175 transposer_type transposer(rcpFromRef (P));
176 Pprime = transposer.createTranspose (transposeParams);
178 Pprime = rcpFromRef(P);
191 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
192 prefix <<
"ERROR, inner dimensions of op(R) and op(A) "
193 "must match for matrix-matrix product. op(R) is "
194 << Rleft <<
"x" << Rright <<
", op(A) is "<< Aleft <<
"x" << Aright);
196 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
197 prefix <<
"ERROR, inner dimensions of op(A) and op(P) "
198 "must match for matrix-matrix product. op(A) is "
199 << Aleft <<
"x" << Aright <<
", op(P) is "<< Pleft <<
"x" << Pright);
205 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.
getGlobalNumRows(), std::runtime_error,
206 prefix <<
"ERROR, dimensions of result Ac must "
207 "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.
getGlobalNumRows()
208 <<
" rows, should have at least " << Rleft << std::endl);
220 int numProcs = P.
getComm()->getSize();
224 crs_matrix_struct_type Rview;
225 crs_matrix_struct_type Aview;
226 crs_matrix_struct_type Pview;
228 RCP<const map_type> targetMap_R = Rprime->getRowMap();
229 RCP<const map_type> targetMap_A = Aprime->getRowMap();
230 RCP<const map_type> targetMap_P = Pprime->getRowMap();
232 #ifdef HAVE_TPETRA_MMM_TIMINGS
233 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All I&X"))));
239 RCP<const import_type> dummyImporter;
241 if (!(transposeR && &R == &P))
242 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter,
true, label, params);
244 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
249 targetMap_P = Aprime->getColMap();
252 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(),
false, label, params);
255 RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
257 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
258 if (needs_final_export)
261 Actemp = rcp(&Ac,
false);
263 #ifdef HAVE_TPETRA_MMM_TIMINGS
264 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Multiply"))));
268 if (call_FillComplete_on_result && newFlag) {
269 if (transposeR && &R == &P)
270 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
272 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
273 }
else if (call_FillComplete_on_result) {
274 if (transposeR && &R == &P)
275 MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
277 MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
300 if (transposeR && &R == &P)
301 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
303 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
306 if (needs_final_export) {
307 #ifdef HAVE_TPETRA_MMM_TIMINGS
308 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP exportAndFillComplete"))));
310 Teuchos::ParameterList labelList;
311 labelList.set(
"Timer Label", label);
312 Teuchos::ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
314 RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
316 bool overrideAllreduce =
false;
318 if(!params.is_null()) {
319 Teuchos::ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
321 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
322 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
323 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
324 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
325 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
327 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
329 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
331 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM,
332 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
333 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
335 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
336 Actemp->exportAndFillComplete(Acprime,
338 Acprime->getDomainMap(),
339 Acprime->getRangeMap(),
340 rcp(&labelList,
false));
343 #ifdef HAVE_TPETRA_MMM_STATISTICS
344 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(
" RAP MMM"));
355 template<
class Scalar,
359 void mult_R_A_P_newmatrix(
360 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
361 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
362 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
363 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
364 const std::string& label,
365 const Teuchos::RCP<Teuchos::ParameterList>& params)
367 using Teuchos::Array;
368 using Teuchos::ArrayRCP;
369 using Teuchos::ArrayView;
374 typedef LocalOrdinal LO;
375 typedef GlobalOrdinal GO;
378 typedef Import<LO,GO,NO> import_type;
379 typedef Map<LO,GO,NO> map_type;
382 typedef typename map_type::local_map_type local_map_type;
384 typedef typename KCRS::StaticCrsGraphType graph_t;
385 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
386 typedef typename NO::execution_space execution_space;
387 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
388 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
390 #ifdef HAVE_TPETRA_MMM_TIMINGS
391 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
392 using Teuchos::TimeMonitor;
393 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
395 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
398 RCP<const import_type> Cimport;
399 RCP<const map_type> Ccolmap;
400 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
401 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
402 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
403 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
404 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
405 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
406 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
414 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
416 if (Pview.importMatrix.is_null()) {
419 Ccolmap = Pview.colMap;
420 const LO colMapSize =
static_cast<LO
>(Pview.colMap->getLocalNumElements());
422 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
423 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
424 KOKKOS_LAMBDA(
const LO i) {
437 if (!Pimport.is_null() && !Iimport.is_null()) {
438 Cimport = Pimport->setUnion(*Iimport);
440 else if (!Pimport.is_null() && Iimport.is_null()) {
441 Cimport = Pimport->setUnion();
443 else if (Pimport.is_null() && !Iimport.is_null()) {
444 Cimport = Iimport->setUnion();
447 throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
449 Ccolmap = Cimport->getTargetMap();
454 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
455 std::runtime_error,
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
462 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
463 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
464 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
465 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
467 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
468 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
477 Ac.replaceColMap(Ccolmap);
495 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
496 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
497 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
498 GO aidx = Acolmap_local.getGlobalElement(i);
499 LO P_LID = Prowmap_local.getLocalElement(aidx);
500 if (P_LID != LO_INVALID) {
501 targetMapToOrigRow(i) = P_LID;
502 targetMapToImportRow(i) = LO_INVALID;
504 LO I_LID = Irowmap_local.getLocalElement(aidx);
505 targetMapToOrigRow(i) = LO_INVALID;
506 targetMapToImportRow(i) = I_LID;
512 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
513 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
514 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
515 Ac, Cimport, label, params);
520 template<
class Scalar,
524 void mult_R_A_P_reuse(
525 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
526 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
527 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
528 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
529 const std::string& label,
530 const Teuchos::RCP<Teuchos::ParameterList>& params)
532 using Teuchos::Array;
533 using Teuchos::ArrayRCP;
534 using Teuchos::ArrayView;
539 typedef LocalOrdinal LO;
540 typedef GlobalOrdinal GO;
543 typedef Import<LO,GO,NO> import_type;
544 typedef Map<LO,GO,NO> map_type;
547 typedef typename map_type::local_map_type local_map_type;
549 typedef typename KCRS::StaticCrsGraphType graph_t;
550 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
551 typedef typename NO::execution_space execution_space;
552 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
553 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
555 #ifdef HAVE_TPETRA_MMM_TIMINGS
556 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
557 using Teuchos::TimeMonitor;
558 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
560 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
563 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
564 RCP<const map_type> Ccolmap = Ac.getColMap();
565 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
566 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
567 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
568 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
569 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
570 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
571 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
572 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
575 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
579 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
580 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
583 if (!Pview.importMatrix.is_null()) {
584 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
585 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
587 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
588 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
589 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
595 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
596 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
597 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
598 GO aidx = Acolmap_local.getGlobalElement(i);
599 LO B_LID = Prowmap_local.getLocalElement(aidx);
600 if (B_LID != LO_INVALID) {
601 targetMapToOrigRow(i) = B_LID;
602 targetMapToImportRow(i) = LO_INVALID;
604 LO I_LID = Irowmap_local.getLocalElement(aidx);
605 targetMapToOrigRow(i) = LO_INVALID;
606 targetMapToImportRow(i) = I_LID;
613 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
614 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
615 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
616 Ac, Cimport, label, params);
621 template<
class Scalar,
625 void mult_PT_A_P_newmatrix(
626 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
627 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
628 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
629 const std::string& label,
630 const Teuchos::RCP<Teuchos::ParameterList>& params)
632 using Teuchos::Array;
633 using Teuchos::ArrayRCP;
634 using Teuchos::ArrayView;
639 typedef LocalOrdinal LO;
640 typedef GlobalOrdinal GO;
643 typedef Import<LO,GO,NO> import_type;
644 typedef Map<LO,GO,NO> map_type;
647 typedef typename map_type::local_map_type local_map_type;
649 typedef typename KCRS::StaticCrsGraphType graph_t;
650 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
651 typedef typename NO::execution_space execution_space;
652 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
653 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
655 #ifdef HAVE_TPETRA_MMM_TIMINGS
656 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
657 using Teuchos::TimeMonitor;
658 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP M5 Cmap")))));
660 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
663 RCP<const import_type> Cimport;
664 RCP<const map_type> Ccolmap;
665 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
666 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
667 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
668 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
669 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
670 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
671 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
679 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
681 if (Pview.importMatrix.is_null()) {
684 Ccolmap = Pview.colMap;
685 const LO colMapSize =
static_cast<LO
>(Pview.colMap->getLocalNumElements());
687 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
688 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
689 KOKKOS_LAMBDA(
const LO i) {
702 if (!Pimport.is_null() && !Iimport.is_null()) {
703 Cimport = Pimport->setUnion(*Iimport);
705 else if (!Pimport.is_null() && Iimport.is_null()) {
706 Cimport = Pimport->setUnion();
708 else if (Pimport.is_null() && !Iimport.is_null()) {
709 Cimport = Iimport->setUnion();
712 throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
714 Ccolmap = Cimport->getTargetMap();
719 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
720 std::runtime_error,
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
727 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
728 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
729 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
730 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
732 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
733 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
742 Ac.replaceColMap(Ccolmap);
760 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
761 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
763 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
764 GO aidx = Acolmap_local.getGlobalElement(i);
765 LO P_LID = Prowmap_local.getLocalElement(aidx);
766 if (P_LID != LO_INVALID) {
767 targetMapToOrigRow(i) = P_LID;
768 targetMapToImportRow(i) = LO_INVALID;
770 LO I_LID = Irowmap_local.getLocalElement(aidx);
771 targetMapToOrigRow(i) = LO_INVALID;
772 targetMapToImportRow(i) = I_LID;
778 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
779 mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
780 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
781 Ac, Cimport, label, params);
785 template<
class Scalar,
789 void mult_PT_A_P_reuse(
790 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
791 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
792 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
793 const std::string& label,
794 const Teuchos::RCP<Teuchos::ParameterList>& params)
796 using Teuchos::Array;
797 using Teuchos::ArrayRCP;
798 using Teuchos::ArrayView;
803 typedef LocalOrdinal LO;
804 typedef GlobalOrdinal GO;
807 typedef Import<LO,GO,NO> import_type;
808 typedef Map<LO,GO,NO> map_type;
811 typedef typename map_type::local_map_type local_map_type;
813 typedef typename KCRS::StaticCrsGraphType graph_t;
814 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
815 typedef typename NO::execution_space execution_space;
816 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
817 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
819 #ifdef HAVE_TPETRA_MMM_TIMINGS
820 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
821 using Teuchos::TimeMonitor;
822 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
824 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
827 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
828 RCP<const map_type> Ccolmap = Ac.getColMap();
829 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
830 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
831 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
832 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
833 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
834 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
835 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
836 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
839 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
843 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
844 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
847 if (!Pview.importMatrix.is_null()) {
848 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
849 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
851 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
852 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
853 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
859 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
860 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
861 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
862 GO aidx = Acolmap_local.getGlobalElement(i);
863 LO B_LID = Prowmap_local.getLocalElement(aidx);
864 if (B_LID != LO_INVALID) {
865 targetMapToOrigRow(i) = B_LID;
866 targetMapToImportRow(i) = LO_INVALID;
868 LO I_LID = Irowmap_local.getLocalElement(aidx);
869 targetMapToOrigRow(i) = LO_INVALID;
870 targetMapToImportRow(i) = I_LID;
877 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
878 mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
879 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
880 Ac, Cimport, label, params);
887 template<
class Scalar,
891 class LocalOrdinalViewType>
892 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
893 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
894 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
895 const LocalOrdinalViewType & Acol2Prow_dev,
896 const LocalOrdinalViewType & Acol2PIrow_dev,
897 const LocalOrdinalViewType & Pcol2Accol_dev,
898 const LocalOrdinalViewType & PIcol2Accol_dev,
899 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
900 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
901 const std::string& label,
902 const Teuchos::RCP<Teuchos::ParameterList>& params) {
903 #ifdef HAVE_TPETRA_MMM_TIMINGS
904 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
905 using Teuchos::TimeMonitor;
906 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
909 using Teuchos::Array;
910 using Teuchos::ArrayRCP;
911 using Teuchos::ArrayView;
917 typedef typename KCRS::StaticCrsGraphType graph_t;
918 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
919 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
920 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
921 typedef typename KCRS::values_type::non_const_type scalar_view_t;
924 typedef LocalOrdinal LO;
925 typedef GlobalOrdinal GO;
927 typedef Map<LO,GO,NO> map_type;
928 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
929 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
930 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
933 RCP<const map_type> Accolmap = Ac.getColMap();
934 size_t m = Rview.origMatrix->getLocalNumRows();
935 size_t n = Accolmap->getLocalNumElements();
936 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
939 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
941 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
943 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
945 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
949 const auto Amat = Aview.origMatrix->getLocalMatrixHost();
950 const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
951 const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
953 auto Arowptr = Amat.graph.row_map;
954 auto Prowptr = Pmat.graph.row_map;
955 auto Rrowptr = Rmat.graph.row_map;
956 const auto Acolind = Amat.graph.entries;
957 const auto Pcolind = Pmat.graph.entries;
958 const auto Rcolind = Rmat.graph.entries;
959 const auto Avals = Amat.values;
960 const auto Pvals = Pmat.values;
961 const auto Rvals = Rmat.values;
963 typename c_lno_view_t::HostMirror::const_type Irowptr;
964 typename lno_nnz_view_t::HostMirror Icolind;
965 typename scalar_view_t::HostMirror Ivals;
966 if(!Pview.importMatrix.is_null()) {
967 auto lclP = Pview.importMatrix->getLocalMatrixHost();
968 Irowptr = lclP.graph.row_map;
969 Icolind = lclP.graph.entries;
971 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
974 #ifdef HAVE_TPETRA_MMM_TIMINGS
975 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore - Compare"))));
985 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
986 typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
987 typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
988 typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
998 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
999 Array<size_t> ac_status(n, ST_INVALID);
1009 size_t nnz = 0, nnz_old = 0;
1010 for (
size_t i = 0; i < m; i++) {
1016 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1018 const SC Rik = Rvals[kk];
1022 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1024 const SC Akl = Avals[ll];
1029 if (Acol2Prow[l] != LO_INVALID) {
1036 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1039 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1041 LO Acj = Pcol2Accol[j];
1044 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1045 #ifdef HAVE_TPETRA_DEBUG
1047 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1049 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1052 ac_status[Acj] = nnz;
1054 Cvals[nnz] = Rik*Akl*Plj;
1057 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1067 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1068 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1070 LO Acj = PIcol2Accol[j];
1073 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1074 #ifdef HAVE_TPETRA_DEBUG
1076 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1078 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1081 ac_status[Acj] = nnz;
1083 Cvals[nnz] = Rik*Akl*Plj;
1086 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1093 if (nnz + n > CSR_alloc) {
1095 Kokkos::resize(Ccolind,CSR_alloc);
1096 Kokkos::resize(Cvals,CSR_alloc);
1104 Kokkos::resize(Ccolind,nnz);
1105 Kokkos::resize(Cvals,nnz);
1107 #ifdef HAVE_TPETRA_MMM_TIMINGS
1108 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1110 auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1111 typename KCRS::device_type(), Crowptr);
1112 auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1113 typename KCRS::device_type(), Ccolind);
1114 auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1115 typename KCRS::device_type(), Cvals);
1118 if (params.is_null() || params->get(
"sort entries",
true))
1119 Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1120 Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1122 #ifdef HAVE_TPETRA_MMM_TIMINGS
1123 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1134 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1135 labelList->set(
"Timer Label",label);
1136 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1137 RCP<const Export<LO,GO,NO> > dummyExport;
1138 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1139 Rview.origMatrix->getRangeMap(),
1149 template<
class Scalar,
1151 class GlobalOrdinal,
1153 class LocalOrdinalViewType>
1154 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1155 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1156 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1157 const LocalOrdinalViewType & Acol2Prow_dev,
1158 const LocalOrdinalViewType & Acol2PIrow_dev,
1159 const LocalOrdinalViewType & Pcol2Accol_dev,
1160 const LocalOrdinalViewType & PIcol2Accol_dev,
1161 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1162 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1163 const std::string& label,
1164 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1165 #ifdef HAVE_TPETRA_MMM_TIMINGS
1166 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1167 using Teuchos::TimeMonitor;
1168 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore"))));
1171 using Teuchos::Array;
1172 using Teuchos::ArrayRCP;
1173 using Teuchos::ArrayView;
1178 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1179 typedef typename KCRS::StaticCrsGraphType graph_t;
1180 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1181 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1182 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1185 typedef LocalOrdinal LO;
1186 typedef GlobalOrdinal GO;
1188 typedef Map<LO,GO,NO> map_type;
1189 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1190 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1191 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1194 RCP<const map_type> Accolmap = Ac.getColMap();
1195 size_t m = Rview.origMatrix->getLocalNumRows();
1196 size_t n = Accolmap->getLocalNumElements();
1197 size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1200 auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1202 auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1204 auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1206 auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1210 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1211 const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
1212 const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
1213 const KCRS & Cmat = Ac.getLocalMatrixHost();
1215 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1216 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1217 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1218 scalar_view_t Cvals = Cmat.values;
1220 c_lno_view_t Irowptr;
1221 lno_nnz_view_t Icolind;
1222 scalar_view_t Ivals;
1223 if(!Pview.importMatrix.is_null()) {
1224 auto lclP = Pview.importMatrix->getLocalMatrixHost();
1225 Irowptr = lclP.graph.row_map;
1226 Icolind = lclP.graph.entries;
1227 Ivals = lclP.values;
1228 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
1231 #ifdef HAVE_TPETRA_MMM_TIMINGS
1232 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore - Compare"))));
1243 Array<size_t> ac_status(n, ST_INVALID);
1257 size_t OLD_ip = 0, CSR_ip = 0;
1258 for (
size_t i = 0; i < m; i++) {
1261 OLD_ip = Crowptr[i];
1262 CSR_ip = Crowptr[i+1];
1263 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1264 ac_status[Ccolind[k]] = k;
1271 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1273 const SC Rik = Rvals[kk];
1277 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1279 const SC Akl = Avals[ll];
1284 if (Acol2Prow[l] != LO_INVALID) {
1291 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1294 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1296 LO Cij = Pcol2Accol[j];
1299 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1300 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1301 "(c_status = " << ac_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1303 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1312 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1313 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1315 LO Cij = PIcol2Accol[j];
1318 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1319 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1320 "(c_status = " << ac_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1322 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1329 #ifdef HAVE_TPETRA_MMM_TIMINGS
1330 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse ESFC"))));
1333 Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1341 template<
class Scalar,
1343 class GlobalOrdinal,
1345 class LocalOrdinalViewType>
1346 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1347 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1348 const LocalOrdinalViewType & Acol2Prow,
1349 const LocalOrdinalViewType & Acol2PIrow,
1350 const LocalOrdinalViewType & Pcol2Accol,
1351 const LocalOrdinalViewType & PIcol2Accol,
1352 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1353 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1354 const std::string& label,
1355 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1356 #ifdef HAVE_TPETRA_MMM_TIMINGS
1357 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1358 using Teuchos::TimeMonitor;
1359 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
1363 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1364 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1366 using Teuchos::ParameterList;
1368 RCP<ParameterList> transposeParams (
new ParameterList);
1369 transposeParams->set (
"sort",
false);
1371 if (! params.is_null ()) {
1372 transposeParams->set (
"compute global constants",
1373 params->get (
"compute global constants: temporaries",
1376 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1377 transposer.createTransposeLocal (transposeParams);
1378 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1379 Rview.origMatrix = Ptrans;
1381 mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1387 template<
class Scalar,
1389 class GlobalOrdinal,
1391 class LocalOrdinalViewType>
1392 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1393 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1394 const LocalOrdinalViewType & Acol2Prow,
1395 const LocalOrdinalViewType & Acol2PIrow,
1396 const LocalOrdinalViewType & Pcol2Accol,
1397 const LocalOrdinalViewType & PIcol2Accol,
1398 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1399 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1400 const std::string& label,
1401 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1402 #ifdef HAVE_TPETRA_MMM_TIMINGS
1403 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1404 using Teuchos::TimeMonitor;
1405 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
1409 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1410 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1412 using Teuchos::ParameterList;
1414 RCP<ParameterList> transposeParams (
new ParameterList);
1415 transposeParams->set (
"sort",
false);
1417 if (! params.is_null ()) {
1418 transposeParams->set (
"compute global constants",
1419 params->get (
"compute global constants: temporaries",
1422 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1423 transposer.createTransposeLocal (transposeParams);
1424 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1425 Rview.origMatrix = Ptrans;
1427 mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1435 template<
class Scalar,
1437 class GlobalOrdinal,
1439 void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1440 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1441 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1442 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1443 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1444 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1445 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1446 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1447 const std::string& label,
1448 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1449 #ifdef HAVE_TPETRA_MMM_TIMINGS
1450 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1451 using Teuchos::TimeMonitor;
1452 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
1455 using Teuchos::Array;
1456 using Teuchos::ArrayRCP;
1457 using Teuchos::ArrayView;
1462 typedef LocalOrdinal LO;
1463 typedef GlobalOrdinal GO;
1465 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1466 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1467 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1472 size_t n = Ac.getRowMap()->getLocalNumElements();
1473 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1476 ArrayRCP<size_t> Acrowptr_RCP;
1477 ArrayRCP<LO> Accolind_RCP;
1478 ArrayRCP<SC> Acvals_RCP;
1485 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1486 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1487 auto Avals = Aview.origMatrix->getLocalValuesHost(
1488 Tpetra::Access::ReadOnly);
1489 auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1490 auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1491 auto Pvals = Pview.origMatrix->getLocalValuesHost(
1492 Tpetra::Access::ReadOnly);
1493 decltype(Prowptr) Irowptr;
1494 decltype(Pcolind) Icolind;
1495 decltype(Pvals) Ivals;
1497 if (!Pview.importMatrix.is_null()) {
1498 Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1499 Icolind = Pview.importMatrix->getLocalIndicesHost();
1500 Ivals = Pview.importMatrix->getLocalValuesHost(
1501 Tpetra::Access::ReadOnly);
1509 ArrayView<size_t> Acrowptr;
1510 ArrayView<LO> Accolind;
1511 ArrayView<SC> Acvals;
1522 #ifdef HAVE_TPETRA_MMM_TIMINGS
1523 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1530 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1532 using Teuchos::ParameterList;
1533 RCP<ParameterList> transposeParams (
new ParameterList);
1534 transposeParams->set (
"sort",
false);
1535 if (! params.is_null ()) {
1536 transposeParams->set (
"compute global constants",
1537 params->get (
"compute global constants: temporaries",
1540 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1541 transposer.createTransposeLocal (transposeParams);
1543 auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1544 auto Rcolind = Ptrans->getLocalIndicesHost();
1545 auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1550 #ifdef HAVE_TPETRA_MMM_TIMINGS
1551 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP graph"))));
1554 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1555 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1557 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1558 size_t nnzPerRowA = 100;
1559 if (Aview.origMatrix->getLocalNumEntries() > 0)
1560 nnzPerRowA = Aview.origMatrix->getLocalNumEntries()/Aview.origMatrix->getLocalNumRows();
1561 Acrowptr_RCP.resize(n+1);
1562 Acrowptr = Acrowptr_RCP();
1563 Accolind_RCP.resize(nnz_alloc);
1564 Accolind = Accolind_RCP();
1566 size_t nnz = 0, nnz_old = 0;
1567 for (
size_t i = 0; i < n; i++) {
1573 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1576 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1579 if (Acol2PRow[l] != LO_INVALID) {
1586 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1589 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1591 LO Acj = Pcol2Accol[j];
1593 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1595 ac_status[Acj] = nnz;
1596 Accolind[nnz] = Acj;
1607 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1608 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1610 LO Acj = PIcol2Accol[j];
1612 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1614 ac_status[Acj] = nnz;
1615 Accolind[nnz] = Acj;
1625 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1627 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1628 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1629 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1636 Accolind_RCP.resize(nnz);
1637 Accolind = Accolind_RCP();
1640 Acvals_RCP.resize(nnz, SC_ZERO);
1641 Acvals = Acvals_RCP();
1648 #ifdef HAVE_TPETRA_MMM_TIMINGS
1649 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
1653 for (
size_t k = 0; k < n; k++) {
1654 for (
size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1656 const SC Pki = Pvals[ii];
1657 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1659 const SC Akl = Avals[ll];
1662 if (Acol2PRow[l] != LO_INVALID) {
1669 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1670 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1672 LO Acj = Pcol2Accol[j];
1674 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1675 if (Accolind[pp] == Acj)
1679 Acvals[pp] += Pki*Akl*Pvals[jj];
1688 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1689 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1691 LO Acj = PIcol2Accol[j];
1693 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1694 if (Accolind[pp] == Acj)
1698 Acvals[pp] += Pki*Akl*Ivals[jj];
1706 #ifdef HAVE_TPETRA_MMM_TIMINGS
1707 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
1714 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1717 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1719 #ifdef HAVE_TPETRA_MMM_TIMINGS
1720 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
1731 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1732 labelList->set(
"Timer Label",label);
1734 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1735 RCP<const Export<LO,GO,NO> > dummyExport;
1736 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1737 Pview.origMatrix->getDomainMap(),
1739 dummyExport, labelList);
1753 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1756 void TripleMatrixMultiply::MultiplyRAP( \
1757 const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1759 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1761 const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1763 CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1764 bool call_FillComplete_on_result, \
1765 const std::string & label, \
1766 const Teuchos::RCP<Teuchos::ParameterList>& params); \
1770 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.