41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
60 #include "Teuchos_FancyOStream.hpp"
74 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
79 namespace TripleMatrixMultiply{
87 template <
class Scalar,
99 bool call_FillComplete_on_result,
100 const std::string& label,
101 const Teuchos::RCP<Teuchos::ParameterList>& params)
106 typedef LocalOrdinal LO;
107 typedef GlobalOrdinal GO;
116 #ifdef HAVE_TPETRA_MMM_TIMINGS
117 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
118 using Teuchos::TimeMonitor;
119 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Setup"))));
122 const std::string prefix =
"TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
127 TEUCHOS_TEST_FOR_EXCEPTION(!R.
isFillComplete(), std::runtime_error, prefix <<
"Matrix R is not fill complete.");
128 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
129 TEUCHOS_TEST_FOR_EXCEPTION(!P.
isFillComplete(), std::runtime_error, prefix <<
"Matrix P is not fill complete.");
134 RCP<const crs_matrix_type> Rprime = null;
138 RCP<const crs_matrix_type> Aprime = null;
142 RCP<const crs_matrix_type> Pprime = null;
152 const bool newFlag = !Ac.
getGraph()->isLocallyIndexed() && !Ac.
getGraph()->isGloballyIndexed();
154 using Teuchos::ParameterList;
155 RCP<ParameterList> transposeParams (
new ParameterList);
156 transposeParams->set (
"sort",
false);
158 if (transposeR && &R != &P) {
159 transposer_type transposer(rcpFromRef (R));
160 Rprime = transposer.createTranspose (transposeParams);
162 Rprime = rcpFromRef(R);
166 transposer_type transposer(rcpFromRef (A));
167 Aprime = transposer.createTranspose (transposeParams);
169 Aprime = rcpFromRef(A);
173 transposer_type transposer(rcpFromRef (P));
174 Pprime = transposer.createTranspose (transposeParams);
176 Pprime = rcpFromRef(P);
189 TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
190 prefix <<
"ERROR, inner dimensions of op(R) and op(A) "
191 "must match for matrix-matrix product. op(R) is "
192 << Rleft <<
"x" << Rright <<
", op(A) is "<< Aleft <<
"x" << Aright);
194 TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
195 prefix <<
"ERROR, inner dimensions of op(A) and op(P) "
196 "must match for matrix-matrix product. op(A) is "
197 << Aleft <<
"x" << Aright <<
", op(P) is "<< Pleft <<
"x" << Pright);
203 TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.
getGlobalNumRows(), std::runtime_error,
204 prefix <<
"ERROR, dimensions of result Ac must "
205 "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.
getGlobalNumRows()
206 <<
" rows, should have at least " << Rleft << std::endl);
218 int numProcs = P.
getComm()->getSize();
222 crs_matrix_struct_type Rview;
223 crs_matrix_struct_type Aview;
224 crs_matrix_struct_type Pview;
226 RCP<const map_type> targetMap_R = Rprime->getRowMap();
227 RCP<const map_type> targetMap_A = Aprime->getRowMap();
228 RCP<const map_type> targetMap_P = Pprime->getRowMap();
230 #ifdef HAVE_TPETRA_MMM_TIMINGS
231 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All I&X"))));
237 RCP<const import_type> dummyImporter;
239 if (!(transposeR && &R == &P))
240 MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter,
true, label, params);
242 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
247 targetMap_P = Aprime->getColMap();
250 MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(),
false, label, params);
253 RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
255 bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
256 if (needs_final_export)
259 Actemp = rcp(&Ac,
false);
261 #ifdef HAVE_TPETRA_MMM_TIMINGS
262 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP All Multiply"))));
266 if (call_FillComplete_on_result && newFlag) {
267 if (transposeR && &R == &P)
268 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
270 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
271 }
else if (call_FillComplete_on_result) {
272 if (transposeR && &R == &P)
273 MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
275 MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
298 if (transposeR && &R == &P)
299 MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
301 MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
304 if (needs_final_export) {
305 #ifdef HAVE_TPETRA_MMM_TIMINGS
306 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP exportAndFillComplete"))));
308 Teuchos::ParameterList labelList;
309 labelList.set(
"Timer Label", label);
310 Teuchos::ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
312 RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
314 bool overrideAllreduce =
false;
316 if(!params.is_null()) {
317 Teuchos::ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
319 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
320 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
321 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
322 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
323 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
325 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
327 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
329 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM,
330 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
331 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
333 export_type exporter = export_type(*Pprime->getGraph()->getImporter());
334 Actemp->exportAndFillComplete(Acprime,
336 Acprime->getDomainMap(),
337 Acprime->getRangeMap(),
338 rcp(&labelList,
false));
341 #ifdef HAVE_TPETRA_MMM_STATISTICS
342 printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(
" RAP MMM"));
353 template<
class Scalar,
357 void mult_R_A_P_newmatrix(
358 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
359 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
360 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
361 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
362 const std::string& label,
363 const Teuchos::RCP<Teuchos::ParameterList>& params)
365 using Teuchos::Array;
366 using Teuchos::ArrayRCP;
367 using Teuchos::ArrayView;
372 typedef LocalOrdinal LO;
373 typedef GlobalOrdinal GO;
376 typedef Import<LO,GO,NO> import_type;
377 typedef Map<LO,GO,NO> map_type;
380 typedef typename map_type::local_map_type local_map_type;
382 typedef typename KCRS::StaticCrsGraphType graph_t;
383 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
384 typedef typename NO::execution_space execution_space;
385 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
386 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
388 #ifdef HAVE_TPETRA_MMM_TIMINGS
389 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
390 using Teuchos::TimeMonitor;
391 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
393 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
396 RCP<const import_type> Cimport;
397 RCP<const map_type> Ccolmap;
398 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
399 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
400 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
401 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
402 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
403 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
404 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
412 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
414 if (Pview.importMatrix.is_null()) {
417 Ccolmap = Pview.colMap;
418 const LO colMapSize =
static_cast<LO
>(Pview.colMap->getNodeNumElements());
420 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
421 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
422 KOKKOS_LAMBDA(
const LO i) {
435 if (!Pimport.is_null() && !Iimport.is_null()) {
436 Cimport = Pimport->setUnion(*Iimport);
438 else if (!Pimport.is_null() && Iimport.is_null()) {
439 Cimport = Pimport->setUnion();
441 else if (Pimport.is_null() && !Iimport.is_null()) {
442 Cimport = Iimport->setUnion();
445 throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
447 Ccolmap = Cimport->getTargetMap();
452 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
453 std::runtime_error,
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
460 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
461 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
462 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
463 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
465 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
466 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
475 Ac.replaceColMap(Ccolmap);
493 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
494 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
495 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
496 GO aidx = Acolmap_local.getGlobalElement(i);
497 LO P_LID = Prowmap_local.getLocalElement(aidx);
498 if (P_LID != LO_INVALID) {
499 targetMapToOrigRow(i) = P_LID;
500 targetMapToImportRow(i) = LO_INVALID;
502 LO I_LID = Irowmap_local.getLocalElement(aidx);
503 targetMapToOrigRow(i) = LO_INVALID;
504 targetMapToImportRow(i) = I_LID;
510 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
511 mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
512 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
513 Ac, Cimport, label, params);
518 template<
class Scalar,
522 void mult_R_A_P_reuse(
523 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
524 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
525 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
526 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
527 const std::string& label,
528 const Teuchos::RCP<Teuchos::ParameterList>& params)
530 using Teuchos::Array;
531 using Teuchos::ArrayRCP;
532 using Teuchos::ArrayView;
537 typedef LocalOrdinal LO;
538 typedef GlobalOrdinal GO;
541 typedef Import<LO,GO,NO> import_type;
542 typedef Map<LO,GO,NO> map_type;
545 typedef typename map_type::local_map_type local_map_type;
547 typedef typename KCRS::StaticCrsGraphType graph_t;
548 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
549 typedef typename NO::execution_space execution_space;
550 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
551 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
553 #ifdef HAVE_TPETRA_MMM_TIMINGS
554 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
555 using Teuchos::TimeMonitor;
556 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
558 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
561 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
562 RCP<const map_type> Ccolmap = Ac.getColMap();
563 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
564 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
565 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
566 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
567 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
568 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
569 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
570 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
573 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
577 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
578 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
581 if (!Pview.importMatrix.is_null()) {
582 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
583 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
585 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
586 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
587 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
593 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
594 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
595 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
596 GO aidx = Acolmap_local.getGlobalElement(i);
597 LO B_LID = Prowmap_local.getLocalElement(aidx);
598 if (B_LID != LO_INVALID) {
599 targetMapToOrigRow(i) = B_LID;
600 targetMapToImportRow(i) = LO_INVALID;
602 LO I_LID = Irowmap_local.getLocalElement(aidx);
603 targetMapToOrigRow(i) = LO_INVALID;
604 targetMapToImportRow(i) = I_LID;
611 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
612 mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
613 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
614 Ac, Cimport, label, params);
619 template<
class Scalar,
623 void mult_PT_A_P_newmatrix(
624 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
625 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
626 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
627 const std::string& label,
628 const Teuchos::RCP<Teuchos::ParameterList>& params)
630 using Teuchos::Array;
631 using Teuchos::ArrayRCP;
632 using Teuchos::ArrayView;
637 typedef LocalOrdinal LO;
638 typedef GlobalOrdinal GO;
641 typedef Import<LO,GO,NO> import_type;
642 typedef Map<LO,GO,NO> map_type;
645 typedef typename map_type::local_map_type local_map_type;
647 typedef typename KCRS::StaticCrsGraphType graph_t;
648 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
649 typedef typename NO::execution_space execution_space;
650 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
651 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
653 #ifdef HAVE_TPETRA_MMM_TIMINGS
654 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
655 using Teuchos::TimeMonitor;
656 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP M5 Cmap")))));
658 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
661 RCP<const import_type> Cimport;
662 RCP<const map_type> Ccolmap;
663 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
664 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
665 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
666 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
667 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
668 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
669 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
677 lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
679 if (Pview.importMatrix.is_null()) {
682 Ccolmap = Pview.colMap;
683 const LO colMapSize =
static_cast<LO
>(Pview.colMap->getNodeNumElements());
685 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
686 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
687 KOKKOS_LAMBDA(
const LO i) {
700 if (!Pimport.is_null() && !Iimport.is_null()) {
701 Cimport = Pimport->setUnion(*Iimport);
703 else if (!Pimport.is_null() && Iimport.is_null()) {
704 Cimport = Pimport->setUnion();
706 else if (Pimport.is_null() && !Iimport.is_null()) {
707 Cimport = Iimport->setUnion();
710 throw std::runtime_error(
"TpetraExt::RAP status of matrix importers is nonsensical");
712 Ccolmap = Cimport->getTargetMap();
717 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
718 std::runtime_error,
"Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
725 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
726 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
727 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
728 Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
730 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
731 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
740 Ac.replaceColMap(Ccolmap);
758 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
759 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
761 Kokkos::parallel_for(
"Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
762 GO aidx = Acolmap_local.getGlobalElement(i);
763 LO P_LID = Prowmap_local.getLocalElement(aidx);
764 if (P_LID != LO_INVALID) {
765 targetMapToOrigRow(i) = P_LID;
766 targetMapToImportRow(i) = LO_INVALID;
768 LO I_LID = Irowmap_local.getLocalElement(aidx);
769 targetMapToOrigRow(i) = LO_INVALID;
770 targetMapToImportRow(i) = I_LID;
776 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
777 mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
778 targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
779 Ac, Cimport, label, params);
783 template<
class Scalar,
787 void mult_PT_A_P_reuse(
788 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
789 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
790 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
791 const std::string& label,
792 const Teuchos::RCP<Teuchos::ParameterList>& params)
794 using Teuchos::Array;
795 using Teuchos::ArrayRCP;
796 using Teuchos::ArrayView;
801 typedef LocalOrdinal LO;
802 typedef GlobalOrdinal GO;
805 typedef Import<LO,GO,NO> import_type;
806 typedef Map<LO,GO,NO> map_type;
809 typedef typename map_type::local_map_type local_map_type;
811 typedef typename KCRS::StaticCrsGraphType graph_t;
812 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
813 typedef typename NO::execution_space execution_space;
814 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
815 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
817 #ifdef HAVE_TPETRA_MMM_TIMINGS
818 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
819 using Teuchos::TimeMonitor;
820 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP M5 Cmap")))));
822 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
825 RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
826 RCP<const map_type> Ccolmap = Ac.getColMap();
827 RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
828 RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
829 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
830 local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
831 local_map_type Irowmap_local;
if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
832 local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
833 local_map_type Icolmap_local;
if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
834 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
837 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
841 Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
842 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
845 if (!Pview.importMatrix.is_null()) {
846 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
847 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
849 Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
850 Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
851 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
857 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
858 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
859 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
860 GO aidx = Acolmap_local.getGlobalElement(i);
861 LO B_LID = Prowmap_local.getLocalElement(aidx);
862 if (B_LID != LO_INVALID) {
863 targetMapToOrigRow(i) = B_LID;
864 targetMapToImportRow(i) = LO_INVALID;
866 LO I_LID = Irowmap_local.getLocalElement(aidx);
867 targetMapToOrigRow(i) = LO_INVALID;
868 targetMapToImportRow(i) = I_LID;
875 KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
876 mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
877 targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
878 Ac, Cimport, label, params);
885 template<
class Scalar,
889 class LocalOrdinalViewType>
890 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
891 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
892 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
893 const LocalOrdinalViewType & Acol2Prow,
894 const LocalOrdinalViewType & Acol2PIrow,
895 const LocalOrdinalViewType & Pcol2Accol,
896 const LocalOrdinalViewType & PIcol2Accol,
897 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
898 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
899 const std::string& label,
900 const Teuchos::RCP<Teuchos::ParameterList>& params) {
901 #ifdef HAVE_TPETRA_MMM_TIMINGS
902 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
903 using Teuchos::TimeMonitor;
904 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore"))));
907 using Teuchos::Array;
908 using Teuchos::ArrayRCP;
909 using Teuchos::ArrayView;
915 typedef typename KCRS::StaticCrsGraphType graph_t;
916 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
917 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
918 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
919 typedef typename KCRS::values_type::non_const_type scalar_view_t;
922 typedef LocalOrdinal LO;
923 typedef GlobalOrdinal GO;
925 typedef Map<LO,GO,NO> map_type;
926 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
927 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
928 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
931 RCP<const map_type> Accolmap = Ac.getColMap();
932 size_t m = Rview.origMatrix->getNodeNumRows();
933 size_t n = Accolmap->getNodeNumElements();
934 size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
937 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
938 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
939 const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
941 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map;
942 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries;
943 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
945 c_lno_view_t Irowptr;
946 lno_nnz_view_t Icolind;
948 if(!Pview.importMatrix.is_null()) {
949 Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
950 Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
951 Ivals = Pview.importMatrix->getLocalMatrix().values;
952 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
955 #ifdef HAVE_TPETRA_MMM_TIMINGS
956 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix SerialCore - Compare"))));
966 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
967 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
968 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
969 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
979 const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
980 Array<size_t> ac_status(n, ST_INVALID);
990 size_t nnz = 0, nnz_old = 0;
991 for (
size_t i = 0; i < m; i++) {
997 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
999 const SC Rik = Rvals[kk];
1003 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1005 const SC Akl = Avals[ll];
1010 if (Acol2Prow[l] != LO_INVALID) {
1017 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1020 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1022 LO Acj = Pcol2Accol[j];
1025 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1026 #ifdef HAVE_TPETRA_DEBUG
1028 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1030 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1033 ac_status[Acj] = nnz;
1035 Cvals[nnz] = Rik*Akl*Plj;
1038 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1048 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1049 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1051 LO Acj = PIcol2Accol[j];
1054 if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1055 #ifdef HAVE_TPETRA_DEBUG
1057 TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1059 label <<
" ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1062 ac_status[Acj] = nnz;
1064 Cvals[nnz] = Rik*Akl*Plj;
1067 Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1074 if (nnz + n > CSR_alloc) {
1076 Kokkos::resize(Ccolind,CSR_alloc);
1077 Kokkos::resize(Cvals,CSR_alloc);
1085 Kokkos::resize(Ccolind,nnz);
1086 Kokkos::resize(Cvals,nnz);
1088 #ifdef HAVE_TPETRA_MMM_TIMINGS
1089 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix Final Sort"))));
1093 if (params.is_null() || params->get(
"sort entries",
true))
1094 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1095 Ac.setAllValues(Crowptr, Ccolind, Cvals);
1097 #ifdef HAVE_TPETRA_MMM_TIMINGS
1098 MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Newmatrix ESFC"))));
1109 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1110 labelList->set(
"Timer Label",label);
1111 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1112 RCP<const Export<LO,GO,NO> > dummyExport;
1113 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1114 Rview.origMatrix->getRangeMap(),
1124 template<
class Scalar,
1126 class GlobalOrdinal,
1128 class LocalOrdinalViewType>
1129 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1130 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1131 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1132 const LocalOrdinalViewType & Acol2Prow,
1133 const LocalOrdinalViewType & Acol2PIrow,
1134 const LocalOrdinalViewType & Pcol2Accol,
1135 const LocalOrdinalViewType & PIcol2Accol,
1136 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1137 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1138 const std::string& label,
1139 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1140 #ifdef HAVE_TPETRA_MMM_TIMINGS
1141 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1142 using Teuchos::TimeMonitor;
1143 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore"))));
1146 using Teuchos::Array;
1147 using Teuchos::ArrayRCP;
1148 using Teuchos::ArrayView;
1154 typedef typename KCRS::StaticCrsGraphType graph_t;
1155 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1156 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1157 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1160 typedef LocalOrdinal LO;
1161 typedef GlobalOrdinal GO;
1163 typedef Map<LO,GO,NO> map_type;
1164 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1165 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1166 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1169 RCP<const map_type> Accolmap = Ac.getColMap();
1170 size_t m = Rview.origMatrix->getNodeNumRows();
1171 size_t n = Accolmap->getNodeNumElements();
1172 size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
1175 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1176 const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1177 const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1178 const KCRS & Cmat = Ac.getLocalMatrix();
1180 c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1181 const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1182 const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1183 scalar_view_t Cvals = Cmat.values;
1185 c_lno_view_t Irowptr;
1186 lno_nnz_view_t Icolind;
1187 scalar_view_t Ivals;
1188 if(!Pview.importMatrix.is_null()) {
1189 Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
1190 Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
1191 Ivals = Pview.importMatrix->getLocalMatrix().values;
1192 p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
1195 #ifdef HAVE_TPETRA_MMM_TIMINGS
1196 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse SerialCore - Compare"))));
1207 Array<size_t> ac_status(n, ST_INVALID);
1221 size_t OLD_ip = 0, CSR_ip = 0;
1222 for (
size_t i = 0; i < m; i++) {
1225 OLD_ip = Crowptr[i];
1226 CSR_ip = Crowptr[i+1];
1227 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
1228 ac_status[Ccolind[k]] = k;
1235 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1237 const SC Rik = Rvals[kk];
1241 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1243 const SC Akl = Avals[ll];
1248 if (Acol2Prow[l] != LO_INVALID) {
1255 size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1258 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1260 LO Cij = Pcol2Accol[j];
1263 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1264 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1265 "(c_status = " << ac_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1267 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1276 size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1277 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1279 LO Cij = PIcol2Accol[j];
1282 TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1283 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
1284 "(c_status = " << ac_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
1286 Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1293 #ifdef HAVE_TPETRA_MMM_TIMINGS
1294 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"RAP Reuse ESFC"))));
1297 Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1305 template<
class Scalar,
1307 class GlobalOrdinal,
1309 class LocalOrdinalViewType>
1310 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1311 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1312 const LocalOrdinalViewType & Acol2Prow,
1313 const LocalOrdinalViewType & Acol2PIrow,
1314 const LocalOrdinalViewType & Pcol2Accol,
1315 const LocalOrdinalViewType & PIcol2Accol,
1316 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1317 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1318 const std::string& label,
1319 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1320 #ifdef HAVE_TPETRA_MMM_TIMINGS
1321 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1322 using Teuchos::TimeMonitor;
1323 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
1327 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1328 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1330 using Teuchos::ParameterList;
1332 RCP<ParameterList> transposeParams (
new ParameterList);
1333 transposeParams->set (
"sort",
false);
1335 if (! params.is_null ()) {
1336 transposeParams->set (
"compute global constants",
1337 params->get (
"compute global constants: temporaries",
1340 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1341 transposer.createTransposeLocal (transposeParams);
1342 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1343 Rview.origMatrix = Ptrans;
1345 mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1351 template<
class Scalar,
1353 class GlobalOrdinal,
1355 class LocalOrdinalViewType>
1356 void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1357 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1358 const LocalOrdinalViewType & Acol2Prow,
1359 const LocalOrdinalViewType & Acol2PIrow,
1360 const LocalOrdinalViewType & Pcol2Accol,
1361 const LocalOrdinalViewType & PIcol2Accol,
1362 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1363 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1364 const std::string& label,
1365 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1366 #ifdef HAVE_TPETRA_MMM_TIMINGS
1367 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1368 using Teuchos::TimeMonitor;
1369 Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose")));
1373 typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1374 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1376 using Teuchos::ParameterList;
1378 RCP<ParameterList> transposeParams (
new ParameterList);
1379 transposeParams->set (
"sort",
false);
1381 if (! params.is_null ()) {
1382 transposeParams->set (
"compute global constants",
1383 params->get (
"compute global constants: temporaries",
1386 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1387 transposer.createTransposeLocal (transposeParams);
1388 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1389 Rview.origMatrix = Ptrans;
1391 mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1399 template<
class Scalar,
1401 class GlobalOrdinal,
1403 void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1404 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1405 const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1406 const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1407 const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1408 const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1409 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1410 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1411 const std::string& label,
1412 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1413 #ifdef HAVE_TPETRA_MMM_TIMINGS
1414 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1415 using Teuchos::TimeMonitor;
1416 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix SerialCore"))));
1419 using Teuchos::Array;
1420 using Teuchos::ArrayRCP;
1421 using Teuchos::ArrayView;
1426 typedef LocalOrdinal LO;
1427 typedef GlobalOrdinal GO;
1429 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1430 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1431 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1436 size_t n = Ac.getRowMap()->getNodeNumElements();
1437 LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1440 ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1441 ArrayRCP<size_t> Acrowptr_RCP;
1442 ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1443 ArrayRCP<LO> Accolind_RCP;
1444 ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1445 ArrayRCP<SC> Acvals_RCP;
1452 Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1453 Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1455 if (!Pview.importMatrix.is_null())
1456 Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1463 ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1464 ArrayView<const LO> Acolind, Pcolind, Icolind;
1465 ArrayView<const SC> Avals, Pvals, Ivals;
1466 ArrayView<size_t> Acrowptr;
1467 ArrayView<LO> Accolind;
1468 ArrayView<SC> Acvals;
1469 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1470 Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1471 if (!Pview.importMatrix.is_null()) {
1472 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1484 #ifdef HAVE_TPETRA_MMM_TIMINGS
1485 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP local transpose"))));
1492 ArrayRCP<const size_t> Rrowptr_RCP;
1493 ArrayRCP<const LO> Rcolind_RCP;
1494 ArrayRCP<const Scalar> Rvals_RCP;
1495 ArrayView<const size_t> Rrowptr;
1496 ArrayView<const LO> Rcolind;
1497 ArrayView<const SC> Rvals;
1499 transposer_type transposer (Pview.origMatrix,label+std::string(
"XP: "));
1501 using Teuchos::ParameterList;
1502 RCP<ParameterList> transposeParams (
new ParameterList);
1503 transposeParams->set (
"sort",
false);
1504 if (! params.is_null ()) {
1505 transposeParams->set (
"compute global constants",
1506 params->get (
"compute global constants: temporaries",
1509 RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1510 transposer.createTransposeLocal (transposeParams);
1512 Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1513 Rrowptr = Rrowptr_RCP();
1514 Rcolind = Rcolind_RCP();
1515 Rvals = Rvals_RCP();
1520 #ifdef HAVE_TPETRA_MMM_TIMINGS
1521 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP graph"))));
1524 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1525 Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1527 size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1528 size_t nnzPerRowA = 100;
1529 if (Aview.origMatrix->getNodeNumEntries() > 0)
1530 nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1531 Acrowptr_RCP.resize(n+1);
1532 Acrowptr = Acrowptr_RCP();
1533 Accolind_RCP.resize(nnz_alloc);
1534 Accolind = Accolind_RCP();
1536 size_t nnz = 0, nnz_old = 0;
1537 for (
size_t i = 0; i < n; i++) {
1543 for (
size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1546 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1549 if (Acol2PRow[l] != LO_INVALID) {
1556 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1559 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1561 LO Acj = Pcol2Accol[j];
1563 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1565 ac_status[Acj] = nnz;
1566 Accolind[nnz] = Acj;
1577 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1578 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1580 LO Acj = PIcol2Accol[j];
1582 if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1584 ac_status[Acj] = nnz;
1585 Accolind[nnz] = Acj;
1595 if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1597 nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1598 Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1599 Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1606 Accolind_RCP.resize(nnz);
1607 Accolind = Accolind_RCP();
1610 Acvals_RCP.resize(nnz, SC_ZERO);
1611 Acvals = Acvals_RCP();
1618 #ifdef HAVE_TPETRA_MMM_TIMINGS
1619 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix Fill Matrix"))));
1623 for (
size_t k = 0; k < n; k++) {
1624 for (
size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1626 const SC Pki = Pvals[ii];
1627 for (
size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1629 const SC Akl = Avals[ll];
1632 if (Acol2PRow[l] != LO_INVALID) {
1639 size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1640 for (
size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1642 LO Acj = Pcol2Accol[j];
1644 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1645 if (Accolind[pp] == Acj)
1649 Acvals[pp] += Pki*Akl*Pvals[jj];
1658 size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1659 for (
size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1661 LO Acj = PIcol2Accol[j];
1663 for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1664 if (Accolind[pp] == Acj)
1668 Acvals[pp] += Pki*Akl*Ivals[jj];
1676 #ifdef HAVE_TPETRA_MMM_TIMINGS
1677 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP sort"))));
1684 Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1687 Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1689 #ifdef HAVE_TPETRA_MMM_TIMINGS
1690 MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"PTAP Newmatrix ESFC"))));
1701 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1702 labelList->set(
"Timer Label",label);
1704 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1705 RCP<const Export<LO,GO,NO> > dummyExport;
1706 Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1707 Pview.origMatrix->getDomainMap(),
1709 dummyExport, labelList);
1723 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1726 void TripleMatrixMultiply::MultiplyRAP( \
1727 const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1729 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1731 const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1733 CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1734 bool call_FillComplete_on_result, \
1735 const std::string & label, \
1736 const Teuchos::RCP<Teuchos::ParameterList>& params); \
1740 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.