41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_Details_makeColMap.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 #include "Kokkos_Bitset.hpp"
82 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
88 namespace MatrixMatrix{
96 template <
class Scalar,
106 bool call_FillComplete_on_result,
107 const std::string& label,
108 const Teuchos::RCP<Teuchos::ParameterList>& params)
114 typedef LocalOrdinal LO;
115 typedef GlobalOrdinal GO;
123 #ifdef HAVE_TPETRA_MMM_TIMINGS
124 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
125 using Teuchos::TimeMonitor;
128 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Setup"))));
131 const std::string prefix =
"TpetraExt::MatrixMatrix::Multiply(): ";
136 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
137 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
142 RCP<const crs_matrix_type> Aprime = null;
146 RCP<const crs_matrix_type> Bprime = null;
156 const bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
158 bool use_optimized_ATB =
false;
159 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160 use_optimized_ATB =
true;
162 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
163 use_optimized_ATB =
false;
166 using Teuchos::ParameterList;
167 RCP<ParameterList> transposeParams (
new ParameterList);
168 transposeParams->set (
"sort",
false);
170 if (!use_optimized_ATB && transposeA) {
171 transposer_type transposer (rcpFromRef (A));
172 Aprime = transposer.createTranspose (transposeParams);
175 Aprime = rcpFromRef(A);
179 transposer_type transposer (rcpFromRef (B));
180 Bprime = transposer.createTranspose (transposeParams);
183 Bprime = rcpFromRef(B);
193 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
195 "must match for matrix-matrix product. op(A) is "
196 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
202 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
203 prefix <<
"ERROR, dimensions of result C must "
205 <<
" rows, should have at least " << Aouter << std::endl);
217 int numProcs = A.
getComm()->getSize();
221 crs_matrix_struct_type Aview;
222 crs_matrix_struct_type Bview;
224 RCP<const map_type> targetMap_A = Aprime->getRowMap();
225 RCP<const map_type> targetMap_B = Bprime->getRowMap();
227 #ifdef HAVE_TPETRA_MMM_TIMINGS
229 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All I&X")));
235 if (!use_optimized_ATB) {
236 RCP<const import_type> dummyImporter;
237 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
243 targetMap_B = Aprime->getColMap();
246 if (!use_optimized_ATB)
247 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(),
false, label, params);
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
252 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Multiply"))));
256 if (use_optimized_ATB) {
257 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
259 }
else if (call_FillComplete_on_result && newFlag) {
260 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
262 }
else if (call_FillComplete_on_result) {
263 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
269 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
271 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
275 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All FillComplete")));
284 C.
fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
289 template <
class Scalar,
298 bool call_FillComplete_on_result,
299 const std::string& label,
300 const Teuchos::RCP<Teuchos::ParameterList>& params)
304 typedef LocalOrdinal LO;
305 typedef GlobalOrdinal GO;
312 #ifdef HAVE_TPETRA_MMM_TIMINGS
313 std::string prefix_mmm = std::string(
"TpetraExt ")+ label + std::string(
": ");
314 using Teuchos::TimeMonitor;
315 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string(
"Jacobi All Setup")));
318 const std::string prefix =
"TpetraExt::MatrixMatrix::Jacobi(): ";
323 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
324 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
326 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
336 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
338 "must match for matrix-matrix product. op(A) is "
339 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
345 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
346 prefix <<
"ERROR, dimensions of result C must "
348 <<
" rows, should have at least "<< Aouter << std::endl);
360 int numProcs = A.
getComm()->getSize();
364 crs_matrix_struct_type Aview;
365 crs_matrix_struct_type Bview;
367 RCP<const map_type> targetMap_A = Aprime->getRowMap();
368 RCP<const map_type> targetMap_B = Bprime->getRowMap();
370 #ifdef HAVE_TPETRA_MMM_TIMINGS
371 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All I&X")));
377 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(
new Teuchos::ParameterList);
378 if(!params.is_null()) {
379 importParams1->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
380 int mm_optimization_core_count=0;
381 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
383 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
386 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
387 auto & ip1slist = importParams1->sublist(
"matrixmatrix: kernel params",
false);
388 ip1slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389 ip1slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
390 ip1slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
394 RCP<const import_type> dummyImporter;
395 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label,importParams1);
400 targetMap_B = Aprime->getColMap();
405 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(
new Teuchos::ParameterList);
406 if(!params.is_null()) {
407 importParams2->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
409 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
411 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
412 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
413 auto & ip2slist = importParams2->sublist(
"matrixmatrix: kernel params",
false);
414 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416 ip2slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417 ip2slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
418 ip2slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
421 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(),
false, label,importParams2);
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
425 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All Multiply")));
429 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
432 bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
434 if (call_FillComplete_on_result && newFlag) {
435 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
437 }
else if (call_FillComplete_on_result) {
438 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
441 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
"jacobi_A_B_general not implemented");
464 template <
class Scalar,
475 using Teuchos::Array;
479 typedef LocalOrdinal LO;
480 typedef GlobalOrdinal GO;
485 const std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
487 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error,
488 prefix <<
"ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489 "(Result matrix B is not required to be isFillComplete()).");
490 TEUCHOS_TEST_FOR_EXCEPTION(B.
isFillComplete() , std::runtime_error,
491 prefix <<
"ERROR, input matrix B must not be fill complete!");
492 TEUCHOS_TEST_FOR_EXCEPTION(B.
isStaticGraph() , std::runtime_error,
493 prefix <<
"ERROR, input matrix B must not have static graph!");
495 prefix <<
"ERROR, input matrix B must not be locally indexed!");
497 using Teuchos::ParameterList;
498 RCP<ParameterList> transposeParams (
new ParameterList);
499 transposeParams->set (
"sort",
false);
501 RCP<const crs_matrix_type> Aprime = null;
503 transposer_type transposer (rcpFromRef (A));
504 Aprime = transposer.createTranspose (transposeParams);
507 Aprime = rcpFromRef(A);
515 if (scalarB != Teuchos::ScalarTraits<SC>::one())
520 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521 for (LO i = 0; (size_t)i < numMyRows; ++i) {
522 row = B.
getRowMap()->getGlobalElement(i);
523 Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
525 if (scalarA != Teuchos::ScalarTraits<SC>::one())
526 for (
size_t j = 0; j < a_numEntries; ++j)
527 a_vals[j] *= scalarA;
537 template <
class Scalar,
541 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
543 const bool transposeA,
546 const bool transposeB,
550 const Teuchos::RCP<Teuchos::ParameterList>& params)
553 using Teuchos::rcpFromRef;
555 using Teuchos::ParameterList;
557 if(!params.is_null())
559 TEUCHOS_TEST_FOR_EXCEPTION(
560 params->isParameter(
"Call fillComplete") && !params->get<
bool>(
"Call fillComplete"),
561 std::invalid_argument,
562 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564 params->set(
"Call fillComplete",
true);
568 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
575 TEUCHOS_TEST_FOR_EXCEPTION
576 (! A.
isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578 RCP<crs_matrix_type> C = rcp(
new crs_matrix_type(Brcp->getRowMap(), 0));
580 add(alpha, transposeA, A, beta,
false, *Brcp, *C, domainMap, rangeMap, params);
588 template<
class LO,
class GO,
class LOView,
class GOView,
class LocalMap>
589 struct ConvertGlobalToLocalFunctor
591 ConvertGlobalToLocalFunctor(LOView& lids_,
const GOView& gids_,
const LocalMap localColMap_)
592 : lids(lids_), gids(gids_), localColMap(localColMap_)
595 KOKKOS_FUNCTION
void operator() (
const GO i)
const
597 lids(i) = localColMap.getLocalElement(gids(i));
602 const LocalMap localColMap;
605 template <
class Scalar,
611 const bool transposeA,
614 const bool transposeB,
619 const Teuchos::RCP<Teuchos::ParameterList>& params)
623 using Teuchos::rcpFromRef;
624 using Teuchos::rcp_implicit_cast;
625 using Teuchos::rcp_dynamic_cast;
626 using Teuchos::TimeMonitor;
628 using LO = LocalOrdinal;
629 using GO = GlobalOrdinal;
637 using exec_space =
typename crs_graph_type::execution_space;
638 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639 const char* prefix_mmm =
"TpetraExt::MatrixMatrix::add: ";
640 constexpr
bool debug =
false;
642 #ifdef HAVE_TPETRA_MMM_TIMINGS
643 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"transpose"))));
647 std::ostringstream os;
648 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
649 <<
"TpetraExt::MatrixMatrix::add" << std::endl;
650 std::cerr << os.str ();
653 TEUCHOS_TEST_FOR_EXCEPTION
655 prefix_mmm <<
"C must be a 'new' matrix (neither locally nor globally indexed).");
656 TEUCHOS_TEST_FOR_EXCEPTION
658 prefix_mmm <<
"A and B must both be fill complete.");
659 #ifdef HAVE_TPETRA_DEBUG
662 const bool domainMapsSame =
663 (! transposeA && ! transposeB &&
665 (! transposeA && transposeB &&
667 ( transposeA && ! transposeB &&
669 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670 prefix_mmm <<
"The domain Maps of Op(A) and Op(B) are not the same.");
672 const bool rangeMapsSame =
673 (! transposeA && ! transposeB &&
675 (! transposeA && transposeB &&
677 ( transposeA && ! transposeB &&
679 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680 prefix_mmm <<
"The range Maps of Op(A) and Op(B) are not the same.");
682 #endif // HAVE_TPETRA_DEBUG
684 using Teuchos::ParameterList;
686 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
688 transposer_type transposer (Aprime);
689 Aprime = transposer.createTranspose ();
692 #ifdef HAVE_TPETRA_DEBUG
693 TEUCHOS_TEST_FOR_EXCEPTION
694 (Aprime.is_null (), std::logic_error,
695 prefix_mmm <<
"Failed to compute Op(A). "
696 "Please report this bug to the Tpetra developers.");
697 #endif // HAVE_TPETRA_DEBUG
700 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
703 std::ostringstream os;
704 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
705 <<
"Form explicit xpose of B" << std::endl;
706 std::cerr << os.str ();
708 transposer_type transposer (Bprime);
709 Bprime = transposer.createTranspose ();
711 #ifdef HAVE_TPETRA_DEBUG
712 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713 prefix_mmm <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714 TEUCHOS_TEST_FOR_EXCEPTION(
715 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716 prefix_mmm <<
"Aprime and Bprime must both be fill complete. "
717 "Please report this bug to the Tpetra developers.");
718 #endif // HAVE_TPETRA_DEBUG
719 RCP<const map_type> CDomainMap = domainMap;
720 RCP<const map_type> CRangeMap = rangeMap;
721 if(CDomainMap.is_null())
723 CDomainMap = Bprime->getDomainMap();
725 if(CRangeMap.is_null())
727 CRangeMap = Bprime->getRangeMap();
729 assert(!(CDomainMap.is_null()));
730 assert(!(CRangeMap.is_null()));
731 typedef typename AddKern::values_array values_array;
732 typedef typename AddKern::row_ptrs_array row_ptrs_array;
733 typedef typename AddKern::col_inds_array col_inds_array;
734 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
737 row_ptrs_array rowptrs;
738 col_inds_array colinds;
739 #ifdef HAVE_TPETRA_MMM_TIMINGS
741 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"rowmap check/import"))));
743 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
746 auto import = rcp(
new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
751 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *
import, Bprime->getDomainMap(), Bprime->getRangeMap());
753 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754 bool sorted = AGraphSorted && BGraphSorted;
755 RCP<const import_type> Cimport = Teuchos::null;
756 RCP<export_type> Cexport = Teuchos::null;
757 bool doFillComplete =
true;
758 if(Teuchos::nonnull(params) && params->isParameter(
"Call fillComplete"))
760 doFillComplete = params->get<
bool>(
"Call fillComplete");
762 auto Alocal = Aprime->getLocalMatrix();
763 auto Blocal = Bprime->getLocalMatrix();
764 LO numLocalRows = Alocal.numRows();
765 if(numLocalRows == 0)
772 rowptrs = row_ptrs_array(
"C rowptrs", 0);
774 auto Acolmap = Aprime->getColMap();
775 auto Bcolmap = Bprime->getColMap();
778 using global_col_inds_array =
typename AddKern::global_col_inds_array;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
781 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mismatched col map full kernel"))));
784 auto AlocalColmap = Acolmap->getLocalMap();
785 auto BlocalColmap = Bcolmap->getLocalMap();
786 global_col_inds_array globalColinds;
788 std::ostringstream os;
789 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
790 <<
"Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791 std::cerr << os.str ();
793 AddKern::convertToGlobalAndAdd(
794 Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795 vals, rowptrs, globalColinds);
797 std::ostringstream os;
798 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
799 <<
"Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800 std::cerr << os.str ();
802 #ifdef HAVE_TPETRA_MMM_TIMINGS
804 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Constructing graph"))));
806 RCP<const map_type> CcolMap;
807 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808 (CcolMap, CDomainMap, globalColinds);
810 col_inds_array localColinds(
"C colinds", globalColinds.extent(0));
811 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813 col_inds_array, global_col_inds_array,
814 typename map_type::local_map_type>
815 (localColinds, globalColinds, CcolMap->getLocalMap()));
816 KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
825 auto Avals = Alocal.values;
826 auto Bvals = Blocal.values;
827 auto Arowptrs = Alocal.graph.row_map;
828 auto Browptrs = Blocal.graph.row_map;
829 auto Acolinds = Alocal.graph.entries;
830 auto Bcolinds = Blocal.graph.entries;
834 #ifdef HAVE_TPETRA_MMM_TIMINGS
836 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"sorted entries full kernel"))));
839 std::ostringstream os;
840 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
841 <<
"Call AddKern::addSorted(...)" << std::endl;
842 std::cerr << os.str ();
844 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
851 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mm add unsorted entries full kernel"))));
854 std::ostringstream os;
855 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
856 <<
"Call AddKern::addUnsorted(...)" << std::endl;
857 std::cerr << os.str ();
859 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
862 RCP<const map_type> Ccolmap = Bcolmap;
867 #ifdef HAVE_TPETRA_MMM_TIMINGS
869 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Tpetra::Crs expertStaticFillComplete"))));
871 if(!CDomainMap->isSameAs(*Ccolmap))
874 std::ostringstream os;
875 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
876 <<
"Create Cimport" << std::endl;
877 std::cerr << os.str ();
879 Cimport = rcp(
new import_type(CDomainMap, Ccolmap));
884 std::ostringstream os;
885 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
886 <<
"Create Cexport" << std::endl;
887 std::cerr << os.str ();
889 Cexport = rcp(
new export_type(C.
getRowMap(), CRangeMap));
893 std::ostringstream os;
894 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
895 <<
"Call C->expertStaticFillComplete(...)" << std::endl;
896 std::cerr << os.str ();
903 template <
class Scalar,
916 using Teuchos::Array;
917 using Teuchos::ArrayRCP;
918 using Teuchos::ArrayView;
921 using Teuchos::rcp_dynamic_cast;
922 using Teuchos::rcpFromRef;
923 using Teuchos::tuple;
926 typedef Teuchos::ScalarTraits<Scalar> STS;
934 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
936 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937 prefix <<
"The case C == null does not actually work. Fixing this will require an interface change.");
939 TEUCHOS_TEST_FOR_EXCEPTION(
941 prefix <<
"Both input matrices must be fill complete before calling this function.");
943 #ifdef HAVE_TPETRA_DEBUG
945 const bool domainMapsSame =
949 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950 prefix <<
"The domain Maps of Op(A) and Op(B) are not the same.");
952 const bool rangeMapsSame =
956 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957 prefix <<
"The range Maps of Op(A) and Op(B) are not the same.");
959 #endif // HAVE_TPETRA_DEBUG
961 using Teuchos::ParameterList;
962 RCP<ParameterList> transposeParams (
new ParameterList);
963 transposeParams->set (
"sort",
false);
966 RCP<const crs_matrix_type> Aprime;
968 transposer_type theTransposer (rcpFromRef (A));
969 Aprime = theTransposer.createTranspose (transposeParams);
972 Aprime = rcpFromRef (A);
975 #ifdef HAVE_TPETRA_DEBUG
976 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977 prefix <<
"Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978 #endif // HAVE_TPETRA_DEBUG
981 RCP<const crs_matrix_type> Bprime;
983 transposer_type theTransposer (rcpFromRef (B));
984 Bprime = theTransposer.createTranspose (transposeParams);
987 Bprime = rcpFromRef (B);
990 #ifdef HAVE_TPETRA_DEBUG
991 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992 prefix <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993 #endif // HAVE_TPETRA_DEBUG
996 if (! C.is_null ()) {
997 C->setAllToScalar (STS::zero ());
1002 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), 0));
1005 #ifdef HAVE_TPETRA_DEBUG
1006 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007 prefix <<
"At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009 prefix <<
"At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011 prefix <<
"At this point, C is null. Please report this bug to the Tpetra developers.");
1012 #endif // HAVE_TPETRA_DEBUG
1014 Array<RCP<const crs_matrix_type> > Mat =
1015 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1019 for (
int k = 0; k < 2; ++k) {
1020 Array<GlobalOrdinal> Indices;
1021 Array<Scalar> Values;
1029 #ifdef HAVE_TPETRA_DEBUG
1030 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032 #endif // HAVE_TPETRA_DEBUG
1033 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034 #ifdef HAVE_TPETRA_DEBUG
1035 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037 #endif // HAVE_TPETRA_DEBUG
1039 const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040 for (
size_t i = 0; i < localNumRows; ++i) {
1041 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043 if (numEntries > 0) {
1044 Indices.resize (numEntries);
1045 Values.resize (numEntries);
1046 Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1048 if (scalar[k] != STS::one ()) {
1049 for (
size_t j = 0; j < numEntries; ++j) {
1050 Values[j] *= scalar[k];
1054 if (C->isFillComplete ()) {
1055 C->sumIntoGlobalValues (globalRow, Indices, Values);
1057 C->insertGlobalValues (globalRow, Indices, Values);
1066 namespace MMdetails{
1070 template <
class TransferType>
1071 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer,
const std::string &label) {
1072 if (Transfer.is_null())
1075 const Distributor & Distor = Transfer->getDistributor();
1076 Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1078 size_t rows_send = Transfer->getNumExportIDs();
1079 size_t rows_recv = Transfer->getNumRemoteIDs();
1081 size_t round1_send = Transfer->getNumExportIDs() *
sizeof(size_t);
1082 size_t round1_recv = Transfer->getNumRemoteIDs() *
sizeof(size_t);
1083 size_t num_send_neighbors = Distor.getNumSends();
1084 size_t num_recv_neighbors = Distor.getNumReceives();
1085 size_t round2_send, round2_recv;
1086 Distor.getLastDoStatistics(round2_send,round2_recv);
1088 int myPID = Comm->getRank();
1089 int NumProcs = Comm->getSize();
1096 size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1097 size_t gstats_min[8], gstats_max[8];
1099 double lstats_avg[8], gstats_avg[8];
1100 for(
int i=0; i<8; i++)
1101 lstats_avg[i] = ((
double)lstats[i])/NumProcs;
1103 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1104 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1105 Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1108 printf(
"%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1109 (int)gstats_min[0],gstats_avg[0],(
int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(
int)gstats_max[2],
1110 (
int)gstats_min[4],gstats_avg[4],(
int)gstats_max[4], (
int)gstats_min[6],gstats_avg[6],(
int)gstats_max[6]);
1111 printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1112 (
int)gstats_min[1],gstats_avg[1],(
int)gstats_max[1], (
int)gstats_min[3],gstats_avg[3],(
int)gstats_max[3],
1113 (
int)gstats_min[5],gstats_avg[5],(
int)gstats_max[5], (
int)gstats_min[7],gstats_avg[7],(
int)gstats_max[7]);
1118 template<class Scalar,
1120 class GlobalOrdinal,
1122 void mult_AT_B_newmatrix(
1123 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1124 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1125 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1126 const std::
string & label,
1127 const Teuchos::RCP<Teuchos::ParameterList>& params)
1132 typedef LocalOrdinal LO;
1133 typedef GlobalOrdinal GO;
1135 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1136 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1138 #ifdef HAVE_TPETRA_MMM_TIMINGS
1139 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1140 using Teuchos::TimeMonitor;
1141 RCP<TimeMonitor> MM = rcp (
new TimeMonitor
1142 (*TimeMonitor::getNewTimer (prefix_mmm +
"MMM-T Transpose")));
1148 transposer_type transposer (rcpFromRef (A), label + std::string(
"XP: "));
1150 using Teuchos::ParameterList;
1151 RCP<ParameterList> transposeParams (
new ParameterList);
1152 transposeParams->set (
"sort",
false);
1153 if(! params.is_null ()) {
1154 transposeParams->set (
"compute global constants",
1155 params->get (
"compute global constants: temporaries",
1158 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1159 transposer.createTransposeLocal (transposeParams);
1164 #ifdef HAVE_TPETRA_MMM_TIMINGS
1166 MM = rcp (
new TimeMonitor
1167 (*TimeMonitor::getNewTimer (prefix_mmm + std::string (
"MMM-T I&X"))));
1171 crs_matrix_struct_type Aview;
1172 crs_matrix_struct_type Bview;
1173 RCP<const Import<LO, GO, NO> > dummyImporter;
1176 RCP<Teuchos::ParameterList> importParams1 (
new ParameterList);
1177 if (! params.is_null ()) {
1178 importParams1->set (
"compute global constants",
1179 params->get (
"compute global constants: temporaries",
1181 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1182 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1183 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1184 int mm_optimization_core_count =
1186 mm_optimization_core_count =
1187 slist.get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1188 int mm_optimization_core_count2 =
1189 params->get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1191 mm_optimization_core_count = mm_optimization_core_count2;
1193 auto & sip1 = importParams1->sublist (
"matrixmatrix: kernel params",
false);
1194 sip1.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1195 sip1.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1196 sip1.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1199 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1200 Aview, dummyImporter,
true,
1201 label, importParams1);
1203 RCP<ParameterList> importParams2 (
new ParameterList);
1204 if (! params.is_null ()) {
1205 importParams2->set (
"compute global constants",
1206 params->get (
"compute global constants: temporaries",
1208 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1209 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1210 bool overrideAllreduce = slist.get (
"MM_TAFC_OverrideAllreduceCheck",
false);
1211 int mm_optimization_core_count =
1213 mm_optimization_core_count =
1214 slist.get (
"MM_TAFC_OptimizationCoreCount",
1215 mm_optimization_core_count);
1216 int mm_optimization_core_count2 =
1217 params->get (
"MM_TAFC_OptimizationCoreCount",
1218 mm_optimization_core_count);
1219 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1220 mm_optimization_core_count = mm_optimization_core_count2;
1222 auto & sip2 = importParams2->sublist (
"matrixmatrix: kernel params",
false);
1223 sip2.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1224 sip2.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1225 sip2.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1228 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1229 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,
true, label,importParams2);
1232 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,
false, label,importParams2);
1235 #ifdef HAVE_TPETRA_MMM_TIMINGS
1237 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T AB-core"))));
1240 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1243 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1244 if (needs_final_export) {
1248 Ctemp = rcp (&C,
false);
1251 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1256 #ifdef HAVE_TPETRA_MMM_TIMINGS
1258 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T exportAndFillComplete"))));
1261 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C,
false);
1263 if (needs_final_export) {
1264 ParameterList labelList;
1265 labelList.set(
"Timer Label", label);
1266 if(!params.is_null()) {
1267 ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
1268 ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
1270 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1271 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1272 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1273 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
1274 bool isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
1275 bool overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1277 labelList_subList.set (
"isMatrixMatrix_TransferAndFillComplete", isMM,
1278 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1279 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
1280 labelList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1282 Ctemp->exportAndFillComplete (Crcp,
1283 *Ctemp->getGraph ()->getExporter (),
1286 rcp (&labelList,
false));
1288 #ifdef HAVE_TPETRA_MMM_STATISTICS
1289 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(
" AT_B MMM"));
1295 template<
class Scalar,
1297 class GlobalOrdinal,
1300 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1301 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1302 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1303 const std::string& ,
1304 const Teuchos::RCP<Teuchos::ParameterList>& )
1306 using Teuchos::Array;
1307 using Teuchos::ArrayRCP;
1308 using Teuchos::ArrayView;
1309 using Teuchos::OrdinalTraits;
1310 using Teuchos::null;
1312 typedef Teuchos::ScalarTraits<Scalar> STS;
1314 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1315 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1317 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1318 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1320 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1321 ArrayView<const GlobalOrdinal> bcols_import = null;
1322 if (Bview.importColMap != null) {
1323 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1324 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1326 bcols_import = Bview.importColMap->getNodeElementList();
1329 size_t C_numCols = C_lastCol - C_firstCol +
1330 OrdinalTraits<LocalOrdinal>::one();
1331 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1332 OrdinalTraits<LocalOrdinal>::one();
1334 if (C_numCols_import > C_numCols)
1335 C_numCols = C_numCols_import;
1337 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1338 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1339 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1341 Array<Scalar> C_row_i = dwork;
1342 Array<GlobalOrdinal> C_cols = iwork;
1343 Array<size_t> c_index = iwork2;
1344 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1345 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1347 size_t C_row_i_length, j, k, last_index;
1350 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1351 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1352 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1353 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1355 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1356 Aview.colMap->getMaxLocalIndex(); i++)
1361 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1362 Aview.colMap->getMaxLocalIndex(); i++) {
1363 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1364 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1365 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1366 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1376 ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1377 ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1378 ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1379 ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1380 ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1381 ArrayView<const Scalar> Avals, Bvals, Ivals;
1382 Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1383 Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1384 Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1385 Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1386 if(!Bview.importMatrix.is_null()) {
1387 Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1388 Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1391 bool C_filled = C.isFillComplete();
1393 for (
size_t i = 0; i < C_numCols; i++)
1394 c_index[i] = OrdinalTraits<size_t>::invalid();
1397 size_t Arows = Aview.rowMap->getNodeNumElements();
1398 for(
size_t i=0; i<Arows; ++i) {
1402 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1408 C_row_i_length = OrdinalTraits<size_t>::zero();
1410 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1411 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1412 const Scalar Aval = Avals[k];
1413 if (Aval == STS::zero())
1416 if (Ak == LO_INVALID)
1419 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1420 LocalOrdinal col = Bcolind[j];
1423 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1426 C_row_i[C_row_i_length] = Aval*Bvals[j];
1427 C_cols[C_row_i_length] = col;
1428 c_index[col] = C_row_i_length;
1432 C_row_i[c_index[col]] += Aval*Bvals[j];
1437 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1438 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1439 C_cols[ii] = bcols[C_cols[ii]];
1440 combined_index[ii] = C_cols[ii];
1441 combined_values[ii] = C_row_i[ii];
1443 last_index = C_row_i_length;
1449 C_row_i_length = OrdinalTraits<size_t>::zero();
1451 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1452 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1453 const Scalar Aval = Avals[k];
1454 if (Aval == STS::zero())
1457 if (Ak!=LO_INVALID)
continue;
1459 Ak = Acol2Irow[Acolind[k]];
1460 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1461 LocalOrdinal col = Icolind[j];
1464 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1467 C_row_i[C_row_i_length] = Aval*Ivals[j];
1468 C_cols[C_row_i_length] = col;
1469 c_index[col] = C_row_i_length;
1474 C_row_i[c_index[col]] += Aval*Ivals[j];
1479 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1480 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1481 C_cols[ii] = bcols_import[C_cols[ii]];
1482 combined_index[last_index] = C_cols[ii];
1483 combined_values[last_index] = C_row_i[ii];
1490 C.sumIntoGlobalValues(
1492 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1493 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1495 C.insertGlobalValues(
1497 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1498 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1504 template<
class Scalar,
1506 class GlobalOrdinal,
1508 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1509 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1510 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1512 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1513 Mview.maxNumRowEntries = Mview.indices[0].size();
1515 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1516 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1517 Mview.maxNumRowEntries = Mview.indices[i].size();
1522 template<
class CrsMatrixType>
1523 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1525 size_t Aest = 100, Best=100;
1526 if (A.getNodeNumEntries() >= A.getNodeNumRows())
1527 Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1528 if (B.getNodeNumEntries() >= B.getNodeNumRows())
1529 Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1531 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
1532 nnzperrow *= nnzperrow;
1534 return (
size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1544 template<
class Scalar,
1546 class GlobalOrdinal,
1548 void mult_A_B_newmatrix(
1549 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1550 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1551 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1552 const std::string& label,
1553 const Teuchos::RCP<Teuchos::ParameterList>& params)
1555 using Teuchos::Array;
1556 using Teuchos::ArrayRCP;
1557 using Teuchos::ArrayView;
1562 typedef LocalOrdinal LO;
1563 typedef GlobalOrdinal GO;
1565 typedef Import<LO,GO,NO> import_type;
1566 typedef Map<LO,GO,NO> map_type;
1569 typedef typename map_type::local_map_type local_map_type;
1571 typedef typename KCRS::StaticCrsGraphType graph_t;
1572 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1573 typedef typename NO::execution_space execution_space;
1574 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1575 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1577 #ifdef HAVE_TPETRA_MMM_TIMINGS
1578 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1579 using Teuchos::TimeMonitor;
1580 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM M5 Cmap")))));
1582 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1585 RCP<const import_type> Cimport;
1586 RCP<const map_type> Ccolmap;
1587 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1588 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1589 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1590 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1591 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1592 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1593 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1594 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1601 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1603 if (Bview.importMatrix.is_null()) {
1606 Ccolmap = Bview.colMap;
1607 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getNodeNumElements());
1609 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1610 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1611 KOKKOS_LAMBDA(
const LO i) {
1624 if (!Bimport.is_null() && !Iimport.is_null()) {
1625 Cimport = Bimport->setUnion(*Iimport);
1627 else if (!Bimport.is_null() && Iimport.is_null()) {
1628 Cimport = Bimport->setUnion();
1630 else if (Bimport.is_null() && !Iimport.is_null()) {
1631 Cimport = Iimport->setUnion();
1634 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1636 Ccolmap = Cimport->getTargetMap();
1641 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1642 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1649 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1650 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1651 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1652 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1654 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1655 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1664 C.replaceColMap(Ccolmap);
1682 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1683 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1685 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
1686 GO aidx = Acolmap_local.getGlobalElement(i);
1687 LO B_LID = Browmap_local.getLocalElement(aidx);
1688 if (B_LID != LO_INVALID) {
1689 targetMapToOrigRow(i) = B_LID;
1690 targetMapToImportRow(i) = LO_INVALID;
1692 LO I_LID = Irowmap_local.getLocalElement(aidx);
1693 targetMapToOrigRow(i) = LO_INVALID;
1694 targetMapToImportRow(i) = I_LID;
1701 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1708 template<
class Scalar,
1710 class GlobalOrdinal,
1712 class LocalOrdinalViewType>
1713 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1714 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1715 const LocalOrdinalViewType & targetMapToOrigRow,
1716 const LocalOrdinalViewType & targetMapToImportRow,
1717 const LocalOrdinalViewType & Bcol2Ccol,
1718 const LocalOrdinalViewType & Icol2Ccol,
1719 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1720 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1721 const std::string& label,
1722 const Teuchos::RCP<Teuchos::ParameterList>& params) {
1723 #ifdef HAVE_TPETRA_MMM_TIMINGS
1724 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1725 using Teuchos::TimeMonitor;
1726 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore"))));
1729 using Teuchos::Array;
1730 using Teuchos::ArrayRCP;
1731 using Teuchos::ArrayView;
1737 typedef typename KCRS::StaticCrsGraphType graph_t;
1738 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1739 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1740 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1741 typedef typename KCRS::values_type::non_const_type scalar_view_t;
1744 typedef LocalOrdinal LO;
1745 typedef GlobalOrdinal GO;
1747 typedef Map<LO,GO,NO> map_type;
1748 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1749 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1750 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1753 RCP<const map_type> Ccolmap = C.getColMap();
1754 size_t m = Aview.origMatrix->getNodeNumRows();
1755 size_t n = Ccolmap->getNodeNumElements();
1756 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1759 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1760 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
1762 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1763 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1764 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1766 c_lno_view_t Irowptr;
1767 lno_nnz_view_t Icolind;
1768 scalar_view_t Ivals;
1769 if(!Bview.importMatrix.is_null()) {
1770 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
1771 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
1772 Ivals = Bview.importMatrix->getLocalMatrix().values;
1773 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1776 #ifdef HAVE_TPETRA_MMM_TIMINGS
1777 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
1787 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1788 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
1789 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
1790 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
1800 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1801 std::vector<size_t> c_status(n, ST_INVALID);
1811 size_t CSR_ip = 0, OLD_ip = 0;
1812 for (
size_t i = 0; i < m; i++) {
1815 Crowptr[i] = CSR_ip;
1818 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1819 LO Aik = Acolind[k];
1820 const SC Aval = Avals[k];
1821 if (Aval == SC_ZERO)
1824 if (targetMapToOrigRow[Aik] != LO_INVALID) {
1831 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
1834 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1835 LO Bkj = Bcolind[j];
1836 LO Cij = Bcol2Ccol[Bkj];
1838 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1840 c_status[Cij] = CSR_ip;
1841 Ccolind[CSR_ip] = Cij;
1842 Cvals[CSR_ip] = Aval*Bvals[j];
1846 Cvals[c_status[Cij]] += Aval*Bvals[j];
1857 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
1858 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1859 LO Ikj = Icolind[j];
1860 LO Cij = Icol2Ccol[Ikj];
1862 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1864 c_status[Cij] = CSR_ip;
1865 Ccolind[CSR_ip] = Cij;
1866 Cvals[CSR_ip] = Aval*Ivals[j];
1869 Cvals[c_status[Cij]] += Aval*Ivals[j];
1876 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1878 Kokkos::resize(Ccolind,CSR_alloc);
1879 Kokkos::resize(Cvals,CSR_alloc);
1884 Crowptr[m] = CSR_ip;
1887 Kokkos::resize(Ccolind,CSR_ip);
1888 Kokkos::resize(Cvals,CSR_ip);
1890 #ifdef HAVE_TPETRA_MMM_TIMINGS
1892 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix Final Sort")));
1896 if (params.is_null() || params->get(
"sort entries",
true))
1897 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1898 C.setAllValues(Crowptr,Ccolind, Cvals);
1901 #ifdef HAVE_TPETRA_MMM_TIMINGS
1903 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix ESFC")));
1915 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
1916 labelList->set(
"Timer Label",label);
1917 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
1918 RCP<const Export<LO,GO,NO> > dummyExport;
1919 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1920 #ifdef HAVE_TPETRA_MMM_TIMINGS
1922 MM2 = Teuchos::null;
1932 template<
class Scalar,
1934 class GlobalOrdinal,
1936 void mult_A_B_reuse(
1937 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1938 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1939 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1940 const std::string& label,
1941 const Teuchos::RCP<Teuchos::ParameterList>& params)
1943 using Teuchos::Array;
1944 using Teuchos::ArrayRCP;
1945 using Teuchos::ArrayView;
1950 typedef LocalOrdinal LO;
1951 typedef GlobalOrdinal GO;
1953 typedef Import<LO,GO,NO> import_type;
1954 typedef Map<LO,GO,NO> map_type;
1957 typedef typename map_type::local_map_type local_map_type;
1959 typedef typename KCRS::StaticCrsGraphType graph_t;
1960 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1961 typedef typename NO::execution_space execution_space;
1962 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1963 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1965 #ifdef HAVE_TPETRA_MMM_TIMINGS
1966 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1967 using Teuchos::TimeMonitor;
1968 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse Cmap"))));
1970 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1973 RCP<const import_type> Cimport = C.getGraph()->getImporter();
1974 RCP<const map_type> Ccolmap = C.getColMap();
1975 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1976 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1977 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1978 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1979 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1980 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1983 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1987 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1988 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1991 if (!Bview.importMatrix.is_null()) {
1992 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1993 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1995 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1996 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
1997 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2003 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2004 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2005 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2006 GO aidx = Acolmap_local.getGlobalElement(i);
2007 LO B_LID = Browmap_local.getLocalElement(aidx);
2008 if (B_LID != LO_INVALID) {
2009 targetMapToOrigRow(i) = B_LID;
2010 targetMapToImportRow(i) = LO_INVALID;
2012 LO I_LID = Irowmap_local.getLocalElement(aidx);
2013 targetMapToOrigRow(i) = LO_INVALID;
2014 targetMapToImportRow(i) = I_LID;
2021 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2025 template<
class Scalar,
2027 class GlobalOrdinal,
2029 class LocalOrdinalViewType>
2030 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2031 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2032 const LocalOrdinalViewType & targetMapToOrigRow,
2033 const LocalOrdinalViewType & targetMapToImportRow,
2034 const LocalOrdinalViewType & Bcol2Ccol,
2035 const LocalOrdinalViewType & Icol2Ccol,
2036 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2037 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2038 const std::string& label,
2039 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2040 #ifdef HAVE_TPETRA_MMM_TIMINGS
2041 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2042 using Teuchos::TimeMonitor;
2043 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
2044 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2054 typedef typename KCRS::StaticCrsGraphType graph_t;
2055 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2056 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2057 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2060 typedef LocalOrdinal LO;
2061 typedef GlobalOrdinal GO;
2063 typedef Map<LO,GO,NO> map_type;
2064 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2065 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2066 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2069 RCP<const map_type> Ccolmap = C.getColMap();
2070 size_t m = Aview.origMatrix->getNodeNumRows();
2071 size_t n = Ccolmap->getNodeNumElements();
2074 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2075 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2076 const KCRS & Cmat = C.getLocalMatrix();
2078 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2079 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2080 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2081 scalar_view_t Cvals = Cmat.values;
2083 c_lno_view_t Irowptr;
2084 lno_nnz_view_t Icolind;
2085 scalar_view_t Ivals;
2086 if(!Bview.importMatrix.is_null()) {
2087 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2088 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2089 Ivals = Bview.importMatrix->getLocalMatrix().values;
2092 #ifdef HAVE_TPETRA_MMM_TIMINGS
2093 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2105 std::vector<size_t> c_status(n, ST_INVALID);
2108 size_t CSR_ip = 0, OLD_ip = 0;
2109 for (
size_t i = 0; i < m; i++) {
2112 OLD_ip = Crowptr[i];
2113 CSR_ip = Crowptr[i+1];
2114 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2115 c_status[Ccolind[k]] = k;
2121 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2122 LO Aik = Acolind[k];
2123 const SC Aval = Avals[k];
2124 if (Aval == SC_ZERO)
2127 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2129 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2131 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2132 LO Bkj = Bcolind[j];
2133 LO Cij = Bcol2Ccol[Bkj];
2135 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2136 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2137 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2139 Cvals[c_status[Cij]] += Aval * Bvals[j];
2144 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2145 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2146 LO Ikj = Icolind[j];
2147 LO Cij = Icol2Ccol[Ikj];
2149 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2150 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2151 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2153 Cvals[c_status[Cij]] += Aval * Ivals[j];
2159 #ifdef HAVE_TPETRA_MMM_TIMINGS
2160 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse ESFC"))));
2163 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2169 template<
class Scalar,
2171 class GlobalOrdinal,
2173 void jacobi_A_B_newmatrix(
2175 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2176 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2177 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2178 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2179 const std::string& label,
2180 const Teuchos::RCP<Teuchos::ParameterList>& params)
2182 using Teuchos::Array;
2183 using Teuchos::ArrayRCP;
2184 using Teuchos::ArrayView;
2188 typedef LocalOrdinal LO;
2189 typedef GlobalOrdinal GO;
2192 typedef Import<LO,GO,NO> import_type;
2193 typedef Map<LO,GO,NO> map_type;
2194 typedef typename map_type::local_map_type local_map_type;
2198 typedef typename KCRS::StaticCrsGraphType graph_t;
2199 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2200 typedef typename NO::execution_space execution_space;
2201 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2202 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2205 #ifdef HAVE_TPETRA_MMM_TIMINGS
2206 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2207 using Teuchos::TimeMonitor;
2208 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi M5 Cmap"))));
2210 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2213 RCP<const import_type> Cimport;
2214 RCP<const map_type> Ccolmap;
2215 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2216 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2217 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2218 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2219 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2220 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2221 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2222 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2229 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2231 if (Bview.importMatrix.is_null()) {
2234 Ccolmap = Bview.colMap;
2238 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2239 Kokkos::parallel_for (range, KOKKOS_LAMBDA (
const size_t i) {
2240 Bcol2Ccol(i) =
static_cast<LO
> (i);
2251 if (!Bimport.is_null() && !Iimport.is_null()){
2252 Cimport = Bimport->setUnion(*Iimport);
2253 Ccolmap = Cimport->getTargetMap();
2255 }
else if (!Bimport.is_null() && Iimport.is_null()) {
2256 Cimport = Bimport->setUnion();
2258 }
else if(Bimport.is_null() && !Iimport.is_null()) {
2259 Cimport = Iimport->setUnion();
2262 throw std::runtime_error(
"TpetraExt::Jacobi status of matrix importers is nonsensical");
2264 Ccolmap = Cimport->getTargetMap();
2266 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2267 std::runtime_error,
"Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2274 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2275 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2276 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2277 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2279 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2280 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2289 C.replaceColMap(Ccolmap);
2307 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2308 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2309 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2310 GO aidx = Acolmap_local.getGlobalElement(i);
2311 LO B_LID = Browmap_local.getLocalElement(aidx);
2312 if (B_LID != LO_INVALID) {
2313 targetMapToOrigRow(i) = B_LID;
2314 targetMapToImportRow(i) = LO_INVALID;
2316 LO I_LID = Irowmap_local.getLocalElement(aidx);
2317 targetMapToOrigRow(i) = LO_INVALID;
2318 targetMapToImportRow(i) = I_LID;
2325 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2334 template<
class Scalar,
2336 class GlobalOrdinal,
2338 class LocalOrdinalViewType>
2339 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2340 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2341 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2342 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2343 const LocalOrdinalViewType & targetMapToOrigRow,
2344 const LocalOrdinalViewType & targetMapToImportRow,
2345 const LocalOrdinalViewType & Bcol2Ccol,
2346 const LocalOrdinalViewType & Icol2Ccol,
2347 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2348 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2349 const std::string& label,
2350 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2352 #ifdef HAVE_TPETRA_MMM_TIMINGS
2353 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2354 using Teuchos::TimeMonitor;
2355 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Nemwmatrix SerialCore")));
2358 using Teuchos::Array;
2359 using Teuchos::ArrayRCP;
2360 using Teuchos::ArrayView;
2366 typedef typename KCRS::StaticCrsGraphType graph_t;
2367 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2368 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2369 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2370 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2373 typedef typename scalar_view_t::memory_space scalar_memory_space;
2376 typedef LocalOrdinal LO;
2377 typedef GlobalOrdinal GO;
2380 typedef Map<LO,GO,NO> map_type;
2381 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2382 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2385 RCP<const map_type> Ccolmap = C.getColMap();
2386 size_t m = Aview.origMatrix->getNodeNumRows();
2387 size_t n = Ccolmap->getNodeNumElements();
2388 size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2391 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2392 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2394 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2395 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2396 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2398 c_lno_view_t Irowptr;
2399 lno_nnz_view_t Icolind;
2400 scalar_view_t Ivals;
2401 if(!Bview.importMatrix.is_null()) {
2402 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2403 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2404 Ivals = Bview.importMatrix->getLocalMatrix().values;
2405 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2409 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2417 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2418 Array<size_t> c_status(n, ST_INVALID);
2427 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2428 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2429 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2430 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2431 size_t CSR_ip = 0, OLD_ip = 0;
2433 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2447 for (
size_t i = 0; i < m; i++) {
2450 Crowptr[i] = CSR_ip;
2451 SC minusOmegaDval = -omega*Dvals(i,0);
2454 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2455 Scalar Bval = Bvals[j];
2456 if (Bval == SC_ZERO)
2458 LO Bij = Bcolind[j];
2459 LO Cij = Bcol2Ccol[Bij];
2462 c_status[Cij] = CSR_ip;
2463 Ccolind[CSR_ip] = Cij;
2464 Cvals[CSR_ip] = Bvals[j];
2469 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2470 LO Aik = Acolind[k];
2471 const SC Aval = Avals[k];
2472 if (Aval == SC_ZERO)
2475 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2477 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2479 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2480 LO Bkj = Bcolind[j];
2481 LO Cij = Bcol2Ccol[Bkj];
2483 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2485 c_status[Cij] = CSR_ip;
2486 Ccolind[CSR_ip] = Cij;
2487 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2491 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2497 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2498 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2499 LO Ikj = Icolind[j];
2500 LO Cij = Icol2Ccol[Ikj];
2502 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2504 c_status[Cij] = CSR_ip;
2505 Ccolind[CSR_ip] = Cij;
2506 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2509 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2516 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2518 Kokkos::resize(Ccolind,CSR_alloc);
2519 Kokkos::resize(Cvals,CSR_alloc);
2523 Crowptr[m] = CSR_ip;
2526 Kokkos::resize(Ccolind,CSR_ip);
2527 Kokkos::resize(Cvals,CSR_ip);
2530 #ifdef HAVE_TPETRA_MMM_TIMINGS
2531 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix Final Sort")));
2538 C.replaceColMap(Ccolmap);
2545 if (params.is_null() || params->get(
"sort entries",
true))
2546 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2547 C.setAllValues(Crowptr,Ccolind, Cvals);
2550 #ifdef HAVE_TPETRA_MMM_TIMINGS
2551 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix ESFC")));
2562 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2563 labelList->set(
"Timer Label",label);
2564 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2565 RCP<const Export<LO,GO,NO> > dummyExport;
2566 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2574 template<
class Scalar,
2576 class GlobalOrdinal,
2578 void jacobi_A_B_reuse(
2580 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2581 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2582 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2583 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2584 const std::string& label,
2585 const Teuchos::RCP<Teuchos::ParameterList>& params)
2587 using Teuchos::Array;
2588 using Teuchos::ArrayRCP;
2589 using Teuchos::ArrayView;
2593 typedef LocalOrdinal LO;
2594 typedef GlobalOrdinal GO;
2597 typedef Import<LO,GO,NO> import_type;
2598 typedef Map<LO,GO,NO> map_type;
2601 typedef typename map_type::local_map_type local_map_type;
2603 typedef typename KCRS::StaticCrsGraphType graph_t;
2604 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2605 typedef typename NO::execution_space execution_space;
2606 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2607 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2609 #ifdef HAVE_TPETRA_MMM_TIMINGS
2610 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2611 using Teuchos::TimeMonitor;
2612 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse Cmap"))));
2614 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2617 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2618 RCP<const map_type> Ccolmap = C.getColMap();
2619 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2620 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2621 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2622 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2623 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2624 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2627 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2631 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2632 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2635 if (!Bview.importMatrix.is_null()) {
2636 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2637 std::runtime_error,
"Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2639 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2640 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(
const LO i) {
2641 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2647 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2648 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2649 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2650 GO aidx = Acolmap_local.getGlobalElement(i);
2651 LO B_LID = Browmap_local.getLocalElement(aidx);
2652 if (B_LID != LO_INVALID) {
2653 targetMapToOrigRow(i) = B_LID;
2654 targetMapToImportRow(i) = LO_INVALID;
2656 LO I_LID = Irowmap_local.getLocalElement(aidx);
2657 targetMapToOrigRow(i) = LO_INVALID;
2658 targetMapToImportRow(i) = I_LID;
2663 #ifdef HAVE_TPETRA_MMM_TIMINGS
2669 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2675 template<
class Scalar,
2677 class GlobalOrdinal,
2679 class LocalOrdinalViewType>
2680 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2681 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2682 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2683 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2684 const LocalOrdinalViewType & targetMapToOrigRow,
2685 const LocalOrdinalViewType & targetMapToImportRow,
2686 const LocalOrdinalViewType & Bcol2Ccol,
2687 const LocalOrdinalViewType & Icol2Ccol,
2688 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2689 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2690 const std::string& label,
2691 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2692 #ifdef HAVE_TPETRA_MMM_TIMINGS
2693 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2694 using Teuchos::TimeMonitor;
2695 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore"))));
2696 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2705 typedef typename KCRS::StaticCrsGraphType graph_t;
2706 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2707 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2708 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2709 typedef typename scalar_view_t::memory_space scalar_memory_space;
2712 typedef LocalOrdinal LO;
2713 typedef GlobalOrdinal GO;
2715 typedef Map<LO,GO,NO> map_type;
2716 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2717 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2718 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2721 RCP<const map_type> Ccolmap = C.getColMap();
2722 size_t m = Aview.origMatrix->getNodeNumRows();
2723 size_t n = Ccolmap->getNodeNumElements();
2726 const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2727 const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2728 const KCRS & Cmat = C.getLocalMatrix();
2730 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2731 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2732 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2733 scalar_view_t Cvals = Cmat.values;
2735 c_lno_view_t Irowptr;
2736 lno_nnz_view_t Icolind;
2737 scalar_view_t Ivals;
2738 if(!Bview.importMatrix.is_null()) {
2739 Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2740 Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2741 Ivals = Bview.importMatrix->getLocalMatrix().values;
2745 auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2747 #ifdef HAVE_TPETRA_MMM_TIMINGS
2748 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore - Compare"))));
2756 std::vector<size_t> c_status(n, ST_INVALID);
2759 size_t CSR_ip = 0, OLD_ip = 0;
2760 for (
size_t i = 0; i < m; i++) {
2764 OLD_ip = Crowptr[i];
2765 CSR_ip = Crowptr[i+1];
2766 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2767 c_status[Ccolind[k]] = k;
2773 SC minusOmegaDval = -omega*Dvals(i,0);
2776 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2777 Scalar Bval = Bvals[j];
2778 if (Bval == SC_ZERO)
2780 LO Bij = Bcolind[j];
2781 LO Cij = Bcol2Ccol[Bij];
2783 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2784 std::runtime_error,
"Trying to insert a new entry into a static graph");
2786 Cvals[c_status[Cij]] = Bvals[j];
2790 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2791 LO Aik = Acolind[k];
2792 const SC Aval = Avals[k];
2793 if (Aval == SC_ZERO)
2796 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2798 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2800 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2801 LO Bkj = Bcolind[j];
2802 LO Cij = Bcol2Ccol[Bkj];
2804 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2805 std::runtime_error,
"Trying to insert a new entry into a static graph");
2807 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2812 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2813 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2814 LO Ikj = Icolind[j];
2815 LO Cij = Icol2Ccol[Ikj];
2817 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2818 std::runtime_error,
"Trying to insert a new entry into a static graph");
2820 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2826 #ifdef HAVE_TPETRA_MMM_TIMINGS
2827 MM2 = Teuchos::null;
2828 MM2 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
2831 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2832 #ifdef HAVE_TPETRA_MMM_TIMINGS
2833 MM2 = Teuchos::null;
2842 template<
class Scalar,
2844 class GlobalOrdinal,
2846 void import_and_extract_views(
2847 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2848 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2849 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2850 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2851 bool userAssertsThereAreNoRemotes,
2852 const std::string& label,
2853 const Teuchos::RCP<Teuchos::ParameterList>& params)
2855 using Teuchos::Array;
2856 using Teuchos::ArrayView;
2859 using Teuchos::null;
2862 typedef LocalOrdinal LO;
2863 typedef GlobalOrdinal GO;
2866 typedef Map<LO,GO,NO> map_type;
2867 typedef Import<LO,GO,NO> import_type;
2868 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2870 #ifdef HAVE_TPETRA_MMM_TIMINGS
2871 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2872 using Teuchos::TimeMonitor;
2873 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Alloc"))));
2881 Aview.deleteContents();
2883 Aview.origMatrix = rcp(&A,
false);
2884 Aview.origRowMap = A.getRowMap();
2885 Aview.rowMap = targetMap;
2886 Aview.colMap = A.getColMap();
2887 Aview.domainMap = A.getDomainMap();
2888 Aview.importColMap = null;
2889 RCP<const map_type> rowMap = A.getRowMap();
2890 const int numProcs = rowMap->getComm()->getSize();
2893 if (userAssertsThereAreNoRemotes || numProcs < 2)
2896 RCP<const import_type> importer;
2897 if (params != null && params->isParameter(
"importer")) {
2898 importer = params->get<RCP<const import_type> >(
"importer");
2901 #ifdef HAVE_TPETRA_MMM_TIMINGS
2903 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap"))));
2908 RCP<const map_type> remoteRowMap;
2909 size_t numRemote = 0;
2911 if (!prototypeImporter.is_null() &&
2912 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2913 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2915 #ifdef HAVE_TPETRA_MMM_TIMINGS
2916 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode1"));
2918 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2919 numRemote = prototypeImporter->getNumRemoteIDs();
2921 Array<GO> remoteRows(numRemote);
2922 for (
size_t i = 0; i < numRemote; i++)
2923 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2925 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2926 rowMap->getIndexBase(), rowMap->getComm()));
2929 }
else if (prototypeImporter.is_null()) {
2931 #ifdef HAVE_TPETRA_MMM_TIMINGS
2932 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode2"));
2934 ArrayView<const GO> rows = targetMap->getNodeElementList();
2935 size_t numRows = targetMap->getNodeNumElements();
2937 Array<GO> remoteRows(numRows);
2938 for(
size_t i = 0; i < numRows; ++i) {
2939 const LO mlid = rowMap->getLocalElement(rows[i]);
2941 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2942 remoteRows[numRemote++] = rows[i];
2944 remoteRows.resize(numRemote);
2945 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2946 rowMap->getIndexBase(), rowMap->getComm()));
2955 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2956 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2965 #ifdef HAVE_TPETRA_MMM_TIMINGS
2967 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Collective-0"))));
2971 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2973 if (globalMaxNumRemote > 0) {
2974 #ifdef HAVE_TPETRA_MMM_TIMINGS
2976 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-2"))));
2980 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2982 importer = rcp(
new import_type(rowMap, remoteRowMap));
2984 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match A.getRowMap()!");
2988 params->set(
"importer", importer);
2991 if (importer != null) {
2992 #ifdef HAVE_TPETRA_MMM_TIMINGS
2994 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-3"))));
2998 Teuchos::ParameterList labelList;
2999 labelList.set(
"Timer Label", label);
3000 auto & labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
3003 bool overrideAllreduce =
false;
3006 Teuchos::ParameterList params_sublist;
3007 if(!params.is_null()) {
3008 labelList.set(
"compute global constants", params->get(
"compute global constants",
false));
3009 params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
3010 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3011 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3012 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3013 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
3014 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
3016 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
3017 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3018 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3020 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3021 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3027 sprintf(str,
"import_matrix.%d.dat",count);
3032 #ifdef HAVE_TPETRA_MMM_STATISTICS
3033 printMultiplicationStatistics(importer, label + std::string(
" I&X MMM"));
3037 #ifdef HAVE_TPETRA_MMM_TIMINGS
3039 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-4"))));
3043 Aview.importColMap = Aview.importMatrix->getColMap();
3044 #ifdef HAVE_TPETRA_MMM_TIMINGS
3056 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3058 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3059 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3060 const LocalOrdinalViewType & Acol2Brow,
3061 const LocalOrdinalViewType & Acol2Irow,
3062 const LocalOrdinalViewType & Bcol2Ccol,
3063 const LocalOrdinalViewType & Icol2Ccol,
3064 const size_t mergedNodeNumCols) {
3068 typedef typename KCRS::StaticCrsGraphType graph_t;
3069 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3070 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3071 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3073 const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3074 const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3077 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3081 RCP<const KCRS> Ik_;
3082 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3083 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3085 if(Ik!=0) Iks = *Ik;
3086 size_t merge_numrows = Ak.numCols();
3088 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3090 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3093 typedef typename Node::execution_space execution_space;
3094 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3095 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3096 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3097 if(
final) Mrowptr(i) = update;
3100 if(Acol2Brow(i)!=LO_INVALID)
3101 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3103 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3106 if(
final && i+1==merge_numrows)
3107 Mrowptr(i+1)=update;
3111 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3112 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3113 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz);
3116 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3117 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3118 if(Acol2Brow(i)!=LO_INVALID) {
3119 size_t row = Acol2Brow(i);
3120 size_t start = Bk.graph.row_map(row);
3121 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3122 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3123 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3127 size_t row = Acol2Irow(i);
3128 size_t start = Iks.graph.row_map(row);
3129 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3131 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3136 KCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3145 template<
typename SC,
typename LO,
typename GO,
typename NO>
3146 void AddKernels<SC, LO, GO, NO>::
3148 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3149 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3150 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3151 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3152 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3153 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3154 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3155 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3156 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3157 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3158 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3160 using Teuchos::TimeMonitor;
3161 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3162 auto nrows = Arowptrs.extent(0) - 1;
3163 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3164 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, LO, impl_scalar_type,
3165 execution_space, memory_space, memory_space> KKH;
3167 handle.create_spadd_handle(
true);
3168 auto addHandle = handle.get_spadd_handle();
3169 #ifdef HAVE_TPETRA_MMM_TIMINGS
3170 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3172 KokkosSparse::Experimental::spadd_symbolic
3174 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3175 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3176 row_ptrs_array, col_inds_array>
3177 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3179 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
3180 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_max_result_nnz());
3181 #ifdef HAVE_TPETRA_MMM_TIMINGS
3183 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted numeric")));
3185 KokkosSparse::Experimental::spadd_numeric(&handle,
3186 Arowptrs, Acolinds, Avals, scalarA,
3187 Browptrs, Bcolinds, Bvals, scalarB,
3188 Crowptrs, Ccolinds, Cvals);
3191 template<
typename SC,
typename LO,
typename GO,
typename NO>
3192 void AddKernels<SC, LO, GO, NO>::
3194 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3195 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3196 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3197 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3198 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3199 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3200 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3201 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3203 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3204 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3205 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3207 using Teuchos::TimeMonitor;
3208 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3209 auto nrows = Arowptrs.extent(0) - 1;
3210 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3211 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3212 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
3213 AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
3215 handle.create_spadd_handle(
false);
3216 auto addHandle = handle.get_spadd_handle();
3217 #ifdef HAVE_TPETRA_MMM_TIMINGS
3218 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3220 KokkosSparse::Experimental::spadd_symbolic
3222 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3223 typename row_ptrs_array::const_type,
typename col_inds_array::const_type,
3224 row_ptrs_array, col_inds_array>
3225 (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3227 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
3228 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_max_result_nnz());
3229 #ifdef HAVE_TPETRA_MMM_TIMINGS
3231 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
3233 KokkosSparse::Experimental::spadd_numeric(&handle,
3234 Arowptrs, Acolinds, Avals, scalarA,
3235 Browptrs, Bcolinds, Bvals, scalarB,
3236 Crowptrs, Ccolinds, Cvals);
3239 template<
typename GO,
3240 typename LocalIndicesType,
3241 typename GlobalIndicesType,
3242 typename ColMapType>
3243 struct ConvertLocalToGlobalFunctor
3245 ConvertLocalToGlobalFunctor(
3246 const LocalIndicesType& colindsOrig_,
3247 const GlobalIndicesType& colindsConverted_,
3248 const ColMapType& colmap_) :
3249 colindsOrig (colindsOrig_),
3250 colindsConverted (colindsConverted_),
3253 KOKKOS_INLINE_FUNCTION
void
3254 operator() (
const GO i)
const
3256 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3258 LocalIndicesType colindsOrig;
3259 GlobalIndicesType colindsConverted;
3263 template<
typename SC,
typename LO,
typename GO,
typename NO>
3264 void MMdetails::AddKernels<SC, LO, GO, NO>::
3265 convertToGlobalAndAdd(
3266 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3267 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3268 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3269 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3270 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3271 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3272 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3273 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3274 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3276 using Teuchos::TimeMonitor;
3278 const values_array Avals = A.values;
3279 const values_array Bvals = B.values;
3280 const col_inds_array Acolinds = A.graph.entries;
3281 const col_inds_array Bcolinds = B.graph.entries;
3282 auto Arowptrs = A.graph.row_map;
3283 auto Browptrs = B.graph.row_map;
3284 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"A colinds (converted)"), Acolinds.extent(0));
3285 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"B colinds (converted)"), Bcolinds.extent(0));
3286 #ifdef HAVE_TPETRA_MMM_TIMINGS
3287 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string(
"column map conversion"))));
3289 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3290 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3291 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3292 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3293 typedef KokkosKernels::Experimental::KokkosKernelsHandle<
typename col_inds_array::size_type, GO, impl_scalar_type,
3294 execution_space, memory_space, memory_space> KKH;
3296 handle.create_spadd_handle(
false);
3297 auto addHandle = handle.get_spadd_handle();
3298 #ifdef HAVE_TPETRA_MMM_TIMINGS
3300 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3302 auto nrows = Arowptrs.extent(0) - 1;
3303 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3304 KokkosSparse::Experimental::spadd_symbolic
3305 <KKH,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type,
typename row_ptrs_array::const_type,
typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3306 (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3307 Cvals = values_array(
"C values", addHandle->get_max_result_nnz());
3308 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_max_result_nnz());
3309 #ifdef HAVE_TPETRA_MMM_TIMINGS
3311 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3313 KokkosSparse::Experimental::spadd_numeric(&handle,
3314 Arowptrs, AcolindsConverted, Avals, scalarA,
3315 Browptrs, BcolindsConverted, Bvals, scalarB,
3316 Crowptrs, Ccolinds, Cvals);
3332 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3334 void MatrixMatrix::Multiply( \
3335 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3337 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3339 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3340 bool call_FillComplete_on_result, \
3341 const std::string & label, \
3342 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3345 void MatrixMatrix::Jacobi( \
3347 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3348 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3349 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3350 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3351 bool call_FillComplete_on_result, \
3352 const std::string & label, \
3353 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3356 void MatrixMatrix::Add( \
3357 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3360 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3363 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3366 void MatrixMatrix::Add( \
3367 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3370 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3374 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3375 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3376 (const SCALAR & alpha, \
3377 const bool transposeA, \
3378 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3379 const SCALAR & beta, \
3380 const bool transposeB, \
3381 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3382 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3383 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3384 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3388 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3389 (const SCALAR & alpha, \
3390 const bool transposeA, \
3391 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3392 const SCALAR& beta, \
3393 const bool transposeB, \
3394 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3395 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3396 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3397 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3398 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3400 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
3404 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.