10 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_DEF_HPP
13 #include "KokkosSparse_Utils.hpp"
14 #include "Tpetra_ConfigDefs.hpp"
16 #include "Teuchos_VerboseObject.hpp"
17 #include "Teuchos_Array.hpp"
19 #include "Tpetra_CrsMatrix.hpp"
20 #include "Tpetra_BlockCrsMatrix.hpp"
22 #include "Tpetra_RowMatrixTransposer.hpp"
25 #include "Tpetra_Details_makeColMap.hpp"
26 #include "Tpetra_ConfigDefs.hpp"
27 #include "Tpetra_Map.hpp"
28 #include "Tpetra_Export.hpp"
32 #include <type_traits>
33 #include "Teuchos_FancyOStream.hpp"
35 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
38 #include "KokkosSparse_spgemm.hpp"
39 #include "KokkosSparse_spadd.hpp"
40 #include "Kokkos_Bitset.hpp"
54 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
55 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
56 #include "TpetraExt_MatrixMatrix_HIP.hpp"
57 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
61 namespace MatrixMatrix{
69 template <
class Scalar,
79 bool call_FillComplete_on_result,
80 const std::string& label,
81 const Teuchos::RCP<Teuchos::ParameterList>& params)
87 typedef LocalOrdinal LO;
88 typedef GlobalOrdinal GO;
96 #ifdef HAVE_TPETRA_MMM_TIMINGS
97 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
98 using Teuchos::TimeMonitor;
101 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Setup"))));
104 const std::string prefix =
"TpetraExt::MatrixMatrix::Multiply(): ";
109 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
110 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
115 RCP<const crs_matrix_type> Aprime = null;
119 RCP<const crs_matrix_type> Bprime = null;
129 const bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
131 bool use_optimized_ATB =
false;
132 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
133 use_optimized_ATB =
true;
135 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
136 use_optimized_ATB =
false;
139 using Teuchos::ParameterList;
140 RCP<ParameterList> transposeParams (
new ParameterList);
141 transposeParams->set (
"sort",
true);
143 if (!use_optimized_ATB && transposeA) {
144 transposer_type transposer (rcpFromRef (A));
145 Aprime = transposer.createTranspose (transposeParams);
148 Aprime = rcpFromRef(A);
152 transposer_type transposer (rcpFromRef (B));
153 Bprime = transposer.createTranspose (transposeParams);
156 Bprime = rcpFromRef(B);
166 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
167 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
168 "must match for matrix-matrix product. op(A) is "
169 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
175 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
176 prefix <<
"ERROR, dimensions of result C must "
178 <<
" rows, should have at least " << Aouter << std::endl);
190 int numProcs = A.
getComm()->getSize();
194 crs_matrix_struct_type Aview;
195 crs_matrix_struct_type Bview;
197 RCP<const map_type> targetMap_A = Aprime->getRowMap();
198 RCP<const map_type> targetMap_B = Bprime->getRowMap();
200 #ifdef HAVE_TPETRA_MMM_TIMINGS
202 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All I&X")));
208 if (!use_optimized_ATB) {
209 RCP<const import_type> dummyImporter;
210 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
216 targetMap_B = Aprime->getColMap();
219 if (!use_optimized_ATB)
220 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
222 #ifdef HAVE_TPETRA_MMM_TIMINGS
225 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Multiply"))));
229 if (use_optimized_ATB) {
230 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
232 }
else if (call_FillComplete_on_result && newFlag) {
233 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
235 }
else if (call_FillComplete_on_result) {
236 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
242 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
244 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
247 #ifdef HAVE_TPETRA_MMM_TIMINGS
248 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All FillComplete")));
257 C.
fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
265 template <
class Scalar,
275 const std::string& label)
281 typedef LocalOrdinal LO;
282 typedef GlobalOrdinal GO;
288 std::string prefix = std::string(
"TpetraExt ") + label + std::string(
": ");
290 TEUCHOS_TEST_FOR_EXCEPTION(transposeA==
true, std::runtime_error, prefix <<
"Matrix A cannot be transposed.");
291 TEUCHOS_TEST_FOR_EXCEPTION(transposeB==
true, std::runtime_error, prefix <<
"Matrix B cannot be transposed.");
303 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
304 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
305 "must match for matrix-matrix product. op(A) is "
306 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
310 int numProcs = A->getComm()->getSize();
312 const LO blocksize = A->getBlockSize();
313 TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
314 prefix <<
"ERROR, Blocksizes do not match. A.blocksize = " <<
315 blocksize <<
", B.blocksize = " << B->getBlockSize() );
319 blockcrs_matrix_struct_type Aview(blocksize);
320 blockcrs_matrix_struct_type Bview(blocksize);
322 RCP<const map_type> targetMap_A = A->getRowMap();
323 RCP<const map_type> targetMap_B = B->getRowMap();
326 RCP<const import_type> dummyImporter;
327 MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter,
true);
332 targetMap_B = A->getColMap();
335 MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
336 A->getGraph()->getImporter().is_null());
339 MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
342 template <
class Scalar,
351 bool call_FillComplete_on_result,
352 const std::string& label,
353 const Teuchos::RCP<Teuchos::ParameterList>& params)
357 typedef LocalOrdinal LO;
358 typedef GlobalOrdinal GO;
365 #ifdef HAVE_TPETRA_MMM_TIMINGS
366 std::string prefix_mmm = std::string(
"TpetraExt ")+ label + std::string(
": ");
367 using Teuchos::TimeMonitor;
368 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string(
"Jacobi All Setup")));
371 const std::string prefix =
"TpetraExt::MatrixMatrix::Jacobi(): ";
376 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
377 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
379 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
380 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
389 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
390 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
391 "must match for matrix-matrix product. op(A) is "
392 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
398 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
399 prefix <<
"ERROR, dimensions of result C must "
401 <<
" rows, should have at least "<< Aouter << std::endl);
413 int numProcs = A.
getComm()->getSize();
417 crs_matrix_struct_type Aview;
418 crs_matrix_struct_type Bview;
420 RCP<const map_type> targetMap_A = Aprime->getRowMap();
421 RCP<const map_type> targetMap_B = Bprime->getRowMap();
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
424 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All I&X")));
430 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(
new Teuchos::ParameterList);
431 if(!params.is_null()) {
432 importParams1->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
433 int mm_optimization_core_count=0;
434 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
436 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
437 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
438 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
439 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
440 auto & ip1slist = importParams1->sublist(
"matrixmatrix: kernel params",
false);
441 ip1slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
442 ip1slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
443 ip1slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
447 RCP<const import_type> dummyImporter;
448 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label,importParams1);
453 targetMap_B = Aprime->getColMap();
458 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(
new Teuchos::ParameterList);
459 if(!params.is_null()) {
460 importParams2->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
462 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
464 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
465 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
466 auto & ip2slist = importParams2->sublist(
"matrixmatrix: kernel params",
false);
467 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
468 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
469 ip2slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
470 ip2slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
471 ip2slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
474 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
476 #ifdef HAVE_TPETRA_MMM_TIMINGS
478 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All Multiply")));
482 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
485 bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
487 if (call_FillComplete_on_result && newFlag) {
488 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
490 }
else if (call_FillComplete_on_result) {
491 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
494 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
"jacobi_A_B_general not implemented");
497 if(!params.is_null()) {
498 bool removeZeroEntries = params->get(
"remove zeros",
false);
499 if (removeZeroEntries) {
500 typedef Teuchos::ScalarTraits<Scalar> STS;
501 typename STS::magnitudeType threshold = params->get(
"remove zeros threshold", STS::magnitude(STS::zero()));
508 template <
class Scalar,
519 using Teuchos::Array;
523 typedef LocalOrdinal LO;
524 typedef GlobalOrdinal GO;
529 const std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
531 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error,
532 prefix <<
"ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
533 "(Result matrix B is not required to be isFillComplete()).");
534 TEUCHOS_TEST_FOR_EXCEPTION(B.
isFillComplete() , std::runtime_error,
535 prefix <<
"ERROR, input matrix B must not be fill complete!");
536 TEUCHOS_TEST_FOR_EXCEPTION(B.
isStaticGraph() , std::runtime_error,
537 prefix <<
"ERROR, input matrix B must not have static graph!");
539 prefix <<
"ERROR, input matrix B must not be locally indexed!");
541 using Teuchos::ParameterList;
542 RCP<ParameterList> transposeParams (
new ParameterList);
543 transposeParams->set (
"sort",
false);
545 RCP<const crs_matrix_type> Aprime = null;
547 transposer_type transposer (rcpFromRef (A));
548 Aprime = transposer.createTranspose (transposeParams);
551 Aprime = rcpFromRef(A);
559 if (scalarB != Teuchos::ScalarTraits<SC>::one())
563 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
564 for (LO i = 0; (size_t)i < numMyRows; ++i) {
565 row = B.
getRowMap()->getGlobalElement(i);
566 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
568 if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
569 for (
size_t j = 0; j < a_numEntries; ++j)
570 a_vals[j] *= scalarA;
572 B.
insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
577 template <
class Scalar,
581 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
583 const bool transposeA,
586 const bool transposeB,
590 const Teuchos::RCP<Teuchos::ParameterList>& params)
593 using Teuchos::rcpFromRef;
595 using Teuchos::ParameterList;
597 if(!params.is_null())
599 TEUCHOS_TEST_FOR_EXCEPTION(
600 params->isParameter(
"Call fillComplete") && !params->get<
bool>(
"Call fillComplete"),
601 std::invalid_argument,
602 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
603 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
604 params->set(
"Call fillComplete",
true);
608 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
615 TEUCHOS_TEST_FOR_EXCEPTION
616 (! A.
isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
617 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
618 RCP<crs_matrix_type> C = rcp(
new crs_matrix_type(Brcp->getRowMap(), 0));
620 add(alpha, transposeA, A, beta,
false, *Brcp, *C, domainMap, rangeMap, params);
628 template<
class LO,
class GO,
class LOView,
class GOView,
class LocalMap>
629 struct ConvertGlobalToLocalFunctor
631 ConvertGlobalToLocalFunctor(LOView& lids_,
const GOView& gids_,
const LocalMap localColMap_)
632 : lids(lids_), gids(gids_), localColMap(localColMap_)
635 KOKKOS_FUNCTION
void operator() (
const GO i)
const
637 lids(i) = localColMap.getLocalElement(gids(i));
642 const LocalMap localColMap;
645 template <
class Scalar,
651 const bool transposeA,
654 const bool transposeB,
659 const Teuchos::RCP<Teuchos::ParameterList>& params)
663 using Teuchos::rcpFromRef;
664 using Teuchos::rcp_implicit_cast;
665 using Teuchos::rcp_dynamic_cast;
666 using Teuchos::TimeMonitor;
668 using LO = LocalOrdinal;
669 using GO = GlobalOrdinal;
677 using exec_space =
typename crs_graph_type::execution_space;
678 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
679 const char* prefix_mmm =
"TpetraExt::MatrixMatrix::add: ";
680 constexpr
bool debug =
false;
682 #ifdef HAVE_TPETRA_MMM_TIMINGS
683 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"transpose"))));
687 std::ostringstream os;
688 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
689 <<
"TpetraExt::MatrixMatrix::add" << std::endl;
690 std::cerr << os.str ();
693 TEUCHOS_TEST_FOR_EXCEPTION
695 prefix_mmm <<
"C must be a 'new' matrix (neither locally nor globally indexed).");
696 TEUCHOS_TEST_FOR_EXCEPTION
698 prefix_mmm <<
"A and B must both be fill complete.");
699 #ifdef HAVE_TPETRA_DEBUG
702 const bool domainMapsSame =
703 (! transposeA && ! transposeB &&
705 (! transposeA && transposeB &&
707 ( transposeA && ! transposeB &&
709 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
710 prefix_mmm <<
"The domain Maps of Op(A) and Op(B) are not the same.");
712 const bool rangeMapsSame =
713 (! transposeA && ! transposeB &&
715 (! transposeA && transposeB &&
717 ( transposeA && ! transposeB &&
719 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
720 prefix_mmm <<
"The range Maps of Op(A) and Op(B) are not the same.");
722 #endif // HAVE_TPETRA_DEBUG
724 using Teuchos::ParameterList;
726 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
728 transposer_type transposer (Aprime);
729 Aprime = transposer.createTranspose ();
732 #ifdef HAVE_TPETRA_DEBUG
733 TEUCHOS_TEST_FOR_EXCEPTION
734 (Aprime.is_null (), std::logic_error,
735 prefix_mmm <<
"Failed to compute Op(A). "
736 "Please report this bug to the Tpetra developers.");
737 #endif // HAVE_TPETRA_DEBUG
740 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
743 std::ostringstream os;
744 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
745 <<
"Form explicit xpose of B" << std::endl;
746 std::cerr << os.str ();
748 transposer_type transposer (Bprime);
749 Bprime = transposer.createTranspose ();
751 #ifdef HAVE_TPETRA_DEBUG
752 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
753 prefix_mmm <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
754 TEUCHOS_TEST_FOR_EXCEPTION(
755 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
756 prefix_mmm <<
"Aprime and Bprime must both be fill complete. "
757 "Please report this bug to the Tpetra developers.");
758 #endif // HAVE_TPETRA_DEBUG
759 RCP<const map_type> CDomainMap = domainMap;
760 RCP<const map_type> CRangeMap = rangeMap;
761 if(CDomainMap.is_null())
763 CDomainMap = Bprime->getDomainMap();
765 if(CRangeMap.is_null())
767 CRangeMap = Bprime->getRangeMap();
769 assert(!(CDomainMap.is_null()));
770 assert(!(CRangeMap.is_null()));
771 typedef typename AddKern::values_array values_array;
772 typedef typename AddKern::row_ptrs_array row_ptrs_array;
773 typedef typename AddKern::col_inds_array col_inds_array;
774 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
775 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
777 row_ptrs_array rowptrs;
778 col_inds_array colinds;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
781 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"rowmap check/import"))));
783 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
786 auto import = rcp(
new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
791 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *
import, Bprime->getDomainMap(), Bprime->getRangeMap());
793 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
794 bool sorted = AGraphSorted && BGraphSorted;
795 RCP<const import_type> Cimport = Teuchos::null;
796 RCP<export_type> Cexport = Teuchos::null;
797 bool doFillComplete =
true;
798 if(Teuchos::nonnull(params) && params->isParameter(
"Call fillComplete"))
800 doFillComplete = params->get<
bool>(
"Call fillComplete");
802 auto Alocal = Aprime->getLocalMatrixDevice();
803 auto Blocal = Bprime->getLocalMatrixDevice();
804 LO numLocalRows = Alocal.numRows();
805 if(numLocalRows == 0)
812 rowptrs = row_ptrs_array(
"C rowptrs", 0);
814 auto Acolmap = Aprime->getColMap();
815 auto Bcolmap = Bprime->getColMap();
818 using global_col_inds_array =
typename AddKern::global_col_inds_array;
819 #ifdef HAVE_TPETRA_MMM_TIMINGS
821 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mismatched col map full kernel"))));
824 auto AlocalColmap = Acolmap->getLocalMap();
825 auto BlocalColmap = Bcolmap->getLocalMap();
826 global_col_inds_array globalColinds;
828 std::ostringstream os;
829 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
830 <<
"Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
831 std::cerr << os.str ();
833 AddKern::convertToGlobalAndAdd(
834 Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
835 vals, rowptrs, globalColinds);
837 std::ostringstream os;
838 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
839 <<
"Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
840 std::cerr << os.str ();
842 #ifdef HAVE_TPETRA_MMM_TIMINGS
844 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Constructing graph"))));
846 RCP<const map_type> CcolMap;
847 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
848 (CcolMap, CDomainMap, globalColinds);
850 col_inds_array localColinds(
"C colinds", globalColinds.extent(0));
851 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
852 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
853 col_inds_array, global_col_inds_array,
854 typename map_type::local_map_type>
855 (localColinds, globalColinds, CcolMap->getLocalMap()));
856 KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
865 auto Avals = Alocal.values;
866 auto Bvals = Blocal.values;
867 auto Arowptrs = Alocal.graph.row_map;
868 auto Browptrs = Blocal.graph.row_map;
869 auto Acolinds = Alocal.graph.entries;
870 auto Bcolinds = Blocal.graph.entries;
874 #ifdef HAVE_TPETRA_MMM_TIMINGS
876 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"sorted entries full kernel"))));
879 std::ostringstream os;
880 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
881 <<
"Call AddKern::addSorted(...)" << std::endl;
882 std::cerr << os.str ();
884 #if KOKKOSKERNELS_VERSION >= 40299
885 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
887 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
893 #ifdef HAVE_TPETRA_MMM_TIMINGS
895 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mm add unsorted entries full kernel"))));
898 std::ostringstream os;
899 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
900 <<
"Call AddKern::addUnsorted(...)" << std::endl;
901 std::cerr << os.str ();
903 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
906 RCP<const map_type> Ccolmap = Bcolmap;
911 #ifdef HAVE_TPETRA_MMM_TIMINGS
913 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Tpetra::Crs expertStaticFillComplete"))));
915 if(!CDomainMap->isSameAs(*Ccolmap))
918 std::ostringstream os;
919 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
920 <<
"Create Cimport" << std::endl;
921 std::cerr << os.str ();
923 Cimport = rcp(
new import_type(CDomainMap, Ccolmap));
928 std::ostringstream os;
929 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
930 <<
"Create Cexport" << std::endl;
931 std::cerr << os.str ();
933 Cexport = rcp(
new export_type(C.
getRowMap(), CRangeMap));
937 std::ostringstream os;
938 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
939 <<
"Call C->expertStaticFillComplete(...)" << std::endl;
940 std::cerr << os.str ();
949 template <
class Scalar,
962 using Teuchos::Array;
963 using Teuchos::ArrayRCP;
964 using Teuchos::ArrayView;
967 using Teuchos::rcp_dynamic_cast;
968 using Teuchos::rcpFromRef;
969 using Teuchos::tuple;
972 typedef Teuchos::ScalarTraits<Scalar> STS;
980 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
982 TEUCHOS_TEST_FOR_EXCEPTION(
984 prefix <<
"A and B must both be fill complete before calling this function.");
988 prefix <<
"C is null (must be allocated), but A.haveGlobalConstants() is false. "
989 "Please report this bug to the Tpetra developers.");
991 prefix <<
"C is null (must be allocated), but B.haveGlobalConstants() is false. "
992 "Please report this bug to the Tpetra developers.");
995 #ifdef HAVE_TPETRA_DEBUG
997 const bool domainMapsSame =
1001 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
1002 prefix <<
"The domain Maps of Op(A) and Op(B) are not the same.");
1004 const bool rangeMapsSame =
1008 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
1009 prefix <<
"The range Maps of Op(A) and Op(B) are not the same.");
1011 #endif // HAVE_TPETRA_DEBUG
1013 using Teuchos::ParameterList;
1014 RCP<ParameterList> transposeParams (
new ParameterList);
1015 transposeParams->set (
"sort",
false);
1018 RCP<const crs_matrix_type> Aprime;
1020 transposer_type theTransposer (rcpFromRef (A));
1021 Aprime = theTransposer.createTranspose (transposeParams);
1024 Aprime = rcpFromRef (A);
1027 #ifdef HAVE_TPETRA_DEBUG
1028 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1029 prefix <<
"Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1030 #endif // HAVE_TPETRA_DEBUG
1033 RCP<const crs_matrix_type> Bprime;
1035 transposer_type theTransposer (rcpFromRef (B));
1036 Bprime = theTransposer.createTranspose (transposeParams);
1039 Bprime = rcpFromRef (B);
1042 #ifdef HAVE_TPETRA_DEBUG
1043 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1044 prefix <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1045 #endif // HAVE_TPETRA_DEBUG
1047 bool CwasFillComplete =
false;
1050 if (! C.is_null ()) {
1051 CwasFillComplete = C->isFillComplete();
1052 if(CwasFillComplete)
1054 C->setAllToScalar (STS::zero ());
1064 if(Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
1065 LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1066 Array<size_t> CmaxEntriesPerRow(numLocalRows);
1067 for(LocalOrdinal i = 0; i < numLocalRows; i++) {
1068 CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1070 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), CmaxEntriesPerRow()));
1074 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1078 #ifdef HAVE_TPETRA_DEBUG
1079 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1080 prefix <<
"At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1081 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1082 prefix <<
"At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1083 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1084 prefix <<
"At this point, C is null. Please report this bug to the Tpetra developers.");
1085 #endif // HAVE_TPETRA_DEBUG
1087 Array<RCP<const crs_matrix_type> > Mat =
1088 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1089 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1092 for (
int k = 0; k < 2; ++k) {
1093 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1094 typename crs_matrix_type::nonconst_values_host_view_type Values;
1102 #ifdef HAVE_TPETRA_DEBUG
1103 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1104 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1105 #endif // HAVE_TPETRA_DEBUG
1106 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1107 #ifdef HAVE_TPETRA_DEBUG
1108 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1109 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1110 #endif // HAVE_TPETRA_DEBUG
1112 const size_t localNumRows = Mat[k]->getLocalNumRows ();
1113 for (
size_t i = 0; i < localNumRows; ++i) {
1114 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1115 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1116 if (numEntries > 0) {
1117 if(numEntries > Indices.extent(0)) {
1118 Kokkos::resize(Indices, numEntries);
1119 Kokkos::resize(Values, numEntries);
1121 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1123 if (scalar[k] != STS::one ()) {
1124 for (
size_t j = 0; j < numEntries; ++j) {
1125 Values[j] *= scalar[k];
1129 if (CwasFillComplete) {
1130 size_t result = C->sumIntoGlobalValues (globalRow, numEntries,
1131 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1132 TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1133 prefix <<
"sumIntoGlobalValues failed to add entries from A or B into C.");
1135 C->insertGlobalValues (globalRow, numEntries,
1136 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1141 if(CwasFillComplete) {
1142 C->fillComplete(C->getDomainMap (),
1149 template <
class Scalar,
1151 class GlobalOrdinal,
1162 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
1164 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::invalid_argument,
1165 prefix <<
"C must not be null");
1167 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C_ = C;
1168 Add(A, transposeA, scalarA, B, transposeB, scalarB, C_);
1173 namespace MMdetails{
1225 template<
class Scalar,
1227 class GlobalOrdinal,
1229 void mult_AT_B_newmatrix(
1230 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1231 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1232 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1233 const std::string & label,
1234 const Teuchos::RCP<Teuchos::ParameterList>& params)
1239 typedef LocalOrdinal LO;
1240 typedef GlobalOrdinal GO;
1242 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1243 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1245 #ifdef HAVE_TPETRA_MMM_TIMINGS
1246 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1247 using Teuchos::TimeMonitor;
1248 RCP<TimeMonitor> MM = rcp (
new TimeMonitor
1249 (*TimeMonitor::getNewTimer (prefix_mmm +
"MMM-T Transpose")));
1255 transposer_type transposer (rcpFromRef (A), label + std::string(
"XP: "));
1257 using Teuchos::ParameterList;
1258 RCP<ParameterList> transposeParams (
new ParameterList);
1259 transposeParams->set (
"sort",
true);
1260 if(! params.is_null ()) {
1261 transposeParams->set (
"compute global constants",
1262 params->get (
"compute global constants: temporaries",
1265 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1266 transposer.createTransposeLocal (transposeParams);
1271 #ifdef HAVE_TPETRA_MMM_TIMINGS
1273 MM = rcp (
new TimeMonitor
1274 (*TimeMonitor::getNewTimer (prefix_mmm + std::string (
"MMM-T I&X"))));
1278 crs_matrix_struct_type Aview;
1279 crs_matrix_struct_type Bview;
1280 RCP<const Import<LO, GO, NO> > dummyImporter;
1283 RCP<Teuchos::ParameterList> importParams1 (
new ParameterList);
1284 if (! params.is_null ()) {
1285 importParams1->set (
"compute global constants",
1286 params->get (
"compute global constants: temporaries",
1288 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1289 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1290 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1291 int mm_optimization_core_count =
1293 mm_optimization_core_count =
1294 slist.get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1295 int mm_optimization_core_count2 =
1296 params->get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1297 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1298 mm_optimization_core_count = mm_optimization_core_count2;
1300 auto & sip1 = importParams1->sublist (
"matrixmatrix: kernel params",
false);
1301 sip1.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1302 sip1.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1303 sip1.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1306 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1307 Aview, dummyImporter,
true,
1308 label, importParams1);
1310 RCP<ParameterList> importParams2 (
new ParameterList);
1311 if (! params.is_null ()) {
1312 importParams2->set (
"compute global constants",
1313 params->get (
"compute global constants: temporaries",
1315 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1316 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1317 bool overrideAllreduce = slist.get (
"MM_TAFC_OverrideAllreduceCheck",
false);
1318 int mm_optimization_core_count =
1320 mm_optimization_core_count =
1321 slist.get (
"MM_TAFC_OptimizationCoreCount",
1322 mm_optimization_core_count);
1323 int mm_optimization_core_count2 =
1324 params->get (
"MM_TAFC_OptimizationCoreCount",
1325 mm_optimization_core_count);
1326 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1327 mm_optimization_core_count = mm_optimization_core_count2;
1329 auto & sip2 = importParams2->sublist (
"matrixmatrix: kernel params",
false);
1330 sip2.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1331 sip2.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1332 sip2.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1335 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1336 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,
true, label,importParams2);
1339 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,
false, label,importParams2);
1342 #ifdef HAVE_TPETRA_MMM_TIMINGS
1344 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T AB-core"))));
1347 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1350 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1351 if (needs_final_export) {
1355 Ctemp = rcp (&C,
false);
1358 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1363 #ifdef HAVE_TPETRA_MMM_TIMINGS
1365 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T exportAndFillComplete"))));
1368 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C,
false);
1370 if (needs_final_export) {
1371 ParameterList labelList;
1372 labelList.set(
"Timer Label", label);
1373 if(!params.is_null()) {
1374 ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
1375 ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
1377 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1378 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1379 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1380 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
1381 bool isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
1382 bool overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1384 labelList_subList.set (
"isMatrixMatrix_TransferAndFillComplete", isMM,
1385 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1386 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
1387 labelList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1389 Ctemp->exportAndFillComplete (Crcp,
1390 *Ctemp->getGraph ()->getExporter (),
1393 rcp (&labelList,
false));
1395 #ifdef HAVE_TPETRA_MMM_STATISTICS
1396 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(
" AT_B MMM"));
1402 template<
class Scalar,
1404 class GlobalOrdinal,
1407 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1408 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1409 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1410 const std::string& ,
1411 const Teuchos::RCP<Teuchos::ParameterList>& )
1413 using Teuchos::Array;
1414 using Teuchos::ArrayRCP;
1415 using Teuchos::ArrayView;
1416 using Teuchos::OrdinalTraits;
1417 using Teuchos::null;
1419 typedef Teuchos::ScalarTraits<Scalar> STS;
1421 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1422 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1424 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1425 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1427 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1428 ArrayView<const GlobalOrdinal> bcols_import = null;
1429 if (Bview.importColMap != null) {
1430 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1431 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1433 bcols_import = Bview.importColMap->getLocalElementList();
1436 size_t C_numCols = C_lastCol - C_firstCol +
1437 OrdinalTraits<LocalOrdinal>::one();
1438 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1439 OrdinalTraits<LocalOrdinal>::one();
1441 if (C_numCols_import > C_numCols)
1442 C_numCols = C_numCols_import;
1444 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1445 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1446 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1448 Array<Scalar> C_row_i = dwork;
1449 Array<GlobalOrdinal> C_cols = iwork;
1450 Array<size_t> c_index = iwork2;
1451 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1452 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1454 size_t C_row_i_length, j, k, last_index;
1457 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1458 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1459 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1460 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1462 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1463 Aview.colMap->getMaxLocalIndex(); i++)
1468 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1469 Aview.colMap->getMaxLocalIndex(); i++) {
1470 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1471 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1472 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1473 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1483 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1484 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1485 auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1486 auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1487 auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1488 auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1489 decltype(Browptr) Irowptr;
1490 decltype(Bcolind) Icolind;
1491 decltype(Bvals) Ivals;
1492 if(!Bview.importMatrix.is_null()) {
1493 Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1494 Icolind = Bview.importMatrix->getLocalIndicesHost();
1495 Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1498 bool C_filled = C.isFillComplete();
1500 for (
size_t i = 0; i < C_numCols; i++)
1501 c_index[i] = OrdinalTraits<size_t>::invalid();
1504 size_t Arows = Aview.rowMap->getLocalNumElements();
1505 for(
size_t i=0; i<Arows; ++i) {
1509 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1515 C_row_i_length = OrdinalTraits<size_t>::zero();
1517 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1518 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1519 const Scalar Aval = Avals[k];
1520 if (Aval == STS::zero())
1523 if (Ak == LO_INVALID)
1526 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1527 LocalOrdinal col = Bcolind[j];
1530 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1533 C_row_i[C_row_i_length] = Aval*Bvals[j];
1534 C_cols[C_row_i_length] = col;
1535 c_index[col] = C_row_i_length;
1540 C_row_i[c_index[col]] += Aval *
static_cast<Scalar
>(Bvals[j]);
1545 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1546 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1547 C_cols[ii] = bcols[C_cols[ii]];
1548 combined_index[ii] = C_cols[ii];
1549 combined_values[ii] = C_row_i[ii];
1551 last_index = C_row_i_length;
1557 C_row_i_length = OrdinalTraits<size_t>::zero();
1559 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1560 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1561 const Scalar Aval = Avals[k];
1562 if (Aval == STS::zero())
1565 if (Ak!=LO_INVALID)
continue;
1567 Ak = Acol2Irow[Acolind[k]];
1568 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1569 LocalOrdinal col = Icolind[j];
1572 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1575 C_row_i[C_row_i_length] = Aval*Ivals[j];
1576 C_cols[C_row_i_length] = col;
1577 c_index[col] = C_row_i_length;
1583 C_row_i[c_index[col]] += Aval *
static_cast<Scalar
>(Ivals[j]);
1588 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1589 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1590 C_cols[ii] = bcols_import[C_cols[ii]];
1591 combined_index[last_index] = C_cols[ii];
1592 combined_values[last_index] = C_row_i[ii];
1599 C.sumIntoGlobalValues(
1601 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1602 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1604 C.insertGlobalValues(
1606 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1607 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1613 template<
class Scalar,
1615 class GlobalOrdinal,
1617 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1618 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1619 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1621 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1622 Mview.maxNumRowEntries = Mview.indices[0].size();
1624 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1625 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1626 Mview.maxNumRowEntries = Mview.indices[i].size();
1631 template<
class CrsMatrixType>
1632 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1634 size_t Aest = 100, Best=100;
1635 if (A.getLocalNumEntries() >= A.getLocalNumRows())
1636 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1637 if (B.getLocalNumEntries() >= B.getLocalNumRows())
1638 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1640 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
1641 nnzperrow *= nnzperrow;
1643 return (
size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1653 template<
class Scalar,
1655 class GlobalOrdinal,
1657 void mult_A_B_newmatrix(
1658 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1659 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1660 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1661 const std::string& label,
1662 const Teuchos::RCP<Teuchos::ParameterList>& params)
1664 using Teuchos::Array;
1665 using Teuchos::ArrayRCP;
1666 using Teuchos::ArrayView;
1671 typedef LocalOrdinal LO;
1672 typedef GlobalOrdinal GO;
1674 typedef Import<LO,GO,NO> import_type;
1675 typedef Map<LO,GO,NO> map_type;
1678 typedef typename map_type::local_map_type local_map_type;
1680 typedef typename KCRS::StaticCrsGraphType graph_t;
1681 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1682 typedef typename NO::execution_space execution_space;
1683 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1684 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1686 #ifdef HAVE_TPETRA_MMM_TIMINGS
1687 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1688 using Teuchos::TimeMonitor;
1689 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM M5 Cmap")))));
1691 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1694 RCP<const import_type> Cimport;
1695 RCP<const map_type> Ccolmap;
1696 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1697 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1698 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1699 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1700 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1701 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1702 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1703 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1710 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1712 if (Bview.importMatrix.is_null()) {
1715 Ccolmap = Bview.colMap;
1716 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getLocalNumElements());
1718 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1719 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1720 KOKKOS_LAMBDA(
const LO i) {
1733 if (!Bimport.is_null() && !Iimport.is_null()) {
1734 Cimport = Bimport->setUnion(*Iimport);
1736 else if (!Bimport.is_null() && Iimport.is_null()) {
1737 Cimport = Bimport->setUnion();
1739 else if (Bimport.is_null() && !Iimport.is_null()) {
1740 Cimport = Iimport->setUnion();
1743 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1745 Ccolmap = Cimport->getTargetMap();
1750 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1751 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1758 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1759 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1760 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
1761 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1763 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
1764 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1773 C.replaceColMap(Ccolmap);
1791 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1792 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1794 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
1795 GO aidx = Acolmap_local.getGlobalElement(i);
1796 LO B_LID = Browmap_local.getLocalElement(aidx);
1797 if (B_LID != LO_INVALID) {
1798 targetMapToOrigRow(i) = B_LID;
1799 targetMapToImportRow(i) = LO_INVALID;
1801 LO I_LID = Irowmap_local.getLocalElement(aidx);
1802 targetMapToOrigRow(i) = LO_INVALID;
1803 targetMapToImportRow(i) = I_LID;
1810 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1816 template<
class Scalar,
1818 class GlobalOrdinal,
1820 void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1821 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1822 Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1824 using Teuchos::null;
1825 using Teuchos::Array;
1826 using Teuchos::ArrayRCP;
1827 using Teuchos::ArrayView;
1832 typedef LocalOrdinal LO;
1833 typedef GlobalOrdinal GO;
1835 typedef Import<LO,GO,NO> import_type;
1836 typedef Map<LO,GO,NO> map_type;
1837 typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1838 typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1841 typedef typename map_type::local_map_type local_map_type;
1842 typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1843 typedef typename KBSR::device_type device_t;
1844 typedef typename KBSR::StaticCrsGraphType static_graph_t;
1845 typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1846 typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1847 typedef typename KBSR::values_type::non_const_type scalar_view_t;
1848 typedef typename NO::execution_space execution_space;
1849 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1850 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1852 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1855 RCP<const import_type> Cimport;
1856 RCP<const map_type> Ccolmap;
1857 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1858 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1859 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1860 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1861 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1862 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1863 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1864 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1871 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1873 if (Bview.importMatrix.is_null()) {
1876 Ccolmap = Bview.colMap;
1877 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getLocalNumElements());
1879 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1880 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1881 KOKKOS_LAMBDA(
const LO i) {
1894 if (!Bimport.is_null() && !Iimport.is_null()) {
1895 Cimport = Bimport->setUnion(*Iimport);
1897 else if (!Bimport.is_null() && Iimport.is_null()) {
1898 Cimport = Bimport->setUnion();
1900 else if (Bimport.is_null() && !Iimport.is_null()) {
1901 Cimport = Iimport->setUnion();
1904 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1906 Ccolmap = Cimport->getTargetMap();
1913 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1914 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1915 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1916 range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1917 KOKKOS_LAMBDA(
const LO i) {
1918 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1920 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1921 range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1922 KOKKOS_LAMBDA(
const LO i) {
1923 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1943 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),
1944 Aview.colMap->getLocalNumElements());
1945 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),
1946 Aview.colMap->getLocalNumElements());
1948 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",
1949 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1950 KOKKOS_LAMBDA(
const LO i) {
1951 GO aidx = Acolmap_local.getGlobalElement(i);
1952 LO B_LID = Browmap_local.getLocalElement(aidx);
1953 if (B_LID != LO_INVALID) {
1954 targetMapToOrigRow(i) = B_LID;
1955 targetMapToImportRow(i) = LO_INVALID;
1957 LO I_LID = Irowmap_local.getLocalElement(aidx);
1958 targetMapToOrigRow(i) = LO_INVALID;
1959 targetMapToImportRow(i) = I_LID;
1964 using KernelHandle =
1965 KokkosKernels::Experimental::KokkosKernelsHandle<
typename lno_view_t::const_value_type,
1966 typename lno_nnz_view_t::const_value_type,
1967 typename scalar_view_t::const_value_type,
1968 typename device_t::execution_space,
1969 typename device_t::memory_space,
1970 typename device_t::memory_space>;
1971 int team_work_size = 16;
1972 std::string myalg(
"SPGEMM_KK_MEMORY");
1973 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1976 kh.create_spgemm_handle(alg_enum);
1977 kh.set_team_work_size(team_work_size);
1980 const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1981 const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
1982 targetMapToOrigRow,targetMapToImportRow,
1983 Bcol2Ccol,Icol2Ccol,
1984 Ccolmap.getConst()->getLocalNumElements());
1986 RCP<graph_t> graphC;
1987 typename KBSR::values_type values;
1992 KokkosSparse::block_spgemm_symbolic(kh, Amat,
false, Bmerged,
false, Cmat);
1993 KokkosSparse::block_spgemm_numeric (kh, Amat,
false, Bmerged,
false, Cmat);
1994 kh.destroy_spgemm_handle();
1997 graphC = rcp(
new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1998 values = Cmat.values;
2000 C = rcp (
new block_crs_matrix_type (*graphC, values, Aview.blocksize));
2006 template<
class Scalar,
2008 class GlobalOrdinal,
2010 class LocalOrdinalViewType>
2011 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2012 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2013 const LocalOrdinalViewType & targetMapToOrigRow,
2014 const LocalOrdinalViewType & targetMapToImportRow,
2015 const LocalOrdinalViewType & Bcol2Ccol,
2016 const LocalOrdinalViewType & Icol2Ccol,
2017 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2018 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2019 const std::string& label,
2020 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2021 #ifdef HAVE_TPETRA_MMM_TIMINGS
2022 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2023 using Teuchos::TimeMonitor;
2024 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore"))));
2027 using Teuchos::Array;
2028 using Teuchos::ArrayRCP;
2029 using Teuchos::ArrayView;
2034 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2035 typedef typename KCRS::StaticCrsGraphType graph_t;
2036 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2037 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2038 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2039 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2042 typedef LocalOrdinal LO;
2043 typedef GlobalOrdinal GO;
2045 typedef Map<LO,GO,NO> map_type;
2046 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2047 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2048 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2051 RCP<const map_type> Ccolmap = C.getColMap();
2052 size_t m = Aview.origMatrix->getLocalNumRows();
2053 size_t n = Ccolmap->getLocalNumElements();
2054 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2057 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2058 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2060 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2061 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2062 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2064 c_lno_view_t Irowptr;
2065 lno_nnz_view_t Icolind;
2066 scalar_view_t Ivals;
2067 if(!Bview.importMatrix.is_null()) {
2068 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2069 Irowptr = lclB.graph.row_map;
2070 Icolind = lclB.graph.entries;
2071 Ivals = lclB.values;
2072 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2075 #ifdef HAVE_TPETRA_MMM_TIMINGS
2076 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2086 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2087 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2088 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2089 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2099 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2100 std::vector<size_t> c_status(n, ST_INVALID);
2110 size_t CSR_ip = 0, OLD_ip = 0;
2111 for (
size_t i = 0; i < m; i++) {
2114 Crowptr[i] = CSR_ip;
2117 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2118 LO Aik = Acolind[k];
2119 const SC Aval = Avals[k];
2120 if (Aval == SC_ZERO)
2123 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2130 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2133 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2134 LO Bkj = Bcolind[j];
2135 LO Cij = Bcol2Ccol[Bkj];
2137 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2139 c_status[Cij] = CSR_ip;
2140 Ccolind[CSR_ip] = Cij;
2141 Cvals[CSR_ip] = Aval*Bvals[j];
2145 Cvals[c_status[Cij]] += Aval*Bvals[j];
2156 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2157 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2158 LO Ikj = Icolind[j];
2159 LO Cij = Icol2Ccol[Ikj];
2161 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2163 c_status[Cij] = CSR_ip;
2164 Ccolind[CSR_ip] = Cij;
2165 Cvals[CSR_ip] = Aval*Ivals[j];
2168 Cvals[c_status[Cij]] += Aval*Ivals[j];
2175 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2177 Kokkos::resize(Ccolind,CSR_alloc);
2178 Kokkos::resize(Cvals,CSR_alloc);
2183 Crowptr[m] = CSR_ip;
2186 Kokkos::resize(Ccolind,CSR_ip);
2187 Kokkos::resize(Cvals,CSR_ip);
2189 #ifdef HAVE_TPETRA_MMM_TIMINGS
2191 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix Final Sort")));
2195 if (params.is_null() || params->get(
"sort entries",
true))
2196 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2197 C.setAllValues(Crowptr,Ccolind, Cvals);
2200 #ifdef HAVE_TPETRA_MMM_TIMINGS
2202 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix ESFC")));
2214 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2215 labelList->set(
"Timer Label",label);
2216 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2217 RCP<const Export<LO,GO,NO> > dummyExport;
2218 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2219 #ifdef HAVE_TPETRA_MMM_TIMINGS
2221 MM2 = Teuchos::null;
2227 template<
class Scalar,
2229 class GlobalOrdinal,
2231 void mult_A_B_reuse(
2232 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2233 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2234 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2235 const std::string& label,
2236 const Teuchos::RCP<Teuchos::ParameterList>& params)
2238 using Teuchos::Array;
2239 using Teuchos::ArrayRCP;
2240 using Teuchos::ArrayView;
2245 typedef LocalOrdinal LO;
2246 typedef GlobalOrdinal GO;
2248 typedef Import<LO,GO,NO> import_type;
2249 typedef Map<LO,GO,NO> map_type;
2252 typedef typename map_type::local_map_type local_map_type;
2254 typedef typename KCRS::StaticCrsGraphType graph_t;
2255 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2256 typedef typename NO::execution_space execution_space;
2257 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2258 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2260 #ifdef HAVE_TPETRA_MMM_TIMINGS
2261 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2262 using Teuchos::TimeMonitor;
2263 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse Cmap"))));
2265 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2268 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2269 RCP<const map_type> Ccolmap = C.getColMap();
2270 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2271 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2272 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2273 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2274 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2275 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2278 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2282 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2283 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2286 if (!Bview.importMatrix.is_null()) {
2287 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2288 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2290 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2291 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2292 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2298 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2299 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2300 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2301 GO aidx = Acolmap_local.getGlobalElement(i);
2302 LO B_LID = Browmap_local.getLocalElement(aidx);
2303 if (B_LID != LO_INVALID) {
2304 targetMapToOrigRow(i) = B_LID;
2305 targetMapToImportRow(i) = LO_INVALID;
2307 LO I_LID = Irowmap_local.getLocalElement(aidx);
2308 targetMapToOrigRow(i) = LO_INVALID;
2309 targetMapToImportRow(i) = I_LID;
2316 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2320 template<
class Scalar,
2322 class GlobalOrdinal,
2324 class LocalOrdinalViewType>
2325 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2326 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2327 const LocalOrdinalViewType & targetMapToOrigRow,
2328 const LocalOrdinalViewType & targetMapToImportRow,
2329 const LocalOrdinalViewType & Bcol2Ccol,
2330 const LocalOrdinalViewType & Icol2Ccol,
2331 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2332 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2333 const std::string& label,
2334 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2335 #ifdef HAVE_TPETRA_MMM_TIMINGS
2336 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2337 using Teuchos::TimeMonitor;
2338 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
2339 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2348 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2349 typedef typename KCRS::StaticCrsGraphType graph_t;
2350 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2351 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2352 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2355 typedef LocalOrdinal LO;
2356 typedef GlobalOrdinal GO;
2358 typedef Map<LO,GO,NO> map_type;
2359 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2360 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2361 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2364 RCP<const map_type> Ccolmap = C.getColMap();
2365 size_t m = Aview.origMatrix->getLocalNumRows();
2366 size_t n = Ccolmap->getLocalNumElements();
2369 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2370 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2371 const KCRS & Cmat = C.getLocalMatrixHost();
2373 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2374 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2375 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2376 scalar_view_t Cvals = Cmat.values;
2378 c_lno_view_t Irowptr;
2379 lno_nnz_view_t Icolind;
2380 scalar_view_t Ivals;
2381 if(!Bview.importMatrix.is_null()) {
2382 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2383 Irowptr = lclB.graph.row_map;
2384 Icolind = lclB.graph.entries;
2385 Ivals = lclB.values;
2388 #ifdef HAVE_TPETRA_MMM_TIMINGS
2389 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2401 std::vector<size_t> c_status(n, ST_INVALID);
2404 size_t CSR_ip = 0, OLD_ip = 0;
2405 for (
size_t i = 0; i < m; i++) {
2408 OLD_ip = Crowptr[i];
2409 CSR_ip = Crowptr[i+1];
2410 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2411 c_status[Ccolind[k]] = k;
2417 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2418 LO Aik = Acolind[k];
2419 const SC Aval = Avals[k];
2420 if (Aval == SC_ZERO)
2423 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2425 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2427 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2428 LO Bkj = Bcolind[j];
2429 LO Cij = Bcol2Ccol[Bkj];
2431 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2432 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2433 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2435 Cvals[c_status[Cij]] += Aval * Bvals[j];
2440 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2441 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2442 LO Ikj = Icolind[j];
2443 LO Cij = Icol2Ccol[Ikj];
2445 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2446 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2447 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2449 Cvals[c_status[Cij]] += Aval * Ivals[j];
2455 #ifdef HAVE_TPETRA_MMM_TIMINGS
2456 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse ESFC"))));
2459 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2465 template<
class Scalar,
2467 class GlobalOrdinal,
2469 void jacobi_A_B_newmatrix(
2471 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2472 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2473 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2474 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2475 const std::string& label,
2476 const Teuchos::RCP<Teuchos::ParameterList>& params)
2478 using Teuchos::Array;
2479 using Teuchos::ArrayRCP;
2480 using Teuchos::ArrayView;
2484 typedef LocalOrdinal LO;
2485 typedef GlobalOrdinal GO;
2488 typedef Import<LO,GO,NO> import_type;
2489 typedef Map<LO,GO,NO> map_type;
2490 typedef typename map_type::local_map_type local_map_type;
2494 typedef typename KCRS::StaticCrsGraphType graph_t;
2495 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2496 typedef typename NO::execution_space execution_space;
2497 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2498 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2501 #ifdef HAVE_TPETRA_MMM_TIMINGS
2502 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2503 using Teuchos::TimeMonitor;
2504 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi M5 Cmap"))));
2506 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2509 RCP<const import_type> Cimport;
2510 RCP<const map_type> Ccolmap;
2511 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2512 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2513 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2514 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2515 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2516 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2517 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2518 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2525 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2527 if (Bview.importMatrix.is_null()) {
2530 Ccolmap = Bview.colMap;
2534 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2535 Kokkos::parallel_for (range, KOKKOS_LAMBDA (
const size_t i) {
2536 Bcol2Ccol(i) =
static_cast<LO
> (i);
2547 if (!Bimport.is_null() && !Iimport.is_null()){
2548 Cimport = Bimport->setUnion(*Iimport);
2549 Ccolmap = Cimport->getTargetMap();
2551 }
else if (!Bimport.is_null() && Iimport.is_null()) {
2552 Cimport = Bimport->setUnion();
2554 }
else if(Bimport.is_null() && !Iimport.is_null()) {
2555 Cimport = Iimport->setUnion();
2558 throw std::runtime_error(
"TpetraExt::Jacobi status of matrix importers is nonsensical");
2560 Ccolmap = Cimport->getTargetMap();
2562 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2563 std::runtime_error,
"Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2570 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2571 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2572 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2573 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2575 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2576 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2585 C.replaceColMap(Ccolmap);
2603 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2604 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2605 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2606 GO aidx = Acolmap_local.getGlobalElement(i);
2607 LO B_LID = Browmap_local.getLocalElement(aidx);
2608 if (B_LID != LO_INVALID) {
2609 targetMapToOrigRow(i) = B_LID;
2610 targetMapToImportRow(i) = LO_INVALID;
2612 LO I_LID = Irowmap_local.getLocalElement(aidx);
2613 targetMapToOrigRow(i) = LO_INVALID;
2614 targetMapToImportRow(i) = I_LID;
2621 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2630 template<
class Scalar,
2632 class GlobalOrdinal,
2634 class LocalOrdinalViewType>
2635 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2636 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2637 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2638 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2639 const LocalOrdinalViewType & targetMapToOrigRow,
2640 const LocalOrdinalViewType & targetMapToImportRow,
2641 const LocalOrdinalViewType & Bcol2Ccol,
2642 const LocalOrdinalViewType & Icol2Ccol,
2643 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2644 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2645 const std::string& label,
2646 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2648 #ifdef HAVE_TPETRA_MMM_TIMINGS
2649 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2650 using Teuchos::TimeMonitor;
2651 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Nemwmatrix SerialCore")));
2654 using Teuchos::Array;
2655 using Teuchos::ArrayRCP;
2656 using Teuchos::ArrayView;
2661 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2662 typedef typename KCRS::StaticCrsGraphType graph_t;
2663 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2664 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2665 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2666 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2669 typedef typename scalar_view_t::memory_space scalar_memory_space;
2672 typedef LocalOrdinal LO;
2673 typedef GlobalOrdinal GO;
2676 typedef Map<LO,GO,NO> map_type;
2677 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2678 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2681 RCP<const map_type> Ccolmap = C.getColMap();
2682 size_t m = Aview.origMatrix->getLocalNumRows();
2683 size_t n = Ccolmap->getLocalNumElements();
2684 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2687 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2688 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2690 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2691 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2692 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2694 c_lno_view_t Irowptr;
2695 lno_nnz_view_t Icolind;
2696 scalar_view_t Ivals;
2697 if(!Bview.importMatrix.is_null()) {
2698 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2699 Irowptr = lclB.graph.row_map;
2700 Icolind = lclB.graph.entries;
2701 Ivals = lclB.values;
2702 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2707 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2715 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2716 Array<size_t> c_status(n, ST_INVALID);
2725 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2726 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2727 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2728 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2729 size_t CSR_ip = 0, OLD_ip = 0;
2731 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2745 for (
size_t i = 0; i < m; i++) {
2748 Crowptr[i] = CSR_ip;
2749 SC minusOmegaDval = -omega*Dvals(i,0);
2752 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2753 Scalar Bval = Bvals[j];
2754 if (Bval == SC_ZERO)
2756 LO Bij = Bcolind[j];
2757 LO Cij = Bcol2Ccol[Bij];
2760 c_status[Cij] = CSR_ip;
2761 Ccolind[CSR_ip] = Cij;
2762 Cvals[CSR_ip] = Bvals[j];
2767 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2768 LO Aik = Acolind[k];
2769 const SC Aval = Avals[k];
2770 if (Aval == SC_ZERO)
2773 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2775 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2777 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2778 LO Bkj = Bcolind[j];
2779 LO Cij = Bcol2Ccol[Bkj];
2781 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2783 c_status[Cij] = CSR_ip;
2784 Ccolind[CSR_ip] = Cij;
2785 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2789 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2795 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2796 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2797 LO Ikj = Icolind[j];
2798 LO Cij = Icol2Ccol[Ikj];
2800 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2802 c_status[Cij] = CSR_ip;
2803 Ccolind[CSR_ip] = Cij;
2804 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2807 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2814 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2816 Kokkos::resize(Ccolind,CSR_alloc);
2817 Kokkos::resize(Cvals,CSR_alloc);
2821 Crowptr[m] = CSR_ip;
2824 Kokkos::resize(Ccolind,CSR_ip);
2825 Kokkos::resize(Cvals,CSR_ip);
2828 #ifdef HAVE_TPETRA_MMM_TIMINGS
2829 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix Final Sort")));
2836 C.replaceColMap(Ccolmap);
2843 if (params.is_null() || params->get(
"sort entries",
true))
2844 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2845 C.setAllValues(Crowptr,Ccolind, Cvals);
2848 #ifdef HAVE_TPETRA_MMM_TIMINGS
2849 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix ESFC")));
2860 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2861 labelList->set(
"Timer Label",label);
2862 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2863 RCP<const Export<LO,GO,NO> > dummyExport;
2864 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2872 template<
class Scalar,
2874 class GlobalOrdinal,
2876 void jacobi_A_B_reuse(
2878 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2879 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2880 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2881 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2882 const std::string& label,
2883 const Teuchos::RCP<Teuchos::ParameterList>& params)
2885 using Teuchos::Array;
2886 using Teuchos::ArrayRCP;
2887 using Teuchos::ArrayView;
2891 typedef LocalOrdinal LO;
2892 typedef GlobalOrdinal GO;
2895 typedef Import<LO,GO,NO> import_type;
2896 typedef Map<LO,GO,NO> map_type;
2899 typedef typename map_type::local_map_type local_map_type;
2901 typedef typename KCRS::StaticCrsGraphType graph_t;
2902 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2903 typedef typename NO::execution_space execution_space;
2904 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2905 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2907 #ifdef HAVE_TPETRA_MMM_TIMINGS
2908 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2909 using Teuchos::TimeMonitor;
2910 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse Cmap"))));
2912 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2915 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2916 RCP<const map_type> Ccolmap = C.getColMap();
2917 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2918 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2919 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2920 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2921 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2922 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2925 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2929 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2930 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2933 if (!Bview.importMatrix.is_null()) {
2934 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2935 std::runtime_error,
"Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2937 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2938 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2939 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2945 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2946 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2947 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2948 GO aidx = Acolmap_local.getGlobalElement(i);
2949 LO B_LID = Browmap_local.getLocalElement(aidx);
2950 if (B_LID != LO_INVALID) {
2951 targetMapToOrigRow(i) = B_LID;
2952 targetMapToImportRow(i) = LO_INVALID;
2954 LO I_LID = Irowmap_local.getLocalElement(aidx);
2955 targetMapToOrigRow(i) = LO_INVALID;
2956 targetMapToImportRow(i) = I_LID;
2961 #ifdef HAVE_TPETRA_MMM_TIMINGS
2967 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2973 template<
class Scalar,
2975 class GlobalOrdinal,
2977 class LocalOrdinalViewType>
2978 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2979 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2980 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2981 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2982 const LocalOrdinalViewType & targetMapToOrigRow,
2983 const LocalOrdinalViewType & targetMapToImportRow,
2984 const LocalOrdinalViewType & Bcol2Ccol,
2985 const LocalOrdinalViewType & Icol2Ccol,
2986 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2987 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2988 const std::string& label,
2989 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2990 #ifdef HAVE_TPETRA_MMM_TIMINGS
2991 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2992 using Teuchos::TimeMonitor;
2993 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore"))));
2994 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
3002 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
3003 typedef typename KCRS::StaticCrsGraphType graph_t;
3004 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3005 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3006 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3007 typedef typename scalar_view_t::memory_space scalar_memory_space;
3010 typedef LocalOrdinal LO;
3011 typedef GlobalOrdinal GO;
3013 typedef Map<LO,GO,NO> map_type;
3014 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3015 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3016 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3019 RCP<const map_type> Ccolmap = C.getColMap();
3020 size_t m = Aview.origMatrix->getLocalNumRows();
3021 size_t n = Ccolmap->getLocalNumElements();
3024 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
3025 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
3026 const KCRS & Cmat = C.getLocalMatrixHost();
3028 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3029 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3030 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3031 scalar_view_t Cvals = Cmat.values;
3033 c_lno_view_t Irowptr;
3034 lno_nnz_view_t Icolind;
3035 scalar_view_t Ivals;
3036 if(!Bview.importMatrix.is_null()) {
3037 auto lclB = Bview.importMatrix->getLocalMatrixHost();
3038 Irowptr = lclB.graph.row_map;
3039 Icolind = lclB.graph.entries;
3040 Ivals = lclB.values;
3045 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
3047 #ifdef HAVE_TPETRA_MMM_TIMINGS
3048 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore - Compare"))));
3056 std::vector<size_t> c_status(n, ST_INVALID);
3059 size_t CSR_ip = 0, OLD_ip = 0;
3060 for (
size_t i = 0; i < m; i++) {
3064 OLD_ip = Crowptr[i];
3065 CSR_ip = Crowptr[i+1];
3066 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
3067 c_status[Ccolind[k]] = k;
3073 SC minusOmegaDval = -omega*Dvals(i,0);
3076 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3077 Scalar Bval = Bvals[j];
3078 if (Bval == SC_ZERO)
3080 LO Bij = Bcolind[j];
3081 LO Cij = Bcol2Ccol[Bij];
3083 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3084 std::runtime_error,
"Trying to insert a new entry into a static graph");
3086 Cvals[c_status[Cij]] = Bvals[j];
3090 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3091 LO Aik = Acolind[k];
3092 const SC Aval = Avals[k];
3093 if (Aval == SC_ZERO)
3096 if (targetMapToOrigRow[Aik] != LO_INVALID) {
3098 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
3100 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3101 LO Bkj = Bcolind[j];
3102 LO Cij = Bcol2Ccol[Bkj];
3104 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3105 std::runtime_error,
"Trying to insert a new entry into a static graph");
3107 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3112 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
3113 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3114 LO Ikj = Icolind[j];
3115 LO Cij = Icol2Ccol[Ikj];
3117 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3118 std::runtime_error,
"Trying to insert a new entry into a static graph");
3120 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3126 #ifdef HAVE_TPETRA_MMM_TIMINGS
3127 MM2 = Teuchos::null;
3128 MM2 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
3131 C.fillComplete(C.getDomainMap(), C.getRangeMap());
3132 #ifdef HAVE_TPETRA_MMM_TIMINGS
3133 MM2 = Teuchos::null;
3142 template<
class Scalar,
3144 class GlobalOrdinal,
3146 void import_and_extract_views(
3147 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3148 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3149 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3150 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3151 bool userAssertsThereAreNoRemotes,
3152 const std::string& label,
3153 const Teuchos::RCP<Teuchos::ParameterList>& params)
3155 using Teuchos::Array;
3156 using Teuchos::ArrayView;
3159 using Teuchos::null;
3162 typedef LocalOrdinal LO;
3163 typedef GlobalOrdinal GO;
3166 typedef Map<LO,GO,NO> map_type;
3167 typedef Import<LO,GO,NO> import_type;
3168 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3170 #ifdef HAVE_TPETRA_MMM_TIMINGS
3171 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
3172 using Teuchos::TimeMonitor;
3173 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Alloc"))));
3181 Aview.deleteContents();
3183 Aview.origMatrix = rcp(&A,
false);
3185 Aview.origMatrix->getApplyHelper();
3186 Aview.origRowMap = A.getRowMap();
3187 Aview.rowMap = targetMap;
3188 Aview.colMap = A.getColMap();
3189 Aview.domainMap = A.getDomainMap();
3190 Aview.importColMap = null;
3191 RCP<const map_type> rowMap = A.getRowMap();
3192 const int numProcs = rowMap->getComm()->getSize();
3195 if (userAssertsThereAreNoRemotes || numProcs < 2)
3198 RCP<const import_type> importer;
3199 if (params != null && params->isParameter(
"importer")) {
3200 importer = params->get<RCP<const import_type> >(
"importer");
3203 #ifdef HAVE_TPETRA_MMM_TIMINGS
3205 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap"))));
3210 RCP<const map_type> remoteRowMap;
3211 size_t numRemote = 0;
3213 if (!prototypeImporter.is_null() &&
3214 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3215 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3217 #ifdef HAVE_TPETRA_MMM_TIMINGS
3218 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode1"));
3220 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3221 numRemote = prototypeImporter->getNumRemoteIDs();
3223 Array<GO> remoteRows(numRemote);
3224 for (
size_t i = 0; i < numRemote; i++)
3225 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3227 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3228 rowMap->getIndexBase(), rowMap->getComm()));
3231 }
else if (prototypeImporter.is_null()) {
3233 #ifdef HAVE_TPETRA_MMM_TIMINGS
3234 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode2"));
3236 ArrayView<const GO> rows = targetMap->getLocalElementList();
3237 size_t numRows = targetMap->getLocalNumElements();
3239 Array<GO> remoteRows(numRows);
3240 for(
size_t i = 0; i < numRows; ++i) {
3241 const LO mlid = rowMap->getLocalElement(rows[i]);
3243 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3244 remoteRows[numRemote++] = rows[i];
3246 remoteRows.resize(numRemote);
3247 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3248 rowMap->getIndexBase(), rowMap->getComm()));
3257 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3258 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3267 #ifdef HAVE_TPETRA_MMM_TIMINGS
3269 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Collective-0"))));
3273 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3275 if (globalMaxNumRemote > 0) {
3276 #ifdef HAVE_TPETRA_MMM_TIMINGS
3278 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-2"))));
3282 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3284 importer = rcp(
new import_type(rowMap, remoteRowMap));
3286 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match A.getRowMap()!");
3290 params->set(
"importer", importer);
3293 if (importer != null) {
3294 #ifdef HAVE_TPETRA_MMM_TIMINGS
3296 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-3"))));
3300 Teuchos::ParameterList labelList;
3301 labelList.set(
"Timer Label", label);
3302 auto & labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
3305 bool overrideAllreduce =
false;
3308 Teuchos::ParameterList params_sublist;
3309 if(!params.is_null()) {
3310 labelList.set(
"compute global constants", params->get(
"compute global constants",
false));
3311 params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
3312 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3313 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3314 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3315 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
3316 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
3318 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
3319 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3320 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3322 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3323 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3325 Aview.importMatrix->getApplyHelper();
3331 sprintf(str,
"import_matrix.%d.dat",count);
3336 #ifdef HAVE_TPETRA_MMM_STATISTICS
3337 printMultiplicationStatistics(importer, label + std::string(
" I&X MMM"));
3341 #ifdef HAVE_TPETRA_MMM_TIMINGS
3343 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-4"))));
3347 Aview.importColMap = Aview.importMatrix->getColMap();
3348 #ifdef HAVE_TPETRA_MMM_TIMINGS
3355 template<
class Scalar,
3357 class GlobalOrdinal,
3359 void import_and_extract_views(
3360 const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3361 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3362 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3363 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3364 bool userAssertsThereAreNoRemotes)
3366 using Teuchos::Array;
3367 using Teuchos::ArrayView;
3370 using Teuchos::null;
3373 typedef LocalOrdinal LO;
3374 typedef GlobalOrdinal GO;
3377 typedef Map<LO,GO,NO> map_type;
3378 typedef Import<LO,GO,NO> import_type;
3379 typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3387 Mview.deleteContents();
3389 Mview.origMatrix = rcp(&M,
false);
3391 Mview.origMatrix->getApplyHelper();
3392 Mview.origRowMap = M.getRowMap();
3393 Mview.rowMap = targetMap;
3394 Mview.colMap = M.getColMap();
3395 Mview.importColMap = null;
3396 RCP<const map_type> rowMap = M.getRowMap();
3397 const int numProcs = rowMap->getComm()->getSize();
3400 if (userAssertsThereAreNoRemotes || numProcs < 2)
return;
3404 RCP<const map_type> remoteRowMap;
3405 size_t numRemote = 0;
3407 if (!prototypeImporter.is_null() &&
3408 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3409 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3412 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3413 numRemote = prototypeImporter->getNumRemoteIDs();
3415 Array<GO> remoteRows(numRemote);
3416 for (
size_t i = 0; i < numRemote; i++)
3417 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3419 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3420 rowMap->getIndexBase(), rowMap->getComm()));
3423 }
else if (prototypeImporter.is_null()) {
3426 ArrayView<const GO> rows = targetMap->getLocalElementList();
3427 size_t numRows = targetMap->getLocalNumElements();
3429 Array<GO> remoteRows(numRows);
3430 for(
size_t i = 0; i < numRows; ++i) {
3431 const LO mlid = rowMap->getLocalElement(rows[i]);
3433 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3434 remoteRows[numRemote++] = rows[i];
3436 remoteRows.resize(numRemote);
3437 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3438 rowMap->getIndexBase(), rowMap->getComm()));
3447 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3448 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3456 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3458 RCP<const import_type> importer;
3460 if (globalMaxNumRemote > 0) {
3463 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3465 importer = rcp(
new import_type(rowMap, remoteRowMap));
3467 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match M.getRowMap()!");
3470 if (importer != null) {
3473 Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3475 Mview.importMatrix->getApplyHelper();
3478 Mview.importColMap = Mview.importMatrix->getColMap();
3484 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3486 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3487 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3488 const LocalOrdinalViewType & Acol2Brow,
3489 const LocalOrdinalViewType & Acol2Irow,
3490 const LocalOrdinalViewType & Bcol2Ccol,
3491 const LocalOrdinalViewType & Icol2Ccol,
3492 const size_t mergedNodeNumCols) {
3496 typedef typename KCRS::StaticCrsGraphType graph_t;
3497 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3498 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3499 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3501 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3502 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3505 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3509 RCP<const KCRS> Ik_;
3510 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3511 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3513 if(Ik!=0) Iks = *Ik;
3514 size_t merge_numrows = Ak.numCols();
3516 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3518 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3521 typedef typename Node::execution_space execution_space;
3522 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3523 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3524 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3525 if(
final) Mrowptr(i) = update;
3528 if(Acol2Brow(i)!=LO_INVALID)
3529 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3531 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3534 if(
final && i+1==merge_numrows)
3535 Mrowptr(i+1)=update;
3539 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3540 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3541 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz);
3544 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3545 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3546 if(Acol2Brow(i)!=LO_INVALID) {
3547 size_t row = Acol2Brow(i);
3548 size_t start = Bk.graph.row_map(row);
3549 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3550 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3551 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3555 size_t row = Acol2Irow(i);
3556 size_t start = Iks.graph.row_map(row);
3557 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3558 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3559 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3564 KCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3575 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3576 const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3577 merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3578 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3579 const LocalOrdinalViewType & Acol2Brow,
3580 const LocalOrdinalViewType & Acol2Irow,
3581 const LocalOrdinalViewType & Bcol2Ccol,
3582 const LocalOrdinalViewType & Icol2Ccol,
3583 const size_t mergedNodeNumCols)
3586 typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3587 typedef typename KBCRS::StaticCrsGraphType graph_t;
3588 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3589 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3590 typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3593 const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3594 const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3597 if(!Bview.importMatrix.is_null() ||
3598 (Bview.importMatrix.is_null() &&
3599 (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3604 RCP<const KBCRS> Ik_;
3605 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3606 const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3608 if(Ik!=0) Iks = *Ik;
3609 size_t merge_numrows = Ak.numCols();
3612 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3614 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3617 typedef typename Node::execution_space execution_space;
3618 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3619 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3620 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3621 if(
final) Mrowptr(i) = update;
3624 if(Acol2Brow(i)!=LO_INVALID)
3625 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3627 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3630 if(
final && i+1==merge_numrows)
3631 Mrowptr(i+1)=update;
3635 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3636 const int blocksize = Ak.blockDim();
3637 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3638 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz*blocksize*blocksize);
3641 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3642 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3643 if(Acol2Brow(i)!=LO_INVALID) {
3644 size_t row = Acol2Brow(i);
3645 size_t start = Bk.graph.row_map(row);
3646 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3647 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3649 for (
int b=0; b<blocksize*blocksize; ++b) {
3650 const int val_indx = j*blocksize*blocksize + b;
3651 const int b_val_indx = (j-Mrowptr(i)+
start)*blocksize*blocksize + b;
3652 Mvalues(val_indx) = Bk.values(b_val_indx);
3657 size_t row = Acol2Irow(i);
3658 size_t start = Iks.graph.row_map(row);
3659 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3660 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3662 for (
int b=0; b<blocksize*blocksize; ++b) {
3663 const int val_indx = j*blocksize*blocksize + b;
3664 const int b_val_indx = (j-Mrowptr(i)+
start)*blocksize*blocksize + b;
3665 Mvalues(val_indx) = Iks.values(b_val_indx);
3672 KBCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3682 template<
typename SC,
typename LO,
typename GO,
typename NO>
3683 void AddKernels<SC, LO, GO, NO>::
3685 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3686 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3687 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3688 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3689 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3690 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3691 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3692 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3693 #
if KOKKOSKERNELS_VERSION >= 40299
3696 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3697 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3698 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3700 using Teuchos::TimeMonitor;
3701 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3702 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3703 auto nrows = Arowptrs.extent(0) - 1;
3704 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3705 typename AddKern::KKH handle;
3706 handle.create_spadd_handle(
true);
3707 auto addHandle = handle.get_spadd_handle();
3708 #ifdef HAVE_TPETRA_MMM_TIMINGS
3709 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3711 KokkosSparse::Experimental::spadd_symbolic
3713 #
if KOKKOSKERNELS_VERSION >= 40299
3714 nrows, numGlobalCols,
3716 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3718 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3719 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3720 #ifdef HAVE_TPETRA_MMM_TIMINGS
3722 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted numeric")));
3724 KokkosSparse::Experimental::spadd_numeric(&handle,
3725 #
if KOKKOSKERNELS_VERSION >= 40299
3726 nrows, numGlobalCols,
3728 Arowptrs, Acolinds, Avals, scalarA,
3729 Browptrs, Bcolinds, Bvals, scalarB,
3730 Crowptrs, Ccolinds, Cvals);
3733 template<
typename SC,
typename LO,
typename GO,
typename NO>
3734 void AddKernels<SC, LO, GO, NO>::
3736 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3737 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3738 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3739 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3740 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3741 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3742 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3743 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3744 #
if KOKKOSKERNELS_VERSION >= 40299
3749 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3750 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3751 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3753 using Teuchos::TimeMonitor;
3754 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3755 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3756 auto nrows = Arowptrs.extent(0) - 1;
3757 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3758 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3759 typename AddKern::KKH handle;
3760 handle.create_spadd_handle(
false);
3761 auto addHandle = handle.get_spadd_handle();
3762 #ifdef HAVE_TPETRA_MMM_TIMINGS
3763 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3765 KokkosSparse::Experimental::spadd_symbolic
3767 #
if KOKKOSKERNELS_VERSION >= 40299
3768 nrows, numGlobalCols,
3770 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3772 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3773 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3774 #ifdef HAVE_TPETRA_MMM_TIMINGS
3776 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3778 KokkosSparse::Experimental::spadd_numeric(&handle,
3779 #
if KOKKOSKERNELS_VERSION >= 40299
3780 nrows, numGlobalCols,
3782 Arowptrs, Acolinds, Avals, scalarA,
3783 Browptrs, Bcolinds, Bvals, scalarB,
3784 Crowptrs, Ccolinds, Cvals);
3787 template<
typename GO,
3788 typename LocalIndicesType,
3789 typename GlobalIndicesType,
3790 typename ColMapType>
3791 struct ConvertLocalToGlobalFunctor
3793 ConvertLocalToGlobalFunctor(
3794 const LocalIndicesType& colindsOrig_,
3795 const GlobalIndicesType& colindsConverted_,
3796 const ColMapType& colmap_) :
3797 colindsOrig (colindsOrig_),
3798 colindsConverted (colindsConverted_),
3801 KOKKOS_INLINE_FUNCTION
void
3802 operator() (
const GO i)
const
3804 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3806 LocalIndicesType colindsOrig;
3807 GlobalIndicesType colindsConverted;
3811 template<
typename SC,
typename LO,
typename GO,
typename NO>
3812 void MMdetails::AddKernels<SC, LO, GO, NO>::
3813 convertToGlobalAndAdd(
3814 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3815 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3816 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3817 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3818 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3819 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3820 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3821 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3822 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3824 using Teuchos::TimeMonitor;
3827 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3828 typename NO::execution_space,
typename NO::memory_space,
typename NO::memory_space>;
3830 const values_array Avals = A.values;
3831 const values_array Bvals = B.values;
3832 const col_inds_array Acolinds = A.graph.entries;
3833 const col_inds_array Bcolinds = B.graph.entries;
3834 auto Arowptrs = A.graph.row_map;
3835 auto Browptrs = B.graph.row_map;
3836 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"A colinds (converted)"), Acolinds.extent(0));
3837 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"B colinds (converted)"), Bcolinds.extent(0));
3838 #ifdef HAVE_TPETRA_MMM_TIMINGS
3839 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string(
"column map conversion"))));
3841 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3842 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3843 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3844 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3846 handle.create_spadd_handle(
false);
3847 auto addHandle = handle.get_spadd_handle();
3848 #ifdef HAVE_TPETRA_MMM_TIMINGS
3850 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3852 auto nrows = Arowptrs.extent(0) - 1;
3853 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3854 KokkosSparse::Experimental::spadd_symbolic
3856 #
if KOKKOSKERNELS_VERSION >= 40299
3859 Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3860 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3861 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3862 #ifdef HAVE_TPETRA_MMM_TIMINGS
3864 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3866 KokkosSparse::Experimental::spadd_numeric(&handle,
3867 #
if KOKKOSKERNELS_VERSION >= 40299
3870 Arowptrs, AcolindsConverted, Avals, scalarA,
3871 Browptrs, BcolindsConverted, Bvals, scalarB,
3872 Crowptrs, Ccolinds, Cvals);
3888 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3890 void MatrixMatrix::Multiply( \
3891 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3893 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3895 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3896 bool call_FillComplete_on_result, \
3897 const std::string & label, \
3898 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3901 void MatrixMatrix::Multiply( \
3902 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3904 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3906 Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3907 const std::string & label); \
3910 void MatrixMatrix::Jacobi( \
3912 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3913 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3914 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3915 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3916 bool call_FillComplete_on_result, \
3917 const std::string & label, \
3918 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3921 void MatrixMatrix::Add( \
3922 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3925 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3928 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3931 void MatrixMatrix::Add( \
3932 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3935 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3938 const Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3941 void MatrixMatrix::Add( \
3942 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3945 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3949 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3950 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3951 (const SCALAR & alpha, \
3952 const bool transposeA, \
3953 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3954 const SCALAR & beta, \
3955 const bool transposeB, \
3956 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3957 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3958 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3959 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3963 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3964 (const SCALAR & alpha, \
3965 const bool transposeA, \
3966 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3967 const SCALAR& beta, \
3968 const bool transposeB, \
3969 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3970 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3971 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3972 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3973 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3975 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3977 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3978 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3979 CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3980 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3981 bool userAssertsThereAreNoRemotes, \
3982 const std::string& label, \
3983 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3985 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3986 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3987 BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3988 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3989 bool userAssertsThereAreNoRemotes);
3992 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Struct that holds views of the contents of a BlockCrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void start()
Start the deep_copy counter.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
Forward declaration of some Tpetra Matrix Matrix objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.