41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "KokkosSparse_Utils.hpp"
44 #include "Tpetra_ConfigDefs.hpp"
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
49 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_BlockCrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
55 #include "Tpetra_Details_makeColMap.hpp"
56 #include "Tpetra_ConfigDefs.hpp"
57 #include "Tpetra_Map.hpp"
58 #include "Tpetra_Export.hpp"
62 #include <type_traits>
63 #include "Teuchos_FancyOStream.hpp"
65 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
68 #include "KokkosSparse_spgemm.hpp"
69 #include "KokkosSparse_spadd.hpp"
70 #include "Kokkos_Bitset.hpp"
84 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
85 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
86 #include "TpetraExt_MatrixMatrix_HIP.hpp"
87 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
91 namespace MatrixMatrix{
99 template <
class Scalar,
109 bool call_FillComplete_on_result,
110 const std::string& label,
111 const Teuchos::RCP<Teuchos::ParameterList>& params)
117 typedef LocalOrdinal LO;
118 typedef GlobalOrdinal GO;
126 #ifdef HAVE_TPETRA_MMM_TIMINGS
127 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
128 using Teuchos::TimeMonitor;
131 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Setup"))));
134 const std::string prefix =
"TpetraExt::MatrixMatrix::Multiply(): ";
139 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
140 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
145 RCP<const crs_matrix_type> Aprime = null;
149 RCP<const crs_matrix_type> Bprime = null;
159 const bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
161 bool use_optimized_ATB =
false;
162 if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
163 use_optimized_ATB =
true;
165 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
166 use_optimized_ATB =
false;
169 using Teuchos::ParameterList;
170 RCP<ParameterList> transposeParams (
new ParameterList);
171 transposeParams->set (
"sort",
true);
173 if (!use_optimized_ATB && transposeA) {
174 transposer_type transposer (rcpFromRef (A));
175 Aprime = transposer.createTranspose (transposeParams);
178 Aprime = rcpFromRef(A);
182 transposer_type transposer (rcpFromRef (B));
183 Bprime = transposer.createTranspose (transposeParams);
186 Bprime = rcpFromRef(B);
196 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
197 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
198 "must match for matrix-matrix product. op(A) is "
199 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
205 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
206 prefix <<
"ERROR, dimensions of result C must "
208 <<
" rows, should have at least " << Aouter << std::endl);
220 int numProcs = A.
getComm()->getSize();
224 crs_matrix_struct_type Aview;
225 crs_matrix_struct_type Bview;
227 RCP<const map_type> targetMap_A = Aprime->getRowMap();
228 RCP<const map_type> targetMap_B = Bprime->getRowMap();
230 #ifdef HAVE_TPETRA_MMM_TIMINGS
232 TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All I&X")));
238 if (!use_optimized_ATB) {
239 RCP<const import_type> dummyImporter;
240 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label, params);
246 targetMap_B = Aprime->getColMap();
249 if (!use_optimized_ATB)
250 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
252 #ifdef HAVE_TPETRA_MMM_TIMINGS
255 MM = Teuchos::null; MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All Multiply"))));
259 if (use_optimized_ATB) {
260 MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
262 }
else if (call_FillComplete_on_result && newFlag) {
263 MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
265 }
else if (call_FillComplete_on_result) {
266 MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
272 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
274 MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
277 #ifdef HAVE_TPETRA_MMM_TIMINGS
278 TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM All FillComplete")));
287 C.
fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
295 template <
class Scalar,
305 const std::string& label)
311 typedef LocalOrdinal LO;
312 typedef GlobalOrdinal GO;
318 std::string prefix = std::string(
"TpetraExt ") + label + std::string(
": ");
320 TEUCHOS_TEST_FOR_EXCEPTION(transposeA==
true, std::runtime_error, prefix <<
"Matrix A cannot be transposed.");
321 TEUCHOS_TEST_FOR_EXCEPTION(transposeB==
true, std::runtime_error, prefix <<
"Matrix B cannot be transposed.");
333 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
334 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
335 "must match for matrix-matrix product. op(A) is "
336 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
340 int numProcs = A->getComm()->getSize();
342 const LO blocksize = A->getBlockSize();
343 TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
344 prefix <<
"ERROR, Blocksizes do not match. A.blocksize = " <<
345 blocksize <<
", B.blocksize = " << B->getBlockSize() );
349 blockcrs_matrix_struct_type Aview(blocksize);
350 blockcrs_matrix_struct_type Bview(blocksize);
352 RCP<const map_type> targetMap_A = A->getRowMap();
353 RCP<const map_type> targetMap_B = B->getRowMap();
356 RCP<const import_type> dummyImporter;
357 MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter,
true);
362 targetMap_B = A->getColMap();
365 MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
366 A->getGraph()->getImporter().is_null());
369 MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
372 template <
class Scalar,
381 bool call_FillComplete_on_result,
382 const std::string& label,
383 const Teuchos::RCP<Teuchos::ParameterList>& params)
387 typedef LocalOrdinal LO;
388 typedef GlobalOrdinal GO;
395 #ifdef HAVE_TPETRA_MMM_TIMINGS
396 std::string prefix_mmm = std::string(
"TpetraExt ")+ label + std::string(
": ");
397 using Teuchos::TimeMonitor;
398 TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string(
"Jacobi All Setup")));
401 const std::string prefix =
"TpetraExt::MatrixMatrix::Jacobi(): ";
406 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error, prefix <<
"Matrix A is not fill complete.");
407 TEUCHOS_TEST_FOR_EXCEPTION(!B.
isFillComplete(), std::runtime_error, prefix <<
"Matrix B is not fill complete.");
409 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
410 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
419 TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
420 prefix <<
"ERROR, inner dimensions of op(A) and op(B) "
421 "must match for matrix-matrix product. op(A) is "
422 << Aouter <<
"x" << Ainner <<
", op(B) is "<< Binner <<
"x" << Bouter);
428 TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.
getGlobalNumRows(), std::runtime_error,
429 prefix <<
"ERROR, dimensions of result C must "
431 <<
" rows, should have at least "<< Aouter << std::endl);
443 int numProcs = A.
getComm()->getSize();
447 crs_matrix_struct_type Aview;
448 crs_matrix_struct_type Bview;
450 RCP<const map_type> targetMap_A = Aprime->getRowMap();
451 RCP<const map_type> targetMap_B = Bprime->getRowMap();
453 #ifdef HAVE_TPETRA_MMM_TIMINGS
454 TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All I&X")));
460 RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(
new Teuchos::ParameterList);
461 if(!params.is_null()) {
462 importParams1->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
463 int mm_optimization_core_count=0;
464 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
466 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
467 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
468 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
469 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
470 auto & ip1slist = importParams1->sublist(
"matrixmatrix: kernel params",
false);
471 ip1slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
472 ip1slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
473 ip1slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
477 RCP<const import_type> dummyImporter;
478 MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter,
true, label,importParams1);
483 targetMap_B = Aprime->getColMap();
488 RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(
new Teuchos::ParameterList);
489 if(!params.is_null()) {
490 importParams2->set(
"compute global constants",params->get(
"compute global constants: temporaries",
false));
492 auto slist = params->sublist(
"matrixmatrix: kernel params",
false);
494 bool isMM = slist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
495 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
496 auto & ip2slist = importParams2->sublist(
"matrixmatrix: kernel params",
false);
497 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
498 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
499 ip2slist.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
500 ip2slist.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
501 ip2slist.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
504 MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
506 #ifdef HAVE_TPETRA_MMM_TIMINGS
508 TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi All Multiply")));
512 CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
515 bool newFlag = !C.
getGraph()->isLocallyIndexed() && !C.
getGraph()->isGloballyIndexed();
517 if (call_FillComplete_on_result && newFlag) {
518 MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
520 }
else if (call_FillComplete_on_result) {
521 MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
524 TEUCHOS_TEST_FOR_EXCEPTION(
true, std::runtime_error,
"jacobi_A_B_general not implemented");
527 if(!params.is_null()) {
528 bool removeZeroEntries = params->get(
"remove zeros",
false);
529 if (removeZeroEntries) {
530 typedef Teuchos::ScalarTraits<Scalar> STS;
531 typename STS::magnitudeType threshold = params->get(
"remove zeros threshold", STS::magnitude(STS::zero()));
538 template <
class Scalar,
549 using Teuchos::Array;
553 typedef LocalOrdinal LO;
554 typedef GlobalOrdinal GO;
559 const std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
561 TEUCHOS_TEST_FOR_EXCEPTION(!A.
isFillComplete(), std::runtime_error,
562 prefix <<
"ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
563 "(Result matrix B is not required to be isFillComplete()).");
564 TEUCHOS_TEST_FOR_EXCEPTION(B.
isFillComplete() , std::runtime_error,
565 prefix <<
"ERROR, input matrix B must not be fill complete!");
566 TEUCHOS_TEST_FOR_EXCEPTION(B.
isStaticGraph() , std::runtime_error,
567 prefix <<
"ERROR, input matrix B must not have static graph!");
569 prefix <<
"ERROR, input matrix B must not be locally indexed!");
571 using Teuchos::ParameterList;
572 RCP<ParameterList> transposeParams (
new ParameterList);
573 transposeParams->set (
"sort",
false);
575 RCP<const crs_matrix_type> Aprime = null;
577 transposer_type transposer (rcpFromRef (A));
578 Aprime = transposer.createTranspose (transposeParams);
581 Aprime = rcpFromRef(A);
589 if (scalarB != Teuchos::ScalarTraits<SC>::one())
593 if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
594 for (LO i = 0; (size_t)i < numMyRows; ++i) {
595 row = B.
getRowMap()->getGlobalElement(i);
596 Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
598 if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
599 for (
size_t j = 0; j < a_numEntries; ++j)
600 a_vals[j] *= scalarA;
602 B.
insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
607 template <
class Scalar,
611 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
613 const bool transposeA,
616 const bool transposeB,
620 const Teuchos::RCP<Teuchos::ParameterList>& params)
623 using Teuchos::rcpFromRef;
625 using Teuchos::ParameterList;
627 if(!params.is_null())
629 TEUCHOS_TEST_FOR_EXCEPTION(
630 params->isParameter(
"Call fillComplete") && !params->get<
bool>(
"Call fillComplete"),
631 std::invalid_argument,
632 "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
633 "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
634 params->set(
"Call fillComplete",
true);
638 RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
645 TEUCHOS_TEST_FOR_EXCEPTION
646 (! A.
isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
647 "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
648 RCP<crs_matrix_type> C = rcp(
new crs_matrix_type(Brcp->getRowMap(), 0));
650 add(alpha, transposeA, A, beta,
false, *Brcp, *C, domainMap, rangeMap, params);
658 template<
class LO,
class GO,
class LOView,
class GOView,
class LocalMap>
659 struct ConvertGlobalToLocalFunctor
661 ConvertGlobalToLocalFunctor(LOView& lids_,
const GOView& gids_,
const LocalMap localColMap_)
662 : lids(lids_), gids(gids_), localColMap(localColMap_)
665 KOKKOS_FUNCTION
void operator() (
const GO i)
const
667 lids(i) = localColMap.getLocalElement(gids(i));
672 const LocalMap localColMap;
675 template <
class Scalar,
681 const bool transposeA,
684 const bool transposeB,
689 const Teuchos::RCP<Teuchos::ParameterList>& params)
693 using Teuchos::rcpFromRef;
694 using Teuchos::rcp_implicit_cast;
695 using Teuchos::rcp_dynamic_cast;
696 using Teuchos::TimeMonitor;
698 using LO = LocalOrdinal;
699 using GO = GlobalOrdinal;
707 using exec_space =
typename crs_graph_type::execution_space;
708 using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
709 const char* prefix_mmm =
"TpetraExt::MatrixMatrix::add: ";
710 constexpr
bool debug =
false;
712 #ifdef HAVE_TPETRA_MMM_TIMINGS
713 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"transpose"))));
717 std::ostringstream os;
718 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
719 <<
"TpetraExt::MatrixMatrix::add" << std::endl;
720 std::cerr << os.str ();
723 TEUCHOS_TEST_FOR_EXCEPTION
725 prefix_mmm <<
"C must be a 'new' matrix (neither locally nor globally indexed).");
726 TEUCHOS_TEST_FOR_EXCEPTION
728 prefix_mmm <<
"A and B must both be fill complete.");
729 #ifdef HAVE_TPETRA_DEBUG
732 const bool domainMapsSame =
733 (! transposeA && ! transposeB &&
735 (! transposeA && transposeB &&
737 ( transposeA && ! transposeB &&
739 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
740 prefix_mmm <<
"The domain Maps of Op(A) and Op(B) are not the same.");
742 const bool rangeMapsSame =
743 (! transposeA && ! transposeB &&
745 (! transposeA && transposeB &&
747 ( transposeA && ! transposeB &&
749 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
750 prefix_mmm <<
"The range Maps of Op(A) and Op(B) are not the same.");
752 #endif // HAVE_TPETRA_DEBUG
754 using Teuchos::ParameterList;
756 RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
758 transposer_type transposer (Aprime);
759 Aprime = transposer.createTranspose ();
762 #ifdef HAVE_TPETRA_DEBUG
763 TEUCHOS_TEST_FOR_EXCEPTION
764 (Aprime.is_null (), std::logic_error,
765 prefix_mmm <<
"Failed to compute Op(A). "
766 "Please report this bug to the Tpetra developers.");
767 #endif // HAVE_TPETRA_DEBUG
770 RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
773 std::ostringstream os;
774 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
775 <<
"Form explicit xpose of B" << std::endl;
776 std::cerr << os.str ();
778 transposer_type transposer (Bprime);
779 Bprime = transposer.createTranspose ();
781 #ifdef HAVE_TPETRA_DEBUG
782 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
783 prefix_mmm <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
784 TEUCHOS_TEST_FOR_EXCEPTION(
785 !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
786 prefix_mmm <<
"Aprime and Bprime must both be fill complete. "
787 "Please report this bug to the Tpetra developers.");
788 #endif // HAVE_TPETRA_DEBUG
789 RCP<const map_type> CDomainMap = domainMap;
790 RCP<const map_type> CRangeMap = rangeMap;
791 if(CDomainMap.is_null())
793 CDomainMap = Bprime->getDomainMap();
795 if(CRangeMap.is_null())
797 CRangeMap = Bprime->getRangeMap();
799 assert(!(CDomainMap.is_null()));
800 assert(!(CRangeMap.is_null()));
801 typedef typename AddKern::values_array values_array;
802 typedef typename AddKern::row_ptrs_array row_ptrs_array;
803 typedef typename AddKern::col_inds_array col_inds_array;
804 bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
805 bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
807 row_ptrs_array rowptrs;
808 col_inds_array colinds;
809 #ifdef HAVE_TPETRA_MMM_TIMINGS
811 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"rowmap check/import"))));
813 if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
816 auto import = rcp(
new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
821 Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *
import, Bprime->getDomainMap(), Bprime->getRangeMap());
823 bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
824 bool sorted = AGraphSorted && BGraphSorted;
825 RCP<const import_type> Cimport = Teuchos::null;
826 RCP<export_type> Cexport = Teuchos::null;
827 bool doFillComplete =
true;
828 if(Teuchos::nonnull(params) && params->isParameter(
"Call fillComplete"))
830 doFillComplete = params->get<
bool>(
"Call fillComplete");
832 auto Alocal = Aprime->getLocalMatrixDevice();
833 auto Blocal = Bprime->getLocalMatrixDevice();
834 LO numLocalRows = Alocal.numRows();
835 if(numLocalRows == 0)
842 rowptrs = row_ptrs_array(
"C rowptrs", 0);
844 auto Acolmap = Aprime->getColMap();
845 auto Bcolmap = Bprime->getColMap();
848 using global_col_inds_array =
typename AddKern::global_col_inds_array;
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
851 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mismatched col map full kernel"))));
854 auto AlocalColmap = Acolmap->getLocalMap();
855 auto BlocalColmap = Bcolmap->getLocalMap();
856 global_col_inds_array globalColinds;
858 std::ostringstream os;
859 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
860 <<
"Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
861 std::cerr << os.str ();
863 AddKern::convertToGlobalAndAdd(
864 Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
865 vals, rowptrs, globalColinds);
867 std::ostringstream os;
868 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
869 <<
"Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
870 std::cerr << os.str ();
872 #ifdef HAVE_TPETRA_MMM_TIMINGS
874 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Constructing graph"))));
876 RCP<const map_type> CcolMap;
877 Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
878 (CcolMap, CDomainMap, globalColinds);
880 col_inds_array localColinds(
"C colinds", globalColinds.extent(0));
881 Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
882 ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
883 col_inds_array, global_col_inds_array,
884 typename map_type::local_map_type>
885 (localColinds, globalColinds, CcolMap->getLocalMap()));
886 KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
895 auto Avals = Alocal.values;
896 auto Bvals = Blocal.values;
897 auto Arowptrs = Alocal.graph.row_map;
898 auto Browptrs = Blocal.graph.row_map;
899 auto Acolinds = Alocal.graph.entries;
900 auto Bcolinds = Blocal.graph.entries;
904 #ifdef HAVE_TPETRA_MMM_TIMINGS
906 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"sorted entries full kernel"))));
909 std::ostringstream os;
910 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
911 <<
"Call AddKern::addSorted(...)" << std::endl;
912 std::cerr << os.str ();
914 #if KOKKOSKERNELS_VERSION >= 40299
915 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
917 AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
923 #ifdef HAVE_TPETRA_MMM_TIMINGS
925 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"mm add unsorted entries full kernel"))));
928 std::ostringstream os;
929 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
930 <<
"Call AddKern::addUnsorted(...)" << std::endl;
931 std::cerr << os.str ();
933 AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
936 RCP<const map_type> Ccolmap = Bcolmap;
941 #ifdef HAVE_TPETRA_MMM_TIMINGS
943 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Tpetra::Crs expertStaticFillComplete"))));
945 if(!CDomainMap->isSameAs(*Ccolmap))
948 std::ostringstream os;
949 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
950 <<
"Create Cimport" << std::endl;
951 std::cerr << os.str ();
953 Cimport = rcp(
new import_type(CDomainMap, Ccolmap));
958 std::ostringstream os;
959 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
960 <<
"Create Cexport" << std::endl;
961 std::cerr << os.str ();
963 Cexport = rcp(
new export_type(C.
getRowMap(), CRangeMap));
967 std::ostringstream os;
968 os <<
"Proc " << A.
getMap ()->getComm ()->getRank () <<
": "
969 <<
"Call C->expertStaticFillComplete(...)" << std::endl;
970 std::cerr << os.str ();
979 template <
class Scalar,
992 using Teuchos::Array;
993 using Teuchos::ArrayRCP;
994 using Teuchos::ArrayView;
997 using Teuchos::rcp_dynamic_cast;
998 using Teuchos::rcpFromRef;
999 using Teuchos::tuple;
1002 typedef Teuchos::ScalarTraits<Scalar> STS;
1010 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
1012 TEUCHOS_TEST_FOR_EXCEPTION(
1014 prefix <<
"A and B must both be fill complete before calling this function.");
1018 prefix <<
"C is null (must be allocated), but A.haveGlobalConstants() is false. "
1019 "Please report this bug to the Tpetra developers.");
1021 prefix <<
"C is null (must be allocated), but B.haveGlobalConstants() is false. "
1022 "Please report this bug to the Tpetra developers.");
1025 #ifdef HAVE_TPETRA_DEBUG
1027 const bool domainMapsSame =
1031 TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
1032 prefix <<
"The domain Maps of Op(A) and Op(B) are not the same.");
1034 const bool rangeMapsSame =
1038 TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
1039 prefix <<
"The range Maps of Op(A) and Op(B) are not the same.");
1041 #endif // HAVE_TPETRA_DEBUG
1043 using Teuchos::ParameterList;
1044 RCP<ParameterList> transposeParams (
new ParameterList);
1045 transposeParams->set (
"sort",
false);
1048 RCP<const crs_matrix_type> Aprime;
1050 transposer_type theTransposer (rcpFromRef (A));
1051 Aprime = theTransposer.createTranspose (transposeParams);
1054 Aprime = rcpFromRef (A);
1057 #ifdef HAVE_TPETRA_DEBUG
1058 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1059 prefix <<
"Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1060 #endif // HAVE_TPETRA_DEBUG
1063 RCP<const crs_matrix_type> Bprime;
1065 transposer_type theTransposer (rcpFromRef (B));
1066 Bprime = theTransposer.createTranspose (transposeParams);
1069 Bprime = rcpFromRef (B);
1072 #ifdef HAVE_TPETRA_DEBUG
1073 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1074 prefix <<
"Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1075 #endif // HAVE_TPETRA_DEBUG
1077 bool CwasFillComplete =
false;
1080 if (! C.is_null ()) {
1081 CwasFillComplete = C->isFillComplete();
1082 if(CwasFillComplete)
1084 C->setAllToScalar (STS::zero ());
1094 if(Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
1095 LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1096 Array<size_t> CmaxEntriesPerRow(numLocalRows);
1097 for(LocalOrdinal i = 0; i < numLocalRows; i++) {
1098 CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1100 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), CmaxEntriesPerRow()));
1104 C = rcp (
new crs_matrix_type (Aprime->getRowMap (), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1108 #ifdef HAVE_TPETRA_DEBUG
1109 TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1110 prefix <<
"At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1111 TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1112 prefix <<
"At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1113 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1114 prefix <<
"At this point, C is null. Please report this bug to the Tpetra developers.");
1115 #endif // HAVE_TPETRA_DEBUG
1117 Array<RCP<const crs_matrix_type> > Mat =
1118 tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1119 Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1122 for (
int k = 0; k < 2; ++k) {
1123 typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1124 typename crs_matrix_type::nonconst_values_host_view_type Values;
1132 #ifdef HAVE_TPETRA_DEBUG
1133 TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1134 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1135 #endif // HAVE_TPETRA_DEBUG
1136 RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1137 #ifdef HAVE_TPETRA_DEBUG
1138 TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1139 prefix <<
"At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1140 #endif // HAVE_TPETRA_DEBUG
1142 const size_t localNumRows = Mat[k]->getLocalNumRows ();
1143 for (
size_t i = 0; i < localNumRows; ++i) {
1144 const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1145 size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1146 if (numEntries > 0) {
1147 if(numEntries > Indices.extent(0)) {
1148 Kokkos::resize(Indices, numEntries);
1149 Kokkos::resize(Values, numEntries);
1151 Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1153 if (scalar[k] != STS::one ()) {
1154 for (
size_t j = 0; j < numEntries; ++j) {
1155 Values[j] *= scalar[k];
1159 if (CwasFillComplete) {
1160 size_t result = C->sumIntoGlobalValues (globalRow, numEntries,
1161 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1162 TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1163 prefix <<
"sumIntoGlobalValues failed to add entries from A or B into C.");
1165 C->insertGlobalValues (globalRow, numEntries,
1166 reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1171 if(CwasFillComplete) {
1172 C->fillComplete(C->getDomainMap (),
1179 template <
class Scalar,
1181 class GlobalOrdinal,
1192 std::string prefix =
"TpetraExt::MatrixMatrix::Add(): ";
1194 TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::invalid_argument,
1195 prefix <<
"C must not be null");
1197 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C_ = C;
1198 Add(A, transposeA, scalarA, B, transposeB, scalarB, C_);
1203 namespace MMdetails{
1255 template<
class Scalar,
1257 class GlobalOrdinal,
1259 void mult_AT_B_newmatrix(
1260 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1261 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1262 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1263 const std::string & label,
1264 const Teuchos::RCP<Teuchos::ParameterList>& params)
1269 typedef LocalOrdinal LO;
1270 typedef GlobalOrdinal GO;
1272 typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1273 typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1275 #ifdef HAVE_TPETRA_MMM_TIMINGS
1276 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1277 using Teuchos::TimeMonitor;
1278 RCP<TimeMonitor> MM = rcp (
new TimeMonitor
1279 (*TimeMonitor::getNewTimer (prefix_mmm +
"MMM-T Transpose")));
1285 transposer_type transposer (rcpFromRef (A), label + std::string(
"XP: "));
1287 using Teuchos::ParameterList;
1288 RCP<ParameterList> transposeParams (
new ParameterList);
1289 transposeParams->set (
"sort",
true);
1290 if(! params.is_null ()) {
1291 transposeParams->set (
"compute global constants",
1292 params->get (
"compute global constants: temporaries",
1295 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1296 transposer.createTransposeLocal (transposeParams);
1301 #ifdef HAVE_TPETRA_MMM_TIMINGS
1303 MM = rcp (
new TimeMonitor
1304 (*TimeMonitor::getNewTimer (prefix_mmm + std::string (
"MMM-T I&X"))));
1308 crs_matrix_struct_type Aview;
1309 crs_matrix_struct_type Bview;
1310 RCP<const Import<LO, GO, NO> > dummyImporter;
1313 RCP<Teuchos::ParameterList> importParams1 (
new ParameterList);
1314 if (! params.is_null ()) {
1315 importParams1->set (
"compute global constants",
1316 params->get (
"compute global constants: temporaries",
1318 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1319 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1320 bool overrideAllreduce = slist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1321 int mm_optimization_core_count =
1323 mm_optimization_core_count =
1324 slist.get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1325 int mm_optimization_core_count2 =
1326 params->get (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1327 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1328 mm_optimization_core_count = mm_optimization_core_count2;
1330 auto & sip1 = importParams1->sublist (
"matrixmatrix: kernel params",
false);
1331 sip1.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1332 sip1.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1333 sip1.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1336 MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1337 Aview, dummyImporter,
true,
1338 label, importParams1);
1340 RCP<ParameterList> importParams2 (
new ParameterList);
1341 if (! params.is_null ()) {
1342 importParams2->set (
"compute global constants",
1343 params->get (
"compute global constants: temporaries",
1345 auto slist = params->sublist (
"matrixmatrix: kernel params",
false);
1346 bool isMM = slist.get (
"isMatrixMatrix_TransferAndFillComplete",
false);
1347 bool overrideAllreduce = slist.get (
"MM_TAFC_OverrideAllreduceCheck",
false);
1348 int mm_optimization_core_count =
1350 mm_optimization_core_count =
1351 slist.get (
"MM_TAFC_OptimizationCoreCount",
1352 mm_optimization_core_count);
1353 int mm_optimization_core_count2 =
1354 params->get (
"MM_TAFC_OptimizationCoreCount",
1355 mm_optimization_core_count);
1356 if (mm_optimization_core_count2 < mm_optimization_core_count) {
1357 mm_optimization_core_count = mm_optimization_core_count2;
1359 auto & sip2 = importParams2->sublist (
"matrixmatrix: kernel params",
false);
1360 sip2.set (
"MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1361 sip2.set (
"isMatrixMatrix_TransferAndFillComplete", isMM);
1362 sip2.set (
"MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1365 if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1366 MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,
true, label,importParams2);
1369 MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,
false, label,importParams2);
1372 #ifdef HAVE_TPETRA_MMM_TIMINGS
1374 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T AB-core"))));
1377 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1380 bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1381 if (needs_final_export) {
1385 Ctemp = rcp (&C,
false);
1388 mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1393 #ifdef HAVE_TPETRA_MMM_TIMINGS
1395 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM-T exportAndFillComplete"))));
1398 RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C,
false);
1400 if (needs_final_export) {
1401 ParameterList labelList;
1402 labelList.set(
"Timer Label", label);
1403 if(!params.is_null()) {
1404 ParameterList& params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
1405 ParameterList& labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
1407 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1408 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1409 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1410 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,
"Core Count above which the optimized neighbor discovery is used");
1411 bool isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
1412 bool overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
1414 labelList_subList.set (
"isMatrixMatrix_TransferAndFillComplete", isMM,
1415 "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1416 labelList.set(
"compute global constants",params->get(
"compute global constants",
true));
1417 labelList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1419 Ctemp->exportAndFillComplete (Crcp,
1420 *Ctemp->getGraph ()->getExporter (),
1423 rcp (&labelList,
false));
1425 #ifdef HAVE_TPETRA_MMM_STATISTICS
1426 printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(
" AT_B MMM"));
1432 template<
class Scalar,
1434 class GlobalOrdinal,
1437 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1438 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1439 CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1440 const std::string& ,
1441 const Teuchos::RCP<Teuchos::ParameterList>& )
1443 using Teuchos::Array;
1444 using Teuchos::ArrayRCP;
1445 using Teuchos::ArrayView;
1446 using Teuchos::OrdinalTraits;
1447 using Teuchos::null;
1449 typedef Teuchos::ScalarTraits<Scalar> STS;
1451 LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1452 LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1454 LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1455 LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1457 ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1458 ArrayView<const GlobalOrdinal> bcols_import = null;
1459 if (Bview.importColMap != null) {
1460 C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1461 C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1463 bcols_import = Bview.importColMap->getLocalElementList();
1466 size_t C_numCols = C_lastCol - C_firstCol +
1467 OrdinalTraits<LocalOrdinal>::one();
1468 size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1469 OrdinalTraits<LocalOrdinal>::one();
1471 if (C_numCols_import > C_numCols)
1472 C_numCols = C_numCols_import;
1474 Array<Scalar> dwork = Array<Scalar>(C_numCols);
1475 Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1476 Array<size_t> iwork2 = Array<size_t>(C_numCols);
1478 Array<Scalar> C_row_i = dwork;
1479 Array<GlobalOrdinal> C_cols = iwork;
1480 Array<size_t> c_index = iwork2;
1481 Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1482 Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1484 size_t C_row_i_length, j, k, last_index;
1487 LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1488 Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1489 Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1490 if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1492 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1493 Aview.colMap->getMaxLocalIndex(); i++)
1498 for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1499 Aview.colMap->getMaxLocalIndex(); i++) {
1500 GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1501 LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1502 if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1503 else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1513 auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1514 auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1515 auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1516 auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1517 auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1518 auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1519 decltype(Browptr) Irowptr;
1520 decltype(Bcolind) Icolind;
1521 decltype(Bvals) Ivals;
1522 if(!Bview.importMatrix.is_null()) {
1523 Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1524 Icolind = Bview.importMatrix->getLocalIndicesHost();
1525 Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1528 bool C_filled = C.isFillComplete();
1530 for (
size_t i = 0; i < C_numCols; i++)
1531 c_index[i] = OrdinalTraits<size_t>::invalid();
1534 size_t Arows = Aview.rowMap->getLocalNumElements();
1535 for(
size_t i=0; i<Arows; ++i) {
1539 GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1545 C_row_i_length = OrdinalTraits<size_t>::zero();
1547 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1548 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1549 const Scalar Aval = Avals[k];
1550 if (Aval == STS::zero())
1553 if (Ak == LO_INVALID)
1556 for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1557 LocalOrdinal col = Bcolind[j];
1560 if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1563 C_row_i[C_row_i_length] = Aval*Bvals[j];
1564 C_cols[C_row_i_length] = col;
1565 c_index[col] = C_row_i_length;
1570 C_row_i[c_index[col]] += Aval *
static_cast<Scalar
>(Bvals[j]);
1575 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1576 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1577 C_cols[ii] = bcols[C_cols[ii]];
1578 combined_index[ii] = C_cols[ii];
1579 combined_values[ii] = C_row_i[ii];
1581 last_index = C_row_i_length;
1587 C_row_i_length = OrdinalTraits<size_t>::zero();
1589 for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1590 LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1591 const Scalar Aval = Avals[k];
1592 if (Aval == STS::zero())
1595 if (Ak!=LO_INVALID)
continue;
1597 Ak = Acol2Irow[Acolind[k]];
1598 for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1599 LocalOrdinal col = Icolind[j];
1602 if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1605 C_row_i[C_row_i_length] = Aval*Ivals[j];
1606 C_cols[C_row_i_length] = col;
1607 c_index[col] = C_row_i_length;
1613 C_row_i[c_index[col]] += Aval *
static_cast<Scalar
>(Ivals[j]);
1618 for (
size_t ii = 0; ii < C_row_i_length; ii++) {
1619 c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1620 C_cols[ii] = bcols_import[C_cols[ii]];
1621 combined_index[last_index] = C_cols[ii];
1622 combined_values[last_index] = C_row_i[ii];
1629 C.sumIntoGlobalValues(
1631 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1632 combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1634 C.insertGlobalValues(
1636 combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1637 combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1643 template<
class Scalar,
1645 class GlobalOrdinal,
1647 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1648 typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1649 Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1651 if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1652 Mview.maxNumRowEntries = Mview.indices[0].size();
1654 for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1655 if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1656 Mview.maxNumRowEntries = Mview.indices[i].size();
1661 template<
class CrsMatrixType>
1662 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1664 size_t Aest = 100, Best=100;
1665 if (A.getLocalNumEntries() >= A.getLocalNumRows())
1666 Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1667 if (B.getLocalNumEntries() >= B.getLocalNumRows())
1668 Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1670 size_t nnzperrow = (size_t)(sqrt((
double)Aest) + sqrt((
double)Best) - 1);
1671 nnzperrow *= nnzperrow;
1673 return (
size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1683 template<
class Scalar,
1685 class GlobalOrdinal,
1687 void mult_A_B_newmatrix(
1688 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1689 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1690 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1691 const std::string& label,
1692 const Teuchos::RCP<Teuchos::ParameterList>& params)
1694 using Teuchos::Array;
1695 using Teuchos::ArrayRCP;
1696 using Teuchos::ArrayView;
1701 typedef LocalOrdinal LO;
1702 typedef GlobalOrdinal GO;
1704 typedef Import<LO,GO,NO> import_type;
1705 typedef Map<LO,GO,NO> map_type;
1708 typedef typename map_type::local_map_type local_map_type;
1710 typedef typename KCRS::StaticCrsGraphType graph_t;
1711 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1712 typedef typename NO::execution_space execution_space;
1713 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1714 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1716 #ifdef HAVE_TPETRA_MMM_TIMINGS
1717 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
1718 using Teuchos::TimeMonitor;
1719 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM M5 Cmap")))));
1721 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1724 RCP<const import_type> Cimport;
1725 RCP<const map_type> Ccolmap;
1726 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1727 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1728 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1729 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1730 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1731 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1732 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1733 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1740 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1742 if (Bview.importMatrix.is_null()) {
1745 Ccolmap = Bview.colMap;
1746 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getLocalNumElements());
1748 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1749 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1750 KOKKOS_LAMBDA(
const LO i) {
1763 if (!Bimport.is_null() && !Iimport.is_null()) {
1764 Cimport = Bimport->setUnion(*Iimport);
1766 else if (!Bimport.is_null() && Iimport.is_null()) {
1767 Cimport = Bimport->setUnion();
1769 else if (Bimport.is_null() && !Iimport.is_null()) {
1770 Cimport = Iimport->setUnion();
1773 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1775 Ccolmap = Cimport->getTargetMap();
1780 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1781 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1788 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1789 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1790 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
1791 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1793 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
1794 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1803 C.replaceColMap(Ccolmap);
1821 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1822 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1824 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
1825 GO aidx = Acolmap_local.getGlobalElement(i);
1826 LO B_LID = Browmap_local.getLocalElement(aidx);
1827 if (B_LID != LO_INVALID) {
1828 targetMapToOrigRow(i) = B_LID;
1829 targetMapToImportRow(i) = LO_INVALID;
1831 LO I_LID = Irowmap_local.getLocalElement(aidx);
1832 targetMapToOrigRow(i) = LO_INVALID;
1833 targetMapToImportRow(i) = I_LID;
1840 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1846 template<
class Scalar,
1848 class GlobalOrdinal,
1850 void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1851 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1852 Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1854 using Teuchos::null;
1855 using Teuchos::Array;
1856 using Teuchos::ArrayRCP;
1857 using Teuchos::ArrayView;
1862 typedef LocalOrdinal LO;
1863 typedef GlobalOrdinal GO;
1865 typedef Import<LO,GO,NO> import_type;
1866 typedef Map<LO,GO,NO> map_type;
1867 typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1868 typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1871 typedef typename map_type::local_map_type local_map_type;
1872 typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1873 typedef typename KBSR::device_type device_t;
1874 typedef typename KBSR::StaticCrsGraphType static_graph_t;
1875 typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1876 typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1877 typedef typename KBSR::values_type::non_const_type scalar_view_t;
1878 typedef typename NO::execution_space execution_space;
1879 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1880 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1882 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1885 RCP<const import_type> Cimport;
1886 RCP<const map_type> Ccolmap;
1887 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1888 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1889 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1890 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1891 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1892 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1893 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1894 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1901 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1903 if (Bview.importMatrix.is_null()) {
1906 Ccolmap = Bview.colMap;
1907 const LO colMapSize =
static_cast<LO
>(Bview.colMap->getLocalNumElements());
1909 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1910 Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1911 KOKKOS_LAMBDA(
const LO i) {
1924 if (!Bimport.is_null() && !Iimport.is_null()) {
1925 Cimport = Bimport->setUnion(*Iimport);
1927 else if (!Bimport.is_null() && Iimport.is_null()) {
1928 Cimport = Bimport->setUnion();
1930 else if (Bimport.is_null() && !Iimport.is_null()) {
1931 Cimport = Iimport->setUnion();
1934 throw std::runtime_error(
"TpetraExt::MMM status of matrix importers is nonsensical");
1936 Ccolmap = Cimport->getTargetMap();
1943 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1944 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1945 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1946 range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1947 KOKKOS_LAMBDA(
const LO i) {
1948 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1950 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1951 range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1952 KOKKOS_LAMBDA(
const LO i) {
1953 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1973 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),
1974 Aview.colMap->getLocalNumElements());
1975 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),
1976 Aview.colMap->getLocalNumElements());
1978 Kokkos::parallel_for(
"Tpetra::mult_A_B_newmatrix::construct_tables",
1979 range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1980 KOKKOS_LAMBDA(
const LO i) {
1981 GO aidx = Acolmap_local.getGlobalElement(i);
1982 LO B_LID = Browmap_local.getLocalElement(aidx);
1983 if (B_LID != LO_INVALID) {
1984 targetMapToOrigRow(i) = B_LID;
1985 targetMapToImportRow(i) = LO_INVALID;
1987 LO I_LID = Irowmap_local.getLocalElement(aidx);
1988 targetMapToOrigRow(i) = LO_INVALID;
1989 targetMapToImportRow(i) = I_LID;
1994 using KernelHandle =
1995 KokkosKernels::Experimental::KokkosKernelsHandle<
typename lno_view_t::const_value_type,
1996 typename lno_nnz_view_t::const_value_type,
1997 typename scalar_view_t::const_value_type,
1998 typename device_t::execution_space,
1999 typename device_t::memory_space,
2000 typename device_t::memory_space>;
2001 int team_work_size = 16;
2002 std::string myalg(
"SPGEMM_KK_MEMORY");
2003 KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
2006 kh.create_spgemm_handle(alg_enum);
2007 kh.set_team_work_size(team_work_size);
2010 const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
2011 const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
2012 targetMapToOrigRow,targetMapToImportRow,
2013 Bcol2Ccol,Icol2Ccol,
2014 Ccolmap.getConst()->getLocalNumElements());
2016 RCP<graph_t> graphC;
2017 typename KBSR::values_type values;
2022 KokkosSparse::block_spgemm_symbolic(kh, Amat,
false, Bmerged,
false, Cmat);
2023 KokkosSparse::block_spgemm_numeric (kh, Amat,
false, Bmerged,
false, Cmat);
2024 kh.destroy_spgemm_handle();
2027 graphC = rcp(
new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
2028 values = Cmat.values;
2030 C = rcp (
new block_crs_matrix_type (*graphC, values, Aview.blocksize));
2036 template<
class Scalar,
2038 class GlobalOrdinal,
2040 class LocalOrdinalViewType>
2041 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2042 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2043 const LocalOrdinalViewType & targetMapToOrigRow,
2044 const LocalOrdinalViewType & targetMapToImportRow,
2045 const LocalOrdinalViewType & Bcol2Ccol,
2046 const LocalOrdinalViewType & Icol2Ccol,
2047 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2048 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2049 const std::string& label,
2050 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2051 #ifdef HAVE_TPETRA_MMM_TIMINGS
2052 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2053 using Teuchos::TimeMonitor;
2054 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore"))));
2057 using Teuchos::Array;
2058 using Teuchos::ArrayRCP;
2059 using Teuchos::ArrayView;
2064 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2065 typedef typename KCRS::StaticCrsGraphType graph_t;
2066 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2067 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2068 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2069 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2072 typedef LocalOrdinal LO;
2073 typedef GlobalOrdinal GO;
2075 typedef Map<LO,GO,NO> map_type;
2076 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2077 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2078 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2081 RCP<const map_type> Ccolmap = C.getColMap();
2082 size_t m = Aview.origMatrix->getLocalNumRows();
2083 size_t n = Ccolmap->getLocalNumElements();
2084 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2087 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2088 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2090 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2091 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2092 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2094 c_lno_view_t Irowptr;
2095 lno_nnz_view_t Icolind;
2096 scalar_view_t Ivals;
2097 if(!Bview.importMatrix.is_null()) {
2098 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2099 Irowptr = lclB.graph.row_map;
2100 Icolind = lclB.graph.entries;
2101 Ivals = lclB.values;
2102 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2105 #ifdef HAVE_TPETRA_MMM_TIMINGS
2106 RCP<TimeMonitor> MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2116 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2117 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2118 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2119 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2129 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2130 std::vector<size_t> c_status(n, ST_INVALID);
2140 size_t CSR_ip = 0, OLD_ip = 0;
2141 for (
size_t i = 0; i < m; i++) {
2144 Crowptr[i] = CSR_ip;
2147 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2148 LO Aik = Acolind[k];
2149 const SC Aval = Avals[k];
2150 if (Aval == SC_ZERO)
2153 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2160 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2163 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2164 LO Bkj = Bcolind[j];
2165 LO Cij = Bcol2Ccol[Bkj];
2167 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2169 c_status[Cij] = CSR_ip;
2170 Ccolind[CSR_ip] = Cij;
2171 Cvals[CSR_ip] = Aval*Bvals[j];
2175 Cvals[c_status[Cij]] += Aval*Bvals[j];
2186 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2187 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2188 LO Ikj = Icolind[j];
2189 LO Cij = Icol2Ccol[Ikj];
2191 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2193 c_status[Cij] = CSR_ip;
2194 Ccolind[CSR_ip] = Cij;
2195 Cvals[CSR_ip] = Aval*Ivals[j];
2198 Cvals[c_status[Cij]] += Aval*Ivals[j];
2205 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2207 Kokkos::resize(Ccolind,CSR_alloc);
2208 Kokkos::resize(Cvals,CSR_alloc);
2213 Crowptr[m] = CSR_ip;
2216 Kokkos::resize(Ccolind,CSR_ip);
2217 Kokkos::resize(Cvals,CSR_ip);
2219 #ifdef HAVE_TPETRA_MMM_TIMINGS
2221 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix Final Sort")));
2225 if (params.is_null() || params->get(
"sort entries",
true))
2226 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2227 C.setAllValues(Crowptr,Ccolind, Cvals);
2230 #ifdef HAVE_TPETRA_MMM_TIMINGS
2232 auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix ESFC")));
2244 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2245 labelList->set(
"Timer Label",label);
2246 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2247 RCP<const Export<LO,GO,NO> > dummyExport;
2248 C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2249 #ifdef HAVE_TPETRA_MMM_TIMINGS
2251 MM2 = Teuchos::null;
2257 template<
class Scalar,
2259 class GlobalOrdinal,
2261 void mult_A_B_reuse(
2262 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2263 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2264 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2265 const std::string& label,
2266 const Teuchos::RCP<Teuchos::ParameterList>& params)
2268 using Teuchos::Array;
2269 using Teuchos::ArrayRCP;
2270 using Teuchos::ArrayView;
2275 typedef LocalOrdinal LO;
2276 typedef GlobalOrdinal GO;
2278 typedef Import<LO,GO,NO> import_type;
2279 typedef Map<LO,GO,NO> map_type;
2282 typedef typename map_type::local_map_type local_map_type;
2284 typedef typename KCRS::StaticCrsGraphType graph_t;
2285 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2286 typedef typename NO::execution_space execution_space;
2287 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2288 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2290 #ifdef HAVE_TPETRA_MMM_TIMINGS
2291 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2292 using Teuchos::TimeMonitor;
2293 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse Cmap"))));
2295 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2298 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2299 RCP<const map_type> Ccolmap = C.getColMap();
2300 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2301 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2302 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2303 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2304 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2305 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2308 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2312 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2313 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2316 if (!Bview.importMatrix.is_null()) {
2317 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2318 std::runtime_error,
"Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2320 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2321 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2322 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2328 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2329 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2330 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2331 GO aidx = Acolmap_local.getGlobalElement(i);
2332 LO B_LID = Browmap_local.getLocalElement(aidx);
2333 if (B_LID != LO_INVALID) {
2334 targetMapToOrigRow(i) = B_LID;
2335 targetMapToImportRow(i) = LO_INVALID;
2337 LO I_LID = Irowmap_local.getLocalElement(aidx);
2338 targetMapToOrigRow(i) = LO_INVALID;
2339 targetMapToImportRow(i) = I_LID;
2346 KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2350 template<
class Scalar,
2352 class GlobalOrdinal,
2354 class LocalOrdinalViewType>
2355 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2356 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2357 const LocalOrdinalViewType & targetMapToOrigRow,
2358 const LocalOrdinalViewType & targetMapToImportRow,
2359 const LocalOrdinalViewType & Bcol2Ccol,
2360 const LocalOrdinalViewType & Icol2Ccol,
2361 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2362 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
2363 const std::string& label,
2364 const Teuchos::RCP<Teuchos::ParameterList>& ) {
2365 #ifdef HAVE_TPETRA_MMM_TIMINGS
2366 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2367 using Teuchos::TimeMonitor;
2368 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
2369 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2378 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2379 typedef typename KCRS::StaticCrsGraphType graph_t;
2380 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2381 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2382 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2385 typedef LocalOrdinal LO;
2386 typedef GlobalOrdinal GO;
2388 typedef Map<LO,GO,NO> map_type;
2389 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2390 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2391 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2394 RCP<const map_type> Ccolmap = C.getColMap();
2395 size_t m = Aview.origMatrix->getLocalNumRows();
2396 size_t n = Ccolmap->getLocalNumElements();
2399 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2400 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2401 const KCRS & Cmat = C.getLocalMatrixHost();
2403 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2404 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2405 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2406 scalar_view_t Cvals = Cmat.values;
2408 c_lno_view_t Irowptr;
2409 lno_nnz_view_t Icolind;
2410 scalar_view_t Ivals;
2411 if(!Bview.importMatrix.is_null()) {
2412 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2413 Irowptr = lclB.graph.row_map;
2414 Icolind = lclB.graph.entries;
2415 Ivals = lclB.values;
2418 #ifdef HAVE_TPETRA_MMM_TIMINGS
2419 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
2431 std::vector<size_t> c_status(n, ST_INVALID);
2434 size_t CSR_ip = 0, OLD_ip = 0;
2435 for (
size_t i = 0; i < m; i++) {
2438 OLD_ip = Crowptr[i];
2439 CSR_ip = Crowptr[i+1];
2440 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
2441 c_status[Ccolind[k]] = k;
2447 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2448 LO Aik = Acolind[k];
2449 const SC Aval = Avals[k];
2450 if (Aval == SC_ZERO)
2453 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2455 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2457 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2458 LO Bkj = Bcolind[j];
2459 LO Cij = Bcol2Ccol[Bkj];
2461 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2462 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2463 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2465 Cvals[c_status[Cij]] += Aval * Bvals[j];
2470 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2471 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2472 LO Ikj = Icolind[j];
2473 LO Cij = Icol2Ccol[Ikj];
2475 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2476 std::runtime_error,
"Trying to insert a new entry (" << i <<
"," << Cij <<
") into a static graph " <<
2477 "(c_status = " << c_status[Cij] <<
" of [" << OLD_ip <<
"," << CSR_ip <<
"))");
2479 Cvals[c_status[Cij]] += Aval * Ivals[j];
2485 #ifdef HAVE_TPETRA_MMM_TIMINGS
2486 auto MM3 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse ESFC"))));
2489 C.fillComplete(C.getDomainMap(), C.getRangeMap());
2495 template<
class Scalar,
2497 class GlobalOrdinal,
2499 void jacobi_A_B_newmatrix(
2501 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2502 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2503 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2504 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2505 const std::string& label,
2506 const Teuchos::RCP<Teuchos::ParameterList>& params)
2508 using Teuchos::Array;
2509 using Teuchos::ArrayRCP;
2510 using Teuchos::ArrayView;
2514 typedef LocalOrdinal LO;
2515 typedef GlobalOrdinal GO;
2518 typedef Import<LO,GO,NO> import_type;
2519 typedef Map<LO,GO,NO> map_type;
2520 typedef typename map_type::local_map_type local_map_type;
2524 typedef typename KCRS::StaticCrsGraphType graph_t;
2525 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2526 typedef typename NO::execution_space execution_space;
2527 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2528 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2531 #ifdef HAVE_TPETRA_MMM_TIMINGS
2532 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2533 using Teuchos::TimeMonitor;
2534 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi M5 Cmap"))));
2536 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2539 RCP<const import_type> Cimport;
2540 RCP<const map_type> Ccolmap;
2541 RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2542 RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2543 Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2544 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2545 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2546 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2547 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2548 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2555 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2557 if (Bview.importMatrix.is_null()) {
2560 Ccolmap = Bview.colMap;
2564 Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2565 Kokkos::parallel_for (range, KOKKOS_LAMBDA (
const size_t i) {
2566 Bcol2Ccol(i) =
static_cast<LO
> (i);
2577 if (!Bimport.is_null() && !Iimport.is_null()){
2578 Cimport = Bimport->setUnion(*Iimport);
2579 Ccolmap = Cimport->getTargetMap();
2581 }
else if (!Bimport.is_null() && Iimport.is_null()) {
2582 Cimport = Bimport->setUnion();
2584 }
else if(Bimport.is_null() && !Iimport.is_null()) {
2585 Cimport = Iimport->setUnion();
2588 throw std::runtime_error(
"TpetraExt::Jacobi status of matrix importers is nonsensical");
2590 Ccolmap = Cimport->getTargetMap();
2592 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2593 std::runtime_error,
"Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2600 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2601 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2602 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2603 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2605 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2606 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2615 C.replaceColMap(Ccolmap);
2633 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2634 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2635 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2636 GO aidx = Acolmap_local.getGlobalElement(i);
2637 LO B_LID = Browmap_local.getLocalElement(aidx);
2638 if (B_LID != LO_INVALID) {
2639 targetMapToOrigRow(i) = B_LID;
2640 targetMapToImportRow(i) = LO_INVALID;
2642 LO I_LID = Irowmap_local.getLocalElement(aidx);
2643 targetMapToOrigRow(i) = LO_INVALID;
2644 targetMapToImportRow(i) = I_LID;
2651 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2660 template<
class Scalar,
2662 class GlobalOrdinal,
2664 class LocalOrdinalViewType>
2665 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2666 const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2667 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2668 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2669 const LocalOrdinalViewType & targetMapToOrigRow,
2670 const LocalOrdinalViewType & targetMapToImportRow,
2671 const LocalOrdinalViewType & Bcol2Ccol,
2672 const LocalOrdinalViewType & Icol2Ccol,
2673 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2674 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2675 const std::string& label,
2676 const Teuchos::RCP<Teuchos::ParameterList>& params) {
2678 #ifdef HAVE_TPETRA_MMM_TIMINGS
2679 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2680 using Teuchos::TimeMonitor;
2681 auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Nemwmatrix SerialCore")));
2684 using Teuchos::Array;
2685 using Teuchos::ArrayRCP;
2686 using Teuchos::ArrayView;
2691 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2692 typedef typename KCRS::StaticCrsGraphType graph_t;
2693 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2694 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2695 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2696 typedef typename KCRS::values_type::non_const_type scalar_view_t;
2699 typedef typename scalar_view_t::memory_space scalar_memory_space;
2702 typedef LocalOrdinal LO;
2703 typedef GlobalOrdinal GO;
2706 typedef Map<LO,GO,NO> map_type;
2707 size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2708 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2711 RCP<const map_type> Ccolmap = C.getColMap();
2712 size_t m = Aview.origMatrix->getLocalNumRows();
2713 size_t n = Ccolmap->getLocalNumElements();
2714 size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2717 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2718 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2720 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2721 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2722 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2724 c_lno_view_t Irowptr;
2725 lno_nnz_view_t Icolind;
2726 scalar_view_t Ivals;
2727 if(!Bview.importMatrix.is_null()) {
2728 auto lclB = Bview.importMatrix->getLocalMatrixHost();
2729 Irowptr = lclB.graph.row_map;
2730 Icolind = lclB.graph.entries;
2731 Ivals = lclB.values;
2732 b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2737 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2745 size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2746 Array<size_t> c_status(n, ST_INVALID);
2755 size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2756 lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing(
"Crowptr"),m+1);
2757 lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing(
"Ccolind"),CSR_alloc);
2758 scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing(
"Cvals"),CSR_alloc);
2759 size_t CSR_ip = 0, OLD_ip = 0;
2761 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2775 for (
size_t i = 0; i < m; i++) {
2778 Crowptr[i] = CSR_ip;
2779 SC minusOmegaDval = -omega*Dvals(i,0);
2782 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2783 Scalar Bval = Bvals[j];
2784 if (Bval == SC_ZERO)
2786 LO Bij = Bcolind[j];
2787 LO Cij = Bcol2Ccol[Bij];
2790 c_status[Cij] = CSR_ip;
2791 Ccolind[CSR_ip] = Cij;
2792 Cvals[CSR_ip] = Bvals[j];
2797 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2798 LO Aik = Acolind[k];
2799 const SC Aval = Avals[k];
2800 if (Aval == SC_ZERO)
2803 if (targetMapToOrigRow[Aik] != LO_INVALID) {
2805 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
2807 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2808 LO Bkj = Bcolind[j];
2809 LO Cij = Bcol2Ccol[Bkj];
2811 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2813 c_status[Cij] = CSR_ip;
2814 Ccolind[CSR_ip] = Cij;
2815 Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2819 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2825 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
2826 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2827 LO Ikj = Icolind[j];
2828 LO Cij = Icol2Ccol[Ikj];
2830 if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2832 c_status[Cij] = CSR_ip;
2833 Ccolind[CSR_ip] = Cij;
2834 Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2837 Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2844 if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2846 Kokkos::resize(Ccolind,CSR_alloc);
2847 Kokkos::resize(Cvals,CSR_alloc);
2851 Crowptr[m] = CSR_ip;
2854 Kokkos::resize(Ccolind,CSR_ip);
2855 Kokkos::resize(Cvals,CSR_ip);
2858 #ifdef HAVE_TPETRA_MMM_TIMINGS
2859 auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix Final Sort")));
2866 C.replaceColMap(Ccolmap);
2873 if (params.is_null() || params->get(
"sort entries",
true))
2874 Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2875 C.setAllValues(Crowptr,Ccolind, Cvals);
2878 #ifdef HAVE_TPETRA_MMM_TIMINGS
2879 auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix ESFC")));
2890 RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
2891 labelList->set(
"Timer Label",label);
2892 if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
2893 RCP<const Export<LO,GO,NO> > dummyExport;
2894 C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2902 template<
class Scalar,
2904 class GlobalOrdinal,
2906 void jacobi_A_B_reuse(
2908 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2909 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2910 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2911 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2912 const std::string& label,
2913 const Teuchos::RCP<Teuchos::ParameterList>& params)
2915 using Teuchos::Array;
2916 using Teuchos::ArrayRCP;
2917 using Teuchos::ArrayView;
2921 typedef LocalOrdinal LO;
2922 typedef GlobalOrdinal GO;
2925 typedef Import<LO,GO,NO> import_type;
2926 typedef Map<LO,GO,NO> map_type;
2929 typedef typename map_type::local_map_type local_map_type;
2931 typedef typename KCRS::StaticCrsGraphType graph_t;
2932 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2933 typedef typename NO::execution_space execution_space;
2934 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2935 typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2937 #ifdef HAVE_TPETRA_MMM_TIMINGS
2938 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
2939 using Teuchos::TimeMonitor;
2940 RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse Cmap"))));
2942 LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2945 RCP<const import_type> Cimport = C.getGraph()->getImporter();
2946 RCP<const map_type> Ccolmap = C.getColMap();
2947 local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2948 local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2949 local_map_type Irowmap_local;
if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2950 local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2951 local_map_type Icolmap_local;
if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2952 local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2955 lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing(
"Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2959 Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2960 Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2963 if (!Bview.importMatrix.is_null()) {
2964 TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2965 std::runtime_error,
"Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2967 Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2968 Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(
const LO i) {
2969 Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2975 lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2976 lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing(
"targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2977 Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(
const LO i) {
2978 GO aidx = Acolmap_local.getGlobalElement(i);
2979 LO B_LID = Browmap_local.getLocalElement(aidx);
2980 if (B_LID != LO_INVALID) {
2981 targetMapToOrigRow(i) = B_LID;
2982 targetMapToImportRow(i) = LO_INVALID;
2984 LO I_LID = Irowmap_local.getLocalElement(aidx);
2985 targetMapToOrigRow(i) = LO_INVALID;
2986 targetMapToImportRow(i) = I_LID;
2991 #ifdef HAVE_TPETRA_MMM_TIMINGS
2997 KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
3003 template<
class Scalar,
3005 class GlobalOrdinal,
3007 class LocalOrdinalViewType>
3008 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
3009 const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
3010 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3011 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3012 const LocalOrdinalViewType & targetMapToOrigRow,
3013 const LocalOrdinalViewType & targetMapToImportRow,
3014 const LocalOrdinalViewType & Bcol2Ccol,
3015 const LocalOrdinalViewType & Icol2Ccol,
3016 CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
3017 Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Node> > ,
3018 const std::string& label,
3019 const Teuchos::RCP<Teuchos::ParameterList>& ) {
3020 #ifdef HAVE_TPETRA_MMM_TIMINGS
3021 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
3022 using Teuchos::TimeMonitor;
3023 Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore"))));
3024 Teuchos::RCP<Teuchos::TimeMonitor> MM2;
3032 typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
3033 typedef typename KCRS::StaticCrsGraphType graph_t;
3034 typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3035 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3036 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3037 typedef typename scalar_view_t::memory_space scalar_memory_space;
3040 typedef LocalOrdinal LO;
3041 typedef GlobalOrdinal GO;
3043 typedef Map<LO,GO,NO> map_type;
3044 const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3045 const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3046 const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3049 RCP<const map_type> Ccolmap = C.getColMap();
3050 size_t m = Aview.origMatrix->getLocalNumRows();
3051 size_t n = Ccolmap->getLocalNumElements();
3054 const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
3055 const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
3056 const KCRS & Cmat = C.getLocalMatrixHost();
3058 c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3059 const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3060 const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3061 scalar_view_t Cvals = Cmat.values;
3063 c_lno_view_t Irowptr;
3064 lno_nnz_view_t Icolind;
3065 scalar_view_t Ivals;
3066 if(!Bview.importMatrix.is_null()) {
3067 auto lclB = Bview.importMatrix->getLocalMatrixHost();
3068 Irowptr = lclB.graph.row_map;
3069 Icolind = lclB.graph.entries;
3070 Ivals = lclB.values;
3075 Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
3077 #ifdef HAVE_TPETRA_MMM_TIMINGS
3078 MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse SerialCore - Compare"))));
3086 std::vector<size_t> c_status(n, ST_INVALID);
3089 size_t CSR_ip = 0, OLD_ip = 0;
3090 for (
size_t i = 0; i < m; i++) {
3094 OLD_ip = Crowptr[i];
3095 CSR_ip = Crowptr[i+1];
3096 for (
size_t k = OLD_ip; k < CSR_ip; k++) {
3097 c_status[Ccolind[k]] = k;
3103 SC minusOmegaDval = -omega*Dvals(i,0);
3106 for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3107 Scalar Bval = Bvals[j];
3108 if (Bval == SC_ZERO)
3110 LO Bij = Bcolind[j];
3111 LO Cij = Bcol2Ccol[Bij];
3113 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3114 std::runtime_error,
"Trying to insert a new entry into a static graph");
3116 Cvals[c_status[Cij]] = Bvals[j];
3120 for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3121 LO Aik = Acolind[k];
3122 const SC Aval = Avals[k];
3123 if (Aval == SC_ZERO)
3126 if (targetMapToOrigRow[Aik] != LO_INVALID) {
3128 size_t Bk =
static_cast<size_t> (targetMapToOrigRow[Aik]);
3130 for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3131 LO Bkj = Bcolind[j];
3132 LO Cij = Bcol2Ccol[Bkj];
3134 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3135 std::runtime_error,
"Trying to insert a new entry into a static graph");
3137 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3142 size_t Ik =
static_cast<size_t> (targetMapToImportRow[Aik]);
3143 for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3144 LO Ikj = Icolind[j];
3145 LO Cij = Icol2Ccol[Ikj];
3147 TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3148 std::runtime_error,
"Trying to insert a new entry into a static graph");
3150 Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3156 #ifdef HAVE_TPETRA_MMM_TIMINGS
3157 MM2 = Teuchos::null;
3158 MM2 = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
3161 C.fillComplete(C.getDomainMap(), C.getRangeMap());
3162 #ifdef HAVE_TPETRA_MMM_TIMINGS
3163 MM2 = Teuchos::null;
3172 template<
class Scalar,
3174 class GlobalOrdinal,
3176 void import_and_extract_views(
3177 const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3178 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3179 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3180 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3181 bool userAssertsThereAreNoRemotes,
3182 const std::string& label,
3183 const Teuchos::RCP<Teuchos::ParameterList>& params)
3185 using Teuchos::Array;
3186 using Teuchos::ArrayView;
3189 using Teuchos::null;
3192 typedef LocalOrdinal LO;
3193 typedef GlobalOrdinal GO;
3196 typedef Map<LO,GO,NO> map_type;
3197 typedef Import<LO,GO,NO> import_type;
3198 typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3200 #ifdef HAVE_TPETRA_MMM_TIMINGS
3201 std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
3202 using Teuchos::TimeMonitor;
3203 RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Alloc"))));
3211 Aview.deleteContents();
3213 Aview.origMatrix = rcp(&A,
false);
3214 Aview.origRowMap = A.getRowMap();
3215 Aview.rowMap = targetMap;
3216 Aview.colMap = A.getColMap();
3217 Aview.domainMap = A.getDomainMap();
3218 Aview.importColMap = null;
3219 RCP<const map_type> rowMap = A.getRowMap();
3220 const int numProcs = rowMap->getComm()->getSize();
3223 if (userAssertsThereAreNoRemotes || numProcs < 2)
3226 RCP<const import_type> importer;
3227 if (params != null && params->isParameter(
"importer")) {
3228 importer = params->get<RCP<const import_type> >(
"importer");
3231 #ifdef HAVE_TPETRA_MMM_TIMINGS
3233 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap"))));
3238 RCP<const map_type> remoteRowMap;
3239 size_t numRemote = 0;
3241 if (!prototypeImporter.is_null() &&
3242 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3243 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3245 #ifdef HAVE_TPETRA_MMM_TIMINGS
3246 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode1"));
3248 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3249 numRemote = prototypeImporter->getNumRemoteIDs();
3251 Array<GO> remoteRows(numRemote);
3252 for (
size_t i = 0; i < numRemote; i++)
3253 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3255 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3256 rowMap->getIndexBase(), rowMap->getComm()));
3259 }
else if (prototypeImporter.is_null()) {
3261 #ifdef HAVE_TPETRA_MMM_TIMINGS
3262 TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X RemoteMap-Mode2"));
3264 ArrayView<const GO> rows = targetMap->getLocalElementList();
3265 size_t numRows = targetMap->getLocalNumElements();
3267 Array<GO> remoteRows(numRows);
3268 for(
size_t i = 0; i < numRows; ++i) {
3269 const LO mlid = rowMap->getLocalElement(rows[i]);
3271 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3272 remoteRows[numRemote++] = rows[i];
3274 remoteRows.resize(numRemote);
3275 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3276 rowMap->getIndexBase(), rowMap->getComm()));
3285 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3286 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3295 #ifdef HAVE_TPETRA_MMM_TIMINGS
3297 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Collective-0"))));
3301 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3303 if (globalMaxNumRemote > 0) {
3304 #ifdef HAVE_TPETRA_MMM_TIMINGS
3306 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-2"))));
3310 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3312 importer = rcp(
new import_type(rowMap, remoteRowMap));
3314 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match A.getRowMap()!");
3318 params->set(
"importer", importer);
3321 if (importer != null) {
3322 #ifdef HAVE_TPETRA_MMM_TIMINGS
3324 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-3"))));
3328 Teuchos::ParameterList labelList;
3329 labelList.set(
"Timer Label", label);
3330 auto & labelList_subList = labelList.sublist(
"matrixmatrix: kernel params",
false);
3333 bool overrideAllreduce =
false;
3336 Teuchos::ParameterList params_sublist;
3337 if(!params.is_null()) {
3338 labelList.set(
"compute global constants", params->get(
"compute global constants",
false));
3339 params_sublist = params->sublist(
"matrixmatrix: kernel params",
false);
3340 mm_optimization_core_count = params_sublist.get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3341 int mm_optimization_core_count2 = params->get(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3342 if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3343 isMM = params_sublist.get(
"isMatrixMatrix_TransferAndFillComplete",
false);
3344 overrideAllreduce = params_sublist.get(
"MM_TAFC_OverrideAllreduceCheck",
false);
3346 labelList_subList.set(
"isMatrixMatrix_TransferAndFillComplete",isMM);
3347 labelList_subList.set(
"MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3348 labelList_subList.set(
"MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3350 Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3351 A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3357 sprintf(str,
"import_matrix.%d.dat",count);
3362 #ifdef HAVE_TPETRA_MMM_STATISTICS
3363 printMultiplicationStatistics(importer, label + std::string(
" I&X MMM"));
3367 #ifdef HAVE_TPETRA_MMM_TIMINGS
3369 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM I&X Import-4"))));
3373 Aview.importColMap = Aview.importMatrix->getColMap();
3374 #ifdef HAVE_TPETRA_MMM_TIMINGS
3381 template<
class Scalar,
3383 class GlobalOrdinal,
3385 void import_and_extract_views(
3386 const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3387 Teuchos::RCP<
const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3388 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3389 Teuchos::RCP<
const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3390 bool userAssertsThereAreNoRemotes)
3392 using Teuchos::Array;
3393 using Teuchos::ArrayView;
3396 using Teuchos::null;
3399 typedef LocalOrdinal LO;
3400 typedef GlobalOrdinal GO;
3403 typedef Map<LO,GO,NO> map_type;
3404 typedef Import<LO,GO,NO> import_type;
3405 typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3413 Mview.deleteContents();
3415 Mview.origMatrix = rcp(&M,
false);
3416 Mview.origRowMap = M.getRowMap();
3417 Mview.rowMap = targetMap;
3418 Mview.colMap = M.getColMap();
3419 Mview.importColMap = null;
3420 RCP<const map_type> rowMap = M.getRowMap();
3421 const int numProcs = rowMap->getComm()->getSize();
3424 if (userAssertsThereAreNoRemotes || numProcs < 2)
return;
3428 RCP<const map_type> remoteRowMap;
3429 size_t numRemote = 0;
3431 if (!prototypeImporter.is_null() &&
3432 prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3433 prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3436 ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3437 numRemote = prototypeImporter->getNumRemoteIDs();
3439 Array<GO> remoteRows(numRemote);
3440 for (
size_t i = 0; i < numRemote; i++)
3441 remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3443 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3444 rowMap->getIndexBase(), rowMap->getComm()));
3447 }
else if (prototypeImporter.is_null()) {
3450 ArrayView<const GO> rows = targetMap->getLocalElementList();
3451 size_t numRows = targetMap->getLocalNumElements();
3453 Array<GO> remoteRows(numRows);
3454 for(
size_t i = 0; i < numRows; ++i) {
3455 const LO mlid = rowMap->getLocalElement(rows[i]);
3457 if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3458 remoteRows[numRemote++] = rows[i];
3460 remoteRows.resize(numRemote);
3461 remoteRowMap = rcp(
new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3462 rowMap->getIndexBase(), rowMap->getComm()));
3471 TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3472 "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3480 Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (
global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3482 RCP<const import_type> importer;
3484 if (globalMaxNumRemote > 0) {
3487 importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3489 importer = rcp(
new import_type(rowMap, remoteRowMap));
3491 throw std::runtime_error(
"prototypeImporter->SourceMap() does not match M.getRowMap()!");
3494 if (importer != null) {
3496 Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3500 Mview.importColMap = Mview.importMatrix->getColMap();
3506 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3508 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3509 CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3510 const LocalOrdinalViewType & Acol2Brow,
3511 const LocalOrdinalViewType & Acol2Irow,
3512 const LocalOrdinalViewType & Bcol2Ccol,
3513 const LocalOrdinalViewType & Icol2Ccol,
3514 const size_t mergedNodeNumCols) {
3518 typedef typename KCRS::StaticCrsGraphType graph_t;
3519 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3520 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3521 typedef typename KCRS::values_type::non_const_type scalar_view_t;
3523 const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3524 const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3527 if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3531 RCP<const KCRS> Ik_;
3532 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3533 const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3535 if(Ik!=0) Iks = *Ik;
3536 size_t merge_numrows = Ak.numCols();
3538 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3540 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3543 typedef typename Node::execution_space execution_space;
3544 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3545 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3546 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3547 if(
final) Mrowptr(i) = update;
3550 if(Acol2Brow(i)!=LO_INVALID)
3551 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3553 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3556 if(
final && i+1==merge_numrows)
3557 Mrowptr(i+1)=update;
3561 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3562 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3563 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz);
3566 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3567 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3568 if(Acol2Brow(i)!=LO_INVALID) {
3569 size_t row = Acol2Brow(i);
3570 size_t start = Bk.graph.row_map(row);
3571 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3572 Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3573 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3577 size_t row = Acol2Irow(i);
3578 size_t start = Iks.graph.row_map(row);
3579 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3580 Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3581 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3586 KCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3597 template<
class Scalar,
class LocalOrdinal,
class GlobalOrdinal,
class Node,
class LocalOrdinalViewType>
3598 const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3599 merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3600 BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3601 const LocalOrdinalViewType & Acol2Brow,
3602 const LocalOrdinalViewType & Acol2Irow,
3603 const LocalOrdinalViewType & Bcol2Ccol,
3604 const LocalOrdinalViewType & Icol2Ccol,
3605 const size_t mergedNodeNumCols)
3608 typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3609 typedef typename KBCRS::StaticCrsGraphType graph_t;
3610 typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3611 typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3612 typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3615 const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3616 const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3619 if(!Bview.importMatrix.is_null() ||
3620 (Bview.importMatrix.is_null() &&
3621 (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3626 RCP<const KBCRS> Ik_;
3627 if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3628 const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3630 if(Ik!=0) Iks = *Ik;
3631 size_t merge_numrows = Ak.numCols();
3634 lno_view_t Mrowptr(
"Mrowptr", merge_numrows + 1);
3636 const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3639 typedef typename Node::execution_space execution_space;
3640 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3641 Kokkos::parallel_scan (
"Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3642 KOKKOS_LAMBDA(
const size_t i,
size_t& update,
const bool final) {
3643 if(
final) Mrowptr(i) = update;
3646 if(Acol2Brow(i)!=LO_INVALID)
3647 ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3649 ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3652 if(
final && i+1==merge_numrows)
3653 Mrowptr(i+1)=update;
3657 size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3658 const int blocksize = Ak.blockDim();
3659 lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing(
"Mcolind"),merge_nnz);
3660 scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing(
"Mvals"),merge_nnz*blocksize*blocksize);
3663 typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3664 Kokkos::parallel_for (
"Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(
const size_t i) {
3665 if(Acol2Brow(i)!=LO_INVALID) {
3666 size_t row = Acol2Brow(i);
3667 size_t start = Bk.graph.row_map(row);
3668 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3669 Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3671 for (
int b=0; b<blocksize*blocksize; ++b) {
3672 const int val_indx = j*blocksize*blocksize + b;
3673 const int b_val_indx = (j-Mrowptr(i)+
start)*blocksize*blocksize + b;
3674 Mvalues(val_indx) = Bk.values(b_val_indx);
3679 size_t row = Acol2Irow(i);
3680 size_t start = Iks.graph.row_map(row);
3681 for(
size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3682 Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3684 for (
int b=0; b<blocksize*blocksize; ++b) {
3685 const int val_indx = j*blocksize*blocksize + b;
3686 const int b_val_indx = (j-Mrowptr(i)+
start)*blocksize*blocksize + b;
3687 Mvalues(val_indx) = Iks.values(b_val_indx);
3694 KBCRS newmat(
"CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3704 template<
typename SC,
typename LO,
typename GO,
typename NO>
3705 void AddKernels<SC, LO, GO, NO>::
3707 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3708 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3709 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3710 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3711 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3712 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3713 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3714 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3715 #
if KOKKOSKERNELS_VERSION >= 40299
3718 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3719 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3720 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3722 using Teuchos::TimeMonitor;
3723 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3724 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3725 auto nrows = Arowptrs.extent(0) - 1;
3726 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3727 typename AddKern::KKH handle;
3728 handle.create_spadd_handle(
true);
3729 auto addHandle = handle.get_spadd_handle();
3730 #ifdef HAVE_TPETRA_MMM_TIMINGS
3731 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted symbolic")));
3733 KokkosSparse::Experimental::spadd_symbolic
3735 #
if KOKKOSKERNELS_VERSION >= 40299
3736 nrows, numGlobalCols,
3738 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3740 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3741 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3742 #ifdef HAVE_TPETRA_MMM_TIMINGS
3744 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() sorted numeric")));
3746 KokkosSparse::Experimental::spadd_numeric(&handle,
3747 #
if KOKKOSKERNELS_VERSION >= 40299
3748 nrows, numGlobalCols,
3750 Arowptrs, Acolinds, Avals, scalarA,
3751 Browptrs, Bcolinds, Bvals, scalarB,
3752 Crowptrs, Ccolinds, Cvals);
3755 template<
typename SC,
typename LO,
typename GO,
typename NO>
3756 void AddKernels<SC, LO, GO, NO>::
3758 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3759 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3760 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3761 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3762 const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3763 const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3764 const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3765 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3766 #
if KOKKOSKERNELS_VERSION >= 40299
3771 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3772 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3773 typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3775 using Teuchos::TimeMonitor;
3776 using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3777 TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error,
"Can't add matrices with different numbers of rows.");
3778 auto nrows = Arowptrs.extent(0) - 1;
3779 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3780 typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3781 typename AddKern::KKH handle;
3782 handle.create_spadd_handle(
false);
3783 auto addHandle = handle.get_spadd_handle();
3784 #ifdef HAVE_TPETRA_MMM_TIMINGS
3785 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3787 KokkosSparse::Experimental::spadd_symbolic
3789 #
if KOKKOSKERNELS_VERSION >= 40299
3790 nrows, numGlobalCols,
3792 Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3794 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3795 Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3796 #ifdef HAVE_TPETRA_MMM_TIMINGS
3798 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3800 KokkosSparse::Experimental::spadd_numeric(&handle,
3801 #
if KOKKOSKERNELS_VERSION >= 40299
3802 nrows, numGlobalCols,
3804 Arowptrs, Acolinds, Avals, scalarA,
3805 Browptrs, Bcolinds, Bvals, scalarB,
3806 Crowptrs, Ccolinds, Cvals);
3809 template<
typename GO,
3810 typename LocalIndicesType,
3811 typename GlobalIndicesType,
3812 typename ColMapType>
3813 struct ConvertLocalToGlobalFunctor
3815 ConvertLocalToGlobalFunctor(
3816 const LocalIndicesType& colindsOrig_,
3817 const GlobalIndicesType& colindsConverted_,
3818 const ColMapType& colmap_) :
3819 colindsOrig (colindsOrig_),
3820 colindsConverted (colindsConverted_),
3823 KOKKOS_INLINE_FUNCTION
void
3824 operator() (
const GO i)
const
3826 colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3828 LocalIndicesType colindsOrig;
3829 GlobalIndicesType colindsConverted;
3833 template<
typename SC,
typename LO,
typename GO,
typename NO>
3834 void MMdetails::AddKernels<SC, LO, GO, NO>::
3835 convertToGlobalAndAdd(
3836 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3837 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3838 const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3839 const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3840 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3841 const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3842 typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3843 typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3844 typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3846 using Teuchos::TimeMonitor;
3849 using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3850 typename NO::execution_space,
typename NO::memory_space,
typename NO::memory_space>;
3852 const values_array Avals = A.values;
3853 const values_array Bvals = B.values;
3854 const col_inds_array Acolinds = A.graph.entries;
3855 const col_inds_array Bcolinds = B.graph.entries;
3856 auto Arowptrs = A.graph.row_map;
3857 auto Browptrs = B.graph.row_map;
3858 global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"A colinds (converted)"), Acolinds.extent(0));
3859 global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing(
"B colinds (converted)"), Bcolinds.extent(0));
3860 #ifdef HAVE_TPETRA_MMM_TIMINGS
3861 auto MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string(
"column map conversion"))));
3863 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3864 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3865 ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3866 Kokkos::parallel_for(
"Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3868 handle.create_spadd_handle(
false);
3869 auto addHandle = handle.get_spadd_handle();
3870 #ifdef HAVE_TPETRA_MMM_TIMINGS
3872 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3874 auto nrows = Arowptrs.extent(0) - 1;
3875 Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing(
"C row ptrs"), nrows + 1);
3876 KokkosSparse::Experimental::spadd_symbolic
3878 #
if KOKKOSKERNELS_VERSION >= 40299
3881 Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3882 Cvals = values_array(
"C values", addHandle->get_c_nnz());
3883 Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing(
"C colinds"), addHandle->get_c_nnz());
3884 #ifdef HAVE_TPETRA_MMM_TIMINGS
3886 MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(
"TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3888 KokkosSparse::Experimental::spadd_numeric(&handle,
3889 #
if KOKKOSKERNELS_VERSION >= 40299
3892 Arowptrs, AcolindsConverted, Avals, scalarA,
3893 Browptrs, BcolindsConverted, Bvals, scalarB,
3894 Crowptrs, Ccolinds, Cvals);
3910 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3912 void MatrixMatrix::Multiply( \
3913 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3915 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3917 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3918 bool call_FillComplete_on_result, \
3919 const std::string & label, \
3920 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3923 void MatrixMatrix::Multiply( \
3924 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3926 const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3928 Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3929 const std::string & label); \
3932 void MatrixMatrix::Jacobi( \
3934 const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3935 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3936 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3937 CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3938 bool call_FillComplete_on_result, \
3939 const std::string & label, \
3940 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3943 void MatrixMatrix::Add( \
3944 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3947 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3950 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3953 void MatrixMatrix::Add( \
3954 const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3957 const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3960 const Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3963 void MatrixMatrix::Add( \
3964 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3967 CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3971 Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3972 MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3973 (const SCALAR & alpha, \
3974 const bool transposeA, \
3975 const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3976 const SCALAR & beta, \
3977 const bool transposeB, \
3978 const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3979 const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3980 const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3981 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3985 MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3986 (const SCALAR & alpha, \
3987 const bool transposeA, \
3988 const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3989 const SCALAR& beta, \
3990 const bool transposeB, \
3991 const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3992 CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3993 const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3994 const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3995 const Teuchos::RCP<Teuchos::ParameterList>& params); \
3997 template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3999 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
4000 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
4001 CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
4002 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
4003 bool userAssertsThereAreNoRemotes, \
4004 const std::string& label, \
4005 const Teuchos::RCP<Teuchos::ParameterList>& params); \
4007 template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
4008 Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
4009 BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
4010 Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
4011 bool userAssertsThereAreNoRemotes);
4014 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix's column Map with the given Map.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix's values: this := alpha*this.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Struct that holds views of the contents of a BlockCrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix's graph, as a RowGraph.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void start()
Start the deep_copy counter.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > ¶ms=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra's behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.