42 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP 
   43 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP 
   45 #ifdef HAVE_TPETRA_INST_CUDA 
   51 template<
class Scalar,
 
   54          class LocalOrdinalViewType>
 
   55 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
 
   56     static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
   57                                                   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
   58                                                   const LocalOrdinalViewType & Acol2Brow,
 
   59                                                   const LocalOrdinalViewType & Acol2Irow,
 
   60                                                   const LocalOrdinalViewType & Bcol2Ccol,
 
   61                                                   const LocalOrdinalViewType & Icol2Ccol,
 
   62                                                   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
   63                                                   Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
   64                                                   const std::string& label = std::string(),
 
   65                                                   const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
   69    static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
   70                                                   CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
   71                                                   const LocalOrdinalViewType & Acol2Brow,
 
   72                                                   const LocalOrdinalViewType & Acol2Irow,
 
   73                                                   const LocalOrdinalViewType & Bcol2Ccol,
 
   74                                                   const LocalOrdinalViewType & Icol2Ccol,
 
   75                                                   CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
   76                                                   Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
   77                                                   const std::string& label = std::string(),
 
   78                                                   const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
   83 template<
class Scalar,
 
   85          class GlobalOrdinal, 
class LocalOrdinalViewType>
 
   86 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
 
   87     static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
 
   88                                                            const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
 
   89                                                            CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
   90                                                            CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
   91                                                            const LocalOrdinalViewType & Acol2Brow,
 
   92                                                            const LocalOrdinalViewType & Acol2Irow,
 
   93                                                            const LocalOrdinalViewType & Bcol2Ccol,
 
   94                                                            const LocalOrdinalViewType & Icol2Ccol,
 
   95                                                            CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
   96                                                            Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
   97                                                            const std::string& label = std::string(),
 
   98                                                            const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
  100     static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
 
  101                                                        const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
 
  102                                                        CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
  103                                                        CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
  104                                                        const LocalOrdinalViewType & Acol2Brow,
 
  105                                                        const LocalOrdinalViewType & Acol2Irow,
 
  106                                                        const LocalOrdinalViewType & Bcol2Ccol,
 
  107                                                        const LocalOrdinalViewType & Icol2Ccol,
 
  108                                                        CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
  109                                                        Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
  110                                                        const std::string& label = std::string(),
 
  111                                                        const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
  113     static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
 
  114                                                           const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
 
  115                                                           CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
  116                                                           CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
  117                                                           const LocalOrdinalViewType & Acol2Brow,
 
  118                                                           const LocalOrdinalViewType & Acol2Irow,
 
  119                                                           const LocalOrdinalViewType & Bcol2Ccol,
 
  120                                                           const LocalOrdinalViewType & Icol2Ccol,
 
  121                                                           CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
  122                                                           Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
  123                                                           const std::string& label = std::string(),
 
  124                                                           const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
 
  130 template<
class Scalar,
 
  133          class LocalOrdinalViewType>
 
  134 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
  135                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
  136                                                                                                const LocalOrdinalViewType & Acol2Brow,
 
  137                                                                                                const LocalOrdinalViewType & Acol2Irow,
 
  138                                                                                                const LocalOrdinalViewType & Bcol2Ccol,
 
  139                                                                                                const LocalOrdinalViewType & Icol2Ccol,
 
  140                                                                                                CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
  141                                                                                                Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
  142                                                                                                const std::string& label,
 
  143                                                                                                const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  146 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  147   std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  148   using Teuchos::TimeMonitor;
 
  149   Teuchos::RCP<TimeMonitor> MM = rcp(
new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaWrapper")))));
 
  152   typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
 
  153   std::string nodename(
"Cuda");
 
  158   typedef typename KCRS::device_type device_t;
 
  159   typedef typename KCRS::StaticCrsGraphType graph_t;
 
  160   typedef typename graph_t::row_map_type::non_const_type lno_view_t;
 
  161   typedef typename graph_t::row_map_type::const_type  c_lno_view_t;
 
  162   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
  163   typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
  167   int team_work_size = 16;  
 
  168   std::string myalg(
"SPGEMM_KK_MEMORY");
 
  169   if(!params.is_null()) {
 
  170     if(params->isParameter(
"cuda: algorithm"))
 
  171       myalg = params->get(
"cuda: algorithm",myalg);
 
  172     if(params->isParameter(
"cuda: team work size"))
 
  173       team_work_size = params->get(
"cuda: team work size",team_work_size);
 
  177   typedef KokkosKernels::Experimental::KokkosKernelsHandle<
 
  178        typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type, 
typename scalar_view_t::const_value_type,
 
  179        typename device_t::execution_space, 
typename device_t::memory_space,
typename device_t::memory_space > KernelHandle;
 
  182   const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
 
  183   const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
 
  185   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
 
  186   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
 
  187   const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
 
  189   c_lno_view_t  Irowptr;
 
  190   lno_nnz_view_t  Icolind;
 
  192   if(!Bview.importMatrix.is_null()) {
 
  193     Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
 
  194     Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
 
  195     Ivals   = Bview.importMatrix->getLocalMatrix().values;
 
  200   std::string alg = nodename+std::string(
" algorithm");
 
  202   if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
 
  203   KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
 
  206   const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
 
  208 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  209   MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaCore"))));
 
  213   typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
 
  214   typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
 
  215   typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
 
  217   lno_view_t      row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
 
  218   lno_nnz_view_t  entriesC;
 
  219   scalar_view_t   valuesC;
 
  221   kh.create_spgemm_handle(alg_enum);
 
  222   kh.set_team_work_size(team_work_size);
 
  224   KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,
false,Bmerged.graph.row_map,Bmerged.graph.entries,
false,row_mapC);
 
  226   size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
 
  228     entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
 
  229     valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
 
  231   KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,
false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,
false,row_mapC,entriesC,valuesC);
 
  232   kh.destroy_spgemm_handle();
 
  234 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  235   MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaSort"))));
 
  239   if (params.is_null() || params->get(
"sort entries",
true))
 
  240     Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
 
  241   C.setAllValues(row_mapC,entriesC,valuesC);
 
  243 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  244   MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix CudaESFC"))));
 
  248   RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
 
  249   labelList->set(
"Timer Label",label);
 
  250   if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
 
  251   RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
 
  252   C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
 
  257 template<
class Scalar,
 
  260          class LocalOrdinalViewType>
 
  261 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
  262                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
  263                                                                                                const LocalOrdinalViewType & targetMapToOrigRow,
 
  264                                                                                                const LocalOrdinalViewType & targetMapToImportRow,
 
  265                                                                                                const LocalOrdinalViewType & Bcol2Ccol,
 
  266                                                                                                const LocalOrdinalViewType & Icol2Ccol,
 
  267                                                                                                CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
  268                                                                                                Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
  269                                                                                                const std::string& label,
 
  270                                                                                                const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  273   typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
 
  275 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  276   std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  277   using Teuchos::TimeMonitor;
 
  278   Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Reuse SerialCore"))));
 
  279   Teuchos::RCP<Teuchos::TimeMonitor> MM2;
 
  287   typedef typename KCRS::StaticCrsGraphType graph_t;
 
  288   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
  289   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
  290   typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
  293   typedef LocalOrdinal      LO;
 
  294   typedef GlobalOrdinal     GO;
 
  296   typedef Map<LO,GO,NO>     map_type;
 
  297   const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  298   const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  299   const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
 
  302   typename graph_t::execution_space().fence();
 
  305   RCP<const map_type> Ccolmap = C.getColMap();
 
  306   size_t m = Aview.origMatrix->getNodeNumRows();
 
  307   size_t n = Ccolmap->getNodeNumElements();
 
  310   const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
 
  311   const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
 
  312   const KCRS & Cmat = C.getLocalMatrix();
 
  314   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
 
  315   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
 
  316   const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
 
  317   scalar_view_t Cvals = Cmat.values;
 
  319   c_lno_view_t  Irowptr;
 
  320   lno_nnz_view_t  Icolind;
 
  322   if(!Bview.importMatrix.is_null()) {
 
  323     Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
 
  324     Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
 
  325     Ivals   = Bview.importMatrix->getLocalMatrix().values;
 
  328 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  329   MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"MMM Newmatrix SerialCore - Compare"))));
 
  341   std::vector<size_t> c_status(n, ST_INVALID);
 
  344   size_t CSR_ip = 0, OLD_ip = 0;
 
  345   for (
size_t i = 0; i < m; i++) {
 
  349     CSR_ip = Crowptr[i+1];
 
  350     for (
size_t k = OLD_ip; k < CSR_ip; k++) {
 
  351       c_status[Ccolind[k]] = k;
 
  357     for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
 
  359       const SC Aval = Avals[k];
 
  363       if (targetMapToOrigRow[Aik] != LO_INVALID) {
 
  365         size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
 
  367         for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
 
  369           LO Cij = Bcol2Ccol[Bkj];
 
  371           TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  372             std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " <<
 
  373             "(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
  375           Cvals[c_status[Cij]] += Aval * Bvals[j];
 
  380         size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
 
  381         for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
 
  383           LO Cij = Icol2Ccol[Ikj];
 
  385           TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  386             std::runtime_error, 
"Trying to insert a new entry (" << i << 
"," << Cij << 
") into a static graph " <<
 
  387             "(c_status = " << c_status[Cij] << 
" of [" << OLD_ip << 
"," << CSR_ip << 
"))");
 
  389           Cvals[c_status[Cij]] += Aval * Ivals[j];
 
  395   C.fillComplete(C.getDomainMap(), C.getRangeMap());
 
  399 template<
class Scalar,
 
  402          class LocalOrdinalViewType>
 
  403 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
 
  404                                                                                                const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
 
  405                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
  406                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
  407                                                                                                const LocalOrdinalViewType & Acol2Brow,
 
  408                                                                                                const LocalOrdinalViewType & Acol2Irow,
 
  409                                                                                                const LocalOrdinalViewType & Bcol2Ccol,
 
  410                                                                                                const LocalOrdinalViewType & Icol2Ccol,
 
  411                                                                                                CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
  412                                                                                                Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
  413                                                                                                const std::string& label,
 
  414                                                                                                const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  416 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  417     std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  418     using Teuchos::TimeMonitor;
 
  419     Teuchos::RCP<TimeMonitor> MM;
 
  427   std::string myalg(
"MSAK");
 
  428   if(!params.is_null()) {
 
  429     if(params->isParameter(
"cuda: jacobi algorithm"))
 
  430       myalg = params->get(
"cuda: jacobi algorithm",myalg);
 
  433   if(myalg == 
"MSAK") {
 
  434     ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
 
  436   else if(myalg == 
"KK") {
 
  437     jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
 
  440     throw std::runtime_error(
"Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
 
  443 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  444   MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
 
  448   RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
 
  449   labelList->set(
"Timer Label",label);
 
  450   if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
 
  453   if(!C.isFillComplete()) {
 
  454     RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
 
  455     C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
 
  463 template<
class Scalar,
 
  466          class LocalOrdinalViewType>
 
  467 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
 
  468                                                                                                const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
 
  469                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
  470                                                                                                CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
  471                                                                                                const LocalOrdinalViewType & targetMapToOrigRow,
 
  472                                                                                                const LocalOrdinalViewType & targetMapToImportRow,
 
  473                                                                                                const LocalOrdinalViewType & Bcol2Ccol,
 
  474                                                                                                const LocalOrdinalViewType & Icol2Ccol,
 
  475                                                                                                CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
  476                                                                                                Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
  477                                                                                                const std::string& label,
 
  478                                                                                                const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  481   typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
 
  483 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  484   std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  485   using Teuchos::TimeMonitor;
 
  486   Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore"))));
 
  487   Teuchos::RCP<Teuchos::TimeMonitor> MM2;
 
  494   typedef typename KCRS::StaticCrsGraphType graph_t;
 
  495   typedef typename graph_t::row_map_type::const_type c_lno_view_t;
 
  496   typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
 
  497   typedef typename KCRS::values_type::non_const_type scalar_view_t;
 
  498   typedef typename scalar_view_t::memory_space scalar_memory_space;
 
  501   typedef LocalOrdinal      LO;
 
  502   typedef GlobalOrdinal     GO;
 
  504   typedef Map<LO,GO,NO>     map_type;
 
  505   const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  506   const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
 
  507   const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
 
  510   typename graph_t::execution_space().fence();
 
  513   RCP<const map_type> Ccolmap = C.getColMap();
 
  514   size_t m = Aview.origMatrix->getNodeNumRows();
 
  515   size_t n = Ccolmap->getNodeNumElements();
 
  518   const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
 
  519   const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
 
  520   const KCRS & Cmat = C.getLocalMatrix();
 
  522   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
 
  523   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
 
  524   const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
 
  525   scalar_view_t Cvals = Cmat.values;
 
  527   c_lno_view_t  Irowptr;
 
  528   lno_nnz_view_t  Icolind;
 
  530   if(!Bview.importMatrix.is_null()) {
 
  531     Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
 
  532     Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
 
  533     Ivals   = Bview.importMatrix->getLocalMatrix().values;
 
  537   auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
 
  539 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  540   MM2 = Teuchos::null; MM2 = rcp(
new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse CudaCore - Compare"))));
 
  548   std::vector<size_t> c_status(n, ST_INVALID);
 
  551   size_t CSR_ip = 0, OLD_ip = 0;
 
  552   for (
size_t i = 0; i < m; i++) {
 
  557     CSR_ip = Crowptr[i+1];
 
  558     for (
size_t k = OLD_ip; k < CSR_ip; k++) {
 
  559       c_status[Ccolind[k]] = k;
 
  565     SC minusOmegaDval = -omega*Dvals(i,0);
 
  568     for (
size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
 
  569       Scalar Bval = Bvals[j];
 
  573       LO Cij = Bcol2Ccol[Bij];
 
  575       TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  576         std::runtime_error, 
"Trying to insert a new entry into a static graph");
 
  578       Cvals[c_status[Cij]] = Bvals[j];
 
  582     for (
size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
 
  584       const SC Aval = Avals[k];
 
  588       if (targetMapToOrigRow[Aik] != LO_INVALID) {
 
  590         size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
 
  592         for (
size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
 
  594           LO Cij = Bcol2Ccol[Bkj];
 
  596           TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  597             std::runtime_error, 
"Trying to insert a new entry into a static graph");
 
  599           Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
 
  604         size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
 
  605         for (
size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
 
  607           LO Cij = Icol2Ccol[Ikj];
 
  609           TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
 
  610             std::runtime_error, 
"Trying to insert a new entry into a static graph");
 
  612           Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
 
  618 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  620   MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Reuse ESFC"))));
 
  623   C.fillComplete(C.getDomainMap(), C.getRangeMap());
 
  628 template<
class Scalar,
 
  631          class LocalOrdinalViewType>
 
  632 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
 
  633                         const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
 
  634                               CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
 
  635                         CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
 
  636                         const LocalOrdinalViewType & Acol2Brow,
 
  637                         const LocalOrdinalViewType & Acol2Irow,
 
  638                         const LocalOrdinalViewType & Bcol2Ccol,
 
  639                         const LocalOrdinalViewType & Icol2Ccol,
 
  640                         CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
 
  641                         Teuchos::RCP<
const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
 
  642                         const std::string& label,
 
  643                         const Teuchos::RCP<Teuchos::ParameterList>& params) {
 
  645 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  646   std::string prefix_mmm = std::string(
"TpetraExt ") + label + std::string(
": ");
 
  647   using Teuchos::TimeMonitor;
 
  648   Teuchos::RCP<TimeMonitor> MM;
 
  655     auto rowMap = Aview.origMatrix->getRowMap();
 
  657     Aview.origMatrix->getLocalDiagCopy(diags);
 
  658     size_t diagLength = rowMap->getNodeNumElements();
 
  659     Teuchos::Array<Scalar> diagonal(diagLength);
 
  660     diags.get1dCopy(diagonal());
 
  662     for(
size_t i = 0; i < diagLength; ++i) {
 
  663       TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(), 
 
  665          "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
 
  666          "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
 
  671   using device_t = 
typename Kokkos::Compat::KokkosCudaWrapperNode::device_type;
 
  673   using graph_t = 
typename matrix_t::StaticCrsGraphType;
 
  674   using lno_view_t = 
typename graph_t::row_map_type::non_const_type;
 
  675   using c_lno_view_t = 
typename graph_t::row_map_type::const_type;
 
  676   using lno_nnz_view_t = 
typename graph_t::entries_type::non_const_type;
 
  677   using scalar_view_t = 
typename matrix_t::values_type::non_const_type;
 
  680   using handle_t = 
typename KokkosKernels::Experimental::KokkosKernelsHandle<
 
  681     typename lno_view_t::const_value_type,
typename lno_nnz_view_t::const_value_type, 
typename scalar_view_t::const_value_type,
 
  682     typename device_t::execution_space, 
typename device_t::memory_space,
typename device_t::memory_space >;
 
  685   c_lno_view_t Irowptr;
 
  686   lno_nnz_view_t Icolind;
 
  688   if(!Bview.importMatrix.is_null()) {
 
  689     Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
 
  690     Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
 
  691     Ivals   = Bview.importMatrix->getLocalMatrix().values;
 
  695   const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
 
  698   const matrix_t & Amat = Aview.origMatrix->getLocalMatrix();
 
  699   const matrix_t & Bmat = Bview.origMatrix->getLocalMatrix();
 
  701   typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
 
  702   typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
 
  703   typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
 
  705   c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
 
  706   const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
 
  707   const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
 
  710   lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing(
"non_const_lnow_row"), AnumRows + 1);
 
  711   lno_nnz_view_t entriesC;
 
  712   scalar_view_t valuesC;
 
  715   int team_work_size = 16; 
 
  716   std::string myalg(
"SPGEMM_KK_MEMORY");
 
  717   if(!params.is_null()) {
 
  718     if(params->isParameter(
"cuda: algorithm"))
 
  719       myalg = params->get(
"cuda: algorithm",myalg);
 
  720     if(params->isParameter(
"cuda: team work size"))
 
  721       team_work_size = params->get(
"cuda: team work size",team_work_size);
 
  725   std::string nodename(
"Cuda");
 
  726   std::string alg = nodename + std::string(
" algorithm");
 
  727   if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
 
  728   KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
 
  733   kh.create_spgemm_handle(alg_enum);
 
  734   kh.set_team_work_size(team_work_size);
 
  736   KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
 
  737                 Arowptr, Acolind, 
false,
 
  738                 Browptr, Bcolind, 
false,
 
  741   size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
 
  743     entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing(
"entriesC"), c_nnz_size);
 
  744     valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing(
"valuesC"), c_nnz_size);
 
  747   KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
 
  748               Arowptr, Acolind, Avals, 
false,
 
  749               Browptr, Bcolind, Bvals, 
false,
 
  750               row_mapC, entriesC, valuesC,
 
  751               omega, Dinv.getLocalViewDevice());
 
  752   kh.destroy_spgemm_handle();
 
  754 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  755   MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaSort"))));
 
  759   if (params.is_null() || params->get(
"sort entries",
true))
 
  760     Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
 
  761   C.setAllValues(row_mapC,entriesC,valuesC);
 
  763 #ifdef HAVE_TPETRA_MMM_TIMINGS 
  764   MM = Teuchos::null; MM = rcp(
new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string(
"Jacobi Newmatrix CudaESFC"))));
 
  768   Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(
new Teuchos::ParameterList);
 
  769   labelList->set(
"Timer Label",label);
 
  770   if(!params.is_null()) labelList->set(
"compute global constants",params->get(
"compute global constants",
true));
 
  771   Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
 
  772   C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
 
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
 
static bool debug()
Whether Tpetra is in debug mode. 
 
A distributed dense vector.