Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_Cuda.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
44 
45 #ifdef HAVE_TPETRA_INST_CUDA
46 namespace Tpetra {
47 namespace MMdetails {
48 
49 /*********************************************************************************************************/
50 // MMM KernelWrappers for Partial Specialization to CUDA
51 template<class Scalar,
52  class LocalOrdinal,
53  class GlobalOrdinal,
54  class LocalOrdinalViewType>
55 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
57  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
58  const LocalOrdinalViewType & Acol2Brow,
59  const LocalOrdinalViewType & Acol2Irow,
60  const LocalOrdinalViewType & Bcol2Ccol,
61  const LocalOrdinalViewType & Icol2Ccol,
62  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
63  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
64  const std::string& label = std::string(),
65  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66 
67 
68 
69  static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
70  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
71  const LocalOrdinalViewType & Acol2Brow,
72  const LocalOrdinalViewType & Acol2Irow,
73  const LocalOrdinalViewType & Bcol2Ccol,
74  const LocalOrdinalViewType & Icol2Ccol,
75  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
76  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
77  const std::string& label = std::string(),
78  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79 
80 };
81 
82 // Jacobi KernelWrappers for Partial Specialization to Cuda
83 template<class Scalar,
84  class LocalOrdinal,
85  class GlobalOrdinal, class LocalOrdinalViewType>
86 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87  static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
89  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
90  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
91  const LocalOrdinalViewType & Acol2Brow,
92  const LocalOrdinalViewType & Acol2Irow,
93  const LocalOrdinalViewType & Bcol2Ccol,
94  const LocalOrdinalViewType & Icol2Ccol,
95  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
96  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
97  const std::string& label = std::string(),
98  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
99 
100  static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
102  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
103  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
104  const LocalOrdinalViewType & Acol2Brow,
105  const LocalOrdinalViewType & Acol2Irow,
106  const LocalOrdinalViewType & Bcol2Ccol,
107  const LocalOrdinalViewType & Icol2Ccol,
108  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
109  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
110  const std::string& label = std::string(),
111  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
112 
113  static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
115  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
116  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
117  const LocalOrdinalViewType & Acol2Brow,
118  const LocalOrdinalViewType & Acol2Irow,
119  const LocalOrdinalViewType & Bcol2Ccol,
120  const LocalOrdinalViewType & Icol2Ccol,
121  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
122  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
123  const std::string& label = std::string(),
124  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
125 };
126 
127 
128 /*********************************************************************************************************/
129 // AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
130 template<class Scalar,
131  class LocalOrdinal,
132  class GlobalOrdinal,
133  class LocalOrdinalViewType>
134 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
135  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
136  const LocalOrdinalViewType & Acol2Brow,
137  const LocalOrdinalViewType & Acol2Irow,
138  const LocalOrdinalViewType & Bcol2Ccol,
139  const LocalOrdinalViewType & Icol2Ccol,
140  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
141  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
142  const std::string& label,
143  const Teuchos::RCP<Teuchos::ParameterList>& params) {
144 
145 
146 #ifdef HAVE_TPETRA_MMM_TIMINGS
147  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
148  using Teuchos::TimeMonitor;
149  Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaWrapper")))));
150 #endif
151  // Node-specific code
152  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
153  std::string nodename("Cuda");
154 
155  // Lots and lots of typedefs
156  using Teuchos::RCP;
158  typedef typename KCRS::device_type device_t;
159  typedef typename KCRS::StaticCrsGraphType graph_t;
160  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163  typedef typename KCRS::values_type::non_const_type scalar_view_t;
164  //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
165 
166  // Options
167  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
168  std::string myalg("SPGEMM_KK_MEMORY");
169  if(!params.is_null()) {
170  if(params->isParameter("cuda: algorithm"))
171  myalg = params->get("cuda: algorithm",myalg);
172  if(params->isParameter("cuda: team work size"))
173  team_work_size = params->get("cuda: team work size",team_work_size);
174  }
175 
176  // KokkosKernelsHandle
177  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
179  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
180 
181  // Grab the Kokkos::SparseCrsMatrices
182  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
183  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
184 
185  c_lno_view_t Arowptr = Amat.graph.row_map,
186  Browptr = Bmat.graph.row_map;
187  const lno_nnz_view_t Acolind = Amat.graph.entries,
188  Bcolind = Bmat.graph.entries;
189  const scalar_view_t Avals = Amat.values,
190  Bvals = Bmat.values;
191 
192  c_lno_view_t Irowptr;
193  lno_nnz_view_t Icolind;
194  scalar_view_t Ivals;
195  if(!Bview.importMatrix.is_null()) {
196  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
197  Irowptr = lclB.graph.row_map;
198  Icolind = lclB.graph.entries;
199  Ivals = lclB.values;
200  }
201 
202 
203  // Get the algorithm mode
204  std::string alg = nodename+std::string(" algorithm");
205  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
206  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
207  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
208 
209  // Merge the B and Bimport matrices
210  const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
211 
212 #ifdef HAVE_TPETRA_MMM_TIMINGS
213  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaCore"))));
214 #endif
215 
216  // Do the multiply on whatever we've got
217  typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
218  typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
219  typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
220 
221  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
222  lno_nnz_view_t entriesC;
223  scalar_view_t valuesC;
224  KernelHandle kh;
225  kh.create_spgemm_handle(alg_enum);
226  kh.set_team_work_size(team_work_size);
227 
228  KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
229 
230  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
231  if (c_nnz_size){
232  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
233  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
234  }
235  KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
236  kh.destroy_spgemm_handle();
237 
238 #ifdef HAVE_TPETRA_MMM_TIMINGS
239  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaSort"))));
240 #endif
241 
242  // Sort & set values
243  if (params.is_null() || params->get("sort entries",true))
244  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
245  C.setAllValues(row_mapC,entriesC,valuesC);
246 
247 #ifdef HAVE_TPETRA_MMM_TIMINGS
248  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaESFC"))));
249 #endif
250 
251  // Final Fillcomplete
252  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
253  labelList->set("Timer Label",label);
254  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
255  RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
256  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
257 }
258 
259 
260 /*********************************************************************************************************/
261 template<class Scalar,
262  class LocalOrdinal,
263  class GlobalOrdinal,
264  class LocalOrdinalViewType>
265 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(
266  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
267  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
268  const LocalOrdinalViewType & targetMapToOrigRow_dev,
269  const LocalOrdinalViewType & targetMapToImportRow_dev,
270  const LocalOrdinalViewType & Bcol2Ccol_dev,
271  const LocalOrdinalViewType & Icol2Ccol_dev,
272  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
273  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
274  const std::string& label,
275  const Teuchos::RCP<Teuchos::ParameterList>& params) {
276 
277  // FIXME: Right now, this is a cut-and-paste of the serial kernel
278  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
279 
280 #ifdef HAVE_TPETRA_MMM_TIMINGS
281  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
282  using Teuchos::TimeMonitor;
283  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
284  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
285 #endif
286  using Teuchos::RCP;
287  using Teuchos::rcp;
288 
289 
290  // Lots and lots of typedefs
291  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
292  typedef typename KCRS::StaticCrsGraphType graph_t;
293  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
294  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
295  typedef typename KCRS::values_type::non_const_type scalar_view_t;
296 
297  typedef Scalar SC;
298  typedef LocalOrdinal LO;
299  typedef GlobalOrdinal GO;
300  typedef Node NO;
301  typedef Map<LO,GO,NO> map_type;
302  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
303  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
304  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
305 
306  // Since this is being run on Cuda, we need to fence because the below code will use UVM
307  // typename graph_t::execution_space().fence();
308 
309  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
310  // KDDKDD UVM Ideally, this function would run on device and use
311  // KDDKDD UVM KokkosKernels instead of this host implementation.
312  auto targetMapToOrigRow =
313  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
314  targetMapToOrigRow_dev);
315  auto targetMapToImportRow =
316  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
317  targetMapToImportRow_dev);
318  auto Bcol2Ccol =
319  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
320  Bcol2Ccol_dev);
321  auto Icol2Ccol =
322  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
323  Icol2Ccol_dev);
324 
325  // Sizes
326  RCP<const map_type> Ccolmap = C.getColMap();
327  size_t m = Aview.origMatrix->getLocalNumRows();
328  size_t n = Ccolmap->getLocalNumElements();
329 
330  // Grab the Kokkos::SparseCrsMatrices & inner stuff
331  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
332  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
333  const KCRS & Cmat = C.getLocalMatrixHost();
334 
335  c_lno_view_t Arowptr = Amat.graph.row_map,
336  Browptr = Bmat.graph.row_map,
337  Crowptr = Cmat.graph.row_map;
338  const lno_nnz_view_t Acolind = Amat.graph.entries,
339  Bcolind = Bmat.graph.entries,
340  Ccolind = Cmat.graph.entries;
341  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
342  scalar_view_t Cvals = Cmat.values;
343 
344  c_lno_view_t Irowptr;
345  lno_nnz_view_t Icolind;
346  scalar_view_t Ivals;
347  if(!Bview.importMatrix.is_null()) {
348  auto lclB = Bview.importMatrix->getLocalMatrixHost();
349  Irowptr = lclB.graph.row_map;
350  Icolind = lclB.graph.entries;
351  Ivals = lclB.values;
352  }
353 
354 #ifdef HAVE_TPETRA_MMM_TIMINGS
355  MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
356 #endif
357 
358  // Classic csr assembly (low memory edition)
359  // mfh 27 Sep 2016: The c_status array is an implementation detail
360  // of the local sparse matrix-matrix multiply routine.
361 
362  // The status array will contain the index into colind where this entry was last deposited.
363  // c_status[i] < CSR_ip - not in the row yet
364  // c_status[i] >= CSR_ip - this is the entry where you can find the data
365  // We start with this filled with INVALID's indicating that there are no entries yet.
366  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
367  std::vector<size_t> c_status(n, ST_INVALID);
368 
369  // For each row of A/C
370  size_t CSR_ip = 0, OLD_ip = 0;
371  for (size_t i = 0; i < m; i++) {
372  // First fill the c_status array w/ locations where we're allowed to
373  // generate nonzeros for this row
374  OLD_ip = Crowptr[i];
375  CSR_ip = Crowptr[i+1];
376  for (size_t k = OLD_ip; k < CSR_ip; k++) {
377  c_status[Ccolind[k]] = k;
378 
379  // Reset values in the row of C
380  Cvals[k] = SC_ZERO;
381  }
382 
383  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
384  LO Aik = Acolind[k];
385  const SC Aval = Avals[k];
386  if (Aval == SC_ZERO)
387  continue;
388 
389  if (targetMapToOrigRow[Aik] != LO_INVALID) {
390  // Local matrix
391  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
392 
393  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
394  LO Bkj = Bcolind[j];
395  LO Cij = Bcol2Ccol[Bkj];
396 
397  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
398  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
399  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
400 
401  Cvals[c_status[Cij]] += Aval * Bvals[j];
402  }
403 
404  } else {
405  // Remote matrix
406  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
407  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
408  LO Ikj = Icolind[j];
409  LO Cij = Icol2Ccol[Ikj];
410 
411  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
412  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
413  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
414 
415  Cvals[c_status[Cij]] += Aval * Ivals[j];
416  }
417  }
418  }
419  }
420 
421  C.fillComplete(C.getDomainMap(), C.getRangeMap());
422 }
423 
424 /*********************************************************************************************************/
425 template<class Scalar,
426  class LocalOrdinal,
427  class GlobalOrdinal,
428  class LocalOrdinalViewType>
429 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
430  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
431  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
432  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
433  const LocalOrdinalViewType & Acol2Brow,
434  const LocalOrdinalViewType & Acol2Irow,
435  const LocalOrdinalViewType & Bcol2Ccol,
436  const LocalOrdinalViewType & Icol2Ccol,
437  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
438  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
439  const std::string& label,
440  const Teuchos::RCP<Teuchos::ParameterList>& params) {
441 
442 #ifdef HAVE_TPETRA_MMM_TIMINGS
443  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
444  using Teuchos::TimeMonitor;
445  Teuchos::RCP<TimeMonitor> MM;
446 #endif
447 
448  // Node-specific code
449  using Teuchos::RCP;
450 
451  // Options
452  //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
453  std::string myalg("KK");
454  if(!params.is_null()) {
455  if(params->isParameter("cuda: jacobi algorithm"))
456  myalg = params->get("cuda: jacobi algorithm",myalg);
457  }
458 
459  if(myalg == "MSAK") {
460  ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
461  }
462  else if(myalg == "KK") {
463  jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
464  }
465  else {
466  throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
467  }
468 
469 #ifdef HAVE_TPETRA_MMM_TIMINGS
470  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
471 #endif
472 
473  // Final Fillcomplete
474  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
475  labelList->set("Timer Label",label);
476  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
477 
478  // NOTE: MSAK already fillCompletes, so we have to check here
479  if(!C.isFillComplete()) {
480  RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
481  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
482  }
483 
484 }
485 
486 
487 
488 /*********************************************************************************************************/
489 template<class Scalar,
490  class LocalOrdinal,
491  class GlobalOrdinal,
492  class LocalOrdinalViewType>
493 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
494  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
495  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
496  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
497  const LocalOrdinalViewType & targetMapToOrigRow_dev,
498  const LocalOrdinalViewType & targetMapToImportRow_dev,
499  const LocalOrdinalViewType & Bcol2Ccol_dev,
500  const LocalOrdinalViewType & Icol2Ccol_dev,
501  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
502  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
503  const std::string& label,
504  const Teuchos::RCP<Teuchos::ParameterList>& params) {
505 
506  // FIXME: Right now, this is a cut-and-paste of the serial kernel
507  typedef Tpetra::KokkosCompat::KokkosCudaWrapperNode Node;
508 
509 #ifdef HAVE_TPETRA_MMM_TIMINGS
510  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
511  using Teuchos::TimeMonitor;
512  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore"))));
513  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
514 #endif
515  using Teuchos::RCP;
516  using Teuchos::rcp;
517 
518  // Lots and lots of typedefs
519  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
520  typedef typename KCRS::StaticCrsGraphType graph_t;
521  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
522  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
523  typedef typename KCRS::values_type::non_const_type scalar_view_t;
524  typedef typename scalar_view_t::memory_space scalar_memory_space;
525 
526  typedef Scalar SC;
527  typedef LocalOrdinal LO;
528  typedef GlobalOrdinal GO;
529  typedef Node NO;
530  typedef Map<LO,GO,NO> map_type;
531  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
532  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
533  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
534 
535  // Since this is being run on Cuda, we need to fence because the below host code will use UVM
536  // KDDKDD typename graph_t::execution_space().fence();
537 
538  // KDDKDD UVM Without UVM, need to copy targetMap arrays to host.
539  // KDDKDD UVM Ideally, this function would run on device and use
540  // KDDKDD UVM KokkosKernels instead of this host implementation.
541  auto targetMapToOrigRow =
542  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
543  targetMapToOrigRow_dev);
544  auto targetMapToImportRow =
545  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
546  targetMapToImportRow_dev);
547  auto Bcol2Ccol =
548  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
549  Bcol2Ccol_dev);
550  auto Icol2Ccol =
551  Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
552  Icol2Ccol_dev);
553 
554 
555  // Sizes
556  RCP<const map_type> Ccolmap = C.getColMap();
557  size_t m = Aview.origMatrix->getLocalNumRows();
558  size_t n = Ccolmap->getLocalNumElements();
559 
560  // Grab the Kokkos::SparseCrsMatrices & inner stuff
561  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
562  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
563  const KCRS & Cmat = C.getLocalMatrixHost();
564 
565  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
566  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
567  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
568  scalar_view_t Cvals = Cmat.values;
569 
570  c_lno_view_t Irowptr;
571  lno_nnz_view_t Icolind;
572  scalar_view_t Ivals;
573  if(!Bview.importMatrix.is_null()) {
574  auto lclB = Bview.importMatrix->getLocalMatrixHost();
575  Irowptr = lclB.graph.row_map;
576  Icolind = lclB.graph.entries;
577  Ivals = lclB.values;
578  }
579 
580  // Jacobi-specific inner stuff
581  auto Dvals =
582  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
583 
584 #ifdef HAVE_TPETRA_MMM_TIMINGS
585  MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore - Compare"))));
586 #endif
587 
588  // The status array will contain the index into colind where this entry was last deposited.
589  // c_status[i] < CSR_ip - not in the row yet
590  // c_status[i] >= CSR_ip - this is the entry where you can find the data
591  // We start with this filled with INVALID's indicating that there are no entries yet.
592  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
593  std::vector<size_t> c_status(n, ST_INVALID);
594 
595  // For each row of A/C
596  size_t CSR_ip = 0, OLD_ip = 0;
597  for (size_t i = 0; i < m; i++) {
598 
599  // First fill the c_status array w/ locations where we're allowed to
600  // generate nonzeros for this row
601  OLD_ip = Crowptr[i];
602  CSR_ip = Crowptr[i+1];
603  for (size_t k = OLD_ip; k < CSR_ip; k++) {
604  c_status[Ccolind[k]] = k;
605 
606  // Reset values in the row of C
607  Cvals[k] = SC_ZERO;
608  }
609 
610  SC minusOmegaDval = -omega*Dvals(i,0);
611 
612  // Entries of B
613  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
614  Scalar Bval = Bvals[j];
615  if (Bval == SC_ZERO)
616  continue;
617  LO Bij = Bcolind[j];
618  LO Cij = Bcol2Ccol[Bij];
619 
620  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
621  std::runtime_error, "Trying to insert a new entry into a static graph");
622 
623  Cvals[c_status[Cij]] = Bvals[j];
624  }
625 
626  // Entries of -omega * Dinv * A * B
627  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
628  LO Aik = Acolind[k];
629  const SC Aval = Avals[k];
630  if (Aval == SC_ZERO)
631  continue;
632 
633  if (targetMapToOrigRow[Aik] != LO_INVALID) {
634  // Local matrix
635  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
636 
637  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
638  LO Bkj = Bcolind[j];
639  LO Cij = Bcol2Ccol[Bkj];
640 
641  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
642  std::runtime_error, "Trying to insert a new entry into a static graph");
643 
644  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
645  }
646 
647  } else {
648  // Remote matrix
649  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
650  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
651  LO Ikj = Icolind[j];
652  LO Cij = Icol2Ccol[Ikj];
653 
654  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
655  std::runtime_error, "Trying to insert a new entry into a static graph");
656 
657  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
658  }
659  }
660  }
661  }
662 
663 #ifdef HAVE_TPETRA_MMM_TIMINGS
664  MM2= Teuchos::null;
665  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
666 #endif
667 
668  C.fillComplete(C.getDomainMap(), C.getRangeMap());
669 
670 }
671 
672 /*********************************************************************************************************/
673 template<class Scalar,
674  class LocalOrdinal,
675  class GlobalOrdinal,
676  class LocalOrdinalViewType>
677 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
678  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> & Dinv,
679  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Aview,
680  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& Bview,
681  const LocalOrdinalViewType & Acol2Brow,
682  const LocalOrdinalViewType & Acol2Irow,
683  const LocalOrdinalViewType & Bcol2Ccol,
684  const LocalOrdinalViewType & Icol2Ccol,
685  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosCudaWrapperNode>& C,
686  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > Cimport,
687  const std::string& label,
688  const Teuchos::RCP<Teuchos::ParameterList>& params) {
689 
690 #ifdef HAVE_TPETRA_MMM_TIMINGS
691  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
692  using Teuchos::TimeMonitor;
693  Teuchos::RCP<TimeMonitor> MM;
694 #endif
695 
696  // Check if the diagonal entries exist in debug mode
697  const bool debug = Tpetra::Details::Behavior::debug();
698  if(debug) {
699 
700  auto rowMap = Aview.origMatrix->getRowMap();
702  Aview.origMatrix->getLocalDiagCopy(diags);
703  size_t diagLength = rowMap->getLocalNumElements();
704  Teuchos::Array<Scalar> diagonal(diagLength);
705  diags.get1dCopy(diagonal());
706 
707  for(size_t i = 0; i < diagLength; ++i) {
708  TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
709  std::runtime_error,
710  "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
711  "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
712  }
713  }
714 
715  // Usings
716  using device_t = typename Tpetra::KokkosCompat::KokkosCudaWrapperNode::device_type;
718  using graph_t = typename matrix_t::StaticCrsGraphType;
719  using lno_view_t = typename graph_t::row_map_type::non_const_type;
720  using c_lno_view_t = typename graph_t::row_map_type::const_type;
721  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
722  using scalar_view_t = typename matrix_t::values_type::non_const_type;
723 
724  // KokkosKernels handle
725  using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
726  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
727  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
728 
729  // Get the rowPtr, colInd and vals of importMatrix
730  c_lno_view_t Irowptr;
731  lno_nnz_view_t Icolind;
732  scalar_view_t Ivals;
733  if(!Bview.importMatrix.is_null()) {
734  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
735  Irowptr = lclB.graph.row_map;
736  Icolind = lclB.graph.entries;
737  Ivals = lclB.values;
738  }
739 
740  // Merge the B and Bimport matrices
741  const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getLocalNumElements());
742 
743  // Get the properties and arrays of input matrices
744  const matrix_t & Amat = Aview.origMatrix->getLocalMatrixDevice();
745  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrixDevice();
746 
747  typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
748  typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
749  typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
750 
751  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
752  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
753  const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
754 
755  // Arrays of the output matrix
756  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
757  lno_nnz_view_t entriesC;
758  scalar_view_t valuesC;
759 
760  // Options
761  int team_work_size = 16;
762  std::string myalg("SPGEMM_KK_MEMORY");
763  if(!params.is_null()) {
764  if(params->isParameter("cuda: algorithm"))
765  myalg = params->get("cuda: algorithm",myalg);
766  if(params->isParameter("cuda: team work size"))
767  team_work_size = params->get("cuda: team work size",team_work_size);
768  }
769 
770  // Get the algorithm mode
771  std::string nodename("Cuda");
772  std::string alg = nodename + std::string(" algorithm");
773  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
774  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
775 
776 
777  // KokkosKernels call
778  handle_t kh;
779  kh.create_spgemm_handle(alg_enum);
780  kh.set_team_work_size(team_work_size);
781 
782  KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
783  Arowptr, Acolind, false,
784  Browptr, Bcolind, false,
785  row_mapC);
786 
787  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
788  if (c_nnz_size){
789  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
790  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
791  }
792 
793  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
794  Arowptr, Acolind, Avals, false,
795  Browptr, Bcolind, Bvals, false,
796  row_mapC, entriesC, valuesC,
797  omega, Dinv.getLocalViewDevice(Access::ReadOnly));
798  kh.destroy_spgemm_handle();
799 
800 #ifdef HAVE_TPETRA_MMM_TIMINGS
801  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaSort"))));
802 #endif
803 
804  // Sort & set values
805  if (params.is_null() || params->get("sort entries",true))
806  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
807  C.setAllValues(row_mapC,entriesC,valuesC);
808 
809 #ifdef HAVE_TPETRA_MMM_TIMINGS
810  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
811 #endif
812 
813  // Final Fillcomplete
814  Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
815  labelList->set("Timer Label",label);
816  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
817  Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosCudaWrapperNode> > dummyExport;
818  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
819 }
820 
821  }//MMdetails
822 }//Tpetra
823 
824 #endif//CUDA
825 
826 #endif
static bool debug()
Whether Tpetra is in debug mode.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
A distributed dense vector.