Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
TpetraExt_MatrixMatrix_Cuda.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
44 
45 #ifdef HAVE_TPETRA_INST_CUDA
46 namespace Tpetra {
47 namespace MMdetails {
48 
49 /*********************************************************************************************************/
50 // MMM KernelWrappers for Partial Specialization to CUDA
51 template<class Scalar,
52  class LocalOrdinal,
53  class GlobalOrdinal,
54  class LocalOrdinalViewType>
55 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
57  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
58  const LocalOrdinalViewType & Acol2Brow,
59  const LocalOrdinalViewType & Acol2Irow,
60  const LocalOrdinalViewType & Bcol2Ccol,
61  const LocalOrdinalViewType & Icol2Ccol,
62  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
63  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
64  const std::string& label = std::string(),
65  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66 
67 
68 
69  static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
70  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
71  const LocalOrdinalViewType & Acol2Brow,
72  const LocalOrdinalViewType & Acol2Irow,
73  const LocalOrdinalViewType & Bcol2Ccol,
74  const LocalOrdinalViewType & Icol2Ccol,
75  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
76  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
77  const std::string& label = std::string(),
78  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79 
80 };
81 
82 // Jacobi KernelWrappers for Partial Specialization to Cuda
83 template<class Scalar,
84  class LocalOrdinal,
85  class GlobalOrdinal, class LocalOrdinalViewType>
86 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87  static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
89  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
90  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
91  const LocalOrdinalViewType & Acol2Brow,
92  const LocalOrdinalViewType & Acol2Irow,
93  const LocalOrdinalViewType & Bcol2Ccol,
94  const LocalOrdinalViewType & Icol2Ccol,
95  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
96  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
97  const std::string& label = std::string(),
98  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
99 
100  static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
102  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
103  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
104  const LocalOrdinalViewType & Acol2Brow,
105  const LocalOrdinalViewType & Acol2Irow,
106  const LocalOrdinalViewType & Bcol2Ccol,
107  const LocalOrdinalViewType & Icol2Ccol,
108  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
109  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
110  const std::string& label = std::string(),
111  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
112 
113 };
114 
115 
116 /*********************************************************************************************************/
117 // AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
118 template<class Scalar,
119  class LocalOrdinal,
120  class GlobalOrdinal,
121  class LocalOrdinalViewType>
122 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
123  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
124  const LocalOrdinalViewType & Acol2Brow,
125  const LocalOrdinalViewType & Acol2Irow,
126  const LocalOrdinalViewType & Bcol2Ccol,
127  const LocalOrdinalViewType & Icol2Ccol,
128  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
129  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
130  const std::string& label,
131  const Teuchos::RCP<Teuchos::ParameterList>& params) {
132 
133 
134 #ifdef HAVE_TPETRA_MMM_TIMINGS
135  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
136  using Teuchos::TimeMonitor;
137  Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaWrapper")))));
138 #endif
139  // Node-specific code
140  typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
141  std::string nodename("Cuda");
142 
143  // Lots and lots of typedefs
144  using Teuchos::RCP;
146  typedef typename KCRS::device_type device_t;
147  typedef typename KCRS::StaticCrsGraphType graph_t;
148  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
149  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
150  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
151  typedef typename KCRS::values_type::non_const_type scalar_view_t;
152  //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
153 
154  // Options
155  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
156  std::string myalg("SPGEMM_KK_MEMORY");
157  if(!params.is_null()) {
158  if(params->isParameter("cuda: algorithm"))
159  myalg = params->get("cuda: algorithm",myalg);
160  if(params->isParameter("cuda: team work size"))
161  team_work_size = params->get("cuda: team work size",team_work_size);
162  }
163 
164  // KokkosKernelsHandle
165  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
166  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
167  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
168 
169  // Grab the Kokkos::SparseCrsMatrices
170  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
171  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
172 
173  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
174  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
175  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
176 
177  c_lno_view_t Irowptr;
178  lno_nnz_view_t Icolind;
179  scalar_view_t Ivals;
180  if(!Bview.importMatrix.is_null()) {
181  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
182  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
183  Ivals = Bview.importMatrix->getLocalMatrix().values;
184  }
185 
186 
187  // Get the algorithm mode
188  std::string alg = nodename+std::string(" algorithm");
189  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
190  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
191  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
192 
193  // Merge the B and Bimport matrices
194  const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
195 
196 #ifdef HAVE_TPETRA_MMM_TIMINGS
197  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaCore"))));
198 #endif
199 
200  // Do the multiply on whatever we've got
201  typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
202  typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
203  typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
204 
205  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
206  lno_nnz_view_t entriesC;
207  scalar_view_t valuesC;
208  KernelHandle kh;
209  kh.create_spgemm_handle(alg_enum);
210  kh.set_team_work_size(team_work_size);
211 
212  KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
213 
214  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
215  if (c_nnz_size){
216  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
217  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
218  }
219  KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
220  kh.destroy_spgemm_handle();
221 
222 #ifdef HAVE_TPETRA_MMM_TIMINGS
223  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaSort"))));
224 #endif
225 
226  // Sort & set values
227  if (params.is_null() || params->get("sort entries",true))
228  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
229  C.setAllValues(row_mapC,entriesC,valuesC);
230 
231 #ifdef HAVE_TPETRA_MMM_TIMINGS
232  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaESFC"))));
233 #endif
234 
235  // Final Fillcomplete
236  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
237  labelList->set("Timer Label",label);
238  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
239  RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
240  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
241 }
242 
243 
244 /*********************************************************************************************************/
245 template<class Scalar,
246  class LocalOrdinal,
247  class GlobalOrdinal,
248  class LocalOrdinalViewType>
249 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
250  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
251  const LocalOrdinalViewType & targetMapToOrigRow,
252  const LocalOrdinalViewType & targetMapToImportRow,
253  const LocalOrdinalViewType & Bcol2Ccol,
254  const LocalOrdinalViewType & Icol2Ccol,
255  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
256  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
257  const std::string& label,
258  const Teuchos::RCP<Teuchos::ParameterList>& params) {
259 
260  // FIXME: Right now, this is a cut-and-paste of the serial kernel
261  typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
262 
263 #ifdef HAVE_TPETRA_MMM_TIMINGS
264  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
265  using Teuchos::TimeMonitor;
266  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
267  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
268 #endif
269  using Teuchos::RCP;
270  using Teuchos::rcp;
271 
272 
273  // Lots and lots of typedefs
275  typedef typename KCRS::StaticCrsGraphType graph_t;
276  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
277  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
278  typedef typename KCRS::values_type::non_const_type scalar_view_t;
279 
280  typedef Scalar SC;
281  typedef LocalOrdinal LO;
282  typedef GlobalOrdinal GO;
283  typedef Node NO;
284  typedef Map<LO,GO,NO> map_type;
285  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
286  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
287  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
288 
289  // Sizes
290  RCP<const map_type> Ccolmap = C.getColMap();
291  size_t m = Aview.origMatrix->getNodeNumRows();
292  size_t n = Ccolmap->getNodeNumElements();
293 
294  // Grab the Kokkos::SparseCrsMatrices & inner stuff
295  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
296  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
297  const KCRS & Cmat = C.getLocalMatrix();
298 
299  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
300  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
301  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
302  scalar_view_t Cvals = Cmat.values;
303 
304  c_lno_view_t Irowptr;
305  lno_nnz_view_t Icolind;
306  scalar_view_t Ivals;
307  if(!Bview.importMatrix.is_null()) {
308  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
309  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
310  Ivals = Bview.importMatrix->getLocalMatrix().values;
311  }
312 
313 #ifdef HAVE_TPETRA_MMM_TIMINGS
314  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
315 #endif
316 
317  // Classic csr assembly (low memory edition)
318  // mfh 27 Sep 2016: The c_status array is an implementation detail
319  // of the local sparse matrix-matrix multiply routine.
320 
321  // The status array will contain the index into colind where this entry was last deposited.
322  // c_status[i] < CSR_ip - not in the row yet
323  // c_status[i] >= CSR_ip - this is the entry where you can find the data
324  // We start with this filled with INVALID's indicating that there are no entries yet.
325  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
326  std::vector<size_t> c_status(n, ST_INVALID);
327 
328  // For each row of A/C
329  size_t CSR_ip = 0, OLD_ip = 0;
330  for (size_t i = 0; i < m; i++) {
331  // First fill the c_status array w/ locations where we're allowed to
332  // generate nonzeros for this row
333  OLD_ip = Crowptr[i];
334  CSR_ip = Crowptr[i+1];
335  for (size_t k = OLD_ip; k < CSR_ip; k++) {
336  c_status[Ccolind[k]] = k;
337 
338  // Reset values in the row of C
339  Cvals[k] = SC_ZERO;
340  }
341 
342  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
343  LO Aik = Acolind[k];
344  const SC Aval = Avals[k];
345  if (Aval == SC_ZERO)
346  continue;
347 
348  if (targetMapToOrigRow[Aik] != LO_INVALID) {
349  // Local matrix
350  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
351 
352  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
353  LO Bkj = Bcolind[j];
354  LO Cij = Bcol2Ccol[Bkj];
355 
356  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
357  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
358  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
359 
360  Cvals[c_status[Cij]] += Aval * Bvals[j];
361  }
362 
363  } else {
364  // Remote matrix
365  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
366  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
367  LO Ikj = Icolind[j];
368  LO Cij = Icol2Ccol[Ikj];
369 
370  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
371  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
372  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
373 
374  Cvals[c_status[Cij]] += Aval * Ivals[j];
375  }
376  }
377  }
378  }
379 
380  C.fillComplete(C.getDomainMap(), C.getRangeMap());
381 }
382 
383 /*********************************************************************************************************/
384 template<class Scalar,
385  class LocalOrdinal,
386  class GlobalOrdinal,
387  class LocalOrdinalViewType>
388 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
389  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
390  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
391  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
392  const LocalOrdinalViewType & Acol2Brow,
393  const LocalOrdinalViewType & Acol2Irow,
394  const LocalOrdinalViewType & Bcol2Ccol,
395  const LocalOrdinalViewType & Icol2Ccol,
396  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
397  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
398  const std::string& label,
399  const Teuchos::RCP<Teuchos::ParameterList>& params) {
400 
401 #ifdef HAVE_TPETRA_MMM_TIMINGS
402  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
403  using Teuchos::TimeMonitor;
404  Teuchos::RCP<TimeMonitor> MM;
405 #endif
406 
407  // Node-specific code
408  using Teuchos::RCP;
409 
410  // Options
411  //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
412  std::string myalg("MSAK");
413  if(!params.is_null()) {
414  if(params->isParameter("cuda: jacobi algorithm"))
415  myalg = params->get("cuda: jacobi algorithm",myalg);
416  }
417 
418  if(myalg == "MSAK") {
419  ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
420  }
421  else {
422  throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
423  }
424 
425 #ifdef HAVE_TPETRA_MMM_TIMINGS
426  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
427 #endif
428 
429  // Final Fillcomplete
430  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
431  labelList->set("Timer Label",label);
432  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
433 
434  // NOTE: MSAK already fillCompletes, so we have to check here
435  if(!C.isFillComplete()) {
436  RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
437  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
438  }
439 
440 }
441 
442 
443 
444 /*********************************************************************************************************/
445 template<class Scalar,
446  class LocalOrdinal,
447  class GlobalOrdinal,
448  class LocalOrdinalViewType>
449 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
450  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
451  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
452  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
453  const LocalOrdinalViewType & targetMapToOrigRow,
454  const LocalOrdinalViewType & targetMapToImportRow,
455  const LocalOrdinalViewType & Bcol2Ccol,
456  const LocalOrdinalViewType & Icol2Ccol,
457  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
458  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
459  const std::string& label,
460  const Teuchos::RCP<Teuchos::ParameterList>& params) {
461 
462  // FIXME: Right now, this is a cut-and-paste of the serial kernel
463  typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
464 
465 #ifdef HAVE_TPETRA_MMM_TIMINGS
466  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
467  using Teuchos::TimeMonitor;
468  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore"))));
469  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
470 #endif
471  using Teuchos::RCP;
472  using Teuchos::rcp;
473 
474  // Lots and lots of typedefs
476  typedef typename KCRS::StaticCrsGraphType graph_t;
477  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
478  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
479  typedef typename KCRS::values_type::non_const_type scalar_view_t;
480  typedef typename scalar_view_t::memory_space scalar_memory_space;
481 
482  typedef Scalar SC;
483  typedef LocalOrdinal LO;
484  typedef GlobalOrdinal GO;
485  typedef Node NO;
486  typedef Map<LO,GO,NO> map_type;
487  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
488  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
489  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
490 
491  // Sizes
492  RCP<const map_type> Ccolmap = C.getColMap();
493  size_t m = Aview.origMatrix->getNodeNumRows();
494  size_t n = Ccolmap->getNodeNumElements();
495 
496  // Grab the Kokkos::SparseCrsMatrices & inner stuff
497  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
498  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
499  const KCRS & Cmat = C.getLocalMatrix();
500 
501  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
502  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
503  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
504  scalar_view_t Cvals = Cmat.values;
505 
506  c_lno_view_t Irowptr;
507  lno_nnz_view_t Icolind;
508  scalar_view_t Ivals;
509  if(!Bview.importMatrix.is_null()) {
510  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
511  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
512  Ivals = Bview.importMatrix->getLocalMatrix().values;
513  }
514 
515  // Jacobi-specific inner stuff
516  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
517 
518 #ifdef HAVE_TPETRA_MMM_TIMINGS
519  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore - Compare"))));
520 #endif
521 
522  // The status array will contain the index into colind where this entry was last deposited.
523  // c_status[i] < CSR_ip - not in the row yet
524  // c_status[i] >= CSR_ip - this is the entry where you can find the data
525  // We start with this filled with INVALID's indicating that there are no entries yet.
526  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
527  std::vector<size_t> c_status(n, ST_INVALID);
528 
529  // For each row of A/C
530  size_t CSR_ip = 0, OLD_ip = 0;
531  for (size_t i = 0; i < m; i++) {
532 
533  // First fill the c_status array w/ locations where we're allowed to
534  // generate nonzeros for this row
535  OLD_ip = Crowptr[i];
536  CSR_ip = Crowptr[i+1];
537  for (size_t k = OLD_ip; k < CSR_ip; k++) {
538  c_status[Ccolind[k]] = k;
539 
540  // Reset values in the row of C
541  Cvals[k] = SC_ZERO;
542  }
543 
544  SC minusOmegaDval = -omega*Dvals(i,0);
545 
546  // Entries of B
547  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
548  Scalar Bval = Bvals[j];
549  if (Bval == SC_ZERO)
550  continue;
551  LO Bij = Bcolind[j];
552  LO Cij = Bcol2Ccol[Bij];
553 
554  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
555  std::runtime_error, "Trying to insert a new entry into a static graph");
556 
557  Cvals[c_status[Cij]] = Bvals[j];
558  }
559 
560  // Entries of -omega * Dinv * A * B
561  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
562  LO Aik = Acolind[k];
563  const SC Aval = Avals[k];
564  if (Aval == SC_ZERO)
565  continue;
566 
567  if (targetMapToOrigRow[Aik] != LO_INVALID) {
568  // Local matrix
569  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
570 
571  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
572  LO Bkj = Bcolind[j];
573  LO Cij = Bcol2Ccol[Bkj];
574 
575  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
576  std::runtime_error, "Trying to insert a new entry into a static graph");
577 
578  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
579  }
580 
581  } else {
582  // Remote matrix
583  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
584  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
585  LO Ikj = Icolind[j];
586  LO Cij = Icol2Ccol[Ikj];
587 
588  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
589  std::runtime_error, "Trying to insert a new entry into a static graph");
590 
591  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
592  }
593  }
594  }
595  }
596 
597 #ifdef HAVE_TPETRA_MMM_TIMINGS
598  MM2= Teuchos::null;
599  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
600 #endif
601 
602  C.fillComplete(C.getDomainMap(), C.getRangeMap());
603 
604 }
605 
606  }//MMdetails
607 }//Tpetra
608 
609 #endif//CUDA
610 
611 #endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...