Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_Cuda.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_CUDA_DEF_HPP
44 
45 #ifdef HAVE_TPETRA_INST_CUDA
46 namespace Tpetra {
47 namespace MMdetails {
48 
49 /*********************************************************************************************************/
50 // MMM KernelWrappers for Partial Specialization to CUDA
51 template<class Scalar,
52  class LocalOrdinal,
53  class GlobalOrdinal,
54  class LocalOrdinalViewType>
55 struct KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
56  static inline void mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
57  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
58  const LocalOrdinalViewType & Acol2Brow,
59  const LocalOrdinalViewType & Acol2Irow,
60  const LocalOrdinalViewType & Bcol2Ccol,
61  const LocalOrdinalViewType & Icol2Ccol,
62  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
63  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
64  const std::string& label = std::string(),
65  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
66 
67 
68 
69  static inline void mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
70  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
71  const LocalOrdinalViewType & Acol2Brow,
72  const LocalOrdinalViewType & Acol2Irow,
73  const LocalOrdinalViewType & Bcol2Ccol,
74  const LocalOrdinalViewType & Icol2Ccol,
75  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
76  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
77  const std::string& label = std::string(),
78  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
79 
80 };
81 
82 // Jacobi KernelWrappers for Partial Specialization to Cuda
83 template<class Scalar,
84  class LocalOrdinal,
85  class GlobalOrdinal, class LocalOrdinalViewType>
86 struct KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType> {
87  static inline void jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
88  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
89  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
90  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
91  const LocalOrdinalViewType & Acol2Brow,
92  const LocalOrdinalViewType & Acol2Irow,
93  const LocalOrdinalViewType & Bcol2Ccol,
94  const LocalOrdinalViewType & Icol2Ccol,
95  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
96  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
97  const std::string& label = std::string(),
98  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
99 
100  static inline void jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
101  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
102  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
103  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
104  const LocalOrdinalViewType & Acol2Brow,
105  const LocalOrdinalViewType & Acol2Irow,
106  const LocalOrdinalViewType & Bcol2Ccol,
107  const LocalOrdinalViewType & Icol2Ccol,
108  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
109  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
110  const std::string& label = std::string(),
111  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
112 
113  static inline void jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
114  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
115  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
116  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
117  const LocalOrdinalViewType & Acol2Brow,
118  const LocalOrdinalViewType & Acol2Irow,
119  const LocalOrdinalViewType & Bcol2Ccol,
120  const LocalOrdinalViewType & Icol2Ccol,
121  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
122  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
123  const std::string& label = std::string(),
124  const Teuchos::RCP<Teuchos::ParameterList>& params = Teuchos::null);
125 };
126 
127 
128 /*********************************************************************************************************/
129 // AB NewMatrix Kernel wrappers (KokkosKernels/CUDA Version)
130 template<class Scalar,
131  class LocalOrdinal,
132  class GlobalOrdinal,
133  class LocalOrdinalViewType>
134 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
135  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
136  const LocalOrdinalViewType & Acol2Brow,
137  const LocalOrdinalViewType & Acol2Irow,
138  const LocalOrdinalViewType & Bcol2Ccol,
139  const LocalOrdinalViewType & Icol2Ccol,
140  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
141  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
142  const std::string& label,
143  const Teuchos::RCP<Teuchos::ParameterList>& params) {
144 
145 
146 #ifdef HAVE_TPETRA_MMM_TIMINGS
147  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
148  using Teuchos::TimeMonitor;
149  Teuchos::RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaWrapper")))));
150 #endif
151  // Node-specific code
152  typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
153  std::string nodename("Cuda");
154 
155  // Lots and lots of typedefs
156  using Teuchos::RCP;
158  typedef typename KCRS::device_type device_t;
159  typedef typename KCRS::StaticCrsGraphType graph_t;
160  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
161  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
162  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
163  typedef typename KCRS::values_type::non_const_type scalar_view_t;
164  //typedef typename graph_t::row_map_type::const_type lno_view_t_const;
165 
166  // Options
167  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
168  std::string myalg("SPGEMM_KK_MEMORY");
169  if(!params.is_null()) {
170  if(params->isParameter("cuda: algorithm"))
171  myalg = params->get("cuda: algorithm",myalg);
172  if(params->isParameter("cuda: team work size"))
173  team_work_size = params->get("cuda: team work size",team_work_size);
174  }
175 
176  // KokkosKernelsHandle
177  typedef KokkosKernels::Experimental::KokkosKernelsHandle<
178  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
179  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space > KernelHandle;
180 
181  // Grab the Kokkos::SparseCrsMatrices
182  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
183  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
184 
185  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
186  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
187  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
188 
189  c_lno_view_t Irowptr;
190  lno_nnz_view_t Icolind;
191  scalar_view_t Ivals;
192  if(!Bview.importMatrix.is_null()) {
193  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
194  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
195  Ivals = Bview.importMatrix->getLocalMatrix().values;
196  }
197 
198 
199  // Get the algorithm mode
200  std::string alg = nodename+std::string(" algorithm");
201  // printf("DEBUG: Using kernel: %s\n",myalg.c_str());
202  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
203  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
204 
205  // Merge the B and Bimport matrices
206  const KCRS Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
207 
208 #ifdef HAVE_TPETRA_MMM_TIMINGS
209  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaCore"))));
210 #endif
211 
212  // Do the multiply on whatever we've got
213  typename KernelHandle::nnz_lno_t AnumRows = Amat.numRows();
214  typename KernelHandle::nnz_lno_t BnumRows = Bmerged.numRows();
215  typename KernelHandle::nnz_lno_t BnumCols = Bmerged.numCols();
216 
217  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
218  lno_nnz_view_t entriesC;
219  scalar_view_t valuesC;
220  KernelHandle kh;
221  kh.create_spgemm_handle(alg_enum);
222  kh.set_team_work_size(team_work_size);
223 
224  KokkosSparse::Experimental::spgemm_symbolic(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,false,Bmerged.graph.row_map,Bmerged.graph.entries,false,row_mapC);
225 
226  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
227  if (c_nnz_size){
228  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
229  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
230  }
231  KokkosSparse::Experimental::spgemm_numeric(&kh,AnumRows,BnumRows,BnumCols,Amat.graph.row_map,Amat.graph.entries,Amat.values,false,Bmerged.graph.row_map,Bmerged.graph.entries,Bmerged.values,false,row_mapC,entriesC,valuesC);
232  kh.destroy_spgemm_handle();
233 
234 #ifdef HAVE_TPETRA_MMM_TIMINGS
235  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaSort"))));
236 #endif
237 
238  // Sort & set values
239  if (params.is_null() || params->get("sort entries",true))
240  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
241  C.setAllValues(row_mapC,entriesC,valuesC);
242 
243 #ifdef HAVE_TPETRA_MMM_TIMINGS
244  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix CudaESFC"))));
245 #endif
246 
247  // Final Fillcomplete
248  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
249  labelList->set("Timer Label",label);
250  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
251  RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
252  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
253 }
254 
255 
256 /*********************************************************************************************************/
257 template<class Scalar,
258  class LocalOrdinal,
259  class GlobalOrdinal,
260  class LocalOrdinalViewType>
261 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
262  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
263  const LocalOrdinalViewType & targetMapToOrigRow,
264  const LocalOrdinalViewType & targetMapToImportRow,
265  const LocalOrdinalViewType & Bcol2Ccol,
266  const LocalOrdinalViewType & Icol2Ccol,
267  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
268  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
269  const std::string& label,
270  const Teuchos::RCP<Teuchos::ParameterList>& params) {
271 
272  // FIXME: Right now, this is a cut-and-paste of the serial kernel
273  typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
274 
275 #ifdef HAVE_TPETRA_MMM_TIMINGS
276  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
277  using Teuchos::TimeMonitor;
278  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
279  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
280 #endif
281  using Teuchos::RCP;
282  using Teuchos::rcp;
283 
284 
285  // Lots and lots of typedefs
287  typedef typename KCRS::StaticCrsGraphType graph_t;
288  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
289  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
290  typedef typename KCRS::values_type::non_const_type scalar_view_t;
291 
292  typedef Scalar SC;
293  typedef LocalOrdinal LO;
294  typedef GlobalOrdinal GO;
295  typedef Node NO;
296  typedef Map<LO,GO,NO> map_type;
297  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
298  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
299  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
300 
301  // Since this is being run on Cuda, we need to fence because the below host code will use UVM
302  typename graph_t::execution_space().fence();
303 
304  // Sizes
305  RCP<const map_type> Ccolmap = C.getColMap();
306  size_t m = Aview.origMatrix->getNodeNumRows();
307  size_t n = Ccolmap->getNodeNumElements();
308 
309  // Grab the Kokkos::SparseCrsMatrices & inner stuff
310  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
311  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
312  const KCRS & Cmat = C.getLocalMatrix();
313 
314  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
315  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
316  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
317  scalar_view_t Cvals = Cmat.values;
318 
319  c_lno_view_t Irowptr;
320  lno_nnz_view_t Icolind;
321  scalar_view_t Ivals;
322  if(!Bview.importMatrix.is_null()) {
323  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
324  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
325  Ivals = Bview.importMatrix->getLocalMatrix().values;
326  }
327 
328 #ifdef HAVE_TPETRA_MMM_TIMINGS
329  MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
330 #endif
331 
332  // Classic csr assembly (low memory edition)
333  // mfh 27 Sep 2016: The c_status array is an implementation detail
334  // of the local sparse matrix-matrix multiply routine.
335 
336  // The status array will contain the index into colind where this entry was last deposited.
337  // c_status[i] < CSR_ip - not in the row yet
338  // c_status[i] >= CSR_ip - this is the entry where you can find the data
339  // We start with this filled with INVALID's indicating that there are no entries yet.
340  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
341  std::vector<size_t> c_status(n, ST_INVALID);
342 
343  // For each row of A/C
344  size_t CSR_ip = 0, OLD_ip = 0;
345  for (size_t i = 0; i < m; i++) {
346  // First fill the c_status array w/ locations where we're allowed to
347  // generate nonzeros for this row
348  OLD_ip = Crowptr[i];
349  CSR_ip = Crowptr[i+1];
350  for (size_t k = OLD_ip; k < CSR_ip; k++) {
351  c_status[Ccolind[k]] = k;
352 
353  // Reset values in the row of C
354  Cvals[k] = SC_ZERO;
355  }
356 
357  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
358  LO Aik = Acolind[k];
359  const SC Aval = Avals[k];
360  if (Aval == SC_ZERO)
361  continue;
362 
363  if (targetMapToOrigRow[Aik] != LO_INVALID) {
364  // Local matrix
365  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
366 
367  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
368  LO Bkj = Bcolind[j];
369  LO Cij = Bcol2Ccol[Bkj];
370 
371  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
372  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
373  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
374 
375  Cvals[c_status[Cij]] += Aval * Bvals[j];
376  }
377 
378  } else {
379  // Remote matrix
380  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
381  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
382  LO Ikj = Icolind[j];
383  LO Cij = Icol2Ccol[Ikj];
384 
385  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
386  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
387  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
388 
389  Cvals[c_status[Cij]] += Aval * Ivals[j];
390  }
391  }
392  }
393  }
394 
395  C.fillComplete(C.getDomainMap(), C.getRangeMap());
396 }
397 
398 /*********************************************************************************************************/
399 template<class Scalar,
400  class LocalOrdinal,
401  class GlobalOrdinal,
402  class LocalOrdinalViewType>
403 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
404  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
405  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
406  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
407  const LocalOrdinalViewType & Acol2Brow,
408  const LocalOrdinalViewType & Acol2Irow,
409  const LocalOrdinalViewType & Bcol2Ccol,
410  const LocalOrdinalViewType & Icol2Ccol,
411  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
412  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
413  const std::string& label,
414  const Teuchos::RCP<Teuchos::ParameterList>& params) {
415 
416 #ifdef HAVE_TPETRA_MMM_TIMINGS
417  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
418  using Teuchos::TimeMonitor;
419  Teuchos::RCP<TimeMonitor> MM;
420 #endif
421 
422  // Node-specific code
423  using Teuchos::RCP;
424 
425  // Options
426  //int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer // unreferenced
427  std::string myalg("MSAK");
428  if(!params.is_null()) {
429  if(params->isParameter("cuda: jacobi algorithm"))
430  myalg = params->get("cuda: jacobi algorithm",myalg);
431  }
432 
433  if(myalg == "MSAK") {
434  ::Tpetra::MatrixMatrix::ExtraKernels::jacobi_A_B_newmatrix_MultiplyScaleAddKernel(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
435  }
436  else if(myalg == "KK") {
437  jacobi_A_B_newmatrix_KokkosKernels(omega,Dinv,Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
438  }
439  else {
440  throw std::runtime_error("Tpetra::MatrixMatrix::Jacobi newmatrix unknown kernel");
441  }
442 
443 #ifdef HAVE_TPETRA_MMM_TIMINGS
444  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
445 #endif
446 
447  // Final Fillcomplete
448  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
449  labelList->set("Timer Label",label);
450  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
451 
452  // NOTE: MSAK already fillCompletes, so we have to check here
453  if(!C.isFillComplete()) {
454  RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
455  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
456  }
457 
458 }
459 
460 
461 
462 /*********************************************************************************************************/
463 template<class Scalar,
464  class LocalOrdinal,
465  class GlobalOrdinal,
466  class LocalOrdinalViewType>
467 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
468  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
469  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
470  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
471  const LocalOrdinalViewType & targetMapToOrigRow,
472  const LocalOrdinalViewType & targetMapToImportRow,
473  const LocalOrdinalViewType & Bcol2Ccol,
474  const LocalOrdinalViewType & Icol2Ccol,
475  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
476  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
477  const std::string& label,
478  const Teuchos::RCP<Teuchos::ParameterList>& params) {
479 
480  // FIXME: Right now, this is a cut-and-paste of the serial kernel
481  typedef Kokkos::Compat::KokkosCudaWrapperNode Node;
482 
483 #ifdef HAVE_TPETRA_MMM_TIMINGS
484  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
485  using Teuchos::TimeMonitor;
486  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore"))));
487  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
488 #endif
489  using Teuchos::RCP;
490  using Teuchos::rcp;
491 
492  // Lots and lots of typedefs
494  typedef typename KCRS::StaticCrsGraphType graph_t;
495  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
496  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
497  typedef typename KCRS::values_type::non_const_type scalar_view_t;
498  typedef typename scalar_view_t::memory_space scalar_memory_space;
499 
500  typedef Scalar SC;
501  typedef LocalOrdinal LO;
502  typedef GlobalOrdinal GO;
503  typedef Node NO;
504  typedef Map<LO,GO,NO> map_type;
505  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
506  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
507  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
508 
509  // Since this is being run on Cuda, we need to fence because the below host code will use UVM
510  typename graph_t::execution_space().fence();
511 
512  // Sizes
513  RCP<const map_type> Ccolmap = C.getColMap();
514  size_t m = Aview.origMatrix->getNodeNumRows();
515  size_t n = Ccolmap->getNodeNumElements();
516 
517  // Grab the Kokkos::SparseCrsMatrices & inner stuff
518  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
519  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
520  const KCRS & Cmat = C.getLocalMatrix();
521 
522  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
523  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
524  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
525  scalar_view_t Cvals = Cmat.values;
526 
527  c_lno_view_t Irowptr;
528  lno_nnz_view_t Icolind;
529  scalar_view_t Ivals;
530  if(!Bview.importMatrix.is_null()) {
531  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
532  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
533  Ivals = Bview.importMatrix->getLocalMatrix().values;
534  }
535 
536  // Jacobi-specific inner stuff
537  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
538 
539 #ifdef HAVE_TPETRA_MMM_TIMINGS
540  MM2 = Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse CudaCore - Compare"))));
541 #endif
542 
543  // The status array will contain the index into colind where this entry was last deposited.
544  // c_status[i] < CSR_ip - not in the row yet
545  // c_status[i] >= CSR_ip - this is the entry where you can find the data
546  // We start with this filled with INVALID's indicating that there are no entries yet.
547  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
548  std::vector<size_t> c_status(n, ST_INVALID);
549 
550  // For each row of A/C
551  size_t CSR_ip = 0, OLD_ip = 0;
552  for (size_t i = 0; i < m; i++) {
553 
554  // First fill the c_status array w/ locations where we're allowed to
555  // generate nonzeros for this row
556  OLD_ip = Crowptr[i];
557  CSR_ip = Crowptr[i+1];
558  for (size_t k = OLD_ip; k < CSR_ip; k++) {
559  c_status[Ccolind[k]] = k;
560 
561  // Reset values in the row of C
562  Cvals[k] = SC_ZERO;
563  }
564 
565  SC minusOmegaDval = -omega*Dvals(i,0);
566 
567  // Entries of B
568  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
569  Scalar Bval = Bvals[j];
570  if (Bval == SC_ZERO)
571  continue;
572  LO Bij = Bcolind[j];
573  LO Cij = Bcol2Ccol[Bij];
574 
575  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
576  std::runtime_error, "Trying to insert a new entry into a static graph");
577 
578  Cvals[c_status[Cij]] = Bvals[j];
579  }
580 
581  // Entries of -omega * Dinv * A * B
582  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
583  LO Aik = Acolind[k];
584  const SC Aval = Avals[k];
585  if (Aval == SC_ZERO)
586  continue;
587 
588  if (targetMapToOrigRow[Aik] != LO_INVALID) {
589  // Local matrix
590  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow[Aik]);
591 
592  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
593  LO Bkj = Bcolind[j];
594  LO Cij = Bcol2Ccol[Bkj];
595 
596  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
597  std::runtime_error, "Trying to insert a new entry into a static graph");
598 
599  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
600  }
601 
602  } else {
603  // Remote matrix
604  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow[Aik]);
605  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
606  LO Ikj = Icolind[j];
607  LO Cij = Icol2Ccol[Ikj];
608 
609  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
610  std::runtime_error, "Trying to insert a new entry into a static graph");
611 
612  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
613  }
614  }
615  }
616  }
617 
618 #ifdef HAVE_TPETRA_MMM_TIMINGS
619  MM2= Teuchos::null;
620  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
621 #endif
622 
623  C.fillComplete(C.getDomainMap(), C.getRangeMap());
624 
625 }
626 
627 /*********************************************************************************************************/
628 template<class Scalar,
629  class LocalOrdinal,
630  class GlobalOrdinal,
631  class LocalOrdinalViewType>
632 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode,LocalOrdinalViewType>::jacobi_A_B_newmatrix_KokkosKernels(Scalar omega,
633  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> & Dinv,
634  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Aview,
635  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& Bview,
636  const LocalOrdinalViewType & Acol2Brow,
637  const LocalOrdinalViewType & Acol2Irow,
638  const LocalOrdinalViewType & Bcol2Ccol,
639  const LocalOrdinalViewType & Icol2Ccol,
640  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosCudaWrapperNode>& C,
641  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > Cimport,
642  const std::string& label,
643  const Teuchos::RCP<Teuchos::ParameterList>& params) {
644 
645 #ifdef HAVE_TPETRA_MMM_TIMINGS
646  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
647  using Teuchos::TimeMonitor;
648  Teuchos::RCP<TimeMonitor> MM;
649 #endif
650 
651  // Check if the diagonal entries exist in debug mode
652  const bool debug = Tpetra::Details::Behavior::debug();
653  if(debug) {
654 
655  auto rowMap = Aview.origMatrix->getRowMap();
656  Tpetra::Vector<Scalar> diags(rowMap);
657  Aview.origMatrix->getLocalDiagCopy(diags);
658  size_t diagLength = rowMap->getNodeNumElements();
659  Teuchos::Array<Scalar> diagonal(diagLength);
660  diags.get1dCopy(diagonal());
661 
662  for(size_t i = 0; i < diagLength; ++i) {
663  TEUCHOS_TEST_FOR_EXCEPTION(diagonal[i] == Teuchos::ScalarTraits<Scalar>::zero(),
664  std::runtime_error,
665  "Matrix A has a zero/missing diagonal: " << diagonal[i] << std::endl <<
666  "KokkosKernels Jacobi-fused SpGEMM requires nonzero diagonal entries in A" << std::endl);
667  }
668  }
669 
670  // Usings
671  using device_t = typename Kokkos::Compat::KokkosCudaWrapperNode::device_type;
673  using graph_t = typename matrix_t::StaticCrsGraphType;
674  using lno_view_t = typename graph_t::row_map_type::non_const_type;
675  using c_lno_view_t = typename graph_t::row_map_type::const_type;
676  using lno_nnz_view_t = typename graph_t::entries_type::non_const_type;
677  using scalar_view_t = typename matrix_t::values_type::non_const_type;
678 
679  // KokkosKernels handle
680  using handle_t = typename KokkosKernels::Experimental::KokkosKernelsHandle<
681  typename lno_view_t::const_value_type,typename lno_nnz_view_t::const_value_type, typename scalar_view_t::const_value_type,
682  typename device_t::execution_space, typename device_t::memory_space,typename device_t::memory_space >;
683 
684  // Get the rowPtr, colInd and vals of importMatrix
685  c_lno_view_t Irowptr;
686  lno_nnz_view_t Icolind;
687  scalar_view_t Ivals;
688  if(!Bview.importMatrix.is_null()) {
689  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
690  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
691  Ivals = Bview.importMatrix->getLocalMatrix().values;
692  }
693 
694  // Merge the B and Bimport matrices
695  const matrix_t Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,Acol2Brow,Acol2Irow,Bcol2Ccol,Icol2Ccol,C.getColMap()->getNodeNumElements());
696 
697  // Get the properties and arrays of input matrices
698  const matrix_t & Amat = Aview.origMatrix->getLocalMatrix();
699  const matrix_t & Bmat = Bview.origMatrix->getLocalMatrix();
700 
701  typename handle_t::nnz_lno_t AnumRows = Amat.numRows();
702  typename handle_t::nnz_lno_t BnumRows = Bmerged.numRows();
703  typename handle_t::nnz_lno_t BnumCols = Bmerged.numCols();
704 
705  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmerged.graph.row_map;
706  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmerged.graph.entries;
707  const scalar_view_t Avals = Amat.values, Bvals = Bmerged.values;
708 
709  // Arrays of the output matrix
710  lno_view_t row_mapC (Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), AnumRows + 1);
711  lno_nnz_view_t entriesC;
712  scalar_view_t valuesC;
713 
714  // Options
715  int team_work_size = 16;
716  std::string myalg("SPGEMM_KK_MEMORY");
717  if(!params.is_null()) {
718  if(params->isParameter("cuda: algorithm"))
719  myalg = params->get("cuda: algorithm",myalg);
720  if(params->isParameter("cuda: team work size"))
721  team_work_size = params->get("cuda: team work size",team_work_size);
722  }
723 
724  // Get the algorithm mode
725  std::string nodename("Cuda");
726  std::string alg = nodename + std::string(" algorithm");
727  if(!params.is_null() && params->isParameter(alg)) myalg = params->get(alg,myalg);
728  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
729 
730 
731  // KokkosKernels call
732  handle_t kh;
733  kh.create_spgemm_handle(alg_enum);
734  kh.set_team_work_size(team_work_size);
735 
736  KokkosSparse::Experimental::spgemm_symbolic(&kh, AnumRows, BnumRows, BnumCols,
737  Arowptr, Acolind, false,
738  Browptr, Bcolind, false,
739  row_mapC);
740 
741  size_t c_nnz_size = kh.get_spgemm_handle()->get_c_nnz();
742  if (c_nnz_size){
743  entriesC = lno_nnz_view_t (Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
744  valuesC = scalar_view_t (Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
745  }
746 
747  KokkosSparse::Experimental::spgemm_jacobi(&kh, AnumRows, BnumRows, BnumCols,
748  Arowptr, Acolind, Avals, false,
749  Browptr, Bcolind, Bvals, false,
750  row_mapC, entriesC, valuesC,
751  omega, Dinv.getLocalViewDevice());
752  kh.destroy_spgemm_handle();
753 
754 #ifdef HAVE_TPETRA_MMM_TIMINGS
755  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaSort"))));
756 #endif
757 
758  // Sort & set values
759  if (params.is_null() || params->get("sort entries",true))
760  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
761  C.setAllValues(row_mapC,entriesC,valuesC);
762 
763 #ifdef HAVE_TPETRA_MMM_TIMINGS
764  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix CudaESFC"))));
765 #endif
766 
767  // Final Fillcomplete
768  Teuchos::RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
769  labelList->set("Timer Label",label);
770  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
771  Teuchos::RCP<const Export<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosCudaWrapperNode> > dummyExport;
772  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
773 }
774 
775  }//MMdetails
776 }//Tpetra
777 
778 #endif//CUDA
779 
780 #endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
static bool debug()
Whether Tpetra is in debug mode.
A distributed dense vector.