Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13 #include "Tpetra_RowMatrixTransposer.hpp"
14 
15 namespace Tpetra {
16 
17 namespace MatrixMatrix{
18 
19 namespace ExtraKernels{
20 
21 
22 // Double product version
23 template<class CrsMatrixType>
24 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
25  // Follows the NZ estimate in ML's ml_matmatmult.c
26  size_t Aest = 100, Best=100;
27  if (A.getLocalNumEntries() > 0)
28  Aest = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
29  if (B.getLocalNumEntries() > 0)
30  Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
31 
32  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
33  nnzperrow *= nnzperrow;
34 
35  return nnzperrow;
36 }
37 
38 
39 // Triple product version
40 template<class CrsMatrixType>
41 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
42  size_t nnzPerRowA = 100, Pcols = 100;
43  if (A.getLocalNumEntries() > 0)
44  nnzPerRowA = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 9;
45  if (P.getLocalNumEntries() > 0)
46  Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
47  return (size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
48 }
49 
50 #if defined (HAVE_TPETRA_INST_OPENMP)
51 /*********************************************************************************************************/
52 template<class Scalar,
53  class LocalOrdinal,
54  class GlobalOrdinal,
55  class LocalOrdinalViewType>
56 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
57  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
58  const LocalOrdinalViewType & targetMapToOrigRow,
59  const LocalOrdinalViewType & targetMapToImportRow,
60  const LocalOrdinalViewType & Bcol2Ccol,
61  const LocalOrdinalViewType & Icol2Ccol,
62  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
63  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
64  const std::string& label,
65  const Teuchos::RCP<Teuchos::ParameterList>& params) {
66 #ifdef HAVE_TPETRA_MMM_TIMINGS
67  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
68  using Teuchos::TimeMonitor;
69  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
70 #endif
71  using Teuchos::Array;
72  using Teuchos::ArrayRCP;
73  using Teuchos::ArrayView;
74  using Teuchos::RCP;
75  using Teuchos::rcp;
76 
77 
78  // Lots and lots of typedefs
80  typedef typename KCRS::device_type device_t;
81  typedef typename KCRS::StaticCrsGraphType graph_t;
82  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
83  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
84  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
85  typedef typename KCRS::values_type::non_const_type scalar_view_t;
86 
87  // Unmanaged versions of the above
88  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
89  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
90  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
91 
92  typedef Scalar SC;
93  typedef LocalOrdinal LO;
94  typedef GlobalOrdinal GO;
95  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
96  typedef Map<LO,GO,NO> map_type;
97 
98  // NOTE (mfh 15 Sep 2017) This is specifically only for
99  // execution_space = Kokkos::OpenMP, so we neither need nor want
100  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
101  typedef NO::execution_space execution_space;
102  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
103 
104  // All of the invalid guys
105  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
106  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
107  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
108 
109  // Grab the Kokkos::SparseCrsMatrices & inner stuff
110  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
111  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
112 
113  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
114  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
115  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
116  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
117 
118  c_lno_view_t Irowptr;
119  lno_nnz_view_t Icolind;
120  scalar_view_t Ivals;
121  if(!Bview.importMatrix.is_null()) {
122  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
123  Irowptr = lclB.graph.row_map;
124  Icolind = lclB.graph.entries;
125  Ivals = lclB.values;
126  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
127  }
128 
129  // Sizes
130  RCP<const map_type> Ccolmap = C.getColMap();
131  size_t m = Aview.origMatrix->getLocalNumRows();
132  size_t n = Ccolmap->getLocalNumElements();
133  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
134 
135  // Get my node / thread info (right from openmp or parameter list)
136  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
137  if(!params.is_null()) {
138  if(params->isParameter("openmp: ltg thread max"))
139  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
140  }
141 
142  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
143  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
144  // we will not touch these until the final copyout step
145  lno_nnz_view_t entriesC;
146  scalar_view_t valuesC;
147 
148  // add this, since we do the rowptr in place, we could figure this out
149  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
150  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
151 
152  // Thread-local memory
153  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind",thread_max);
154  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values",thread_max);
155 
156  double thread_chunk = (double)(m) / thread_max;
157 
158  // Run chunks of the matrix independently
159  Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
160  {
161  // Thread coordination stuff
162  size_t my_thread_start = tid * thread_chunk;
163  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
164  size_t my_thread_m = my_thread_stop - my_thread_start;
165 
166  // Size estimate
167  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
168 
169  // Allocations
170  std::vector<size_t> c_status(n,INVALID);
171 
172  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
173  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
174 
175  // For each row of A/C
176  size_t CSR_ip = 0, OLD_ip = 0;
177  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
178  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
179  // on the calling process.
180  // JJE 10 Apr 2019 index directly into the rowptr
181  row_mapC(i) = CSR_ip;
182 
183  // mfh 27 Sep 2016: For each entry of A in the current row of A
184  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
185  LO Aik = Acolind(k); // local column index of current entry of A
186  const SC Aval = Avals(k); // value of current entry of A
187  if (Aval == SC_ZERO)
188  continue; // skip explicitly stored zero values in A
189 
190  if (targetMapToOrigRow(Aik) != LO_INVALID) {
191  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
192  // corresponding to the current entry of A is populated, then
193  // the corresponding row of B is in B_local (i.e., it lives on
194  // the calling process).
195 
196  // Local matrix
197  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
198 
199  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
200  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
201  LO Bkj = Bcolind(j);
202  LO Cij = Bcol2Ccol(Bkj);
203 
204  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
205  // New entry
206  c_status[Cij] = CSR_ip;
207  Ccolind(CSR_ip) = Cij;
208  Cvals(CSR_ip) = Aval*Bvals(j);
209  CSR_ip++;
210 
211  } else {
212  Cvals(c_status[Cij]) += Aval*Bvals(j);
213  }
214  }
215 
216  } else {
217  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
218  // corresponding to the current entry of A NOT populated (has
219  // a flag "invalid" value), then the corresponding row of B is
220  // in B_local (i.e., it lives on the calling process).
221 
222  // Remote matrix
223  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
224  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
225  LO Ikj = Icolind(j);
226  LO Cij = Icol2Ccol(Ikj);
227 
228  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
229  // New entry
230  c_status[Cij] = CSR_ip;
231  Ccolind(CSR_ip) = Cij;
232  Cvals(CSR_ip) = Aval*Ivals(j);
233  CSR_ip++;
234 
235  } else {
236  Cvals(c_status[Cij]) += Aval*Ivals(j);
237  }
238  }
239  }
240  }
241 
242  // Resize for next pass if needed
243  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
244  CSR_alloc *= 2;
245  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
246  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*) Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
247  }
248  OLD_ip = CSR_ip;
249  }
250  thread_total_nnz(tid) = CSR_ip;
251  tl_colind(tid) = Ccolind;
252  tl_values(tid) = Cvals;
253  });
254 
255  // Do the copy out
256  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
257 
258 #ifdef HAVE_TPETRA_MMM_TIMINGS
259  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
260 #endif
261  // Sort & set values
262  if (params.is_null() || params->get("sort entries",true))
263  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
264  C.setAllValues(row_mapC,entriesC,valuesC);
265 
266 }
267 
268 /*********************************************************************************************************/
269 template<class Scalar,
270  class LocalOrdinal,
271  class GlobalOrdinal,
272  class LocalOrdinalViewType>
273 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
274  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
275  const LocalOrdinalViewType & targetMapToOrigRow,
276  const LocalOrdinalViewType & targetMapToImportRow,
277  const LocalOrdinalViewType & Bcol2Ccol,
278  const LocalOrdinalViewType & Icol2Ccol,
279  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
280  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
281  const std::string& label,
282  const Teuchos::RCP<Teuchos::ParameterList>& params) {
283 #ifdef HAVE_TPETRA_MMM_TIMINGS
284  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
285  using Teuchos::TimeMonitor;
286  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
287 #endif
288 
289  using Teuchos::Array;
290  using Teuchos::ArrayRCP;
291  using Teuchos::ArrayView;
292  using Teuchos::RCP;
293  using Teuchos::rcp;
294 
295  // Lots and lots of typedefs
297  // typedef typename KCRS::device_type device_t;
298  typedef typename KCRS::StaticCrsGraphType graph_t;
299  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
300  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
301  typedef typename KCRS::values_type::non_const_type scalar_view_t;
302 
303  typedef Scalar SC;
304  typedef LocalOrdinal LO;
305  typedef GlobalOrdinal GO;
306  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
307  typedef Map<LO,GO,NO> map_type;
308 
309  // NOTE (mfh 15 Sep 2017) This is specifically only for
310  // execution_space = Kokkos::OpenMP, so we neither need nor want
311  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
312  typedef NO::execution_space execution_space;
313  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
314 
315  // All of the invalid guys
316  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
317  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
318  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
319 
320  // Grab the Kokkos::SparseCrsMatrices & inner stuff
321  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
322  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
323  const KCRS & Cmat = C.getLocalMatrixDevice();
324 
325  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
326  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
327  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
328  scalar_view_t Cvals = Cmat.values;
329 
330  c_lno_view_t Irowptr;
331  c_lno_nnz_view_t Icolind;
332  scalar_view_t Ivals;
333  if(!Bview.importMatrix.is_null()) {
334  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
335  Irowptr = lclB.graph.row_map;
336  Icolind = lclB.graph.entries;
337  Ivals = lclB.values;
338  }
339 
340  // Sizes
341  RCP<const map_type> Ccolmap = C.getColMap();
342  size_t m = Aview.origMatrix->getLocalNumRows();
343  size_t n = Ccolmap->getLocalNumElements();
344 
345  // Get my node / thread info (right from openmp or parameter list)
346  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
347  if(!params.is_null()) {
348  if(params->isParameter("openmp: ltg thread max"))
349  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
350  }
351 
352  double thread_chunk = (double)(m) / thread_max;
353 
354  // Run chunks of the matrix independently
355  Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
356  {
357  // Thread coordination stuff
358  size_t my_thread_start = tid * thread_chunk;
359  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
360 
361  // Allocations
362  std::vector<size_t> c_status(n,INVALID);
363 
364  // For each row of A/C
365  size_t CSR_ip = 0, OLD_ip = 0;
366  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
367  // First fill the c_status array w/ locations where we're allowed to
368  // generate nonzeros for this row
369  OLD_ip = Crowptr(i);
370  CSR_ip = Crowptr(i+1);
371  for (size_t k = OLD_ip; k < CSR_ip; k++) {
372  c_status[Ccolind(k)] = k;
373  // Reset values in the row of C
374  Cvals(k) = SC_ZERO;
375  }
376 
377  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
378  LO Aik = Acolind(k);
379  const SC Aval = Avals(k);
380  if (Aval == SC_ZERO)
381  continue;
382 
383  if (targetMapToOrigRow(Aik) != LO_INVALID) {
384  // Local matrix
385  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
386 
387  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
388  LO Bkj = Bcolind(j);
389  LO Cij = Bcol2Ccol(Bkj);
390 
391  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
392  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
393  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
394 
395  Cvals(c_status[Cij]) += Aval * Bvals(j);
396  }
397  } else {
398  // Remote matrix
399  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
400  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
401  LO Ikj = Icolind(j);
402  LO Cij = Icol2Ccol(Ikj);
403 
404  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
405  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
406  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
407 
408  Cvals(c_status[Cij]) += Aval * Ivals(j);
409  }
410  }
411  }
412  }
413  });
414 
415  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
416 }
417 
418 /*********************************************************************************************************/
419 template<class Scalar,
420  class LocalOrdinal,
421  class GlobalOrdinal,
422  class LocalOrdinalViewType>
423 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
424  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
425  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
426  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
427  const LocalOrdinalViewType & targetMapToOrigRow,
428  const LocalOrdinalViewType & targetMapToImportRow,
429  const LocalOrdinalViewType & Bcol2Ccol,
430  const LocalOrdinalViewType & Icol2Ccol,
431  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
432  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
433  const std::string& label,
434  const Teuchos::RCP<Teuchos::ParameterList>& params) {
435 #ifdef HAVE_TPETRA_MMM_TIMINGS
436  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
437  using Teuchos::TimeMonitor;
438  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
439 #endif
440 
441  using Teuchos::Array;
442  using Teuchos::ArrayRCP;
443  using Teuchos::ArrayView;
444  using Teuchos::RCP;
445  using Teuchos::rcp;
446 
447  // Lots and lots of typedefs
448  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
450  typedef typename KCRS::device_type device_t;
451  typedef typename KCRS::StaticCrsGraphType graph_t;
452  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
453  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
454  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
455  typedef typename KCRS::values_type::non_const_type scalar_view_t;
456 
457  // Unmanaged versions of the above
458  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
459  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
460  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
461 
462  // Jacobi-specific
463  typedef typename scalar_view_t::memory_space scalar_memory_space;
464 
465  typedef Scalar SC;
466  typedef LocalOrdinal LO;
467  typedef GlobalOrdinal GO;
468  typedef Node NO;
469  typedef Map<LO,GO,NO> map_type;
470 
471  // NOTE (mfh 15 Sep 2017) This is specifically only for
472  // execution_space = Kokkos::OpenMP, so we neither need nor want
473  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
474  typedef NO::execution_space execution_space;
475  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
476 
477  // All of the invalid guys
478  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
479  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
480  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
481 
482  // Grab the Kokkos::SparseCrsMatrices & inner stuff
483  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
484  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
485 
486  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
487  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
488  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
489  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
490 
491  c_lno_view_t Irowptr;
492  lno_nnz_view_t Icolind;
493  scalar_view_t Ivals;
494  if(!Bview.importMatrix.is_null()) {
495  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
496  Irowptr = lclB.graph.row_map;
497  Icolind = lclB.graph.entries;
498  Ivals = lclB.values;
499  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
500  }
501 
502  // Jacobi-specific inner stuff
503  auto Dvals =
504  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
505 
506  // Sizes
507  RCP<const map_type> Ccolmap = C.getColMap();
508  size_t m = Aview.origMatrix->getLocalNumRows();
509  size_t n = Ccolmap->getLocalNumElements();
510  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
511 
512  // Get my node / thread info (right from openmp)
513  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
514  if(!params.is_null()) {
515  if(params->isParameter("openmp: ltg thread max"))
516  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
517  }
518 
519  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
520  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
521  // we will not touch these until the final copyout step
522  lno_nnz_view_t entriesC;
523  scalar_view_t valuesC;
524 
525  // add this, since we do the rowptr in place, we could figure this out
526  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
527  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
528 
529  // Thread-local memory
530  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind",thread_max);
531  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values",thread_max);
532 
533  double thread_chunk = (double)(m) / thread_max;
534 
535  // Run chunks of the matrix independently
536  Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
537  {
538  // Thread coordination stuff
539  size_t my_thread_start = tid * thread_chunk;
540  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
541  size_t my_thread_m = my_thread_stop - my_thread_start;
542 
543  // Size estimate
544  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
545 
546  // Allocations
547  std::vector<size_t> c_status(n,INVALID);
548 
549  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
550  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
551 
552  // For each row of A/C
553  size_t CSR_ip = 0, OLD_ip = 0;
554  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
555  // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
556  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
557  // on the calling process.
558  // JJE: directly access the rowptr here (indexed using our loop var)
559  row_mapC(i) = CSR_ip;
560  // NOTE: Vector::getLocalView returns a rank 2 view here
561  SC minusOmegaDval = -omega*Dvals(i,0);
562 
563  // Entries of B
564  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
565  const SC Bval = Bvals(j);
566  if (Bval == SC_ZERO)
567  continue;
568  LO Bij = Bcolind(j);
569  LO Cij = Bcol2Ccol(Bij);
570 
571  // Assume no repeated entries in B
572  c_status[Cij] = CSR_ip;
573  Ccolind(CSR_ip) = Cij;
574  Cvals(CSR_ip) = Bvals[j];
575  CSR_ip++;
576  }
577 
578  // Entries of -omega * Dinv * A * B
579  // mfh 27 Sep 2016: For each entry of A in the current row of A
580  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
581  LO Aik = Acolind(k); // local column index of current entry of A
582  const SC Aval = Avals(k); // value of current entry of A
583  if (Aval == SC_ZERO)
584  continue; // skip explicitly stored zero values in A
585 
586  if (targetMapToOrigRow(Aik) != LO_INVALID) {
587  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
588  // corresponding to the current entry of A is populated, then
589  // the corresponding row of B is in B_local (i.e., it lives on
590  // the calling process).
591 
592  // Local matrix
593  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
594 
595  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
596  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
597  LO Bkj = Bcolind(j);
598  LO Cij = Bcol2Ccol(Bkj);
599 
600  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
601  // New entry
602  c_status[Cij] = CSR_ip;
603  Ccolind(CSR_ip) = Cij;
604  Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
605  CSR_ip++;
606 
607  } else {
608  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
609  }
610  }
611 
612  } else {
613  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
614  // corresponding to the current entry of A NOT populated (has
615  // a flag "invalid" value), then the corresponding row of B is
616  // in B_local (i.e., it lives on the calling process).
617 
618  // Remote matrix
619  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
620  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
621  LO Ikj = Icolind(j);
622  LO Cij = Icol2Ccol(Ikj);
623 
624  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
625  // New entry
626  c_status[Cij] = CSR_ip;
627  Ccolind(CSR_ip) = Cij;
628  Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
629  CSR_ip++;
630 
631  } else {
632  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
633  }
634  }
635  }
636  }
637 
638  // Resize for next pass if needed
639  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
640  CSR_alloc *= 2;
641  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
642  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*) Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
643  }
644  OLD_ip = CSR_ip;
645  }
646  thread_total_nnz(tid) = CSR_ip;
647  tl_colind(tid) = Ccolind;
648  tl_values(tid) = Cvals;
649  });
650 
651 
652  // Do the copy out (removed the tl_rowptr!)
653  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
654 
655 
656 #ifdef HAVE_TPETRA_MMM_TIMINGS
657  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
658 #endif
659  // Sort & set values
660  if (params.is_null() || params->get("sort entries",true))
661  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
662  C.setAllValues(row_mapC,entriesC,valuesC);
663 
664 }
665 
666 
667 
668 /*********************************************************************************************************/
669 template<class Scalar,
670  class LocalOrdinal,
671  class GlobalOrdinal,
672  class LocalOrdinalViewType>
673 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
674  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
675  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
676  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
677  const LocalOrdinalViewType & targetMapToOrigRow,
678  const LocalOrdinalViewType & targetMapToImportRow,
679  const LocalOrdinalViewType & Bcol2Ccol,
680  const LocalOrdinalViewType & Icol2Ccol,
681  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
682  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
683  const std::string& label,
684  const Teuchos::RCP<Teuchos::ParameterList>& params) {
685 #ifdef HAVE_TPETRA_MMM_TIMINGS
686  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
687  using Teuchos::TimeMonitor;
688  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
689 #endif
690  using Teuchos::Array;
691  using Teuchos::ArrayRCP;
692  using Teuchos::ArrayView;
693  using Teuchos::RCP;
694  using Teuchos::rcp;
695 
696  // Lots and lots of typedefs
697  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
699  // typedef typename KCRS::device_type device_t;
700  typedef typename KCRS::StaticCrsGraphType graph_t;
701  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
702  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
703  typedef typename KCRS::values_type::non_const_type scalar_view_t;
704 
705  // Jacobi-specific
706  typedef typename scalar_view_t::memory_space scalar_memory_space;
707 
708  typedef Scalar SC;
709  typedef LocalOrdinal LO;
710  typedef GlobalOrdinal GO;
711  typedef Node NO;
712  typedef Map<LO,GO,NO> map_type;
713 
714  // NOTE (mfh 15 Sep 2017) This is specifically only for
715  // execution_space = Kokkos::OpenMP, so we neither need nor want
716  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
717  typedef NO::execution_space execution_space;
718  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
719 
720  // All of the invalid guys
721  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
722  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
723  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
724 
725  // Grab the Kokkos::SparseCrsMatrices & inner stuff
726  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
727  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
728  const KCRS & Cmat = C.getLocalMatrixDevice();
729 
730  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
731  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
732  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
733  scalar_view_t Cvals = Cmat.values;
734 
735  c_lno_view_t Irowptr;
736  c_lno_nnz_view_t Icolind;
737  scalar_view_t Ivals;
738  if(!Bview.importMatrix.is_null()) {
739  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
740  Irowptr = lclB.graph.row_map;
741  Icolind = lclB.graph.entries;
742  Ivals = lclB.values;
743  }
744 
745  // Jacobi-specific inner stuff
746  auto Dvals =
747  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
748 
749  // Sizes
750  RCP<const map_type> Ccolmap = C.getColMap();
751  size_t m = Aview.origMatrix->getLocalNumRows();
752  size_t n = Ccolmap->getLocalNumElements();
753 
754  // Get my node / thread info (right from openmp or parameter list)
755  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
756  if(!params.is_null()) {
757  if(params->isParameter("openmp: ltg thread max"))
758  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
759  }
760 
761  double thread_chunk = (double)(m) / thread_max;
762 
763  // Run chunks of the matrix independently
764  Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
765  {
766  // Thread coordination stuff
767  size_t my_thread_start = tid * thread_chunk;
768  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
769 
770  // Allocations
771  std::vector<size_t> c_status(n,INVALID);
772 
773  // For each row of A/C
774  size_t CSR_ip = 0, OLD_ip = 0;
775  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
776  // First fill the c_status array w/ locations where we're allowed to
777  // generate nonzeros for this row
778  OLD_ip = Crowptr(i);
779  CSR_ip = Crowptr(i+1);
780  // NOTE: Vector::getLocalView returns a rank 2 view here
781  SC minusOmegaDval = -omega*Dvals(i,0);
782 
783  for (size_t k = OLD_ip; k < CSR_ip; k++) {
784  c_status[Ccolind(k)] = k;
785  // Reset values in the row of C
786  Cvals(k) = SC_ZERO;
787  }
788 
789  // Entries of B
790  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
791  const SC Bval = Bvals(j);
792  if (Bval == SC_ZERO)
793  continue;
794  LO Bij = Bcolind(j);
795  LO Cij = Bcol2Ccol(Bij);
796 
797  // Assume no repeated entries in B
798  Cvals(c_status[Cij]) += Bvals(j);
799  CSR_ip++;
800  }
801 
802 
803  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
804  LO Aik = Acolind(k);
805  const SC Aval = Avals(k);
806  if (Aval == SC_ZERO)
807  continue;
808 
809  if (targetMapToOrigRow(Aik) != LO_INVALID) {
810  // Local matrix
811  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
812 
813  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
814  LO Bkj = Bcolind(j);
815  LO Cij = Bcol2Ccol(Bkj);
816 
817  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
818  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
819  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
820 
821  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
822  }
823  } else {
824  // Remote matrix
825  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
826  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
827  LO Ikj = Icolind(j);
828  LO Cij = Icol2Ccol(Ikj);
829 
830  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
831  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
832  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
833 
834  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
835  }
836  }
837  }
838  }
839  });
840 
841  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
842 }
843 
844 
845 /*********************************************************************************************************/
846 template<class InColindArrayType, class InValsArrayType,
847  class OutRowptrType, class OutColindType, class OutValsType>
848 void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
849  const InColindArrayType& Incolind,
850  const InValsArrayType& Invalues,
851  const size_t m,
852  const double thread_chunk,
853  OutRowptrType& row_mapC,
854  OutColindType& entriesC,
855  OutValsType& valuesC ) {
856  typedef OutRowptrType lno_view_t;
857  typedef OutColindType lno_nnz_view_t;
858  typedef OutValsType scalar_view_t;
859  typedef typename lno_view_t::execution_space execution_space;
860  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
861 
862  // Generate the starting nnz number per thread
863  size_t thread_max = Incolind.size();
864  size_t c_nnz_size=0;
865  // since this kernel is 'low thread count' is very likely, that we could
866  // sum over the thread_total_nnz and not go parallel, but since it is a view
867  // we don't know where the memory actually lives... so we kinda need to go parallel
868 
869  lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
870  Kokkos::parallel_scan("LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (const size_t i, size_t& update, const bool final) {
871  size_t mynnz = thread_total_nnz(i);
872  if(final) thread_start_nnz(i) = update;
873  update+=mynnz;
874  if(final && i+1==thread_max) thread_start_nnz(i+1)=update;
875  });
876  c_nnz_size = thread_start_nnz(thread_max);
877 
878  // 2019 Apr 10 JJE: update the rowptr's final entry here
879  row_mapC(m) = thread_start_nnz(thread_max);
880 
881  // Allocate output
882  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
883  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
884 
885  // Copy out
886  Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
887  const size_t my_thread_start = tid * thread_chunk;
888  const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
889  const size_t nnz_thread_start = thread_start_nnz(tid);
890  // we need this info, since we did the rowptr in place
891  const size_t nnz_thread_stop = thread_start_nnz(tid+1);
892 
893  // There are two fundamental operations:
894  // * Updateing the rowptr with the correct offset
895  // * copying entries and values
896 
897  // First, update the rowptr, it since we did it inplace, it is a += operation
898  // in the paper, I experimented with a coloring scheme that had threads do
899  // do their copies in different orders. It wasn't obvious it was beneficial
900  // but it can be replicated, by choosing the array to copy first based on your
901  // thread id % 3
902  if (my_thread_start != 0 ) {
903  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
904  row_mapC(i) += nnz_thread_start;
905  }
906  }
907 
908  // try to Kokkos::single() the alloc here. It should implicitly barrier
909  // thread 0 doesn't copy the rowptr, so it could hit the single first
910  // in the paper, I played a game that effectively got LTG down to a single
911  // OpenMP barrier. But doing that requires the ability to spawn a single
912  // parallel region. The Scan above was implemented using atomic adds
913  // and the barrier was only needed so you could allocate
914  //
915  // Since we can't spawn a single region, we could move the allocations
916  // here, and using 'single'. Most likely, thread 0 will hit it first allowing
917  // the other threads to update the rowptr while it allocates.
918 
919 
920  // next, bulk copy the vals/colind arrays
921  const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
922  for (size_t i = 0; i < my_num_nnz; ++i) {
923  entriesC(nnz_thread_start + i) = Incolind(tid)(i);
924  valuesC(nnz_thread_start + i) = Invalues(tid)(i);
925  }
926 
927  //Free the unamanged views, let each thread deallocate its memory
928  // May need to cast away const here..
929  if(Incolind(tid).data()) free(Incolind(tid).data());
930  if(Invalues(tid).data()) free(Invalues(tid).data());
931  });
932 }//end copy_out
933 
934 #endif // OpenMP
935 
936 
937 /*********************************************************************************************************/
938 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
939 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
940  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
941  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
942  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
943  const LocalOrdinalViewType & Acol2Brow,
944  const LocalOrdinalViewType & Acol2Irow,
945  const LocalOrdinalViewType & Bcol2Ccol,
946  const LocalOrdinalViewType & Icol2Ccol,
947  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
948  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
949  const std::string& label,
950  const Teuchos::RCP<Teuchos::ParameterList>& params) {
951 #ifdef HAVE_TPETRA_MMM_TIMINGS
952  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
953  using Teuchos::TimeMonitor;
954  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
955  Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
956  using Teuchos::rcp;
957 #endif
958  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
959 
960  // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
961 
962  // 1) Multiply A*B
963  Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(),0));
964  Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(" MSAK"),params);
965 
966 #ifdef HAVE_TPETRA_MMM_TIMINGS
967  MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
968 #endif
969 
970  // 2) Scale A by Dinv
971  AB->leftScale(Dinv);
972 
973 #ifdef HAVE_TPETRA_MMM_TIMINGS
974  MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
975 #endif
976 
977  // 3) Add [-omega Dinv A] + B
978  Teuchos::ParameterList jparams;
979  if(!params.is_null()) {
980  jparams = *params;
981  jparams.set("label",label+std::string(" MSAK Add"));
982  }
983  Scalar one = Teuchos::ScalarTraits<Scalar>::one();
984  Tpetra::MatrixMatrix::add(one,false,*Bview.origMatrix,Scalar(-omega),false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,false));
985 #ifdef HAVE_TPETRA_MMM_TIMINGS
986  MM2=Teuchos::null;
987 #endif
988  }// jacobi_A_B_newmatrix_MultiplyScaleAddKernel
989 
990 
991 
992 #if defined (HAVE_TPETRA_INST_OPENMP)
993 /*********************************************************************************************************/
994 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
995 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
996  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
997  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
998  const LocalOrdinalViewType & Acol2PRow,
999  const LocalOrdinalViewType & Acol2PRowImport,
1000  const LocalOrdinalViewType & Pcol2Accol,
1001  const LocalOrdinalViewType & PIcol2Accol,
1002  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1003  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1004  const std::string& label,
1005  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1006 
1007  using Teuchos::RCP;
1008  using Tpetra::MatrixMatrix::UnmanagedView;
1009  #ifdef HAVE_TPETRA_MMM_TIMINGS
1010  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1011  using Teuchos::rcp;
1012  using Teuchos::TimeMonitor;
1013  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1014  #endif
1015 
1016  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1017  typedef Scalar SC;
1018  typedef LocalOrdinal LO;
1019  typedef GlobalOrdinal GO;
1020  typedef Node NO;
1021  typedef Map<LO,GO,NO> map_type;
1023  typedef typename KCRS::StaticCrsGraphType graph_t;
1024  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1025  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1026  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1027  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1028  typedef typename KCRS::device_type device_t;
1029  typedef typename device_t::execution_space execution_space;
1030  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1031 
1032  // Unmanaged versions of the above
1033  typedef UnmanagedView<lno_view_t> u_lno_view_t;
1034  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1035  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1036 
1037  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1038  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1039 
1040  // Sizes
1041  RCP<const map_type> Accolmap = Ac.getColMap();
1042  size_t m = Rview.origMatrix->getLocalNumRows();
1043  size_t n = Accolmap->getLocalNumElements();
1044 
1045  // Get raw Kokkos matrices, and the raw CSR views
1046  const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1047  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1048  const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1049 
1050  c_lno_view_t Rrowptr = Rmat.graph.row_map,
1051  Arowptr = Amat.graph.row_map,
1052  Prowptr = Pmat.graph.row_map, Irowptr;
1053  const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1054  Acolind = Amat.graph.entries,
1055  Pcolind = Pmat.graph.entries;
1056  lno_nnz_view_t Icolind;
1057  const scalar_view_t Rvals = Rmat.values,
1058  Avals = Amat.values,
1059  Pvals = Pmat.values;
1060  scalar_view_t Ivals;
1061 
1062  if (!Pview.importMatrix.is_null())
1063  {
1064  const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1065  Irowptr = Imat.graph.row_map;
1066  Icolind = Imat.graph.entries;
1067  Ivals = Imat.values;
1068  }
1069 
1070  // Classic csr assembly (low memory edition)
1071  //
1072  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1073  // The method loops over rows of R, and may resize after processing
1074  // each row. Chris Siefert says that this reflects experience in
1075  // ML; for the non-threaded case, ML found it faster to spend less
1076  // effort on estimation and risk an occasional reallocation.
1077 
1078  size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1079  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1080 
1081  // Get my node / thread info (right from openmp or parameter list)
1082  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1083  if(!params.is_null()) {
1084  if(params->isParameter("openmp: ltg thread max"))
1085  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1086  }
1087 
1088  double thread_chunk = (double)(m) / thread_max;
1089 
1090  // we can construct the rowptr inplace, allocate here and fault in parallel
1091  lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1092  // we do not touch these until copy out
1093  lno_nnz_view_t entriesAc;
1094  scalar_view_t valuesAc;
1095 
1096  // add this, since we do the rowptr in place, we could figure this out
1097  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1098  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
1099 
1100  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1101  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1102  //
1103  // For column index Aik in row i of A, Acol2PRow[Aik] tells
1104  // you whether the corresponding row of P belongs to P_local
1105  // ("orig") or P_remote ("Import").
1106 
1107  // Thread-local memory
1108  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
1109  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
1110 
1111  // For each row of R
1112  Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1113  {
1114  // Thread coordination stuff
1115  size_t my_thread_start = tid * thread_chunk;
1116  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1117  size_t my_thread_m = my_thread_stop - my_thread_start;
1118 
1119  size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1120 
1121  std::vector<size_t> ac_status(n, INVALID);
1122 
1123  //manually allocate the thread-local storage for Ac
1124  u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1125  u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1126  u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1127 
1128  //count entries as they are added to Ac
1129  size_t nnz = 0, nnz_old = 0;
1130  // bmk: loop over the rows of R which are assigned to thread tid
1131  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1132  // directly index into the rowptr
1133  rowmapAc(i) = nnz;
1134  // mfh 27 Sep 2016: For each entry of R in the current row of R
1135  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1136  LO k = Rcolind(kk); // local column index of current entry of R
1137  const SC Rik = Rvals(kk); // value of current entry of R
1138  if (Rik == SC_ZERO)
1139  continue; // skip explicitly stored zero values in R
1140  // For each entry of A in the current row of A
1141  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1142  LO l = Acolind(ll); // local column index of current entry of A
1143  const SC Akl = Avals(ll); // value of current entry of A
1144  if (Akl == SC_ZERO)
1145  continue; // skip explicitly stored zero values in A
1146 
1147  if (Acol2PRow[l] != LO_INVALID) {
1148  // mfh 27 Sep 2016: If the entry of Acol2PRow
1149  // corresponding to the current entry of A is populated, then
1150  // the corresponding row of P is in P_local (i.e., it lives on
1151  // the calling process).
1152 
1153  // Local matrix
1154  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1155 
1156  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1157  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1158  LO j = Pcolind(jj);
1159  LO Acj = Pcol2Accol(j);
1160  SC Plj = Pvals(jj);
1161 
1162  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1163  #ifdef HAVE_TPETRA_DEBUG
1164  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1165  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1166  std::runtime_error,
1167  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1168  #endif
1169  // New entry
1170  ac_status[Acj] = nnz;
1171  Accolind(nnz) = Acj;
1172  Acvals(nnz) = Rik*Akl*Plj;
1173  nnz++;
1174  } else {
1175  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1176  }
1177  }
1178  } else {
1179  // mfh 27 Sep 2016: If the entry of Acol2PRow
1180  // corresponding to the current entry of A is NOT populated (has
1181  // a flag "invalid" value), then the corresponding row of P is
1182  // in P_remote (i.e., it does not live on the calling process).
1183 
1184  // Remote matrix
1185  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1186  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1187  LO j = Icolind(jj);
1188  LO Acj = PIcol2Accol(j);
1189  SC Plj = Ivals(jj);
1190 
1191  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1192  #ifdef HAVE_TPETRA_DEBUG
1193  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1194  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1195  std::runtime_error,
1196  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1197  #endif
1198  // New entry
1199  ac_status[Acj] = nnz;
1200  Accolind(nnz) = Acj;
1201  Acvals(nnz) = Rik*Akl*Plj;
1202  nnz++;
1203  } else {
1204  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1205  }
1206  }
1207  }
1208  }
1209  }
1210  // Resize for next pass if needed
1211  if (nnz + n > nnzAllocated) {
1212  nnzAllocated *= 2;
1213  Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1214  Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc((void*) Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1215  }
1216  nnz_old = nnz;
1217  }
1218  thread_total_nnz(tid) = nnz;
1219  tl_colind(tid) = Accolind;
1220  tl_values(tid) = Acvals;
1221  });
1222  #ifdef HAVE_TPETRA_MMM_TIMINGS
1223  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1224  #endif
1225 
1226  copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1227 
1228  #ifdef HAVE_TPETRA_MMM_TIMINGS
1229  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1230  #endif
1231 
1232  // Final sort & set of CRS arrays
1233  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1234  // mfh 27 Sep 2016: This just sets pointers.
1235  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1236 
1237  #ifdef HAVE_TPETRA_MMM_TIMINGS
1238  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1239  #endif
1240 
1241  // Final FillComplete
1242  //
1243  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1244  // Import (from domain Map to column Map) construction (which costs
1245  // lots of communication) by taking the previously constructed
1246  // Import object. We should be able to do this without interfering
1247  // with the implementation of the local part of sparse matrix-matrix
1248  // multiply above.
1249  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1250  labelList->set("Timer Label",label);
1251  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1252  RCP<const Export<LO,GO,NO> > dummyExport;
1253  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1254  Rview.origMatrix->getRangeMap(),
1255  Acimport,
1256  dummyExport,
1257  labelList);
1258 }
1259 
1260 
1261 
1262 /*********************************************************************************************************/
1263 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1264 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1265  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1266  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1267  const LocalOrdinalViewType & Acol2PRow,
1268  const LocalOrdinalViewType & Acol2PRowImport,
1269  const LocalOrdinalViewType & Pcol2Accol,
1270  const LocalOrdinalViewType & PIcol2Accol,
1271  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1272  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1273  const std::string& label,
1274  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1275 
1276  using Teuchos::RCP;
1277  using Tpetra::MatrixMatrix::UnmanagedView;
1278  #ifdef HAVE_TPETRA_MMM_TIMINGS
1279  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1280  using Teuchos::TimeMonitor;
1281  using Teuchos::rcp;
1282  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1283  #endif
1284 
1285  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1286  typedef Scalar SC;
1287  typedef LocalOrdinal LO;
1288  typedef GlobalOrdinal GO;
1289  typedef Node NO;
1290  typedef Map<LO,GO,NO> map_type;
1292  typedef typename KCRS::StaticCrsGraphType graph_t;
1293  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1294  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1295  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1296  typedef typename KCRS::device_type device_t;
1297  typedef typename device_t::execution_space execution_space;
1298  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1299 
1300  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1301  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1302  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1303 
1304  // Sizes
1305  RCP<const map_type> Accolmap = Ac.getColMap();
1306  size_t m = Rview.origMatrix->getLocalNumRows();
1307  size_t n = Accolmap->getLocalNumElements();
1308 
1309  // Get raw Kokkos matrices, and the raw CSR views
1310  const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1311  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1312  const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1313  const KCRS & Cmat = Ac.getLocalMatrixDevice();
1314 
1315  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1316  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1317  lno_nnz_view_t Icolind;
1318  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1319  scalar_view_t Cvals = Cmat.values;
1320  scalar_view_t Ivals;
1321 
1322  if (!Pview.importMatrix.is_null())
1323  {
1324  const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1325  Irowptr = Imat.graph.row_map;
1326  Icolind = Imat.graph.entries;
1327  Ivals = Imat.values;
1328  }
1329 
1330  // Get my node / thread info (right from openmp or parameter list)
1331  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1332  if(!params.is_null()) {
1333  if(params->isParameter("openmp: ltg thread max"))
1334  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1335  }
1336 
1337  double thread_chunk = (double)(m) / thread_max;
1338 
1339  // For each row of R
1340  Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1341  {
1342  // Thread coordination stuff
1343  size_t my_thread_start = tid * thread_chunk;
1344  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1345 
1346  std::vector<size_t> c_status(n, INVALID);
1347 
1348  //count entries as they are added to Ac
1349  size_t OLD_ip = 0, CSR_ip = 0;
1350  // bmk: loop over the rows of R which are assigned to thread tid
1351  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1352  // First fill the c_status array w/ locations where we're allowed to
1353  // generate nonzeros for this row
1354  OLD_ip = Crowptr(i);
1355  CSR_ip = Crowptr(i+1);
1356  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1357  c_status[Ccolind(k)] = k;
1358  // Reset values in the row of C
1359  Cvals(k) = SC_ZERO;
1360  }
1361 
1362  // mfh 27 Sep 2016: For each entry of R in the current row of R
1363  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1364  LO k = Rcolind(kk); // local column index of current entry of R
1365  const SC Rik = Rvals(kk); // value of current entry of R
1366  if (Rik == SC_ZERO)
1367  continue; // skip explicitly stored zero values in R
1368  // For each entry of A in the current row of A
1369  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1370  LO l = Acolind(ll); // local column index of current entry of A
1371  const SC Akl = Avals(ll); // value of current entry of A
1372  if (Akl == SC_ZERO)
1373  continue; // skip explicitly stored zero values in A
1374 
1375  if (Acol2PRow[l] != LO_INVALID) {
1376  // mfh 27 Sep 2016: If the entry of Acol2PRow
1377  // corresponding to the current entry of A is populated, then
1378  // the corresponding row of P is in P_local (i.e., it lives on
1379  // the calling process).
1380 
1381  // Local matrix
1382  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1383 
1384  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1385  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1386  LO j = Pcolind(jj);
1387  LO Cij = Pcol2Accol(j);
1388  SC Plj = Pvals(jj);
1389  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1390  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1391  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1392 
1393 
1394  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1395  }
1396  } else {
1397  // mfh 27 Sep 2016: If the entry of Acol2PRow
1398  // corresponding to the current entry of A is NOT populated (has
1399  // a flag "invalid" value), then the corresponding row of P is
1400  // in P_remote (i.e., it does not live on the calling process).
1401 
1402  // Remote matrix
1403  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1404  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1405  LO j = Icolind(jj);
1406  LO Cij = PIcol2Accol(j);
1407  SC Plj = Ivals(jj);
1408  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1409  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1410  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1411 
1412  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1413  }
1414  }
1415  }
1416  }
1417  }
1418  });
1419  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1420 }
1421 
1422 
1423 
1424 #endif
1425 
1426 
1427 }//ExtraKernels
1428 }//MatrixMatrix
1429 }//Tpetra
1430 
1431 
1432 #endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...