Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
12 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
13 #include "Tpetra_RowMatrixTransposer.hpp"
14 
15 namespace Tpetra {
16 
17 namespace MatrixMatrix {
18 
19 namespace ExtraKernels {
20 
21 // Double product version
22 template <class CrsMatrixType>
23 size_t C_estimate_nnz_per_row(CrsMatrixType& A, CrsMatrixType& B) {
24  // Follows the NZ estimate in ML's ml_matmatmult.c
25  size_t Aest = 100, Best = 100;
26  if (A.getLocalNumEntries() > 0)
27  Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
28  if (B.getLocalNumEntries() > 0)
29  Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
30 
31  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
32  nnzperrow *= nnzperrow;
33 
34  return nnzperrow;
35 }
36 
37 // Triple product version
38 template <class CrsMatrixType>
39 size_t Ac_estimate_nnz(CrsMatrixType& A, CrsMatrixType& P) {
40  size_t nnzPerRowA = 100, Pcols = 100;
41  if (A.getLocalNumEntries() > 0)
42  nnzPerRowA = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 9;
43  if (P.getLocalNumEntries() > 0)
44  Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
45  return (size_t)(Pcols * nnzPerRowA + 5 * nnzPerRowA + 300);
46 }
47 
48 #if defined(HAVE_TPETRA_INST_OPENMP)
49 /*********************************************************************************************************/
50 template <class Scalar,
51  class LocalOrdinal,
52  class GlobalOrdinal,
53  class LocalOrdinalViewType>
54 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
55  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
56  const LocalOrdinalViewType& targetMapToOrigRow,
57  const LocalOrdinalViewType& targetMapToImportRow,
58  const LocalOrdinalViewType& Bcol2Ccol,
59  const LocalOrdinalViewType& Icol2Ccol,
60  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
61  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
62  const std::string& label,
63  const Teuchos::RCP<Teuchos::ParameterList>& params) {
64 #ifdef HAVE_TPETRA_MMM_TIMINGS
65  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
66  using Teuchos::TimeMonitor;
67  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
68 #endif
69  using Teuchos::Array;
70  using Teuchos::ArrayRCP;
71  using Teuchos::ArrayView;
72  using Teuchos::RCP;
73  using Teuchos::rcp;
74 
75  // Lots and lots of typedefs
77  typedef typename KCRS::device_type device_t;
78  typedef typename KCRS::StaticCrsGraphType graph_t;
79  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
80  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
81  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
82  typedef typename KCRS::values_type::non_const_type scalar_view_t;
83 
84  // Unmanaged versions of the above
85  // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
86  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
87  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
88 
89  typedef Scalar SC;
90  typedef LocalOrdinal LO;
91  typedef GlobalOrdinal GO;
92  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
93  typedef Map<LO, GO, NO> map_type;
94 
95  // NOTE (mfh 15 Sep 2017) This is specifically only for
96  // execution_space = Kokkos::OpenMP, so we neither need nor want
97  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
98  typedef NO::execution_space execution_space;
99  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
100 
101  // All of the invalid guys
102  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
103  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
104  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
105 
106  // Grab the Kokkos::SparseCrsMatrices & inner stuff
107  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
108  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
109 
110  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
111  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
112  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
113  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
114 
115  c_lno_view_t Irowptr;
116  lno_nnz_view_t Icolind;
117  scalar_view_t Ivals;
118  if (!Bview.importMatrix.is_null()) {
119  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
120  Irowptr = lclB.graph.row_map;
121  Icolind = lclB.graph.entries;
122  Ivals = lclB.values;
123  b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
124  }
125 
126  // Sizes
127  RCP<const map_type> Ccolmap = C.getColMap();
128  size_t m = Aview.origMatrix->getLocalNumRows();
129  size_t n = Ccolmap->getLocalNumElements();
130  size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
131 
132  // Get my node / thread info (right from openmp or parameter list)
133  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
134  if (!params.is_null()) {
135  if (params->isParameter("openmp: ltg thread max"))
136  thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
137  }
138 
139  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
140  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
141  // we will not touch these until the final copyout step
142  lno_nnz_view_t entriesC;
143  scalar_view_t valuesC;
144 
145  // add this, since we do the rowptr in place, we could figure this out
146  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
147  lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
148 
149  // Thread-local memory
150  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
151  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
152 
153  double thread_chunk = (double)(m) / thread_max;
154 
155  // Run chunks of the matrix independently
156  Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
157  // Thread coordination stuff
158  size_t my_thread_start = tid * thread_chunk;
159  size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
160  size_t my_thread_m = my_thread_stop - my_thread_start;
161 
162  // Size estimate
163  size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
164 
165  // Allocations
166  std::vector<size_t> c_status(n, INVALID);
167 
168  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
169  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
170 
171  // For each row of A/C
172  size_t CSR_ip = 0, OLD_ip = 0;
173  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
174  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
175  // on the calling process.
176  // JJE 10 Apr 2019 index directly into the rowptr
177  row_mapC(i) = CSR_ip;
178 
179  // mfh 27 Sep 2016: For each entry of A in the current row of A
180  for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
181  LO Aik = Acolind(k); // local column index of current entry of A
182  const SC Aval = Avals(k); // value of current entry of A
183  if (Aval == SC_ZERO)
184  continue; // skip explicitly stored zero values in A
185 
186  if (targetMapToOrigRow(Aik) != LO_INVALID) {
187  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
188  // corresponding to the current entry of A is populated, then
189  // the corresponding row of B is in B_local (i.e., it lives on
190  // the calling process).
191 
192  // Local matrix
193  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
194 
195  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
196  for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
197  LO Bkj = Bcolind(j);
198  LO Cij = Bcol2Ccol(Bkj);
199 
200  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
201  // New entry
202  c_status[Cij] = CSR_ip;
203  Ccolind(CSR_ip) = Cij;
204  Cvals(CSR_ip) = Aval * Bvals(j);
205  CSR_ip++;
206 
207  } else {
208  Cvals(c_status[Cij]) += Aval * Bvals(j);
209  }
210  }
211 
212  } else {
213  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
214  // corresponding to the current entry of A NOT populated (has
215  // a flag "invalid" value), then the corresponding row of B is
216  // in B_local (i.e., it lives on the calling process).
217 
218  // Remote matrix
219  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
220  for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
221  LO Ikj = Icolind(j);
222  LO Cij = Icol2Ccol(Ikj);
223 
224  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
225  // New entry
226  c_status[Cij] = CSR_ip;
227  Ccolind(CSR_ip) = Cij;
228  Cvals(CSR_ip) = Aval * Ivals(j);
229  CSR_ip++;
230 
231  } else {
232  Cvals(c_status[Cij]) += Aval * Ivals(j);
233  }
234  }
235  }
236  }
237 
238  // Resize for next pass if needed
239  if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1)) * b_max_nnz_per_row) > CSR_alloc) {
240  CSR_alloc *= 2;
241  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
242  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
243  }
244  OLD_ip = CSR_ip;
245  }
246  thread_total_nnz(tid) = CSR_ip;
247  tl_colind(tid) = Ccolind;
248  tl_values(tid) = Cvals;
249  });
250 
251  // Do the copy out
252  copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
253 
254 #ifdef HAVE_TPETRA_MMM_TIMINGS
255  MM = Teuchos::null;
256  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
257 #endif
258  // Sort & set values
259  if (params.is_null() || params->get("sort entries", true))
260  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
261  C.setAllValues(row_mapC, entriesC, valuesC);
262 }
263 
264 /*********************************************************************************************************/
265 template <class Scalar,
266  class LocalOrdinal,
267  class GlobalOrdinal,
268  class LocalOrdinalViewType>
269 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
270  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
271  const LocalOrdinalViewType& targetMapToOrigRow,
272  const LocalOrdinalViewType& targetMapToImportRow,
273  const LocalOrdinalViewType& Bcol2Ccol,
274  const LocalOrdinalViewType& Icol2Ccol,
275  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
276  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
277  const std::string& label,
278  const Teuchos::RCP<Teuchos::ParameterList>& params) {
279 #ifdef HAVE_TPETRA_MMM_TIMINGS
280  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
281  using Teuchos::TimeMonitor;
282  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
283 #endif
284 
285  using Teuchos::Array;
286  using Teuchos::ArrayRCP;
287  using Teuchos::ArrayView;
288  using Teuchos::RCP;
289  using Teuchos::rcp;
290 
291  // Lots and lots of typedefs
293  // typedef typename KCRS::device_type device_t;
294  typedef typename KCRS::StaticCrsGraphType graph_t;
295  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
296  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
297  typedef typename KCRS::values_type::non_const_type scalar_view_t;
298 
299  typedef Scalar SC;
300  typedef LocalOrdinal LO;
301  typedef GlobalOrdinal GO;
302  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
303  typedef Map<LO, GO, NO> map_type;
304 
305  // NOTE (mfh 15 Sep 2017) This is specifically only for
306  // execution_space = Kokkos::OpenMP, so we neither need nor want
307  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
308  typedef NO::execution_space execution_space;
309  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
310 
311  // All of the invalid guys
312  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
313  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
314  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
315 
316  // Grab the Kokkos::SparseCrsMatrices & inner stuff
317  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
318  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
319  const KCRS& Cmat = C.getLocalMatrixDevice();
320 
321  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
322  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
323  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
324  scalar_view_t Cvals = Cmat.values;
325 
326  c_lno_view_t Irowptr;
327  c_lno_nnz_view_t Icolind;
328  scalar_view_t Ivals;
329  if (!Bview.importMatrix.is_null()) {
330  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
331  Irowptr = lclB.graph.row_map;
332  Icolind = lclB.graph.entries;
333  Ivals = lclB.values;
334  }
335 
336  // Sizes
337  RCP<const map_type> Ccolmap = C.getColMap();
338  size_t m = Aview.origMatrix->getLocalNumRows();
339  size_t n = Ccolmap->getLocalNumElements();
340 
341  // Get my node / thread info (right from openmp or parameter list)
342  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
343  if (!params.is_null()) {
344  if (params->isParameter("openmp: ltg thread max"))
345  thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
346  }
347 
348  double thread_chunk = (double)(m) / thread_max;
349 
350  // Run chunks of the matrix independently
351  Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
352  // Thread coordination stuff
353  size_t my_thread_start = tid * thread_chunk;
354  size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
355 
356  // Allocations
357  std::vector<size_t> c_status(n, INVALID);
358 
359  // For each row of A/C
360  size_t CSR_ip = 0, OLD_ip = 0;
361  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
362  // First fill the c_status array w/ locations where we're allowed to
363  // generate nonzeros for this row
364  OLD_ip = Crowptr(i);
365  CSR_ip = Crowptr(i + 1);
366  for (size_t k = OLD_ip; k < CSR_ip; k++) {
367  c_status[Ccolind(k)] = k;
368  // Reset values in the row of C
369  Cvals(k) = SC_ZERO;
370  }
371 
372  for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
373  LO Aik = Acolind(k);
374  const SC Aval = Avals(k);
375  if (Aval == SC_ZERO)
376  continue;
377 
378  if (targetMapToOrigRow(Aik) != LO_INVALID) {
379  // Local matrix
380  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
381 
382  for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
383  LO Bkj = Bcolind(j);
384  LO Cij = Bcol2Ccol(Bkj);
385 
386  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
387  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
388  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
389 
390  Cvals(c_status[Cij]) += Aval * Bvals(j);
391  }
392  } else {
393  // Remote matrix
394  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
395  for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
396  LO Ikj = Icolind(j);
397  LO Cij = Icol2Ccol(Ikj);
398 
399  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
400  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
401  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
402 
403  Cvals(c_status[Cij]) += Aval * Ivals(j);
404  }
405  }
406  }
407  }
408  });
409 
410  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
411 }
412 
413 /*********************************************************************************************************/
414 template <class Scalar,
415  class LocalOrdinal,
416  class GlobalOrdinal,
417  class LocalOrdinalViewType>
418 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
419  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
420  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
421  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
422  const LocalOrdinalViewType& targetMapToOrigRow,
423  const LocalOrdinalViewType& targetMapToImportRow,
424  const LocalOrdinalViewType& Bcol2Ccol,
425  const LocalOrdinalViewType& Icol2Ccol,
426  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
427  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
428  const std::string& label,
429  const Teuchos::RCP<Teuchos::ParameterList>& params) {
430 #ifdef HAVE_TPETRA_MMM_TIMINGS
431  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
432  using Teuchos::TimeMonitor;
433  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
434 #endif
435 
436  using Teuchos::Array;
437  using Teuchos::ArrayRCP;
438  using Teuchos::ArrayView;
439  using Teuchos::RCP;
440  using Teuchos::rcp;
441 
442  // Lots and lots of typedefs
443  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
445  typedef typename KCRS::device_type device_t;
446  typedef typename KCRS::StaticCrsGraphType graph_t;
447  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
448  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
449  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
450  typedef typename KCRS::values_type::non_const_type scalar_view_t;
451 
452  // Unmanaged versions of the above
453  // typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
454  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
455  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
456 
457  // Jacobi-specific
458  typedef typename scalar_view_t::memory_space scalar_memory_space;
459 
460  typedef Scalar SC;
461  typedef LocalOrdinal LO;
462  typedef GlobalOrdinal GO;
463  typedef Node NO;
464  typedef Map<LO, GO, NO> map_type;
465 
466  // NOTE (mfh 15 Sep 2017) This is specifically only for
467  // execution_space = Kokkos::OpenMP, so we neither need nor want
468  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
469  typedef NO::execution_space execution_space;
470  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
471 
472  // All of the invalid guys
473  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
474  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
475  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
476 
477  // Grab the Kokkos::SparseCrsMatrices & inner stuff
478  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
479  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
480 
481  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
482  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
483  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
484  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
485 
486  c_lno_view_t Irowptr;
487  lno_nnz_view_t Icolind;
488  scalar_view_t Ivals;
489  if (!Bview.importMatrix.is_null()) {
490  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
491  Irowptr = lclB.graph.row_map;
492  Icolind = lclB.graph.entries;
493  Ivals = lclB.values;
494  b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
495  }
496 
497  // Jacobi-specific inner stuff
498  auto Dvals =
499  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
500 
501  // Sizes
502  RCP<const map_type> Ccolmap = C.getColMap();
503  size_t m = Aview.origMatrix->getLocalNumRows();
504  size_t n = Ccolmap->getLocalNumElements();
505  size_t Cest_nnz_per_row = 2 * C_estimate_nnz_per_row(*Aview.origMatrix, *Bview.origMatrix);
506 
507  // Get my node / thread info (right from openmp)
508  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
509  if (!params.is_null()) {
510  if (params->isParameter("openmp: ltg thread max"))
511  thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
512  }
513 
514  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
515  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
516  // we will not touch these until the final copyout step
517  lno_nnz_view_t entriesC;
518  scalar_view_t valuesC;
519 
520  // add this, since we do the rowptr in place, we could figure this out
521  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
522  lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
523 
524  // Thread-local memory
525  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
526  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
527 
528  double thread_chunk = (double)(m) / thread_max;
529 
530  // Run chunks of the matrix independently
531  Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
532  // Thread coordination stuff
533  size_t my_thread_start = tid * thread_chunk;
534  size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
535  size_t my_thread_m = my_thread_stop - my_thread_start;
536 
537  // Size estimate
538  size_t CSR_alloc = (size_t)(my_thread_m * Cest_nnz_per_row * 0.75 + 100);
539 
540  // Allocations
541  std::vector<size_t> c_status(n, INVALID);
542 
543  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
544  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
545 
546  // For each row of A/C
547  size_t CSR_ip = 0, OLD_ip = 0;
548  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
549  // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
550  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
551  // on the calling process.
552  // JJE: directly access the rowptr here (indexed using our loop var)
553  row_mapC(i) = CSR_ip;
554  // NOTE: Vector::getLocalView returns a rank 2 view here
555  SC minusOmegaDval = -omega * Dvals(i, 0);
556 
557  // Entries of B
558  for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
559  const SC Bval = Bvals(j);
560  if (Bval == SC_ZERO)
561  continue;
562  LO Bij = Bcolind(j);
563  LO Cij = Bcol2Ccol(Bij);
564 
565  // Assume no repeated entries in B
566  c_status[Cij] = CSR_ip;
567  Ccolind(CSR_ip) = Cij;
568  Cvals(CSR_ip) = Bvals[j];
569  CSR_ip++;
570  }
571 
572  // Entries of -omega * Dinv * A * B
573  // mfh 27 Sep 2016: For each entry of A in the current row of A
574  for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
575  LO Aik = Acolind(k); // local column index of current entry of A
576  const SC Aval = Avals(k); // value of current entry of A
577  if (Aval == SC_ZERO)
578  continue; // skip explicitly stored zero values in A
579 
580  if (targetMapToOrigRow(Aik) != LO_INVALID) {
581  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
582  // corresponding to the current entry of A is populated, then
583  // the corresponding row of B is in B_local (i.e., it lives on
584  // the calling process).
585 
586  // Local matrix
587  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
588 
589  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
590  for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
591  LO Bkj = Bcolind(j);
592  LO Cij = Bcol2Ccol(Bkj);
593 
594  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
595  // New entry
596  c_status[Cij] = CSR_ip;
597  Ccolind(CSR_ip) = Cij;
598  Cvals(CSR_ip) = minusOmegaDval * Aval * Bvals(j);
599  CSR_ip++;
600 
601  } else {
602  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
603  }
604  }
605 
606  } else {
607  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
608  // corresponding to the current entry of A NOT populated (has
609  // a flag "invalid" value), then the corresponding row of B is
610  // in B_local (i.e., it lives on the calling process).
611 
612  // Remote matrix
613  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
614  for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
615  LO Ikj = Icolind(j);
616  LO Cij = Icol2Ccol(Ikj);
617 
618  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
619  // New entry
620  c_status[Cij] = CSR_ip;
621  Ccolind(CSR_ip) = Cij;
622  Cvals(CSR_ip) = minusOmegaDval * Aval * Ivals(j);
623  CSR_ip++;
624 
625  } else {
626  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
627  }
628  }
629  }
630  }
631 
632  // Resize for next pass if needed
633  if (i + 1 < my_thread_stop && CSR_ip + std::min(n, (Arowptr(i + 2) - Arowptr(i + 1) + 1) * b_max_nnz_per_row) > CSR_alloc) {
634  CSR_alloc *= 2;
635  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(), u_lno_nnz_view_t::shmem_size(CSR_alloc)), CSR_alloc);
636  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Cvals.data(), u_scalar_view_t::shmem_size(CSR_alloc)), CSR_alloc);
637  }
638  OLD_ip = CSR_ip;
639  }
640  thread_total_nnz(tid) = CSR_ip;
641  tl_colind(tid) = Ccolind;
642  tl_values(tid) = Cvals;
643  });
644 
645  // Do the copy out (removed the tl_rowptr!)
646  copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, row_mapC, entriesC, valuesC);
647 
648 #ifdef HAVE_TPETRA_MMM_TIMINGS
649  MM = Teuchos::null;
650  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
651 #endif
652  // Sort & set values
653  if (params.is_null() || params->get("sort entries", true))
654  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
655  C.setAllValues(row_mapC, entriesC, valuesC);
656 }
657 
658 /*********************************************************************************************************/
659 template <class Scalar,
660  class LocalOrdinal,
661  class GlobalOrdinal,
662  class LocalOrdinalViewType>
663 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
664  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Dinv,
665  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
666  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
667  const LocalOrdinalViewType& targetMapToOrigRow,
668  const LocalOrdinalViewType& targetMapToImportRow,
669  const LocalOrdinalViewType& Bcol2Ccol,
670  const LocalOrdinalViewType& Icol2Ccol,
671  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
672  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
673  const std::string& label,
674  const Teuchos::RCP<Teuchos::ParameterList>& params) {
675 #ifdef HAVE_TPETRA_MMM_TIMINGS
676  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
677  using Teuchos::TimeMonitor;
678  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
679 #endif
680  using Teuchos::Array;
681  using Teuchos::ArrayRCP;
682  using Teuchos::ArrayView;
683  using Teuchos::RCP;
684  using Teuchos::rcp;
685 
686  // Lots and lots of typedefs
687  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
689  // typedef typename KCRS::device_type device_t;
690  typedef typename KCRS::StaticCrsGraphType graph_t;
691  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
692  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
693  typedef typename KCRS::values_type::non_const_type scalar_view_t;
694 
695  // Jacobi-specific
696  typedef typename scalar_view_t::memory_space scalar_memory_space;
697 
698  typedef Scalar SC;
699  typedef LocalOrdinal LO;
700  typedef GlobalOrdinal GO;
701  typedef Node NO;
702  typedef Map<LO, GO, NO> map_type;
703 
704  // NOTE (mfh 15 Sep 2017) This is specifically only for
705  // execution_space = Kokkos::OpenMP, so we neither need nor want
706  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
707  typedef NO::execution_space execution_space;
708  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
709 
710  // All of the invalid guys
711  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
712  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
713  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
714 
715  // Grab the Kokkos::SparseCrsMatrices & inner stuff
716  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
717  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixDevice();
718  const KCRS& Cmat = C.getLocalMatrixDevice();
719 
720  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
721  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
722  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
723  scalar_view_t Cvals = Cmat.values;
724 
725  c_lno_view_t Irowptr;
726  c_lno_nnz_view_t Icolind;
727  scalar_view_t Ivals;
728  if (!Bview.importMatrix.is_null()) {
729  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
730  Irowptr = lclB.graph.row_map;
731  Icolind = lclB.graph.entries;
732  Ivals = lclB.values;
733  }
734 
735  // Jacobi-specific inner stuff
736  auto Dvals =
737  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
738 
739  // Sizes
740  RCP<const map_type> Ccolmap = C.getColMap();
741  size_t m = Aview.origMatrix->getLocalNumRows();
742  size_t n = Ccolmap->getLocalNumElements();
743 
744  // Get my node / thread info (right from openmp or parameter list)
745  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
746  if (!params.is_null()) {
747  if (params->isParameter("openmp: ltg thread max"))
748  thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
749  }
750 
751  double thread_chunk = (double)(m) / thread_max;
752 
753  // Run chunks of the matrix independently
754  Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
755  // Thread coordination stuff
756  size_t my_thread_start = tid * thread_chunk;
757  size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
758 
759  // Allocations
760  std::vector<size_t> c_status(n, INVALID);
761 
762  // For each row of A/C
763  size_t CSR_ip = 0, OLD_ip = 0;
764  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
765  // First fill the c_status array w/ locations where we're allowed to
766  // generate nonzeros for this row
767  OLD_ip = Crowptr(i);
768  CSR_ip = Crowptr(i + 1);
769  // NOTE: Vector::getLocalView returns a rank 2 view here
770  SC minusOmegaDval = -omega * Dvals(i, 0);
771 
772  for (size_t k = OLD_ip; k < CSR_ip; k++) {
773  c_status[Ccolind(k)] = k;
774  // Reset values in the row of C
775  Cvals(k) = SC_ZERO;
776  }
777 
778  // Entries of B
779  for (size_t j = Browptr(i); j < Browptr(i + 1); j++) {
780  const SC Bval = Bvals(j);
781  if (Bval == SC_ZERO)
782  continue;
783  LO Bij = Bcolind(j);
784  LO Cij = Bcol2Ccol(Bij);
785 
786  // Assume no repeated entries in B
787  Cvals(c_status[Cij]) += Bvals(j);
788  CSR_ip++;
789  }
790 
791  for (size_t k = Arowptr(i); k < Arowptr(i + 1); k++) {
792  LO Aik = Acolind(k);
793  const SC Aval = Avals(k);
794  if (Aval == SC_ZERO)
795  continue;
796 
797  if (targetMapToOrigRow(Aik) != LO_INVALID) {
798  // Local matrix
799  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
800 
801  for (size_t j = Browptr(Bk); j < Browptr(Bk + 1); ++j) {
802  LO Bkj = Bcolind(j);
803  LO Cij = Bcol2Ccol(Bkj);
804 
805  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
806  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
807  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
808 
809  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
810  }
811  } else {
812  // Remote matrix
813  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
814  for (size_t j = Irowptr(Ik); j < Irowptr(Ik + 1); ++j) {
815  LO Ikj = Icolind(j);
816  LO Cij = Icol2Ccol(Ikj);
817 
818  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
819  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
820  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
821 
822  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
823  }
824  }
825  }
826  }
827  });
828 
829  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
830 }
831 
832 /*********************************************************************************************************/
833 template <class InColindArrayType, class InValsArrayType,
834  class OutRowptrType, class OutColindType, class OutValsType>
835 void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
836  const InColindArrayType& Incolind,
837  const InValsArrayType& Invalues,
838  const size_t m,
839  const double thread_chunk,
840  OutRowptrType& row_mapC,
841  OutColindType& entriesC,
842  OutValsType& valuesC) {
843  typedef OutRowptrType lno_view_t;
844  typedef OutColindType lno_nnz_view_t;
845  typedef OutValsType scalar_view_t;
846  typedef typename lno_view_t::execution_space execution_space;
847  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
848 
849  // Generate the starting nnz number per thread
850  size_t thread_max = Incolind.size();
851  size_t c_nnz_size = 0;
852  // since this kernel is 'low thread count' is very likely, that we could
853  // sum over the thread_total_nnz and not go parallel, but since it is a view
854  // we don't know where the memory actually lives... so we kinda need to go parallel
855 
856  lno_view_t thread_start_nnz("thread_nnz", thread_max + 1);
857  Kokkos::parallel_scan("LTG::Scan", range_type(0, thread_max).set_chunk_size(1), [=](const size_t i, size_t& update, const bool final) {
858  size_t mynnz = thread_total_nnz(i);
859  if (final) thread_start_nnz(i) = update;
860  update += mynnz;
861  if (final && i + 1 == thread_max) thread_start_nnz(i + 1) = update;
862  });
863  c_nnz_size = thread_start_nnz(thread_max);
864 
865  // 2019 Apr 10 JJE: update the rowptr's final entry here
866  row_mapC(m) = thread_start_nnz(thread_max);
867 
868  // Allocate output
869  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size);
870  entriesC = entriesC_;
871  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size);
872  valuesC = valuesC_;
873 
874  // Copy out
875  Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
876  const size_t my_thread_start = tid * thread_chunk;
877  const size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
878  const size_t nnz_thread_start = thread_start_nnz(tid);
879  // we need this info, since we did the rowptr in place
880  const size_t nnz_thread_stop = thread_start_nnz(tid + 1);
881 
882  // There are two fundamental operations:
883  // * Updateing the rowptr with the correct offset
884  // * copying entries and values
885 
886  // First, update the rowptr, it since we did it inplace, it is a += operation
887  // in the paper, I experimented with a coloring scheme that had threads do
888  // do their copies in different orders. It wasn't obvious it was beneficial
889  // but it can be replicated, by choosing the array to copy first based on your
890  // thread id % 3
891  if (my_thread_start != 0) {
892  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
893  row_mapC(i) += nnz_thread_start;
894  }
895  }
896 
897  // try to Kokkos::single() the alloc here. It should implicitly barrier
898  // thread 0 doesn't copy the rowptr, so it could hit the single first
899  // in the paper, I played a game that effectively got LTG down to a single
900  // OpenMP barrier. But doing that requires the ability to spawn a single
901  // parallel region. The Scan above was implemented using atomic adds
902  // and the barrier was only needed so you could allocate
903  //
904  // Since we can't spawn a single region, we could move the allocations
905  // here, and using 'single'. Most likely, thread 0 will hit it first allowing
906  // the other threads to update the rowptr while it allocates.
907 
908  // next, bulk copy the vals/colind arrays
909  const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
910  for (size_t i = 0; i < my_num_nnz; ++i) {
911  entriesC(nnz_thread_start + i) = Incolind(tid)(i);
912  valuesC(nnz_thread_start + i) = Invalues(tid)(i);
913  }
914 
915  // Free the unamanged views, let each thread deallocate its memory
916  // May need to cast away const here..
917  if (Incolind(tid).data()) free(Incolind(tid).data());
918  if (Invalues(tid).data()) free(Invalues(tid).data());
919  });
920 } // end copy_out
921 
922 #endif // OpenMP
923 
924 /*********************************************************************************************************/
925 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
926 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
927  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
928  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
929  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
930  const LocalOrdinalViewType& Acol2Brow,
931  const LocalOrdinalViewType& Acol2Irow,
932  const LocalOrdinalViewType& Bcol2Ccol,
933  const LocalOrdinalViewType& Icol2Ccol,
934  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
935  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Cimport,
936  const std::string& label,
937  const Teuchos::RCP<Teuchos::ParameterList>& params) {
938 #ifdef HAVE_TPETRA_MMM_TIMINGS
939  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
940  using Teuchos::TimeMonitor;
941  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
942  Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
943  using Teuchos::rcp;
944 #endif
945  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
946 
947  // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
948 
949  // 1) Multiply A*B
950  Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(), 0));
951  Tpetra::MMdetails::mult_A_B_newmatrix(Aview, Bview, *AB, label + std::string(" MSAK"), params);
952 
953 #ifdef HAVE_TPETRA_MMM_TIMINGS
954  MM2 = Teuchos::null;
955  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
956 #endif
957 
958  // 2) Scale A by Dinv
959  AB->leftScale(Dinv);
960 
961 #ifdef HAVE_TPETRA_MMM_TIMINGS
962  MM2 = Teuchos::null;
963  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
964 #endif
965 
966  // 3) Add [-omega Dinv A] + B
967  Teuchos::ParameterList jparams;
968  if (!params.is_null()) {
969  jparams = *params;
970  jparams.set("label", label + std::string(" MSAK Add"));
971  }
972  Scalar one = Teuchos::ScalarTraits<Scalar>::one();
973  Tpetra::MatrixMatrix::add(one, false, *Bview.origMatrix, Scalar(-omega), false, *AB, C, AB->getDomainMap(), AB->getRangeMap(), Teuchos::rcp(&jparams, false));
974 #ifdef HAVE_TPETRA_MMM_TIMINGS
975  MM2 = Teuchos::null;
976 #endif
977 } // jacobi_A_B_newmatrix_MultiplyScaleAddKernel
978 
979 #if defined(HAVE_TPETRA_INST_OPENMP)
980 /*********************************************************************************************************/
981 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
982 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
983  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
984  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
985  const LocalOrdinalViewType& Acol2PRow,
986  const LocalOrdinalViewType& Acol2PRowImport,
987  const LocalOrdinalViewType& Pcol2Accol,
988  const LocalOrdinalViewType& PIcol2Accol,
989  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
990  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
991  const std::string& label,
992  const Teuchos::RCP<Teuchos::ParameterList>& params) {
993  using Teuchos::RCP;
994  using Tpetra::MatrixMatrix::UnmanagedView;
995 #ifdef HAVE_TPETRA_MMM_TIMINGS
996  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
997  using Teuchos::rcp;
998  using Teuchos::TimeMonitor;
999  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1000 #endif
1001 
1002  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1003  typedef Scalar SC;
1004  typedef LocalOrdinal LO;
1005  typedef GlobalOrdinal GO;
1006  typedef Node NO;
1007  typedef Map<LO, GO, NO> map_type;
1009  typedef typename KCRS::StaticCrsGraphType graph_t;
1010  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1011  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1012  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1013  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1014  typedef typename KCRS::device_type device_t;
1015  typedef typename device_t::execution_space execution_space;
1016  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1017 
1018  // Unmanaged versions of the above
1019  typedef UnmanagedView<lno_view_t> u_lno_view_t;
1020  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1021  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1022 
1023  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1024  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1025 
1026  // Sizes
1027  RCP<const map_type> Accolmap = Ac.getColMap();
1028  size_t m = Rview.origMatrix->getLocalNumRows();
1029  size_t n = Accolmap->getLocalNumElements();
1030 
1031  // Get raw Kokkos matrices, and the raw CSR views
1032  const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
1033  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
1034  const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
1035 
1036  c_lno_view_t Rrowptr = Rmat.graph.row_map,
1037  Arowptr = Amat.graph.row_map,
1038  Prowptr = Pmat.graph.row_map, Irowptr;
1039  const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1040  Acolind = Amat.graph.entries,
1041  Pcolind = Pmat.graph.entries;
1042  lno_nnz_view_t Icolind;
1043  const scalar_view_t Rvals = Rmat.values,
1044  Avals = Amat.values,
1045  Pvals = Pmat.values;
1046  scalar_view_t Ivals;
1047 
1048  if (!Pview.importMatrix.is_null()) {
1049  const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1050  Irowptr = Imat.graph.row_map;
1051  Icolind = Imat.graph.entries;
1052  Ivals = Imat.values;
1053  }
1054 
1055  // Classic csr assembly (low memory edition)
1056  //
1057  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1058  // The method loops over rows of R, and may resize after processing
1059  // each row. Chris Siefert says that this reflects experience in
1060  // ML; for the non-threaded case, ML found it faster to spend less
1061  // effort on estimation and risk an occasional reallocation.
1062 
1063  size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1064  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1065 
1066  // Get my node / thread info (right from openmp or parameter list)
1067  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1068  if (!params.is_null()) {
1069  if (params->isParameter("openmp: ltg thread max"))
1070  thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1071  }
1072 
1073  double thread_chunk = (double)(m) / thread_max;
1074 
1075  // we can construct the rowptr inplace, allocate here and fault in parallel
1076  lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1077  // we do not touch these until copy out
1078  lno_nnz_view_t entriesAc;
1079  scalar_view_t valuesAc;
1080 
1081  // add this, since we do the rowptr in place, we could figure this out
1082  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1083  lno_nnz_view_t thread_total_nnz("thread_total_nnz", thread_max + 1);
1084 
1085  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1086  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1087  //
1088  // For column index Aik in row i of A, Acol2PRow[Aik] tells
1089  // you whether the corresponding row of P belongs to P_local
1090  // ("orig") or P_remote ("Import").
1091 
1092  // Thread-local memory
1093  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
1094  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
1095 
1096  // For each row of R
1097  Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1098  // Thread coordination stuff
1099  size_t my_thread_start = tid * thread_chunk;
1100  size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1101  size_t my_thread_m = my_thread_stop - my_thread_start;
1102 
1103  size_t nnzAllocated = (size_t)(my_thread_m * Acest_nnz_per_row + 100);
1104 
1105  std::vector<size_t> ac_status(n, INVALID);
1106 
1107  // manually allocate the thread-local storage for Ac
1108  u_lno_view_t Acrowptr((typename u_lno_view_t::data_type)malloc(u_lno_view_t::shmem_size(my_thread_m + 1)), my_thread_m + 1);
1109  u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1110  u_scalar_view_t Acvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1111 
1112  // count entries as they are added to Ac
1113  size_t nnz = 0, nnz_old = 0;
1114  // bmk: loop over the rows of R which are assigned to thread tid
1115  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1116  // directly index into the rowptr
1117  rowmapAc(i) = nnz;
1118  // mfh 27 Sep 2016: For each entry of R in the current row of R
1119  for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1120  LO k = Rcolind(kk); // local column index of current entry of R
1121  const SC Rik = Rvals(kk); // value of current entry of R
1122  if (Rik == SC_ZERO)
1123  continue; // skip explicitly stored zero values in R
1124  // For each entry of A in the current row of A
1125  for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1126  LO l = Acolind(ll); // local column index of current entry of A
1127  const SC Akl = Avals(ll); // value of current entry of A
1128  if (Akl == SC_ZERO)
1129  continue; // skip explicitly stored zero values in A
1130 
1131  if (Acol2PRow[l] != LO_INVALID) {
1132  // mfh 27 Sep 2016: If the entry of Acol2PRow
1133  // corresponding to the current entry of A is populated, then
1134  // the corresponding row of P is in P_local (i.e., it lives on
1135  // the calling process).
1136 
1137  // Local matrix
1138  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1139 
1140  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1141  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1142  LO j = Pcolind(jj);
1143  LO Acj = Pcol2Accol(j);
1144  SC Plj = Pvals(jj);
1145 
1146  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1147 #ifdef HAVE_TPETRA_DEBUG
1148  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1149  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1150  std::runtime_error,
1151  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1152 #endif
1153  // New entry
1154  ac_status[Acj] = nnz;
1155  Accolind(nnz) = Acj;
1156  Acvals(nnz) = Rik * Akl * Plj;
1157  nnz++;
1158  } else {
1159  Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1160  }
1161  }
1162  } else {
1163  // mfh 27 Sep 2016: If the entry of Acol2PRow
1164  // corresponding to the current entry of A is NOT populated (has
1165  // a flag "invalid" value), then the corresponding row of P is
1166  // in P_remote (i.e., it does not live on the calling process).
1167 
1168  // Remote matrix
1169  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1170  for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1171  LO j = Icolind(jj);
1172  LO Acj = PIcol2Accol(j);
1173  SC Plj = Ivals(jj);
1174 
1175  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1176 #ifdef HAVE_TPETRA_DEBUG
1177  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1178  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1179  std::runtime_error,
1180  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1181 #endif
1182  // New entry
1183  ac_status[Acj] = nnz;
1184  Accolind(nnz) = Acj;
1185  Acvals(nnz) = Rik * Akl * Plj;
1186  nnz++;
1187  } else {
1188  Acvals(ac_status[Acj]) += Rik * Akl * Plj;
1189  }
1190  }
1191  }
1192  }
1193  }
1194  // Resize for next pass if needed
1195  if (nnz + n > nnzAllocated) {
1196  nnzAllocated *= 2;
1197  Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1198  Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc((void*)Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1199  }
1200  nnz_old = nnz;
1201  }
1202  thread_total_nnz(tid) = nnz;
1203  tl_colind(tid) = Accolind;
1204  tl_values(tid) = Acvals;
1205  });
1206 #ifdef HAVE_TPETRA_MMM_TIMINGS
1207  MM = Teuchos::null;
1208  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1209 #endif
1210 
1211  copy_out_from_thread_memory(thread_total_nnz, tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1212 
1213 #ifdef HAVE_TPETRA_MMM_TIMINGS
1214  MM = Teuchos::null;
1215  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1216 #endif
1217 
1218  // Final sort & set of CRS arrays
1219  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1220  // mfh 27 Sep 2016: This just sets pointers.
1221  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1222 
1223 #ifdef HAVE_TPETRA_MMM_TIMINGS
1224  MM = Teuchos::null;
1225  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1226 #endif
1227 
1228  // Final FillComplete
1229  //
1230  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1231  // Import (from domain Map to column Map) construction (which costs
1232  // lots of communication) by taking the previously constructed
1233  // Import object. We should be able to do this without interfering
1234  // with the implementation of the local part of sparse matrix-matrix
1235  // multiply above.
1236  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1237  labelList->set("Timer Label", label);
1238  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1239  RCP<const Export<LO, GO, NO> > dummyExport;
1240  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1241  Rview.origMatrix->getRangeMap(),
1242  Acimport,
1243  dummyExport,
1244  labelList);
1245 }
1246 
1247 /*********************************************************************************************************/
1248 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1249 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1250  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1251  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1252  const LocalOrdinalViewType& Acol2PRow,
1253  const LocalOrdinalViewType& Acol2PRowImport,
1254  const LocalOrdinalViewType& Pcol2Accol,
1255  const LocalOrdinalViewType& PIcol2Accol,
1256  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1257  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1258  const std::string& label,
1259  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1260  using Teuchos::RCP;
1261  using Tpetra::MatrixMatrix::UnmanagedView;
1262 #ifdef HAVE_TPETRA_MMM_TIMINGS
1263  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1264  using Teuchos::rcp;
1265  using Teuchos::TimeMonitor;
1266  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1267 #endif
1268 
1269  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1270  typedef Scalar SC;
1271  typedef LocalOrdinal LO;
1272  typedef GlobalOrdinal GO;
1273  typedef Node NO;
1274  typedef Map<LO, GO, NO> map_type;
1276  typedef typename KCRS::StaticCrsGraphType graph_t;
1277  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1278  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1279  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1280  typedef typename KCRS::device_type device_t;
1281  typedef typename device_t::execution_space execution_space;
1282  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1283 
1284  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1285  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1286  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1287 
1288  // Sizes
1289  RCP<const map_type> Accolmap = Ac.getColMap();
1290  size_t m = Rview.origMatrix->getLocalNumRows();
1291  size_t n = Accolmap->getLocalNumElements();
1292 
1293  // Get raw Kokkos matrices, and the raw CSR views
1294  const KCRS& Rmat = Rview.origMatrix->getLocalMatrixDevice();
1295  const KCRS& Amat = Aview.origMatrix->getLocalMatrixDevice();
1296  const KCRS& Pmat = Pview.origMatrix->getLocalMatrixDevice();
1297  const KCRS& Cmat = Ac.getLocalMatrixDevice();
1298 
1299  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1300  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1301  lno_nnz_view_t Icolind;
1302  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1303  scalar_view_t Cvals = Cmat.values;
1304  scalar_view_t Ivals;
1305 
1306  if (!Pview.importMatrix.is_null()) {
1307  const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1308  Irowptr = Imat.graph.row_map;
1309  Icolind = Imat.graph.entries;
1310  Ivals = Imat.values;
1311  }
1312 
1313  // Get my node / thread info (right from openmp or parameter list)
1314  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1315  if (!params.is_null()) {
1316  if (params->isParameter("openmp: ltg thread max"))
1317  thread_max = std::max((size_t)1, std::min(thread_max, params->get("openmp: ltg thread max", thread_max)));
1318  }
1319 
1320  double thread_chunk = (double)(m) / thread_max;
1321 
1322  // For each row of R
1323  Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal", range_type(0, thread_max).set_chunk_size(1), [=](const size_t tid) {
1324  // Thread coordination stuff
1325  size_t my_thread_start = tid * thread_chunk;
1326  size_t my_thread_stop = tid == thread_max - 1 ? m : (tid + 1) * thread_chunk;
1327 
1328  std::vector<size_t> c_status(n, INVALID);
1329 
1330  // count entries as they are added to Ac
1331  size_t OLD_ip = 0, CSR_ip = 0;
1332  // bmk: loop over the rows of R which are assigned to thread tid
1333  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1334  // First fill the c_status array w/ locations where we're allowed to
1335  // generate nonzeros for this row
1336  OLD_ip = Crowptr(i);
1337  CSR_ip = Crowptr(i + 1);
1338  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1339  c_status[Ccolind(k)] = k;
1340  // Reset values in the row of C
1341  Cvals(k) = SC_ZERO;
1342  }
1343 
1344  // mfh 27 Sep 2016: For each entry of R in the current row of R
1345  for (size_t kk = Rrowptr(i); kk < Rrowptr(i + 1); kk++) {
1346  LO k = Rcolind(kk); // local column index of current entry of R
1347  const SC Rik = Rvals(kk); // value of current entry of R
1348  if (Rik == SC_ZERO)
1349  continue; // skip explicitly stored zero values in R
1350  // For each entry of A in the current row of A
1351  for (size_t ll = Arowptr(k); ll < Arowptr(k + 1); ll++) {
1352  LO l = Acolind(ll); // local column index of current entry of A
1353  const SC Akl = Avals(ll); // value of current entry of A
1354  if (Akl == SC_ZERO)
1355  continue; // skip explicitly stored zero values in A
1356 
1357  if (Acol2PRow[l] != LO_INVALID) {
1358  // mfh 27 Sep 2016: If the entry of Acol2PRow
1359  // corresponding to the current entry of A is populated, then
1360  // the corresponding row of P is in P_local (i.e., it lives on
1361  // the calling process).
1362 
1363  // Local matrix
1364  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1365 
1366  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1367  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl + 1); jj++) {
1368  LO j = Pcolind(jj);
1369  LO Cij = Pcol2Accol(j);
1370  SC Plj = Pvals(jj);
1371  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1372  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1373  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1374 
1375  Cvals(c_status[Cij]) += Rik * Akl * Plj;
1376  }
1377  } else {
1378  // mfh 27 Sep 2016: If the entry of Acol2PRow
1379  // corresponding to the current entry of A is NOT populated (has
1380  // a flag "invalid" value), then the corresponding row of P is
1381  // in P_remote (i.e., it does not live on the calling process).
1382 
1383  // Remote matrix
1384  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1385  for (size_t jj = Irowptr(Il); jj < Irowptr(Il + 1); jj++) {
1386  LO j = Icolind(jj);
1387  LO Cij = PIcol2Accol(j);
1388  SC Plj = Ivals(jj);
1389  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1390  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1391  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1392 
1393  Cvals(c_status[Cij]) += Rik * Akl * Plj;
1394  }
1395  }
1396  }
1397  }
1398  }
1399  });
1400  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1401 }
1402 
1403 #endif
1404 
1405 } // namespace ExtraKernels
1406 } // namespace MatrixMatrix
1407 } // namespace Tpetra
1408 
1409 #endif
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...