Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45 #include "Tpetra_RowMatrixTransposer.hpp"
46 
47 namespace Tpetra {
48 
49 namespace MatrixMatrix{
50 
51 namespace ExtraKernels{
52 
53 
54 // Double product version
55 template<class CrsMatrixType>
56 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
57  // Follows the NZ estimate in ML's ml_matmatmult.c
58  size_t Aest = 100, Best=100;
59  if (A.getLocalNumEntries() > 0)
60  Aest = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
61  if (B.getLocalNumEntries() > 0)
62  Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
63 
64  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
65  nnzperrow *= nnzperrow;
66 
67  return nnzperrow;
68 }
69 
70 
71 // Triple product version
72 template<class CrsMatrixType>
73 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
74  size_t nnzPerRowA = 100, Pcols = 100;
75  if (A.getLocalNumEntries() > 0)
76  nnzPerRowA = (A.getLocalNumRows() > 0)? A.getLocalNumEntries()/A.getLocalNumRows() : 9;
77  if (P.getLocalNumEntries() > 0)
78  Pcols = (P.getLocalNumCols() > 0) ? P.getLocalNumCols() : 100;
79  return (size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
80 }
81 
82 #if defined (HAVE_TPETRA_INST_OPENMP)
83 /*********************************************************************************************************/
84 template<class Scalar,
85  class LocalOrdinal,
86  class GlobalOrdinal,
87  class LocalOrdinalViewType>
88 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
89  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
90  const LocalOrdinalViewType & targetMapToOrigRow,
91  const LocalOrdinalViewType & targetMapToImportRow,
92  const LocalOrdinalViewType & Bcol2Ccol,
93  const LocalOrdinalViewType & Icol2Ccol,
94  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
95  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
96  const std::string& label,
97  const Teuchos::RCP<Teuchos::ParameterList>& params) {
98 #ifdef HAVE_TPETRA_MMM_TIMINGS
99  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
100  using Teuchos::TimeMonitor;
101  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
102 #endif
103  using Teuchos::Array;
104  using Teuchos::ArrayRCP;
105  using Teuchos::ArrayView;
106  using Teuchos::RCP;
107  using Teuchos::rcp;
108 
109 
110  // Lots and lots of typedefs
112  typedef typename KCRS::device_type device_t;
113  typedef typename KCRS::StaticCrsGraphType graph_t;
114  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
115  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
116  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
117  typedef typename KCRS::values_type::non_const_type scalar_view_t;
118 
119  // Unmanaged versions of the above
120  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
121  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
122  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
123 
124  typedef Scalar SC;
125  typedef LocalOrdinal LO;
126  typedef GlobalOrdinal GO;
127  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
128  typedef Map<LO,GO,NO> map_type;
129 
130  // NOTE (mfh 15 Sep 2017) This is specifically only for
131  // execution_space = Kokkos::OpenMP, so we neither need nor want
132  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
133  typedef NO::execution_space execution_space;
134  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
135 
136  // All of the invalid guys
137  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
138  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
139  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
140 
141  // Grab the Kokkos::SparseCrsMatrices & inner stuff
142  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
143  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
144 
145  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
146  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
147  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
148  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
149 
150  c_lno_view_t Irowptr;
151  lno_nnz_view_t Icolind;
152  scalar_view_t Ivals;
153  if(!Bview.importMatrix.is_null()) {
154  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
155  Irowptr = lclB.graph.row_map;
156  Icolind = lclB.graph.entries;
157  Ivals = lclB.values;
158  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
159  }
160 
161  // Sizes
162  RCP<const map_type> Ccolmap = C.getColMap();
163  size_t m = Aview.origMatrix->getLocalNumRows();
164  size_t n = Ccolmap->getLocalNumElements();
165  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
166 
167  // Get my node / thread info (right from openmp or parameter list)
168  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
169  if(!params.is_null()) {
170  if(params->isParameter("openmp: ltg thread max"))
171  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
172  }
173 
174  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
175  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
176  // we will not touch these until the final copyout step
177  lno_nnz_view_t entriesC;
178  scalar_view_t valuesC;
179 
180  // add this, since we do the rowptr in place, we could figure this out
181  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
182  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
183 
184  // Thread-local memory
185  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind",thread_max);
186  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values",thread_max);
187 
188  double thread_chunk = (double)(m) / thread_max;
189 
190  // Run chunks of the matrix independently
191  Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
192  {
193  // Thread coordination stuff
194  size_t my_thread_start = tid * thread_chunk;
195  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
196  size_t my_thread_m = my_thread_stop - my_thread_start;
197 
198  // Size estimate
199  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
200 
201  // Allocations
202  std::vector<size_t> c_status(n,INVALID);
203 
204  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
205  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
206 
207  // For each row of A/C
208  size_t CSR_ip = 0, OLD_ip = 0;
209  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
210  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
211  // on the calling process.
212  // JJE 10 Apr 2019 index directly into the rowptr
213  row_mapC(i) = CSR_ip;
214 
215  // mfh 27 Sep 2016: For each entry of A in the current row of A
216  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
217  LO Aik = Acolind(k); // local column index of current entry of A
218  const SC Aval = Avals(k); // value of current entry of A
219  if (Aval == SC_ZERO)
220  continue; // skip explicitly stored zero values in A
221 
222  if (targetMapToOrigRow(Aik) != LO_INVALID) {
223  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
224  // corresponding to the current entry of A is populated, then
225  // the corresponding row of B is in B_local (i.e., it lives on
226  // the calling process).
227 
228  // Local matrix
229  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
230 
231  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
232  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
233  LO Bkj = Bcolind(j);
234  LO Cij = Bcol2Ccol(Bkj);
235 
236  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
237  // New entry
238  c_status[Cij] = CSR_ip;
239  Ccolind(CSR_ip) = Cij;
240  Cvals(CSR_ip) = Aval*Bvals(j);
241  CSR_ip++;
242 
243  } else {
244  Cvals(c_status[Cij]) += Aval*Bvals(j);
245  }
246  }
247 
248  } else {
249  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
250  // corresponding to the current entry of A NOT populated (has
251  // a flag "invalid" value), then the corresponding row of B is
252  // in B_local (i.e., it lives on the calling process).
253 
254  // Remote matrix
255  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
256  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
257  LO Ikj = Icolind(j);
258  LO Cij = Icol2Ccol(Ikj);
259 
260  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
261  // New entry
262  c_status[Cij] = CSR_ip;
263  Ccolind(CSR_ip) = Cij;
264  Cvals(CSR_ip) = Aval*Ivals(j);
265  CSR_ip++;
266 
267  } else {
268  Cvals(c_status[Cij]) += Aval*Ivals(j);
269  }
270  }
271  }
272  }
273 
274  // Resize for next pass if needed
275  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
276  CSR_alloc *= 2;
277  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
278  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
279  }
280  OLD_ip = CSR_ip;
281  }
282  thread_total_nnz(tid) = CSR_ip;
283  tl_colind(tid) = Ccolind;
284  tl_values(tid) = Cvals;
285  });
286 
287  // Do the copy out
288  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
289 
290 #ifdef HAVE_TPETRA_MMM_TIMINGS
291  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
292 #endif
293  // Sort & set values
294  if (params.is_null() || params->get("sort entries",true))
295  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
296  C.setAllValues(row_mapC,entriesC,valuesC);
297 
298 }
299 
300 /*********************************************************************************************************/
301 template<class Scalar,
302  class LocalOrdinal,
303  class GlobalOrdinal,
304  class LocalOrdinalViewType>
305 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
306  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
307  const LocalOrdinalViewType & targetMapToOrigRow,
308  const LocalOrdinalViewType & targetMapToImportRow,
309  const LocalOrdinalViewType & Bcol2Ccol,
310  const LocalOrdinalViewType & Icol2Ccol,
311  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
312  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
313  const std::string& label,
314  const Teuchos::RCP<Teuchos::ParameterList>& params) {
315 #ifdef HAVE_TPETRA_MMM_TIMINGS
316  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
317  using Teuchos::TimeMonitor;
318  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
319 #endif
320 
321  using Teuchos::Array;
322  using Teuchos::ArrayRCP;
323  using Teuchos::ArrayView;
324  using Teuchos::RCP;
325  using Teuchos::rcp;
326 
327  // Lots and lots of typedefs
329  // typedef typename KCRS::device_type device_t;
330  typedef typename KCRS::StaticCrsGraphType graph_t;
331  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
332  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
333  typedef typename KCRS::values_type::non_const_type scalar_view_t;
334 
335  typedef Scalar SC;
336  typedef LocalOrdinal LO;
337  typedef GlobalOrdinal GO;
338  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode NO;
339  typedef Map<LO,GO,NO> map_type;
340 
341  // NOTE (mfh 15 Sep 2017) This is specifically only for
342  // execution_space = Kokkos::OpenMP, so we neither need nor want
343  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
344  typedef NO::execution_space execution_space;
345  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
346 
347  // All of the invalid guys
348  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
349  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
350  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
351 
352  // Grab the Kokkos::SparseCrsMatrices & inner stuff
353  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
354  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
355  const KCRS & Cmat = C.getLocalMatrixDevice();
356 
357  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
358  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
359  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
360  scalar_view_t Cvals = Cmat.values;
361 
362  c_lno_view_t Irowptr;
363  c_lno_nnz_view_t Icolind;
364  scalar_view_t Ivals;
365  if(!Bview.importMatrix.is_null()) {
366  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
367  Irowptr = lclB.graph.row_map;
368  Icolind = lclB.graph.entries;
369  Ivals = lclB.values;
370  }
371 
372  // Sizes
373  RCP<const map_type> Ccolmap = C.getColMap();
374  size_t m = Aview.origMatrix->getLocalNumRows();
375  size_t n = Ccolmap->getLocalNumElements();
376 
377  // Get my node / thread info (right from openmp or parameter list)
378  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
379  if(!params.is_null()) {
380  if(params->isParameter("openmp: ltg thread max"))
381  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
382  }
383 
384  double thread_chunk = (double)(m) / thread_max;
385 
386  // Run chunks of the matrix independently
387  Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
388  {
389  // Thread coordination stuff
390  size_t my_thread_start = tid * thread_chunk;
391  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
392 
393  // Allocations
394  std::vector<size_t> c_status(n,INVALID);
395 
396  // For each row of A/C
397  size_t CSR_ip = 0, OLD_ip = 0;
398  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
399  // First fill the c_status array w/ locations where we're allowed to
400  // generate nonzeros for this row
401  OLD_ip = Crowptr(i);
402  CSR_ip = Crowptr(i+1);
403  for (size_t k = OLD_ip; k < CSR_ip; k++) {
404  c_status[Ccolind(k)] = k;
405  // Reset values in the row of C
406  Cvals(k) = SC_ZERO;
407  }
408 
409  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
410  LO Aik = Acolind(k);
411  const SC Aval = Avals(k);
412  if (Aval == SC_ZERO)
413  continue;
414 
415  if (targetMapToOrigRow(Aik) != LO_INVALID) {
416  // Local matrix
417  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
418 
419  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
420  LO Bkj = Bcolind(j);
421  LO Cij = Bcol2Ccol(Bkj);
422 
423  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
425  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
426 
427  Cvals(c_status[Cij]) += Aval * Bvals(j);
428  }
429  } else {
430  // Remote matrix
431  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
432  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
433  LO Ikj = Icolind(j);
434  LO Cij = Icol2Ccol(Ikj);
435 
436  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
437  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
438  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
439 
440  Cvals(c_status[Cij]) += Aval * Ivals(j);
441  }
442  }
443  }
444  }
445  });
446 
447  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
448 }
449 
450 /*********************************************************************************************************/
451 template<class Scalar,
452  class LocalOrdinal,
453  class GlobalOrdinal,
454  class LocalOrdinalViewType>
455 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
456  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
457  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
458  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
459  const LocalOrdinalViewType & targetMapToOrigRow,
460  const LocalOrdinalViewType & targetMapToImportRow,
461  const LocalOrdinalViewType & Bcol2Ccol,
462  const LocalOrdinalViewType & Icol2Ccol,
463  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
464  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
465  const std::string& label,
466  const Teuchos::RCP<Teuchos::ParameterList>& params) {
467 #ifdef HAVE_TPETRA_MMM_TIMINGS
468  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
469  using Teuchos::TimeMonitor;
470  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
471 #endif
472 
473  using Teuchos::Array;
474  using Teuchos::ArrayRCP;
475  using Teuchos::ArrayView;
476  using Teuchos::RCP;
477  using Teuchos::rcp;
478 
479  // Lots and lots of typedefs
480  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
482  typedef typename KCRS::device_type device_t;
483  typedef typename KCRS::StaticCrsGraphType graph_t;
484  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
485  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
486  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
487  typedef typename KCRS::values_type::non_const_type scalar_view_t;
488 
489  // Unmanaged versions of the above
490  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
491  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
492  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
493 
494  // Jacobi-specific
495  typedef typename scalar_view_t::memory_space scalar_memory_space;
496 
497  typedef Scalar SC;
498  typedef LocalOrdinal LO;
499  typedef GlobalOrdinal GO;
500  typedef Node NO;
501  typedef Map<LO,GO,NO> map_type;
502 
503  // NOTE (mfh 15 Sep 2017) This is specifically only for
504  // execution_space = Kokkos::OpenMP, so we neither need nor want
505  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
506  typedef NO::execution_space execution_space;
507  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
508 
509  // All of the invalid guys
510  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
511  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
512  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
513 
514  // Grab the Kokkos::SparseCrsMatrices & inner stuff
515  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
516  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
517 
518  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
519  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
520  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
521  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
522 
523  c_lno_view_t Irowptr;
524  lno_nnz_view_t Icolind;
525  scalar_view_t Ivals;
526  if(!Bview.importMatrix.is_null()) {
527  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
528  Irowptr = lclB.graph.row_map;
529  Icolind = lclB.graph.entries;
530  Ivals = lclB.values;
531  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
532  }
533 
534  // Jacobi-specific inner stuff
535  auto Dvals =
536  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
537 
538  // Sizes
539  RCP<const map_type> Ccolmap = C.getColMap();
540  size_t m = Aview.origMatrix->getLocalNumRows();
541  size_t n = Ccolmap->getLocalNumElements();
542  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
543 
544  // Get my node / thread info (right from openmp)
545  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
546  if(!params.is_null()) {
547  if(params->isParameter("openmp: ltg thread max"))
548  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
549  }
550 
551  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
552  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
553  // we will not touch these until the final copyout step
554  lno_nnz_view_t entriesC;
555  scalar_view_t valuesC;
556 
557  // add this, since we do the rowptr in place, we could figure this out
558  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
559  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
560 
561  // Thread-local memory
562  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind",thread_max);
563  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values",thread_max);
564 
565  double thread_chunk = (double)(m) / thread_max;
566 
567  // Run chunks of the matrix independently
568  Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
569  {
570  // Thread coordination stuff
571  size_t my_thread_start = tid * thread_chunk;
572  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
573  size_t my_thread_m = my_thread_stop - my_thread_start;
574 
575  // Size estimate
576  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
577 
578  // Allocations
579  std::vector<size_t> c_status(n,INVALID);
580 
581  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
582  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
583 
584  // For each row of A/C
585  size_t CSR_ip = 0, OLD_ip = 0;
586  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
587  // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
588  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
589  // on the calling process.
590  // JJE: directly access the rowptr here (indexed using our loop var)
591  row_mapC(i) = CSR_ip;
592  // NOTE: Vector::getLocalView returns a rank 2 view here
593  SC minusOmegaDval = -omega*Dvals(i,0);
594 
595  // Entries of B
596  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
597  const SC Bval = Bvals(j);
598  if (Bval == SC_ZERO)
599  continue;
600  LO Bij = Bcolind(j);
601  LO Cij = Bcol2Ccol(Bij);
602 
603  // Assume no repeated entries in B
604  c_status[Cij] = CSR_ip;
605  Ccolind(CSR_ip) = Cij;
606  Cvals(CSR_ip) = Bvals[j];
607  CSR_ip++;
608  }
609 
610  // Entries of -omega * Dinv * A * B
611  // mfh 27 Sep 2016: For each entry of A in the current row of A
612  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
613  LO Aik = Acolind(k); // local column index of current entry of A
614  const SC Aval = Avals(k); // value of current entry of A
615  if (Aval == SC_ZERO)
616  continue; // skip explicitly stored zero values in A
617 
618  if (targetMapToOrigRow(Aik) != LO_INVALID) {
619  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
620  // corresponding to the current entry of A is populated, then
621  // the corresponding row of B is in B_local (i.e., it lives on
622  // the calling process).
623 
624  // Local matrix
625  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
626 
627  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
628  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
629  LO Bkj = Bcolind(j);
630  LO Cij = Bcol2Ccol(Bkj);
631 
632  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
633  // New entry
634  c_status[Cij] = CSR_ip;
635  Ccolind(CSR_ip) = Cij;
636  Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
637  CSR_ip++;
638 
639  } else {
640  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
641  }
642  }
643 
644  } else {
645  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
646  // corresponding to the current entry of A NOT populated (has
647  // a flag "invalid" value), then the corresponding row of B is
648  // in B_local (i.e., it lives on the calling process).
649 
650  // Remote matrix
651  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
652  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
653  LO Ikj = Icolind(j);
654  LO Cij = Icol2Ccol(Ikj);
655 
656  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
657  // New entry
658  c_status[Cij] = CSR_ip;
659  Ccolind(CSR_ip) = Cij;
660  Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
661  CSR_ip++;
662 
663  } else {
664  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
665  }
666  }
667  }
668  }
669 
670  // Resize for next pass if needed
671  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
672  CSR_alloc *= 2;
673  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
674  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
675  }
676  OLD_ip = CSR_ip;
677  }
678  thread_total_nnz(tid) = CSR_ip;
679  tl_colind(tid) = Ccolind;
680  tl_values(tid) = Cvals;
681  });
682 
683 
684  // Do the copy out (removed the tl_rowptr!)
685  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
686 
687 
688 #ifdef HAVE_TPETRA_MMM_TIMINGS
689  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
690 #endif
691  // Sort & set values
692  if (params.is_null() || params->get("sort entries",true))
693  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
694  C.setAllValues(row_mapC,entriesC,valuesC);
695 
696 }
697 
698 
699 
700 /*********************************************************************************************************/
701 template<class Scalar,
702  class LocalOrdinal,
703  class GlobalOrdinal,
704  class LocalOrdinalViewType>
705 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
706  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> & Dinv,
707  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
708  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Bview,
709  const LocalOrdinalViewType & targetMapToOrigRow,
710  const LocalOrdinalViewType & targetMapToImportRow,
711  const LocalOrdinalViewType & Bcol2Ccol,
712  const LocalOrdinalViewType & Icol2Ccol,
713  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& C,
714  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Cimport,
715  const std::string& label,
716  const Teuchos::RCP<Teuchos::ParameterList>& params) {
717 #ifdef HAVE_TPETRA_MMM_TIMINGS
718  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
719  using Teuchos::TimeMonitor;
720  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
721 #endif
722  using Teuchos::Array;
723  using Teuchos::ArrayRCP;
724  using Teuchos::ArrayView;
725  using Teuchos::RCP;
726  using Teuchos::rcp;
727 
728  // Lots and lots of typedefs
729  typedef typename Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
731  // typedef typename KCRS::device_type device_t;
732  typedef typename KCRS::StaticCrsGraphType graph_t;
733  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
734  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
735  typedef typename KCRS::values_type::non_const_type scalar_view_t;
736 
737  // Jacobi-specific
738  typedef typename scalar_view_t::memory_space scalar_memory_space;
739 
740  typedef Scalar SC;
741  typedef LocalOrdinal LO;
742  typedef GlobalOrdinal GO;
743  typedef Node NO;
744  typedef Map<LO,GO,NO> map_type;
745 
746  // NOTE (mfh 15 Sep 2017) This is specifically only for
747  // execution_space = Kokkos::OpenMP, so we neither need nor want
748  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
749  typedef NO::execution_space execution_space;
750  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
751 
752  // All of the invalid guys
753  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
754  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
755  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
756 
757  // Grab the Kokkos::SparseCrsMatrices & inner stuff
758  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
759  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixDevice();
760  const KCRS & Cmat = C.getLocalMatrixDevice();
761 
762  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
763  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
764  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
765  scalar_view_t Cvals = Cmat.values;
766 
767  c_lno_view_t Irowptr;
768  c_lno_nnz_view_t Icolind;
769  scalar_view_t Ivals;
770  if(!Bview.importMatrix.is_null()) {
771  auto lclB = Bview.importMatrix->getLocalMatrixDevice();
772  Irowptr = lclB.graph.row_map;
773  Icolind = lclB.graph.entries;
774  Ivals = lclB.values;
775  }
776 
777  // Jacobi-specific inner stuff
778  auto Dvals =
779  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
780 
781  // Sizes
782  RCP<const map_type> Ccolmap = C.getColMap();
783  size_t m = Aview.origMatrix->getLocalNumRows();
784  size_t n = Ccolmap->getLocalNumElements();
785 
786  // Get my node / thread info (right from openmp or parameter list)
787  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
788  if(!params.is_null()) {
789  if(params->isParameter("openmp: ltg thread max"))
790  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
791  }
792 
793  double thread_chunk = (double)(m) / thread_max;
794 
795  // Run chunks of the matrix independently
796  Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
797  {
798  // Thread coordination stuff
799  size_t my_thread_start = tid * thread_chunk;
800  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
801 
802  // Allocations
803  std::vector<size_t> c_status(n,INVALID);
804 
805  // For each row of A/C
806  size_t CSR_ip = 0, OLD_ip = 0;
807  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
808  // First fill the c_status array w/ locations where we're allowed to
809  // generate nonzeros for this row
810  OLD_ip = Crowptr(i);
811  CSR_ip = Crowptr(i+1);
812  // NOTE: Vector::getLocalView returns a rank 2 view here
813  SC minusOmegaDval = -omega*Dvals(i,0);
814 
815  for (size_t k = OLD_ip; k < CSR_ip; k++) {
816  c_status[Ccolind(k)] = k;
817  // Reset values in the row of C
818  Cvals(k) = SC_ZERO;
819  }
820 
821  // Entries of B
822  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
823  const SC Bval = Bvals(j);
824  if (Bval == SC_ZERO)
825  continue;
826  LO Bij = Bcolind(j);
827  LO Cij = Bcol2Ccol(Bij);
828 
829  // Assume no repeated entries in B
830  Cvals(c_status[Cij]) += Bvals(j);
831  CSR_ip++;
832  }
833 
834 
835  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
836  LO Aik = Acolind(k);
837  const SC Aval = Avals(k);
838  if (Aval == SC_ZERO)
839  continue;
840 
841  if (targetMapToOrigRow(Aik) != LO_INVALID) {
842  // Local matrix
843  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
844 
845  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
846  LO Bkj = Bcolind(j);
847  LO Cij = Bcol2Ccol(Bkj);
848 
849  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
850  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
851  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
852 
853  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
854  }
855  } else {
856  // Remote matrix
857  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
858  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
859  LO Ikj = Icolind(j);
860  LO Cij = Icol2Ccol(Ikj);
861 
862  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
863  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
864  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
865 
866  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
867  }
868  }
869  }
870  }
871  });
872 
873  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
874 }
875 
876 
877 /*********************************************************************************************************/
878 template<class InColindArrayType, class InValsArrayType,
879  class OutRowptrType, class OutColindType, class OutValsType>
880 void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
881  const InColindArrayType& Incolind,
882  const InValsArrayType& Invalues,
883  const size_t m,
884  const double thread_chunk,
885  OutRowptrType& row_mapC,
886  OutColindType& entriesC,
887  OutValsType& valuesC ) {
888  typedef OutRowptrType lno_view_t;
889  typedef OutColindType lno_nnz_view_t;
890  typedef OutValsType scalar_view_t;
891  typedef typename lno_view_t::execution_space execution_space;
892  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
893 
894  // Generate the starting nnz number per thread
895  size_t thread_max = Incolind.size();
896  size_t c_nnz_size=0;
897  // since this kernel is 'low thread count' is very likely, that we could
898  // sum over the thread_total_nnz and not go parallel, but since it is a view
899  // we don't know where the memory actually lives... so we kinda need to go parallel
900 
901  lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
902  Kokkos::parallel_scan("LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (const size_t i, size_t& update, const bool final) {
903  size_t mynnz = thread_total_nnz(i);
904  if(final) thread_start_nnz(i) = update;
905  update+=mynnz;
906  if(final && i+1==thread_max) thread_start_nnz(i+1)=update;
907  });
908  c_nnz_size = thread_start_nnz(thread_max);
909 
910  // 2019 Apr 10 JJE: update the rowptr's final entry here
911  row_mapC(m) = thread_start_nnz(thread_max);
912 
913  // Allocate output
914  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
915  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
916 
917  // Copy out
918  Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
919  const size_t my_thread_start = tid * thread_chunk;
920  const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
921  const size_t nnz_thread_start = thread_start_nnz(tid);
922  // we need this info, since we did the rowptr in place
923  const size_t nnz_thread_stop = thread_start_nnz(tid+1);
924 
925  // There are two fundamental operations:
926  // * Updateing the rowptr with the correct offset
927  // * copying entries and values
928 
929  // First, update the rowptr, it since we did it inplace, it is a += operation
930  // in the paper, I experimented with a coloring scheme that had threads do
931  // do their copies in different orders. It wasn't obvious it was beneficial
932  // but it can be replicated, by choosing the array to copy first based on your
933  // thread id % 3
934  if (my_thread_start != 0 ) {
935  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
936  row_mapC(i) += nnz_thread_start;
937  }
938  }
939 
940  // try to Kokkos::single() the alloc here. It should implicitly barrier
941  // thread 0 doesn't copy the rowptr, so it could hit the single first
942  // in the paper, I played a game that effectively got LTG down to a single
943  // OpenMP barrier. But doing that requires the ability to spawn a single
944  // parallel region. The Scan above was implemented using atomic adds
945  // and the barrier was only needed so you could allocate
946  //
947  // Since we can't spawn a single region, we could move the allocations
948  // here, and using 'single'. Most likely, thread 0 will hit it first allowing
949  // the other threads to update the rowptr while it allocates.
950 
951 
952  // next, bulk copy the vals/colind arrays
953  const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
954  for (size_t i = 0; i < my_num_nnz; ++i) {
955  entriesC(nnz_thread_start + i) = Incolind(tid)(i);
956  valuesC(nnz_thread_start + i) = Invalues(tid)(i);
957  }
958 
959  //Free the unamanged views, let each thread deallocate its memory
960  // May need to cast away const here..
961  if(Incolind(tid).data()) free(Incolind(tid).data());
962  if(Invalues(tid).data()) free(Invalues(tid).data());
963  });
964 }//end copy_out
965 
966 #endif // OpenMP
967 
968 
969 /*********************************************************************************************************/
970 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
971 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
972  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
973  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
974  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
975  const LocalOrdinalViewType & Acol2Brow,
976  const LocalOrdinalViewType & Acol2Irow,
977  const LocalOrdinalViewType & Bcol2Ccol,
978  const LocalOrdinalViewType & Icol2Ccol,
979  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
980  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
981  const std::string& label,
982  const Teuchos::RCP<Teuchos::ParameterList>& params) {
983 #ifdef HAVE_TPETRA_MMM_TIMINGS
984  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
985  using Teuchos::TimeMonitor;
986  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
987  Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
988  using Teuchos::rcp;
989 #endif
990  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
991 
992  // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
993 
994  // 1) Multiply A*B
995  Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(),0));
996  Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(" MSAK"),params);
997 
998 #ifdef HAVE_TPETRA_MMM_TIMINGS
999  MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
1000 #endif
1001 
1002  // 2) Scale A by Dinv
1003  AB->leftScale(Dinv);
1004 
1005 #ifdef HAVE_TPETRA_MMM_TIMINGS
1006  MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
1007 #endif
1008 
1009  // 3) Add [-omega Dinv A] + B
1010  Teuchos::ParameterList jparams;
1011  if(!params.is_null()) {
1012  jparams = *params;
1013  jparams.set("label",label+std::string(" MSAK Add"));
1014  }
1015  Scalar one = Teuchos::ScalarTraits<Scalar>::one();
1016  Tpetra::MatrixMatrix::add(one,false,*Bview.origMatrix,Scalar(-omega),false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,false));
1017 #ifdef HAVE_TPETRA_MMM_TIMINGS
1018  MM2=Teuchos::null;
1019 #endif
1020  }// jacobi_A_B_newmatrix_MultiplyScaleAddKernel
1021 
1022 
1023 
1024 #if defined (HAVE_TPETRA_INST_OPENMP)
1025 /*********************************************************************************************************/
1026 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1027 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1028  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1029  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1030  const LocalOrdinalViewType & Acol2PRow,
1031  const LocalOrdinalViewType & Acol2PRowImport,
1032  const LocalOrdinalViewType & Pcol2Accol,
1033  const LocalOrdinalViewType & PIcol2Accol,
1034  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1035  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1036  const std::string& label,
1037  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1038 
1039  using Teuchos::RCP;
1040  using Tpetra::MatrixMatrix::UnmanagedView;
1041  #ifdef HAVE_TPETRA_MMM_TIMINGS
1042  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1043  using Teuchos::rcp;
1044  using Teuchos::TimeMonitor;
1045  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1046  #endif
1047 
1048  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1049  typedef Scalar SC;
1050  typedef LocalOrdinal LO;
1051  typedef GlobalOrdinal GO;
1052  typedef Node NO;
1053  typedef Map<LO,GO,NO> map_type;
1055  typedef typename KCRS::StaticCrsGraphType graph_t;
1056  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1057  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1058  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1059  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1060  typedef typename KCRS::device_type device_t;
1061  typedef typename device_t::execution_space execution_space;
1062  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1063 
1064  // Unmanaged versions of the above
1065  typedef UnmanagedView<lno_view_t> u_lno_view_t;
1066  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1067  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1068 
1069  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1070  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1071 
1072  // Sizes
1073  RCP<const map_type> Accolmap = Ac.getColMap();
1074  size_t m = Rview.origMatrix->getLocalNumRows();
1075  size_t n = Accolmap->getLocalNumElements();
1076 
1077  // Get raw Kokkos matrices, and the raw CSR views
1078  const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1079  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1080  const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1081 
1082  c_lno_view_t Rrowptr = Rmat.graph.row_map,
1083  Arowptr = Amat.graph.row_map,
1084  Prowptr = Pmat.graph.row_map, Irowptr;
1085  const lno_nnz_view_t Rcolind = Rmat.graph.entries,
1086  Acolind = Amat.graph.entries,
1087  Pcolind = Pmat.graph.entries;
1088  lno_nnz_view_t Icolind;
1089  const scalar_view_t Rvals = Rmat.values,
1090  Avals = Amat.values,
1091  Pvals = Pmat.values;
1092  scalar_view_t Ivals;
1093 
1094  if (!Pview.importMatrix.is_null())
1095  {
1096  const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1097  Irowptr = Imat.graph.row_map;
1098  Icolind = Imat.graph.entries;
1099  Ivals = Imat.values;
1100  }
1101 
1102  // Classic csr assembly (low memory edition)
1103  //
1104  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1105  // The method loops over rows of R, and may resize after processing
1106  // each row. Chris Siefert says that this reflects experience in
1107  // ML; for the non-threaded case, ML found it faster to spend less
1108  // effort on estimation and risk an occasional reallocation.
1109 
1110  size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1111  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1112 
1113  // Get my node / thread info (right from openmp or parameter list)
1114  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1115  if(!params.is_null()) {
1116  if(params->isParameter("openmp: ltg thread max"))
1117  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1118  }
1119 
1120  double thread_chunk = (double)(m) / thread_max;
1121 
1122  // we can construct the rowptr inplace, allocate here and fault in parallel
1123  lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1124  // we do not touch these until copy out
1125  lno_nnz_view_t entriesAc;
1126  scalar_view_t valuesAc;
1127 
1128  // add this, since we do the rowptr in place, we could figure this out
1129  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1130  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
1131 
1132  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1133  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1134  //
1135  // For column index Aik in row i of A, Acol2PRow[Aik] tells
1136  // you whether the corresponding row of P belongs to P_local
1137  // ("orig") or P_remote ("Import").
1138 
1139  // Thread-local memory
1140  Kokkos::View<u_lno_nnz_view_t*, device_t> tl_colind("top_colind", thread_max);
1141  Kokkos::View<u_scalar_view_t*, device_t> tl_values("top_values", thread_max);
1142 
1143  // For each row of R
1144  Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1145  {
1146  // Thread coordination stuff
1147  size_t my_thread_start = tid * thread_chunk;
1148  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1149  size_t my_thread_m = my_thread_stop - my_thread_start;
1150 
1151  size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1152 
1153  std::vector<size_t> ac_status(n, INVALID);
1154 
1155  //manually allocate the thread-local storage for Ac
1156  u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1157  u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1158  u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1159 
1160  //count entries as they are added to Ac
1161  size_t nnz = 0, nnz_old = 0;
1162  // bmk: loop over the rows of R which are assigned to thread tid
1163  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1164  // directly index into the rowptr
1165  rowmapAc(i) = nnz;
1166  // mfh 27 Sep 2016: For each entry of R in the current row of R
1167  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1168  LO k = Rcolind(kk); // local column index of current entry of R
1169  const SC Rik = Rvals(kk); // value of current entry of R
1170  if (Rik == SC_ZERO)
1171  continue; // skip explicitly stored zero values in R
1172  // For each entry of A in the current row of A
1173  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1174  LO l = Acolind(ll); // local column index of current entry of A
1175  const SC Akl = Avals(ll); // value of current entry of A
1176  if (Akl == SC_ZERO)
1177  continue; // skip explicitly stored zero values in A
1178 
1179  if (Acol2PRow[l] != LO_INVALID) {
1180  // mfh 27 Sep 2016: If the entry of Acol2PRow
1181  // corresponding to the current entry of A is populated, then
1182  // the corresponding row of P is in P_local (i.e., it lives on
1183  // the calling process).
1184 
1185  // Local matrix
1186  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1187 
1188  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1189  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1190  LO j = Pcolind(jj);
1191  LO Acj = Pcol2Accol(j);
1192  SC Plj = Pvals(jj);
1193 
1194  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1195  #ifdef HAVE_TPETRA_DEBUG
1196  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1197  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1198  std::runtime_error,
1199  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1200  #endif
1201  // New entry
1202  ac_status[Acj] = nnz;
1203  Accolind(nnz) = Acj;
1204  Acvals(nnz) = Rik*Akl*Plj;
1205  nnz++;
1206  } else {
1207  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1208  }
1209  }
1210  } else {
1211  // mfh 27 Sep 2016: If the entry of Acol2PRow
1212  // corresponding to the current entry of A is NOT populated (has
1213  // a flag "invalid" value), then the corresponding row of P is
1214  // in P_remote (i.e., it does not live on the calling process).
1215 
1216  // Remote matrix
1217  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1218  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1219  LO j = Icolind(jj);
1220  LO Acj = PIcol2Accol(j);
1221  SC Plj = Ivals(jj);
1222 
1223  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1224  #ifdef HAVE_TPETRA_DEBUG
1225  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1226  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1227  std::runtime_error,
1228  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1229  #endif
1230  // New entry
1231  ac_status[Acj] = nnz;
1232  Accolind(nnz) = Acj;
1233  Acvals(nnz) = Rik*Akl*Plj;
1234  nnz++;
1235  } else {
1236  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1237  }
1238  }
1239  }
1240  }
1241  }
1242  // Resize for next pass if needed
1243  if (nnz + n > nnzAllocated) {
1244  nnzAllocated *= 2;
1245  Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1246  Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1247  }
1248  nnz_old = nnz;
1249  }
1250  thread_total_nnz(tid) = nnz;
1251  tl_colind(tid) = Accolind;
1252  tl_values(tid) = Acvals;
1253  });
1254  #ifdef HAVE_TPETRA_MMM_TIMINGS
1255  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1256  #endif
1257 
1258  copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1259 
1260  #ifdef HAVE_TPETRA_MMM_TIMINGS
1261  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1262  #endif
1263 
1264  // Final sort & set of CRS arrays
1265  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1266  // mfh 27 Sep 2016: This just sets pointers.
1267  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1268 
1269  #ifdef HAVE_TPETRA_MMM_TIMINGS
1270  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1271  #endif
1272 
1273  // Final FillComplete
1274  //
1275  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1276  // Import (from domain Map to column Map) construction (which costs
1277  // lots of communication) by taking the previously constructed
1278  // Import object. We should be able to do this without interfering
1279  // with the implementation of the local part of sparse matrix-matrix
1280  // multiply above.
1281  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1282  labelList->set("Timer Label",label);
1283  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1284  RCP<const Export<LO,GO,NO> > dummyExport;
1285  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1286  Rview.origMatrix->getRangeMap(),
1287  Acimport,
1288  dummyExport,
1289  labelList);
1290 }
1291 
1292 
1293 
1294 /*********************************************************************************************************/
1295 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1296 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Rview,
1297  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Aview,
1298  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Pview,
1299  const LocalOrdinalViewType & Acol2PRow,
1300  const LocalOrdinalViewType & Acol2PRowImport,
1301  const LocalOrdinalViewType & Pcol2Accol,
1302  const LocalOrdinalViewType & PIcol2Accol,
1303  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode>& Ac,
1304  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Tpetra::KokkosCompat::KokkosOpenMPWrapperNode> > Acimport,
1305  const std::string& label,
1306  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1307 
1308  using Teuchos::RCP;
1309  using Tpetra::MatrixMatrix::UnmanagedView;
1310  #ifdef HAVE_TPETRA_MMM_TIMINGS
1311  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1312  using Teuchos::TimeMonitor;
1313  using Teuchos::rcp;
1314  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1315  #endif
1316 
1317  typedef Tpetra::KokkosCompat::KokkosOpenMPWrapperNode Node;
1318  typedef Scalar SC;
1319  typedef LocalOrdinal LO;
1320  typedef GlobalOrdinal GO;
1321  typedef Node NO;
1322  typedef Map<LO,GO,NO> map_type;
1324  typedef typename KCRS::StaticCrsGraphType graph_t;
1325  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1326  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1327  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1328  typedef typename KCRS::device_type device_t;
1329  typedef typename device_t::execution_space execution_space;
1330  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1331 
1332  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1333  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1334  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1335 
1336  // Sizes
1337  RCP<const map_type> Accolmap = Ac.getColMap();
1338  size_t m = Rview.origMatrix->getLocalNumRows();
1339  size_t n = Accolmap->getLocalNumElements();
1340 
1341  // Get raw Kokkos matrices, and the raw CSR views
1342  const KCRS & Rmat = Rview.origMatrix->getLocalMatrixDevice();
1343  const KCRS & Amat = Aview.origMatrix->getLocalMatrixDevice();
1344  const KCRS & Pmat = Pview.origMatrix->getLocalMatrixDevice();
1345  const KCRS & Cmat = Ac.getLocalMatrixDevice();
1346 
1347  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1348  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1349  lno_nnz_view_t Icolind;
1350  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1351  scalar_view_t Cvals = Cmat.values;
1352  scalar_view_t Ivals;
1353 
1354  if (!Pview.importMatrix.is_null())
1355  {
1356  const KCRS& Imat = Pview.importMatrix->getLocalMatrixDevice();
1357  Irowptr = Imat.graph.row_map;
1358  Icolind = Imat.graph.entries;
1359  Ivals = Imat.values;
1360  }
1361 
1362  // Get my node / thread info (right from openmp or parameter list)
1363  size_t thread_max = Tpetra::KokkosCompat::KokkosOpenMPWrapperNode::execution_space().concurrency();
1364  if(!params.is_null()) {
1365  if(params->isParameter("openmp: ltg thread max"))
1366  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1367  }
1368 
1369  double thread_chunk = (double)(m) / thread_max;
1370 
1371  // For each row of R
1372  Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1373  {
1374  // Thread coordination stuff
1375  size_t my_thread_start = tid * thread_chunk;
1376  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1377 
1378  std::vector<size_t> c_status(n, INVALID);
1379 
1380  //count entries as they are added to Ac
1381  size_t OLD_ip = 0, CSR_ip = 0;
1382  // bmk: loop over the rows of R which are assigned to thread tid
1383  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1384  // First fill the c_status array w/ locations where we're allowed to
1385  // generate nonzeros for this row
1386  OLD_ip = Crowptr(i);
1387  CSR_ip = Crowptr(i+1);
1388  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1389  c_status[Ccolind(k)] = k;
1390  // Reset values in the row of C
1391  Cvals(k) = SC_ZERO;
1392  }
1393 
1394  // mfh 27 Sep 2016: For each entry of R in the current row of R
1395  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1396  LO k = Rcolind(kk); // local column index of current entry of R
1397  const SC Rik = Rvals(kk); // value of current entry of R
1398  if (Rik == SC_ZERO)
1399  continue; // skip explicitly stored zero values in R
1400  // For each entry of A in the current row of A
1401  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1402  LO l = Acolind(ll); // local column index of current entry of A
1403  const SC Akl = Avals(ll); // value of current entry of A
1404  if (Akl == SC_ZERO)
1405  continue; // skip explicitly stored zero values in A
1406 
1407  if (Acol2PRow[l] != LO_INVALID) {
1408  // mfh 27 Sep 2016: If the entry of Acol2PRow
1409  // corresponding to the current entry of A is populated, then
1410  // the corresponding row of P is in P_local (i.e., it lives on
1411  // the calling process).
1412 
1413  // Local matrix
1414  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1415 
1416  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1417  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1418  LO j = Pcolind(jj);
1419  LO Cij = Pcol2Accol(j);
1420  SC Plj = Pvals(jj);
1421  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1422  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1423  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1424 
1425 
1426  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1427  }
1428  } else {
1429  // mfh 27 Sep 2016: If the entry of Acol2PRow
1430  // corresponding to the current entry of A is NOT populated (has
1431  // a flag "invalid" value), then the corresponding row of P is
1432  // in P_remote (i.e., it does not live on the calling process).
1433 
1434  // Remote matrix
1435  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1436  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1437  LO j = Icolind(jj);
1438  LO Cij = PIcol2Accol(j);
1439  SC Plj = Ivals(jj);
1440  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1441  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1442  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1443 
1444  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1445  }
1446  }
1447  }
1448  }
1449  }
1450  });
1451  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1452 }
1453 
1454 
1455 
1456 #endif
1457 
1458 
1459 }//ExtraKernels
1460 }//MatrixMatrix
1461 }//Tpetra
1462 
1463 
1464 #endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...