Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45 #include "Tpetra_RowMatrixTransposer.hpp"
46 
47 namespace Tpetra {
48 
49 namespace MatrixMatrix{
50 
51 namespace ExtraKernels{
52 
53 
54 // Double product version
55 template<class CrsMatrixType>
56 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
57  // Follows the NZ estimate in ML's ml_matmatmult.c
58  size_t Aest = 100, Best=100;
59  if (A.getNodeNumEntries() > 0)
60  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
61  if (B.getNodeNumEntries() > 0)
62  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
63 
64  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
65  nnzperrow *= nnzperrow;
66 
67  return nnzperrow;
68 }
69 
70 
71 // Triple product version
72 template<class CrsMatrixType>
73 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
74  size_t nnzPerRowA = 100, Pcols = 100;
75  if (A.getNodeNumEntries() > 0)
76  nnzPerRowA = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 9;
77  if (P.getNodeNumEntries() > 0)
78  Pcols = (P.getNodeNumCols() > 0) ? P.getNodeNumCols() : 100;
79  return (size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
80 }
81 
82 #if defined (HAVE_TPETRA_INST_OPENMP)
83 /*********************************************************************************************************/
84 template<class Scalar,
85  class LocalOrdinal,
86  class GlobalOrdinal,
87  class LocalOrdinalViewType>
88 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
89  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
90  const LocalOrdinalViewType & targetMapToOrigRow,
91  const LocalOrdinalViewType & targetMapToImportRow,
92  const LocalOrdinalViewType & Bcol2Ccol,
93  const LocalOrdinalViewType & Icol2Ccol,
94  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
95  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
96  const std::string& label,
97  const Teuchos::RCP<Teuchos::ParameterList>& params) {
98 #ifdef HAVE_TPETRA_MMM_TIMINGS
99  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
100  using Teuchos::TimeMonitor;
101  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore"))));
102 #endif
103  using Teuchos::Array;
104  using Teuchos::ArrayRCP;
105  using Teuchos::ArrayView;
106  using Teuchos::RCP;
107  using Teuchos::rcp;
108 
109 
110  // Lots and lots of typedefs
112  // typedef typename KCRS::device_type device_t;
113  typedef typename KCRS::StaticCrsGraphType graph_t;
114  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
115  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
116  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
117  typedef typename KCRS::values_type::non_const_type scalar_view_t;
118 
119  // Unmanaged versions of the above
120  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
121  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
122  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
123 
124  typedef Scalar SC;
125  typedef LocalOrdinal LO;
126  typedef GlobalOrdinal GO;
127  typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
128  typedef Map<LO,GO,NO> map_type;
129 
130  // NOTE (mfh 15 Sep 2017) This is specifically only for
131  // execution_space = Kokkos::OpenMP, so we neither need nor want
132  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
133  typedef NO::execution_space execution_space;
134  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
135 
136  // All of the invalid guys
137  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
138  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
139  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
140 
141  // Grab the Kokkos::SparseCrsMatrices & inner stuff
142  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
143  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
144 
145  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
146  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
147  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
148  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
149 
150  c_lno_view_t Irowptr;
151  lno_nnz_view_t Icolind;
152  scalar_view_t Ivals;
153  if(!Bview.importMatrix.is_null()) {
154  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
155  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
156  Ivals = Bview.importMatrix->getLocalMatrix().values;
157  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
158  }
159 
160  // Sizes
161  RCP<const map_type> Ccolmap = C.getColMap();
162  size_t m = Aview.origMatrix->getNodeNumRows();
163  size_t n = Ccolmap->getNodeNumElements();
164  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
165 
166  // Get my node / thread info (right from openmp or parameter list)
167  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
168  if(!params.is_null()) {
169  if(params->isParameter("openmp: ltg thread max"))
170  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
171  }
172 
173  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
174  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
175  // we will not touch these until the final copyout step
176  lno_nnz_view_t entriesC;
177  scalar_view_t valuesC;
178 
179  // add this, since we do the rowptr in place, we could figure this out
180  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
181  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
182 
183  // Thread-local memory
184  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
185  Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
186 
187  double thread_chunk = (double)(m) / thread_max;
188 
189  // Run chunks of the matrix independently
190  Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
191  {
192  // Thread coordination stuff
193  size_t my_thread_start = tid * thread_chunk;
194  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
195  size_t my_thread_m = my_thread_stop - my_thread_start;
196 
197  // Size estimate
198  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
199 
200  // Allocations
201  std::vector<size_t> c_status(n,INVALID);
202 
203  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
204  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
205 
206  // For each row of A/C
207  size_t CSR_ip = 0, OLD_ip = 0;
208  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
209  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
210  // on the calling process.
211  // JJE 10 Apr 2019 index directly into the rowptr
212  row_mapC(i) = CSR_ip;
213 
214  // mfh 27 Sep 2016: For each entry of A in the current row of A
215  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
216  LO Aik = Acolind(k); // local column index of current entry of A
217  const SC Aval = Avals(k); // value of current entry of A
218  if (Aval == SC_ZERO)
219  continue; // skip explicitly stored zero values in A
220 
221  if (targetMapToOrigRow(Aik) != LO_INVALID) {
222  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
223  // corresponding to the current entry of A is populated, then
224  // the corresponding row of B is in B_local (i.e., it lives on
225  // the calling process).
226 
227  // Local matrix
228  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
229 
230  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
231  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
232  LO Bkj = Bcolind(j);
233  LO Cij = Bcol2Ccol(Bkj);
234 
235  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
236  // New entry
237  c_status[Cij] = CSR_ip;
238  Ccolind(CSR_ip) = Cij;
239  Cvals(CSR_ip) = Aval*Bvals(j);
240  CSR_ip++;
241 
242  } else {
243  Cvals(c_status[Cij]) += Aval*Bvals(j);
244  }
245  }
246 
247  } else {
248  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
249  // corresponding to the current entry of A NOT populated (has
250  // a flag "invalid" value), then the corresponding row of B is
251  // in B_local (i.e., it lives on the calling process).
252 
253  // Remote matrix
254  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
255  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
256  LO Ikj = Icolind(j);
257  LO Cij = Icol2Ccol(Ikj);
258 
259  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
260  // New entry
261  c_status[Cij] = CSR_ip;
262  Ccolind(CSR_ip) = Cij;
263  Cvals(CSR_ip) = Aval*Ivals(j);
264  CSR_ip++;
265 
266  } else {
267  Cvals(c_status[Cij]) += Aval*Ivals(j);
268  }
269  }
270  }
271  }
272 
273  // Resize for next pass if needed
274  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
275  CSR_alloc *= 2;
276  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
277  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
278  }
279  OLD_ip = CSR_ip;
280  }
281  thread_total_nnz(tid) = CSR_ip;
282  tl_colind(tid) = Ccolind;
283  tl_values(tid) = Cvals;
284  });
285 
286  // Do the copy out
287  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
288 
289 #ifdef HAVE_TPETRA_MMM_TIMINGS
290  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort"))));
291 #endif
292  // Sort & set values
293  if (params.is_null() || params->get("sort entries",true))
294  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
295  C.setAllValues(row_mapC,entriesC,valuesC);
296 
297 }
298 
299 /*********************************************************************************************************/
300 template<class Scalar,
301  class LocalOrdinal,
302  class GlobalOrdinal,
303  class LocalOrdinalViewType>
304 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
305  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
306  const LocalOrdinalViewType & targetMapToOrigRow,
307  const LocalOrdinalViewType & targetMapToImportRow,
308  const LocalOrdinalViewType & Bcol2Ccol,
309  const LocalOrdinalViewType & Icol2Ccol,
310  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
311  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
312  const std::string& label,
313  const Teuchos::RCP<Teuchos::ParameterList>& params) {
314 #ifdef HAVE_TPETRA_MMM_TIMINGS
315  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
316  using Teuchos::TimeMonitor;
317  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore"))));
318 #endif
319 
320  using Teuchos::Array;
321  using Teuchos::ArrayRCP;
322  using Teuchos::ArrayView;
323  using Teuchos::RCP;
324  using Teuchos::rcp;
325 
326  // Lots and lots of typedefs
328  // typedef typename KCRS::device_type device_t;
329  typedef typename KCRS::StaticCrsGraphType graph_t;
330  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
331  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
332  typedef typename KCRS::values_type::non_const_type scalar_view_t;
333 
334  typedef Scalar SC;
335  typedef LocalOrdinal LO;
336  typedef GlobalOrdinal GO;
337  typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
338  typedef Map<LO,GO,NO> map_type;
339 
340  // NOTE (mfh 15 Sep 2017) This is specifically only for
341  // execution_space = Kokkos::OpenMP, so we neither need nor want
342  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
343  typedef NO::execution_space execution_space;
344  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
345 
346  // All of the invalid guys
347  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
348  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
349  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
350 
351  // Grab the Kokkos::SparseCrsMatrices & inner stuff
352  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
353  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
354  const KCRS & Cmat = C.getLocalMatrix();
355 
356  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
357  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
358  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
359  scalar_view_t Cvals = Cmat.values;
360 
361  c_lno_view_t Irowptr;
362  c_lno_nnz_view_t Icolind;
363  scalar_view_t Ivals;
364  if(!Bview.importMatrix.is_null()) {
365  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
366  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
367  Ivals = Bview.importMatrix->getLocalMatrix().values;
368  }
369 
370  // Sizes
371  RCP<const map_type> Ccolmap = C.getColMap();
372  size_t m = Aview.origMatrix->getNodeNumRows();
373  size_t n = Ccolmap->getNodeNumElements();
374 
375  // Get my node / thread info (right from openmp or parameter list)
376  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
377  if(!params.is_null()) {
378  if(params->isParameter("openmp: ltg thread max"))
379  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
380  }
381 
382  double thread_chunk = (double)(m) / thread_max;
383 
384  // Run chunks of the matrix independently
385  Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
386  {
387  // Thread coordination stuff
388  size_t my_thread_start = tid * thread_chunk;
389  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
390 
391  // Allocations
392  std::vector<size_t> c_status(n,INVALID);
393 
394  // For each row of A/C
395  size_t CSR_ip = 0, OLD_ip = 0;
396  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
397  // First fill the c_status array w/ locations where we're allowed to
398  // generate nonzeros for this row
399  OLD_ip = Crowptr(i);
400  CSR_ip = Crowptr(i+1);
401  for (size_t k = OLD_ip; k < CSR_ip; k++) {
402  c_status[Ccolind(k)] = k;
403  // Reset values in the row of C
404  Cvals(k) = SC_ZERO;
405  }
406 
407  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
408  LO Aik = Acolind(k);
409  const SC Aval = Avals(k);
410  if (Aval == SC_ZERO)
411  continue;
412 
413  if (targetMapToOrigRow(Aik) != LO_INVALID) {
414  // Local matrix
415  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
416 
417  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
418  LO Bkj = Bcolind(j);
419  LO Cij = Bcol2Ccol(Bkj);
420 
421  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
422  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
423  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
424 
425  Cvals(c_status[Cij]) += Aval * Bvals(j);
426  }
427  } else {
428  // Remote matrix
429  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
430  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
431  LO Ikj = Icolind(j);
432  LO Cij = Icol2Ccol(Ikj);
433 
434  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
435  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
436  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
437 
438  Cvals(c_status[Cij]) += Aval * Ivals(j);
439  }
440  }
441  }
442  }
443  });
444 
445  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
446 }
447 
448 /*********************************************************************************************************/
449 template<class Scalar,
450  class LocalOrdinal,
451  class GlobalOrdinal,
452  class LocalOrdinalViewType>
453 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
454  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
455  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
456  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
457  const LocalOrdinalViewType & targetMapToOrigRow,
458  const LocalOrdinalViewType & targetMapToImportRow,
459  const LocalOrdinalViewType & Bcol2Ccol,
460  const LocalOrdinalViewType & Icol2Ccol,
461  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
462  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
463  const std::string& label,
464  const Teuchos::RCP<Teuchos::ParameterList>& params) {
465 #ifdef HAVE_TPETRA_MMM_TIMINGS
466  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
467  using Teuchos::TimeMonitor;
468  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore"))));
469 #endif
470 
471  using Teuchos::Array;
472  using Teuchos::ArrayRCP;
473  using Teuchos::ArrayView;
474  using Teuchos::RCP;
475  using Teuchos::rcp;
476 
477  // Lots and lots of typedefs
478  typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
480  // typedef typename KCRS::device_type device_t;
481  typedef typename KCRS::StaticCrsGraphType graph_t;
482  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
483  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
484  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
485  typedef typename KCRS::values_type::non_const_type scalar_view_t;
486 
487  // Unmanaged versions of the above
488  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
489  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
490  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
491 
492  // Jacobi-specific
493  typedef typename scalar_view_t::memory_space scalar_memory_space;
494 
495  typedef Scalar SC;
496  typedef LocalOrdinal LO;
497  typedef GlobalOrdinal GO;
498  typedef Node NO;
499  typedef Map<LO,GO,NO> map_type;
500 
501  // NOTE (mfh 15 Sep 2017) This is specifically only for
502  // execution_space = Kokkos::OpenMP, so we neither need nor want
503  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
504  typedef NO::execution_space execution_space;
505  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
506 
507  // All of the invalid guys
508  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
509  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
510  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
511 
512  // Grab the Kokkos::SparseCrsMatrices & inner stuff
513  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
514  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
515 
516  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
517  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
518  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
519  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
520 
521  c_lno_view_t Irowptr;
522  lno_nnz_view_t Icolind;
523  scalar_view_t Ivals;
524  if(!Bview.importMatrix.is_null()) {
525  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
526  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
527  Ivals = Bview.importMatrix->getLocalMatrix().values;
528  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
529  }
530 
531  // Jacobi-specific inner stuff
532  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
533 
534  // Sizes
535  RCP<const map_type> Ccolmap = C.getColMap();
536  size_t m = Aview.origMatrix->getNodeNumRows();
537  size_t n = Ccolmap->getNodeNumElements();
538  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
539 
540  // Get my node / thread info (right from openmp)
541  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
542  if(!params.is_null()) {
543  if(params->isParameter("openmp: ltg thread max"))
544  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
545  }
546 
547  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
548  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
549  // we will not touch these until the final copyout step
550  lno_nnz_view_t entriesC;
551  scalar_view_t valuesC;
552 
553  // add this, since we do the rowptr in place, we could figure this out
554  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
555  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
556 
557  // Thread-local memory
558  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
559  Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
560 
561  double thread_chunk = (double)(m) / thread_max;
562 
563  // Run chunks of the matrix independently
564  Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
565  {
566  // Thread coordination stuff
567  size_t my_thread_start = tid * thread_chunk;
568  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
569  size_t my_thread_m = my_thread_stop - my_thread_start;
570 
571  // Size estimate
572  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
573 
574  // Allocations
575  std::vector<size_t> c_status(n,INVALID);
576 
577  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
578  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
579 
580  // For each row of A/C
581  size_t CSR_ip = 0, OLD_ip = 0;
582  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
583  // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
584  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
585  // on the calling process.
586  // JJE: directly access the rowptr here (indexed using our loop var)
587  row_mapC(i) = CSR_ip;
588  // NOTE: Vector::getLocalView returns a rank 2 view here
589  SC minusOmegaDval = -omega*Dvals(i,0);
590 
591  // Entries of B
592  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
593  const SC Bval = Bvals(j);
594  if (Bval == SC_ZERO)
595  continue;
596  LO Bij = Bcolind(j);
597  LO Cij = Bcol2Ccol(Bij);
598 
599  // Assume no repeated entries in B
600  c_status[Cij] = CSR_ip;
601  Ccolind(CSR_ip) = Cij;
602  Cvals(CSR_ip) = Bvals[j];
603  CSR_ip++;
604  }
605 
606  // Entries of -omega * Dinv * A * B
607  // mfh 27 Sep 2016: For each entry of A in the current row of A
608  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
609  LO Aik = Acolind(k); // local column index of current entry of A
610  const SC Aval = Avals(k); // value of current entry of A
611  if (Aval == SC_ZERO)
612  continue; // skip explicitly stored zero values in A
613 
614  if (targetMapToOrigRow(Aik) != LO_INVALID) {
615  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
616  // corresponding to the current entry of A is populated, then
617  // the corresponding row of B is in B_local (i.e., it lives on
618  // the calling process).
619 
620  // Local matrix
621  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
622 
623  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
624  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
625  LO Bkj = Bcolind(j);
626  LO Cij = Bcol2Ccol(Bkj);
627 
628  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
629  // New entry
630  c_status[Cij] = CSR_ip;
631  Ccolind(CSR_ip) = Cij;
632  Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
633  CSR_ip++;
634 
635  } else {
636  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
637  }
638  }
639 
640  } else {
641  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
642  // corresponding to the current entry of A NOT populated (has
643  // a flag "invalid" value), then the corresponding row of B is
644  // in B_local (i.e., it lives on the calling process).
645 
646  // Remote matrix
647  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
648  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
649  LO Ikj = Icolind(j);
650  LO Cij = Icol2Ccol(Ikj);
651 
652  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
653  // New entry
654  c_status[Cij] = CSR_ip;
655  Ccolind(CSR_ip) = Cij;
656  Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
657  CSR_ip++;
658 
659  } else {
660  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
661  }
662  }
663  }
664  }
665 
666  // Resize for next pass if needed
667  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
668  CSR_alloc *= 2;
669  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
670  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
671  }
672  OLD_ip = CSR_ip;
673  }
674  thread_total_nnz(tid) = CSR_ip;
675  tl_colind(tid) = Ccolind;
676  tl_values(tid) = Cvals;
677  });
678 
679 
680  // Do the copy out (removed the tl_rowptr!)
681  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
682 
683 
684 #ifdef HAVE_TPETRA_MMM_TIMINGS
685  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort"))));
686 #endif
687  // Sort & set values
688  if (params.is_null() || params->get("sort entries",true))
689  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
690  C.setAllValues(row_mapC,entriesC,valuesC);
691 
692 }
693 
694 
695 
696 /*********************************************************************************************************/
697 template<class Scalar,
698  class LocalOrdinal,
699  class GlobalOrdinal,
700  class LocalOrdinalViewType>
701 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
702  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
703  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
704  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
705  const LocalOrdinalViewType & targetMapToOrigRow,
706  const LocalOrdinalViewType & targetMapToImportRow,
707  const LocalOrdinalViewType & Bcol2Ccol,
708  const LocalOrdinalViewType & Icol2Ccol,
709  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
710  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
711  const std::string& label,
712  const Teuchos::RCP<Teuchos::ParameterList>& params) {
713 #ifdef HAVE_TPETRA_MMM_TIMINGS
714  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
715  using Teuchos::TimeMonitor;
716  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore"))));
717 #endif
718  using Teuchos::Array;
719  using Teuchos::ArrayRCP;
720  using Teuchos::ArrayView;
721  using Teuchos::RCP;
722  using Teuchos::rcp;
723 
724  // Lots and lots of typedefs
725  typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
727  // typedef typename KCRS::device_type device_t;
728  typedef typename KCRS::StaticCrsGraphType graph_t;
729  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
730  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
731  typedef typename KCRS::values_type::non_const_type scalar_view_t;
732 
733  // Jacobi-specific
734  typedef typename scalar_view_t::memory_space scalar_memory_space;
735 
736  typedef Scalar SC;
737  typedef LocalOrdinal LO;
738  typedef GlobalOrdinal GO;
739  typedef Node NO;
740  typedef Map<LO,GO,NO> map_type;
741 
742  // NOTE (mfh 15 Sep 2017) This is specifically only for
743  // execution_space = Kokkos::OpenMP, so we neither need nor want
744  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
745  typedef NO::execution_space execution_space;
746  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
747 
748  // All of the invalid guys
749  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
750  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
751  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
752 
753  // Grab the Kokkos::SparseCrsMatrices & inner stuff
754  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
755  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
756  const KCRS & Cmat = C.getLocalMatrix();
757 
758  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
759  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
760  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
761  scalar_view_t Cvals = Cmat.values;
762 
763  c_lno_view_t Irowptr;
764  c_lno_nnz_view_t Icolind;
765  scalar_view_t Ivals;
766  if(!Bview.importMatrix.is_null()) {
767  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
768  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
769  Ivals = Bview.importMatrix->getLocalMatrix().values;
770  }
771 
772  // Jacobi-specific inner stuff
773  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
774 
775  // Sizes
776  RCP<const map_type> Ccolmap = C.getColMap();
777  size_t m = Aview.origMatrix->getNodeNumRows();
778  size_t n = Ccolmap->getNodeNumElements();
779 
780  // Get my node / thread info (right from openmp or parameter list)
781  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
782  if(!params.is_null()) {
783  if(params->isParameter("openmp: ltg thread max"))
784  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
785  }
786 
787  double thread_chunk = (double)(m) / thread_max;
788 
789  // Run chunks of the matrix independently
790  Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
791  {
792  // Thread coordination stuff
793  size_t my_thread_start = tid * thread_chunk;
794  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
795 
796  // Allocations
797  std::vector<size_t> c_status(n,INVALID);
798 
799  // For each row of A/C
800  size_t CSR_ip = 0, OLD_ip = 0;
801  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
802  // First fill the c_status array w/ locations where we're allowed to
803  // generate nonzeros for this row
804  OLD_ip = Crowptr(i);
805  CSR_ip = Crowptr(i+1);
806  // NOTE: Vector::getLocalView returns a rank 2 view here
807  SC minusOmegaDval = -omega*Dvals(i,0);
808 
809  for (size_t k = OLD_ip; k < CSR_ip; k++) {
810  c_status[Ccolind(k)] = k;
811  // Reset values in the row of C
812  Cvals(k) = SC_ZERO;
813  }
814 
815  // Entries of B
816  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
817  const SC Bval = Bvals(j);
818  if (Bval == SC_ZERO)
819  continue;
820  LO Bij = Bcolind(j);
821  LO Cij = Bcol2Ccol(Bij);
822 
823  // Assume no repeated entries in B
824  Cvals(c_status[Cij]) += Bvals(j);
825  CSR_ip++;
826  }
827 
828 
829  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
830  LO Aik = Acolind(k);
831  const SC Aval = Avals(k);
832  if (Aval == SC_ZERO)
833  continue;
834 
835  if (targetMapToOrigRow(Aik) != LO_INVALID) {
836  // Local matrix
837  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
838 
839  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
840  LO Bkj = Bcolind(j);
841  LO Cij = Bcol2Ccol(Bkj);
842 
843  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
844  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
845  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
846 
847  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
848  }
849  } else {
850  // Remote matrix
851  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
852  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
853  LO Ikj = Icolind(j);
854  LO Cij = Icol2Ccol(Ikj);
855 
856  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
857  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
858  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
859 
860  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
861  }
862  }
863  }
864  }
865  });
866 
867  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
868 }
869 
870 
871 /*********************************************************************************************************/
872 template<class InColindArrayType, class InValsArrayType,
873  class OutRowptrType, class OutColindType, class OutValsType>
874 void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
875  const InColindArrayType& Incolind,
876  const InValsArrayType& Invalues,
877  const size_t m,
878  const double thread_chunk,
879  OutRowptrType& row_mapC,
880  OutColindType& entriesC,
881  OutValsType& valuesC ) {
882  typedef OutRowptrType lno_view_t;
883  typedef OutColindType lno_nnz_view_t;
884  typedef OutValsType scalar_view_t;
885  typedef typename lno_view_t::execution_space execution_space;
886  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
887 
888  // Generate the starting nnz number per thread
889  size_t thread_max = Incolind.size();
890  size_t c_nnz_size=0;
891  // since this kernel is 'low thread count' is very likely, that we could
892  // sum over the thread_total_nnz and not go parallel, but since it is a view
893  // we don't know where the memory actually lives... so we kinda need to go parallel
894 
895  lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
896  Kokkos::parallel_scan("LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (const size_t i, size_t& update, const bool final) {
897  size_t mynnz = thread_total_nnz(i);
898  if(final) thread_start_nnz(i) = update;
899  update+=mynnz;
900  if(final && i+1==thread_max) thread_start_nnz(i+1)=update;
901  });
902  c_nnz_size = thread_start_nnz(thread_max);
903 
904  // 2019 Apr 10 JJE: update the rowptr's final entry here
905  row_mapC(m) = thread_start_nnz(thread_max);
906 
907  // Allocate output
908  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
909  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
910 
911  // Copy out
912  Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
913  const size_t my_thread_start = tid * thread_chunk;
914  const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
915  const size_t nnz_thread_start = thread_start_nnz(tid);
916  // we need this info, since we did the rowptr in place
917  const size_t nnz_thread_stop = thread_start_nnz(tid+1);
918 
919  // There are two fundamental operations:
920  // * Updateing the rowptr with the correct offset
921  // * copying entries and values
922 
923  // First, update the rowptr, it since we did it inplace, it is a += operation
924  // in the paper, I experimented with a coloring scheme that had threads do
925  // do their copies in different orders. It wasn't obvious it was beneficial
926  // but it can be replicated, by choosing the array to copy first based on your
927  // thread id % 3
928  if (my_thread_start != 0 ) {
929  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
930  row_mapC(i) += nnz_thread_start;
931  }
932  }
933 
934  // try to Kokkos::single() the alloc here. It should implicitly barrier
935  // thread 0 doesn't copy the rowptr, so it could hit the single first
936  // in the paper, I played a game that effectively got LTG down to a single
937  // OpenMP barrier. But doing that requires the ability to spawn a single
938  // parallel region. The Scan above was implemented using atomic adds
939  // and the barrier was only needed so you could allocate
940  //
941  // Since we can't spawn a single region, we could move the allocations
942  // here, and using 'single'. Most likely, thread 0 will hit it first allowing
943  // the other threads to update the rowptr while it allocates.
944 
945 
946  // next, bulk copy the vals/colind arrays
947  const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
948  for (size_t i = 0; i < my_num_nnz; ++i) {
949  entriesC(nnz_thread_start + i) = Incolind(tid)(i);
950  valuesC(nnz_thread_start + i) = Invalues(tid)(i);
951  }
952 
953  //Free the unamanged views, let each thread deallocate its memory
954  // May need to cast away const here..
955  if(Incolind(tid).data()) free(Incolind(tid).data());
956  if(Invalues(tid).data()) free(Invalues(tid).data());
957  });
958 }//end copy_out
959 
960 #endif // OpenMP
961 
962 
963 /*********************************************************************************************************/
964 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
965 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
966  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
967  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
968  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
969  const LocalOrdinalViewType & Acol2Brow,
970  const LocalOrdinalViewType & Acol2Irow,
971  const LocalOrdinalViewType & Bcol2Ccol,
972  const LocalOrdinalViewType & Icol2Ccol,
973  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
974  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
975  const std::string& label,
976  const Teuchos::RCP<Teuchos::ParameterList>& params) {
977 #ifdef HAVE_TPETRA_MMM_TIMINGS
978  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
979  using Teuchos::TimeMonitor;
980  Teuchos::RCP<TimeMonitor> MM = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK"))));
981  Teuchos::RCP<TimeMonitor> MM2 = Teuchos::rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply"))));
982  using Teuchos::rcp;
983 #endif
984  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
985 
986  // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
987 
988  // 1) Multiply A*B
989  Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(),0));
990  Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(" MSAK"),params);
991 
992 #ifdef HAVE_TPETRA_MMM_TIMINGS
993  MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale"))));
994 #endif
995 
996  // 2) Scale A by Dinv
997  AB->leftScale(Dinv);
998 
999 #ifdef HAVE_TPETRA_MMM_TIMINGS
1000  MM2=Teuchos::null; MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add"))));
1001 #endif
1002 
1003  // 3) Add [-omega Dinv A] + B
1004  Teuchos::ParameterList jparams;
1005  if(!params.is_null()) {
1006  jparams = *params;
1007  jparams.set("label",label+std::string(" MSAK Add"));
1008  }
1009  Scalar one = Teuchos::ScalarTraits<Scalar>::one();
1010  Tpetra::MatrixMatrix::add(one,false,*Bview.origMatrix,Scalar(-omega),false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,false));
1011 #ifdef HAVE_TPETRA_MMM_TIMINGS
1012  MM2=Teuchos::null;
1013 #endif
1014  }// jacobi_A_B_newmatrix_MultiplyScaleAddKernel
1015 
1016 
1017 
1018 #if defined (HAVE_TPETRA_INST_OPENMP)
1019 /*********************************************************************************************************/
1020 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1021 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1022  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1023  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1024  const LocalOrdinalViewType & Acol2PRow,
1025  const LocalOrdinalViewType & Acol2PRowImport,
1026  const LocalOrdinalViewType & Pcol2Accol,
1027  const LocalOrdinalViewType & PIcol2Accol,
1028  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1029  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1030  const std::string& label,
1031  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1032 
1033  using Teuchos::RCP;
1034  using Tpetra::MatrixMatrix::UnmanagedView;
1035  #ifdef HAVE_TPETRA_MMM_TIMINGS
1036  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1037  using Teuchos::rcp;
1038  using Teuchos::TimeMonitor;
1039  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore"))));
1040  #endif
1041 
1042  typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1043  typedef Scalar SC;
1044  typedef LocalOrdinal LO;
1045  typedef GlobalOrdinal GO;
1046  typedef Node NO;
1047  typedef Map<LO,GO,NO> map_type;
1049  typedef typename KCRS::StaticCrsGraphType graph_t;
1050  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1051  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1052  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1053  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1054  typedef typename KCRS::device_type device_t;
1055  typedef typename device_t::execution_space execution_space;
1056  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1057 
1058  // Unmanaged versions of the above
1059  typedef UnmanagedView<lno_view_t> u_lno_view_t;
1060  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1061  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1062 
1063  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1064  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1065 
1066  // Sizes
1067  RCP<const map_type> Accolmap = Ac.getColMap();
1068  size_t m = Rview.origMatrix->getNodeNumRows();
1069  size_t n = Accolmap->getNodeNumElements();
1070 
1071  // Get raw Kokkos matrices, and the raw CSR views
1072  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1073  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1074  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1075 
1076  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
1077  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
1078  lno_nnz_view_t Icolind;
1079  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1080  scalar_view_t Ivals;
1081 
1082  if (!Pview.importMatrix.is_null())
1083  {
1084  const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1085  Irowptr = Imat.graph.row_map;
1086  Icolind = Imat.graph.entries;
1087  Ivals = Imat.values;
1088  }
1089 
1090  // Classic csr assembly (low memory edition)
1091  //
1092  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1093  // The method loops over rows of R, and may resize after processing
1094  // each row. Chris Siefert says that this reflects experience in
1095  // ML; for the non-threaded case, ML found it faster to spend less
1096  // effort on estimation and risk an occasional reallocation.
1097 
1098  size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1099  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1100 
1101  // Get my node / thread info (right from openmp or parameter list)
1102  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1103  if(!params.is_null()) {
1104  if(params->isParameter("openmp: ltg thread max"))
1105  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1106  }
1107 
1108  double thread_chunk = (double)(m) / thread_max;
1109 
1110  // we can construct the rowptr inplace, allocate here and fault in parallel
1111  lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1112  // we do not touch these until copy out
1113  lno_nnz_view_t entriesAc;
1114  scalar_view_t valuesAc;
1115 
1116  // add this, since we do the rowptr in place, we could figure this out
1117  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1118  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
1119 
1120  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1121  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1122  //
1123  // For column index Aik in row i of A, Acol2PRow[Aik] tells
1124  // you whether the corresponding row of P belongs to P_local
1125  // ("orig") or P_remote ("Import").
1126 
1127  // Thread-local memory
1128  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind", thread_max);
1129  Kokkos::View<u_scalar_view_t*> tl_values("top_values", thread_max);
1130 
1131  // For each row of R
1132  Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1133  {
1134  // Thread coordination stuff
1135  size_t my_thread_start = tid * thread_chunk;
1136  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1137  size_t my_thread_m = my_thread_stop - my_thread_start;
1138 
1139  size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1140 
1141  std::vector<size_t> ac_status(n, INVALID);
1142 
1143  //manually allocate the thread-local storage for Ac
1144  u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1145  u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1146  u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1147 
1148  //count entries as they are added to Ac
1149  size_t nnz = 0, nnz_old = 0;
1150  // bmk: loop over the rows of R which are assigned to thread tid
1151  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1152  // directly index into the rowptr
1153  rowmapAc(i) = nnz;
1154  // mfh 27 Sep 2016: For each entry of R in the current row of R
1155  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1156  LO k = Rcolind(kk); // local column index of current entry of R
1157  const SC Rik = Rvals(kk); // value of current entry of R
1158  if (Rik == SC_ZERO)
1159  continue; // skip explicitly stored zero values in R
1160  // For each entry of A in the current row of A
1161  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1162  LO l = Acolind(ll); // local column index of current entry of A
1163  const SC Akl = Avals(ll); // value of current entry of A
1164  if (Akl == SC_ZERO)
1165  continue; // skip explicitly stored zero values in A
1166 
1167  if (Acol2PRow[l] != LO_INVALID) {
1168  // mfh 27 Sep 2016: If the entry of Acol2PRow
1169  // corresponding to the current entry of A is populated, then
1170  // the corresponding row of P is in P_local (i.e., it lives on
1171  // the calling process).
1172 
1173  // Local matrix
1174  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1175 
1176  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1177  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1178  LO j = Pcolind(jj);
1179  LO Acj = Pcol2Accol(j);
1180  SC Plj = Pvals(jj);
1181 
1182  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1183  #ifdef HAVE_TPETRA_DEBUG
1184  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1185  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1186  std::runtime_error,
1187  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1188  #endif
1189  // New entry
1190  ac_status[Acj] = nnz;
1191  Accolind(nnz) = Acj;
1192  Acvals(nnz) = Rik*Akl*Plj;
1193  nnz++;
1194  } else {
1195  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1196  }
1197  }
1198  } else {
1199  // mfh 27 Sep 2016: If the entry of Acol2PRow
1200  // corresponding to the current entry of A is NOT populated (has
1201  // a flag "invalid" value), then the corresponding row of P is
1202  // in P_remote (i.e., it does not live on the calling process).
1203 
1204  // Remote matrix
1205  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1206  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1207  LO j = Icolind(jj);
1208  LO Acj = PIcol2Accol(j);
1209  SC Plj = Ivals(jj);
1210 
1211  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1212  #ifdef HAVE_TPETRA_DEBUG
1213  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1214  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1215  std::runtime_error,
1216  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1217  #endif
1218  // New entry
1219  ac_status[Acj] = nnz;
1220  Accolind(nnz) = Acj;
1221  Acvals(nnz) = Rik*Akl*Plj;
1222  nnz++;
1223  } else {
1224  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1225  }
1226  }
1227  }
1228  }
1229  }
1230  // Resize for next pass if needed
1231  if (nnz + n > nnzAllocated) {
1232  nnzAllocated *= 2;
1233  Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1234  Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1235  }
1236  nnz_old = nnz;
1237  }
1238  thread_total_nnz(tid) = nnz;
1239  tl_colind(tid) = Accolind;
1240  tl_values(tid) = Acvals;
1241  });
1242  #ifdef HAVE_TPETRA_MMM_TIMINGS
1243  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local"))));
1244  #endif
1245 
1246  copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1247 
1248  #ifdef HAVE_TPETRA_MMM_TIMINGS
1249  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1250  #endif
1251 
1252  // Final sort & set of CRS arrays
1253  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1254  // mfh 27 Sep 2016: This just sets pointers.
1255  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1256 
1257  #ifdef HAVE_TPETRA_MMM_TIMINGS
1258  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1259  #endif
1260 
1261  // Final FillComplete
1262  //
1263  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1264  // Import (from domain Map to column Map) construction (which costs
1265  // lots of communication) by taking the previously constructed
1266  // Import object. We should be able to do this without interfering
1267  // with the implementation of the local part of sparse matrix-matrix
1268  // multiply above.
1269  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1270  labelList->set("Timer Label",label);
1271  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1272  RCP<const Export<LO,GO,NO> > dummyExport;
1273  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1274  Rview.origMatrix->getRangeMap(),
1275  Acimport,
1276  dummyExport,
1277  labelList);
1278 }
1279 
1280 
1281 
1282 /*********************************************************************************************************/
1283 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1284 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1285  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1286  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1287  const LocalOrdinalViewType & Acol2PRow,
1288  const LocalOrdinalViewType & Acol2PRowImport,
1289  const LocalOrdinalViewType & Pcol2Accol,
1290  const LocalOrdinalViewType & PIcol2Accol,
1291  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1292  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1293  const std::string& label,
1294  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1295 
1296  using Teuchos::RCP;
1297  using Tpetra::MatrixMatrix::UnmanagedView;
1298  #ifdef HAVE_TPETRA_MMM_TIMINGS
1299  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1300  using Teuchos::TimeMonitor;
1301  using Teuchos::rcp;
1302  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore"))));
1303  #endif
1304 
1305  typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1306  typedef Scalar SC;
1307  typedef LocalOrdinal LO;
1308  typedef GlobalOrdinal GO;
1309  typedef Node NO;
1310  typedef Map<LO,GO,NO> map_type;
1312  typedef typename KCRS::StaticCrsGraphType graph_t;
1313  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1314  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1315  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1316  typedef typename KCRS::device_type device_t;
1317  typedef typename device_t::execution_space execution_space;
1318  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1319 
1320  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1321  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1322  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1323 
1324  // Sizes
1325  RCP<const map_type> Accolmap = Ac.getColMap();
1326  size_t m = Rview.origMatrix->getNodeNumRows();
1327  size_t n = Accolmap->getNodeNumElements();
1328 
1329  // Get raw Kokkos matrices, and the raw CSR views
1330  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1331  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1332  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1333  const KCRS & Cmat = Ac.getLocalMatrix();
1334 
1335  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1336  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1337  lno_nnz_view_t Icolind;
1338  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1339  scalar_view_t Cvals = Cmat.values;
1340  scalar_view_t Ivals;
1341 
1342  if (!Pview.importMatrix.is_null())
1343  {
1344  const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1345  Irowptr = Imat.graph.row_map;
1346  Icolind = Imat.graph.entries;
1347  Ivals = Imat.values;
1348  }
1349 
1350  // Get my node / thread info (right from openmp or parameter list)
1351  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1352  if(!params.is_null()) {
1353  if(params->isParameter("openmp: ltg thread max"))
1354  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1355  }
1356 
1357  double thread_chunk = (double)(m) / thread_max;
1358 
1359  // For each row of R
1360  Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1361  {
1362  // Thread coordination stuff
1363  size_t my_thread_start = tid * thread_chunk;
1364  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1365 
1366  std::vector<size_t> c_status(n, INVALID);
1367 
1368  //count entries as they are added to Ac
1369  size_t OLD_ip = 0, CSR_ip = 0;
1370  // bmk: loop over the rows of R which are assigned to thread tid
1371  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1372  // First fill the c_status array w/ locations where we're allowed to
1373  // generate nonzeros for this row
1374  OLD_ip = Crowptr(i);
1375  CSR_ip = Crowptr(i+1);
1376  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1377  c_status[Ccolind(k)] = k;
1378  // Reset values in the row of C
1379  Cvals(k) = SC_ZERO;
1380  }
1381 
1382  // mfh 27 Sep 2016: For each entry of R in the current row of R
1383  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1384  LO k = Rcolind(kk); // local column index of current entry of R
1385  const SC Rik = Rvals(kk); // value of current entry of R
1386  if (Rik == SC_ZERO)
1387  continue; // skip explicitly stored zero values in R
1388  // For each entry of A in the current row of A
1389  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1390  LO l = Acolind(ll); // local column index of current entry of A
1391  const SC Akl = Avals(ll); // value of current entry of A
1392  if (Akl == SC_ZERO)
1393  continue; // skip explicitly stored zero values in A
1394 
1395  if (Acol2PRow[l] != LO_INVALID) {
1396  // mfh 27 Sep 2016: If the entry of Acol2PRow
1397  // corresponding to the current entry of A is populated, then
1398  // the corresponding row of P is in P_local (i.e., it lives on
1399  // the calling process).
1400 
1401  // Local matrix
1402  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1403 
1404  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1405  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1406  LO j = Pcolind(jj);
1407  LO Cij = Pcol2Accol(j);
1408  SC Plj = Pvals(jj);
1409  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1410  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1411  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1412 
1413 
1414  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1415  }
1416  } else {
1417  // mfh 27 Sep 2016: If the entry of Acol2PRow
1418  // corresponding to the current entry of A is NOT populated (has
1419  // a flag "invalid" value), then the corresponding row of P is
1420  // in P_remote (i.e., it does not live on the calling process).
1421 
1422  // Remote matrix
1423  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1424  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1425  LO j = Icolind(jj);
1426  LO Cij = PIcol2Accol(j);
1427  SC Plj = Ivals(jj);
1428  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1429  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1430  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1431 
1432  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1433  }
1434  }
1435  }
1436  }
1437  }
1438  });
1439  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1440 }
1441 
1442 
1443 
1444 #endif
1445 
1446 
1447 }//ExtraKernels
1448 }//MatrixMatrix
1449 }//Tpetra
1450 
1451 
1452 #endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...