Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
TpetraExt_MatrixMatrix_ExtraKernels_def.hpp
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 
42 #ifndef TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
43 #define TPETRA_MATRIXMATRIX_EXTRAKERNELS_DEF_HPP
44 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp"
45 #include "Tpetra_RowMatrixTransposer.hpp"
46 
47 namespace Tpetra {
48 
49 namespace MatrixMatrix{
50 
51 namespace ExtraKernels{
52 
53 
54 // Double product version
55 template<class CrsMatrixType>
56 size_t C_estimate_nnz_per_row(CrsMatrixType & A, CrsMatrixType &B){
57  // Follows the NZ estimate in ML's ml_matmatmult.c
58  size_t Aest = 100, Best=100;
59  if (A.getNodeNumEntries() > 0)
60  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
61  if (B.getNodeNumEntries() > 0)
62  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
63 
64  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
65  nnzperrow *= nnzperrow;
66 
67  return nnzperrow;
68 }
69 
70 
71 // Triple product version
72 template<class CrsMatrixType>
73 size_t Ac_estimate_nnz(CrsMatrixType & A, CrsMatrixType &P){
74  size_t nnzPerRowA = 100, Pcols = 100;
75  if (A.getNodeNumEntries() > 0)
76  nnzPerRowA = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 9;
77  if (P.getNodeNumEntries() > 0)
78  Pcols = (P.getNodeNumCols() > 0) ? P.getNodeNumCols() : 100;
79  return (size_t)(Pcols*nnzPerRowA + 5*nnzPerRowA + 300);
80 }
81 
82 #if defined (HAVE_TPETRA_INST_OPENMP)
83 /*********************************************************************************************************/
84 template<class Scalar,
85  class LocalOrdinal,
86  class GlobalOrdinal,
87  class LocalOrdinalViewType>
88 void mult_A_B_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
89  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
90  const LocalOrdinalViewType & targetMapToOrigRow,
91  const LocalOrdinalViewType & targetMapToImportRow,
92  const LocalOrdinalViewType & Bcol2Ccol,
93  const LocalOrdinalViewType & Icol2Ccol,
94  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
95  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
96  const std::string& label,
97  const Teuchos::RCP<Teuchos::ParameterList>& params) {
98 #ifdef HAVE_TPETRA_MMM_TIMINGS
99  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
100  using Teuchos::TimeMonitor;
101  // do we need RCPs at all?
102  Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix LTGCore")));
103 #endif
104 
105  using Teuchos::Array;
106  using Teuchos::ArrayRCP;
107  using Teuchos::ArrayView;
108  using Teuchos::RCP;
109  using Teuchos::rcp;
110 
111 
112  // Lots and lots of typedefs
114  // typedef typename KCRS::device_type device_t;
115  typedef typename KCRS::StaticCrsGraphType graph_t;
116  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
117  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
118  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
119  typedef typename KCRS::values_type::non_const_type scalar_view_t;
120 
121  // Unmanaged versions of the above
122  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
123  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
124  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
125 
126  typedef Scalar SC;
127  typedef LocalOrdinal LO;
128  typedef GlobalOrdinal GO;
129  typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
130  typedef Map<LO,GO,NO> map_type;
131 
132  // NOTE (mfh 15 Sep 2017) This is specifically only for
133  // execution_space = Kokkos::OpenMP, so we neither need nor want
134  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
135  typedef NO::execution_space execution_space;
136  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
137 
138  // All of the invalid guys
139  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
140  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
141  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
142 
143  // Grab the Kokkos::SparseCrsMatrices & inner stuff
144  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
145  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
146 
147  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
148  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
149  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
150  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
151 
152  c_lno_view_t Irowptr;
153  lno_nnz_view_t Icolind;
154  scalar_view_t Ivals;
155  if(!Bview.importMatrix.is_null()) {
156  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
157  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
158  Ivals = Bview.importMatrix->getLocalMatrix().values;
159  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
160  }
161 
162  // Sizes
163  RCP<const map_type> Ccolmap = C.getColMap();
164  size_t m = Aview.origMatrix->getNodeNumRows();
165  size_t n = Ccolmap->getNodeNumElements();
166  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
167 
168  // Get my node / thread info (right from openmp or parameter list)
169  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
170  if(!params.is_null()) {
171  if(params->isParameter("openmp: ltg thread max"))
172  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
173  }
174 
175  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
176  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
177  // we will not touch these until the final copyout step
178  lno_nnz_view_t entriesC;
179  scalar_view_t valuesC;
180 
181  // add this, since we do the rowptr in place, we could figure this out
182  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
183  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
184 
185  // Thread-local memory
186  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
187  Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
188 
189  double thread_chunk = (double)(m) / thread_max;
190 
191  // Run chunks of the matrix independently
192  Kokkos::parallel_for("MMM::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
193  {
194  // Thread coordination stuff
195  size_t my_thread_start = tid * thread_chunk;
196  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
197  size_t my_thread_m = my_thread_stop - my_thread_start;
198 
199  // Size estimate
200  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
201 
202  // Allocations
203  std::vector<size_t> c_status(n,INVALID);
204 
205  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
206  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
207 
208  // For each row of A/C
209  size_t CSR_ip = 0, OLD_ip = 0;
210  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
211  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
212  // on the calling process.
213  // JJE 10 Apr 2019 index directly into the rowptr
214  row_mapC(i) = CSR_ip;
215 
216  // mfh 27 Sep 2016: For each entry of A in the current row of A
217  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
218  LO Aik = Acolind(k); // local column index of current entry of A
219  const SC Aval = Avals(k); // value of current entry of A
220  if (Aval == SC_ZERO)
221  continue; // skip explicitly stored zero values in A
222 
223  if (targetMapToOrigRow(Aik) != LO_INVALID) {
224  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
225  // corresponding to the current entry of A is populated, then
226  // the corresponding row of B is in B_local (i.e., it lives on
227  // the calling process).
228 
229  // Local matrix
230  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
231 
232  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
233  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
234  LO Bkj = Bcolind(j);
235  LO Cij = Bcol2Ccol(Bkj);
236 
237  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
238  // New entry
239  c_status[Cij] = CSR_ip;
240  Ccolind(CSR_ip) = Cij;
241  Cvals(CSR_ip) = Aval*Bvals(j);
242  CSR_ip++;
243 
244  } else {
245  Cvals(c_status[Cij]) += Aval*Bvals(j);
246  }
247  }
248 
249  } else {
250  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
251  // corresponding to the current entry of A NOT populated (has
252  // a flag "invalid" value), then the corresponding row of B is
253  // in B_local (i.e., it lives on the calling process).
254 
255  // Remote matrix
256  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
257  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
258  LO Ikj = Icolind(j);
259  LO Cij = Icol2Ccol(Ikj);
260 
261  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
262  // New entry
263  c_status[Cij] = CSR_ip;
264  Ccolind(CSR_ip) = Cij;
265  Cvals(CSR_ip) = Aval*Ivals(j);
266  CSR_ip++;
267 
268  } else {
269  Cvals(c_status[Cij]) += Aval*Ivals(j);
270  }
271  }
272  }
273  }
274 
275  // Resize for next pass if needed
276  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1))*b_max_nnz_per_row) > CSR_alloc) {
277  CSR_alloc *= 2;
278  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
279  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
280  }
281  OLD_ip = CSR_ip;
282  }
283  thread_total_nnz(tid) = CSR_ip;
284  tl_colind(tid) = Ccolind;
285  tl_values(tid) = Cvals;
286  });
287 
288  // Do the copy out
289  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
290 
291 #ifdef HAVE_TPETRA_MMM_TIMINGS
292  MM.~(); // destruct the 'Core'
293  Teuchos::TimeMonitor MMsort (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix OpenMPSort")));
294 #endif
295  // Sort & set values
296  if (params.is_null() || params->get("sort entries",true))
297  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
298  C.setAllValues(row_mapC,entriesC,valuesC);
299 
300 }
301 
302 /*********************************************************************************************************/
303 template<class Scalar,
304  class LocalOrdinal,
305  class GlobalOrdinal,
306  class LocalOrdinalViewType>
307 void mult_A_B_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
308  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
309  const LocalOrdinalViewType & targetMapToOrigRow,
310  const LocalOrdinalViewType & targetMapToImportRow,
311  const LocalOrdinalViewType & Bcol2Ccol,
312  const LocalOrdinalViewType & Icol2Ccol,
313  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
314  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
315  const std::string& label,
316  const Teuchos::RCP<Teuchos::ParameterList>& params) {
317 #ifdef HAVE_TPETRA_MMM_TIMINGS
318  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
319  Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse LTGCore")));
320 #endif
321 
322  using Teuchos::Array;
323  using Teuchos::ArrayRCP;
324  using Teuchos::ArrayView;
325  using Teuchos::RCP;
326  using Teuchos::rcp;
327 
328  // Lots and lots of typedefs
330  // typedef typename KCRS::device_type device_t;
331  typedef typename KCRS::StaticCrsGraphType graph_t;
332  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
333  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
334  typedef typename KCRS::values_type::non_const_type scalar_view_t;
335 
336  typedef Scalar SC;
337  typedef LocalOrdinal LO;
338  typedef GlobalOrdinal GO;
339  typedef Kokkos::Compat::KokkosOpenMPWrapperNode NO;
340  typedef Map<LO,GO,NO> map_type;
341 
342  // NOTE (mfh 15 Sep 2017) This is specifically only for
343  // execution_space = Kokkos::OpenMP, so we neither need nor want
344  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
345  typedef NO::execution_space execution_space;
346  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
347 
348  // All of the invalid guys
349  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
350  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
351  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
352 
353  // Grab the Kokkos::SparseCrsMatrices & inner stuff
354  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
355  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
356  const KCRS & Cmat = C.getLocalMatrix();
357 
358  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
359  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
360  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
361  scalar_view_t Cvals = Cmat.values;
362 
363  c_lno_view_t Irowptr;
364  c_lno_nnz_view_t Icolind;
365  scalar_view_t Ivals;
366  if(!Bview.importMatrix.is_null()) {
367  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
368  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
369  Ivals = Bview.importMatrix->getLocalMatrix().values;
370  }
371 
372  // Sizes
373  RCP<const map_type> Ccolmap = C.getColMap();
374  size_t m = Aview.origMatrix->getNodeNumRows();
375  size_t n = Ccolmap->getNodeNumElements();
376 
377  // Get my node / thread info (right from openmp or parameter list)
378  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
379  if(!params.is_null()) {
380  if(params->isParameter("openmp: ltg thread max"))
381  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
382  }
383 
384  double thread_chunk = (double)(m) / thread_max;
385 
386  // Run chunks of the matrix independently
387  Kokkos::parallel_for("MMM::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
388  {
389  // Thread coordination stuff
390  size_t my_thread_start = tid * thread_chunk;
391  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
392 
393  // Allocations
394  std::vector<size_t> c_status(n,INVALID);
395 
396  // For each row of A/C
397  size_t CSR_ip = 0, OLD_ip = 0;
398  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
399  // First fill the c_status array w/ locations where we're allowed to
400  // generate nonzeros for this row
401  OLD_ip = Crowptr(i);
402  CSR_ip = Crowptr(i+1);
403  for (size_t k = OLD_ip; k < CSR_ip; k++) {
404  c_status[Ccolind(k)] = k;
405  // Reset values in the row of C
406  Cvals(k) = SC_ZERO;
407  }
408 
409  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
410  LO Aik = Acolind(k);
411  const SC Aval = Avals(k);
412  if (Aval == SC_ZERO)
413  continue;
414 
415  if (targetMapToOrigRow(Aik) != LO_INVALID) {
416  // Local matrix
417  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
418 
419  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
420  LO Bkj = Bcolind(j);
421  LO Cij = Bcol2Ccol(Bkj);
422 
423  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
424  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
425  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
426 
427  Cvals(c_status[Cij]) += Aval * Bvals(j);
428  }
429  } else {
430  // Remote matrix
431  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
432  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
433  LO Ikj = Icolind(j);
434  LO Cij = Icol2Ccol(Ikj);
435 
436  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
437  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
438  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
439 
440  Cvals(c_status[Cij]) += Aval * Ivals(j);
441  }
442  }
443  }
444  }
445  });
446 
447  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
448 }
449 
450 /*********************************************************************************************************/
451 template<class Scalar,
452  class LocalOrdinal,
453  class GlobalOrdinal,
454  class LocalOrdinalViewType>
455 void jacobi_A_B_newmatrix_LowThreadGustavsonKernel(Scalar omega,
456  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
457  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
458  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
459  const LocalOrdinalViewType & targetMapToOrigRow,
460  const LocalOrdinalViewType & targetMapToImportRow,
461  const LocalOrdinalViewType & Bcol2Ccol,
462  const LocalOrdinalViewType & Icol2Ccol,
463  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
464  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
465  const std::string& label,
466  const Teuchos::RCP<Teuchos::ParameterList>& params) {
467 #ifdef HAVE_TPETRA_MMM_TIMINGS
468  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
469  Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix LTGCore")));
470 #endif
471 
472  using Teuchos::Array;
473  using Teuchos::ArrayRCP;
474  using Teuchos::ArrayView;
475  using Teuchos::RCP;
476  using Teuchos::rcp;
477 
478  // Lots and lots of typedefs
479  typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
481  // typedef typename KCRS::device_type device_t;
482  typedef typename KCRS::StaticCrsGraphType graph_t;
483  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
484  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
485  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
486  typedef typename KCRS::values_type::non_const_type scalar_view_t;
487 
488  // Unmanaged versions of the above
489  //typedef UnmanagedView<lno_view_t> u_lno_view_t; // unused
490  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
491  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
492 
493  // Jacobi-specific
494  typedef typename scalar_view_t::memory_space scalar_memory_space;
495 
496  typedef Scalar SC;
497  typedef LocalOrdinal LO;
498  typedef GlobalOrdinal GO;
499  typedef Node NO;
500  typedef Map<LO,GO,NO> map_type;
501 
502  // NOTE (mfh 15 Sep 2017) This is specifically only for
503  // execution_space = Kokkos::OpenMP, so we neither need nor want
504  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
505  typedef NO::execution_space execution_space;
506  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
507 
508  // All of the invalid guys
509  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
510  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
511  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
512 
513  // Grab the Kokkos::SparseCrsMatrices & inner stuff
514  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
515  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
516 
517  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
518  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
519  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
520  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
521 
522  c_lno_view_t Irowptr;
523  lno_nnz_view_t Icolind;
524  scalar_view_t Ivals;
525  if(!Bview.importMatrix.is_null()) {
526  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
527  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
528  Ivals = Bview.importMatrix->getLocalMatrix().values;
529  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
530  }
531 
532  // Jacobi-specific inner stuff
533  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
534 
535  // Sizes
536  RCP<const map_type> Ccolmap = C.getColMap();
537  size_t m = Aview.origMatrix->getNodeNumRows();
538  size_t n = Ccolmap->getNodeNumElements();
539  size_t Cest_nnz_per_row = 2*C_estimate_nnz_per_row(*Aview.origMatrix,*Bview.origMatrix);
540 
541  // Get my node / thread info (right from openmp)
542  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
543  if(!params.is_null()) {
544  if(params->isParameter("openmp: ltg thread max"))
545  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
546  }
547 
548  // 2019 Apr 10 jje: We can do rowptr in place, and no need to inialize since we can fault as we go
549  lno_view_t row_mapC(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
550  // we will not touch these until the final copyout step
551  lno_nnz_view_t entriesC;
552  scalar_view_t valuesC;
553 
554  // add this, since we do the rowptr in place, we could figure this out
555  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
556  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
557 
558  // Thread-local memory
559  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind",thread_max);
560  Kokkos::View<u_scalar_view_t*> tl_values("top_values",thread_max);
561 
562  double thread_chunk = (double)(m) / thread_max;
563 
564  // Run chunks of the matrix independently
565  Kokkos::parallel_for("Jacobi::LTG::NewMatrix::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
566  {
567  // Thread coordination stuff
568  size_t my_thread_start = tid * thread_chunk;
569  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
570  size_t my_thread_m = my_thread_stop - my_thread_start;
571 
572  // Size estimate
573  size_t CSR_alloc = (size_t) (my_thread_m*Cest_nnz_per_row*0.75 + 100);
574 
575  // Allocations
576  std::vector<size_t> c_status(n,INVALID);
577 
578  u_lno_nnz_view_t Ccolind((typename u_lno_nnz_view_t::data_type)malloc(u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
579  u_scalar_view_t Cvals((typename u_scalar_view_t::data_type)malloc(u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
580 
581  // For each row of A/C
582  size_t CSR_ip = 0, OLD_ip = 0;
583  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
584  // printf("CMS: row %d CSR_alloc = %d\n",(int)i,(int)CSR_alloc);fflush(stdout);
585  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
586  // on the calling process.
587  // JJE: directly access the rowptr here (indexed using our loop var)
588  row_mapC(i) = CSR_ip;
589  // NOTE: Vector::getLocalView returns a rank 2 view here
590  SC minusOmegaDval = -omega*Dvals(i,0);
591 
592  // Entries of B
593  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
594  const SC Bval = Bvals(j);
595  if (Bval == SC_ZERO)
596  continue;
597  LO Bij = Bcolind(j);
598  LO Cij = Bcol2Ccol(Bij);
599 
600  // Assume no repeated entries in B
601  c_status[Cij] = CSR_ip;
602  Ccolind(CSR_ip) = Cij;
603  Cvals(CSR_ip) = Bvals[j];
604  CSR_ip++;
605  }
606 
607  // Entries of -omega * Dinv * A * B
608  // mfh 27 Sep 2016: For each entry of A in the current row of A
609  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
610  LO Aik = Acolind(k); // local column index of current entry of A
611  const SC Aval = Avals(k); // value of current entry of A
612  if (Aval == SC_ZERO)
613  continue; // skip explicitly stored zero values in A
614 
615  if (targetMapToOrigRow(Aik) != LO_INVALID) {
616  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
617  // corresponding to the current entry of A is populated, then
618  // the corresponding row of B is in B_local (i.e., it lives on
619  // the calling process).
620 
621  // Local matrix
622  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
623 
624  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
625  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
626  LO Bkj = Bcolind(j);
627  LO Cij = Bcol2Ccol(Bkj);
628 
629  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
630  // New entry
631  c_status[Cij] = CSR_ip;
632  Ccolind(CSR_ip) = Cij;
633  Cvals(CSR_ip) = minusOmegaDval*Aval*Bvals(j);
634  CSR_ip++;
635 
636  } else {
637  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Bvals(j);
638  }
639  }
640 
641  } else {
642  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
643  // corresponding to the current entry of A NOT populated (has
644  // a flag "invalid" value), then the corresponding row of B is
645  // in B_local (i.e., it lives on the calling process).
646 
647  // Remote matrix
648  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
649  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
650  LO Ikj = Icolind(j);
651  LO Cij = Icol2Ccol(Ikj);
652 
653  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
654  // New entry
655  c_status[Cij] = CSR_ip;
656  Ccolind(CSR_ip) = Cij;
657  Cvals(CSR_ip) = minusOmegaDval*Aval*Ivals(j);
658  CSR_ip++;
659 
660  } else {
661  Cvals(c_status[Cij]) += minusOmegaDval*Aval*Ivals(j);
662  }
663  }
664  }
665  }
666 
667  // Resize for next pass if needed
668  if (i+1 < my_thread_stop && CSR_ip + std::min(n,(Arowptr(i+2)-Arowptr(i+1)+1)*b_max_nnz_per_row) > CSR_alloc) {
669  CSR_alloc *= 2;
670  Ccolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type)realloc(Ccolind.data(),u_lno_nnz_view_t::shmem_size(CSR_alloc)),CSR_alloc);
671  Cvals = u_scalar_view_t((typename u_scalar_view_t::data_type)realloc(Cvals.data(),u_scalar_view_t::shmem_size(CSR_alloc)),CSR_alloc);
672  }
673  OLD_ip = CSR_ip;
674  }
675  thread_total_nnz(tid) = CSR_ip;
676  tl_colind(tid) = Ccolind;
677  tl_values(tid) = Cvals;
678  });
679 
680 
681  // Do the copy out (removed the tl_rowptr!)
682  copy_out_from_thread_memory(thread_total_nnz,tl_colind,tl_values,m,thread_chunk,row_mapC,entriesC,valuesC);
683 
684 
685 #ifdef HAVE_TPETRA_MMM_TIMINGS
686  MM.~();
687  Teuchos::TimeMonitor MMsort (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix OpenMPSort")));
688 #endif
689  // Sort & set values
690  if (params.is_null() || params->get("sort entries",true))
691  Import_Util::sortCrsEntries(row_mapC, entriesC, valuesC);
692  C.setAllValues(row_mapC,entriesC,valuesC);
693 
694 }
695 
696 
697 
698 /*********************************************************************************************************/
699 template<class Scalar,
700  class LocalOrdinal,
701  class GlobalOrdinal,
702  class LocalOrdinalViewType>
703 void jacobi_A_B_reuse_LowThreadGustavsonKernel(Scalar omega,
704  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> & Dinv,
705  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
706  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Bview,
707  const LocalOrdinalViewType & targetMapToOrigRow,
708  const LocalOrdinalViewType & targetMapToImportRow,
709  const LocalOrdinalViewType & Bcol2Ccol,
710  const LocalOrdinalViewType & Icol2Ccol,
711  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& C,
712  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Kokkos::Compat::KokkosOpenMPWrapperNode> > Cimport,
713  const std::string& label,
714  const Teuchos::RCP<Teuchos::ParameterList>& params) {
715 #ifdef HAVE_TPETRA_MMM_TIMINGS
716  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
717  Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse LTGCore")));
718 #endif
719  using Teuchos::Array;
720  using Teuchos::ArrayRCP;
721  using Teuchos::ArrayView;
722  using Teuchos::RCP;
723  using Teuchos::rcp;
724 
725  // Lots and lots of typedefs
726  typedef typename Kokkos::Compat::KokkosOpenMPWrapperNode Node;
728  // typedef typename KCRS::device_type device_t;
729  typedef typename KCRS::StaticCrsGraphType graph_t;
730  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
731  typedef typename graph_t::entries_type::const_type c_lno_nnz_view_t;
732  typedef typename KCRS::values_type::non_const_type scalar_view_t;
733 
734  // Jacobi-specific
735  typedef typename scalar_view_t::memory_space scalar_memory_space;
736 
737  typedef Scalar SC;
738  typedef LocalOrdinal LO;
739  typedef GlobalOrdinal GO;
740  typedef Node NO;
741  typedef Map<LO,GO,NO> map_type;
742 
743  // NOTE (mfh 15 Sep 2017) This is specifically only for
744  // execution_space = Kokkos::OpenMP, so we neither need nor want
745  // KOKKOS_LAMBDA (with its mandatory __device__ marking).
746  typedef NO::execution_space execution_space;
747  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
748 
749  // All of the invalid guys
750  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
751  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
752  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
753 
754  // Grab the Kokkos::SparseCrsMatrices & inner stuff
755  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
756  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
757  const KCRS & Cmat = C.getLocalMatrix();
758 
759  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
760  const c_lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
761  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
762  scalar_view_t Cvals = Cmat.values;
763 
764  c_lno_view_t Irowptr;
765  c_lno_nnz_view_t Icolind;
766  scalar_view_t Ivals;
767  if(!Bview.importMatrix.is_null()) {
768  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
769  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
770  Ivals = Bview.importMatrix->getLocalMatrix().values;
771  }
772 
773  // Jacobi-specific inner stuff
774  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
775 
776  // Sizes
777  RCP<const map_type> Ccolmap = C.getColMap();
778  size_t m = Aview.origMatrix->getNodeNumRows();
779  size_t n = Ccolmap->getNodeNumElements();
780 
781  // Get my node / thread info (right from openmp or parameter list)
782  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
783  if(!params.is_null()) {
784  if(params->isParameter("openmp: ltg thread max"))
785  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
786  }
787 
788  double thread_chunk = (double)(m) / thread_max;
789 
790  // Run chunks of the matrix independently
791  Kokkos::parallel_for("Jacobi::LTG::Reuse::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
792  {
793  // Thread coordination stuff
794  size_t my_thread_start = tid * thread_chunk;
795  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
796 
797  // Allocations
798  std::vector<size_t> c_status(n,INVALID);
799 
800  // For each row of A/C
801  size_t CSR_ip = 0, OLD_ip = 0;
802  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
803  // First fill the c_status array w/ locations where we're allowed to
804  // generate nonzeros for this row
805  OLD_ip = Crowptr(i);
806  CSR_ip = Crowptr(i+1);
807  // NOTE: Vector::getLocalView returns a rank 2 view here
808  SC minusOmegaDval = -omega*Dvals(i,0);
809 
810  for (size_t k = OLD_ip; k < CSR_ip; k++) {
811  c_status[Ccolind(k)] = k;
812  // Reset values in the row of C
813  Cvals(k) = SC_ZERO;
814  }
815 
816  // Entries of B
817  for (size_t j = Browptr(i); j < Browptr(i+1); j++) {
818  const SC Bval = Bvals(j);
819  if (Bval == SC_ZERO)
820  continue;
821  LO Bij = Bcolind(j);
822  LO Cij = Bcol2Ccol(Bij);
823 
824  // Assume no repeated entries in B
825  Cvals(c_status[Cij]) += Bvals(j);
826  CSR_ip++;
827  }
828 
829 
830  for (size_t k = Arowptr(i); k < Arowptr(i+1); k++) {
831  LO Aik = Acolind(k);
832  const SC Aval = Avals(k);
833  if (Aval == SC_ZERO)
834  continue;
835 
836  if (targetMapToOrigRow(Aik) != LO_INVALID) {
837  // Local matrix
838  size_t Bk = Teuchos::as<size_t>(targetMapToOrigRow(Aik));
839 
840  for (size_t j = Browptr(Bk); j < Browptr(Bk+1); ++j) {
841  LO Bkj = Bcolind(j);
842  LO Cij = Bcol2Ccol(Bkj);
843 
844  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
845  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
846  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
847 
848  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Bvals(j);
849  }
850  } else {
851  // Remote matrix
852  size_t Ik = Teuchos::as<size_t>(targetMapToImportRow(Aik));
853  for (size_t j = Irowptr(Ik); j < Irowptr(Ik+1); ++j) {
854  LO Ikj = Icolind(j);
855  LO Cij = Icol2Ccol(Ikj);
856 
857  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
858  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
859  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
860 
861  Cvals(c_status[Cij]) += minusOmegaDval * Aval * Ivals(j);
862  }
863  }
864  }
865  }
866  });
867 
868  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
869 }
870 
871 
872 /*********************************************************************************************************/
873 template<class InColindArrayType, class InValsArrayType,
874  class OutRowptrType, class OutColindType, class OutValsType>
875 void copy_out_from_thread_memory(const OutColindType& thread_total_nnz,
876  const InColindArrayType& Incolind,
877  const InValsArrayType& Invalues,
878  const size_t m,
879  const double thread_chunk,
880  OutRowptrType& row_mapC,
881  OutColindType& entriesC,
882  OutValsType& valuesC ) {
883  typedef OutRowptrType lno_view_t;
884  typedef OutColindType lno_nnz_view_t;
885  typedef OutValsType scalar_view_t;
886  typedef typename lno_view_t::execution_space execution_space;
887  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
888 
889  // Generate the starting nnz number per thread
890  size_t thread_max = Incolind.size();
891  size_t c_nnz_size=0;
892  // since this kernel is 'low thread count' is very likely, that we could
893  // sum over the thread_total_nnz and not go parallel, but since it is a view
894  // we don't know where the memory actually lives... so we kinda need to go parallel
895 
896  lno_view_t thread_start_nnz("thread_nnz",thread_max+1);
897  Kokkos::parallel_scan("LTG::Scan",range_type(0,thread_max).set_chunk_size(1), [=] (const size_t i, size_t& update, const bool final) {
898  size_t mynnz = thread_total_nnz(i);
899  if(final) thread_start_nnz(i) = update;
900  update+=mynnz;
901  if(final && i+1==thread_max) thread_start_nnz(i+1)=update;
902  });
903  c_nnz_size = thread_start_nnz(thread_max);
904 
905  // 2019 Apr 10 JJE: update the rowptr's final entry here
906  row_mapC(m) = thread_start_nnz(thread_max);
907 
908  // Allocate output
909  lno_nnz_view_t entriesC_(Kokkos::ViewAllocateWithoutInitializing("entriesC"), c_nnz_size); entriesC = entriesC_;
910  scalar_view_t valuesC_(Kokkos::ViewAllocateWithoutInitializing("valuesC"), c_nnz_size); valuesC = valuesC_;
911 
912  // Copy out
913  Kokkos::parallel_for("LTG::CopyOut", range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid) {
914  const size_t my_thread_start = tid * thread_chunk;
915  const size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
916  const size_t nnz_thread_start = thread_start_nnz(tid);
917  // we need this info, since we did the rowptr in place
918  const size_t nnz_thread_stop = thread_start_nnz(tid+1);
919 
920  // There are two fundamental operations:
921  // * Updateing the rowptr with the correct offset
922  // * copying entries and values
923 
924  // First, update the rowptr, it since we did it inplace, it is a += operation
925  // in the paper, I experimented with a coloring scheme that had threads do
926  // do their copies in different orders. It wasn't obvious it was beneficial
927  // but it can be replicated, by choosing the array to copy first based on your
928  // thread id % 3
929  if (my_thread_start != 0 ) {
930  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
931  row_mapC(i) += nnz_thread_start;
932  }
933  }
934 
935  // try to Kokkos::single() the alloc here. It should implicitly barrier
936  // thread 0 doesn't copy the rowptr, so it could hit the single first
937  // in the paper, I played a game that effectively got LTG down to a single
938  // OpenMP barrier. But doing that requires the ability to spawn a single
939  // parallel region. The Scan above was implemented using atomic adds
940  // and the barrier was only needed so you could allocate
941  //
942  // Since we can't spawn a single region, we could move the allocations
943  // here, and using 'single'. Most likely, thread 0 will hit it first allowing
944  // the other threads to update the rowptr while it allocates.
945 
946 
947  // next, bulk copy the vals/colind arrays
948  const size_t my_num_nnz = nnz_thread_stop - nnz_thread_start;
949  for (size_t i = 0; i < my_num_nnz; ++i) {
950  entriesC(nnz_thread_start + i) = Incolind(tid)(i);
951  valuesC(nnz_thread_start + i) = Invalues(tid)(i);
952  }
953 
954  //Free the unamanged views, let each thread deallocate its memory
955  // May need to cast away const here..
956  if(Incolind(tid).data()) free(Incolind(tid).data());
957  if(Invalues(tid).data()) free(Invalues(tid).data());
958  });
959 }//end copy_out
960 
961 #endif // OpenMP
962 
963 
964 /*********************************************************************************************************/
965 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
966 void jacobi_A_B_newmatrix_MultiplyScaleAddKernel(Scalar omega,
967  const Vector<Scalar,LocalOrdinal,GlobalOrdinal, Node> & Dinv,
968  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
969  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
970  const LocalOrdinalViewType & Acol2Brow,
971  const LocalOrdinalViewType & Acol2Irow,
972  const LocalOrdinalViewType & Bcol2Ccol,
973  const LocalOrdinalViewType & Icol2Ccol,
974  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
975  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
976  const std::string& label,
977  const Teuchos::RCP<Teuchos::ParameterList>& params) {
978 #ifdef HAVE_TPETRA_MMM_TIMINGS
979  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
980  using Teuchos::TimeMonitor;
981  Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK")));
982  Teuchos::TimeMonitor MMmult (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Multiply")));
983 #endif
984  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> Matrix_t;
985 
986  // This kernel computes (I-omega Dinv A) B the slow way (for generality's sake, you must understand)
987 
988  // 1) Multiply A*B
989  Teuchos::RCP<Matrix_t> AB = Teuchos::rcp(new Matrix_t(C.getRowMap(),0));
990  Tpetra::MMdetails::mult_A_B_newmatrix(Aview,Bview,*AB,label+std::string(" MSAK"),params);
991 
992 #ifdef HAVE_TPETRA_MMM_TIMINGS
993  MMmult.~();
994  Teuchos::TimeMonitor MMscale (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Scale")));
995 #endif
996 
997  // 2) Scale A by Dinv
998  AB->leftScale(Dinv);
999 
1000 #ifdef HAVE_TPETRA_MMM_TIMINGS
1001  MMscale.~();
1002  Teuchos::TimeMonitor MMadd (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix MSAK Add")));
1003 #endif
1004 
1005  // 3) Add [-omega Dinv A] + B
1006  Teuchos::ParameterList jparams;
1007  if(!params.is_null()) {
1008  jparams = *params;
1009  jparams.set("label",label+std::string(" MSAK Add"));
1010  }
1011  Scalar one = Teuchos::ScalarTraits<Scalar>::one();
1012  Tpetra::MatrixMatrix::add(one,false,*Bview.origMatrix,Scalar(-omega),false,*AB,C,AB->getDomainMap(),AB->getRangeMap(),Teuchos::rcp(&jparams,false));
1013 
1014  }// jacobi_A_B_newmatrix_MultiplyScaleAddKernel
1015 
1016 
1017 
1018 #if defined (HAVE_TPETRA_INST_OPENMP)
1019 /*********************************************************************************************************/
1020 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1021 static inline void mult_R_A_P_newmatrix_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1022  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1023  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1024  const LocalOrdinalViewType & Acol2PRow,
1025  const LocalOrdinalViewType & Acol2PRowImport,
1026  const LocalOrdinalViewType & Pcol2Accol,
1027  const LocalOrdinalViewType & PIcol2Accol,
1028  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1029  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1030  const std::string& label,
1031  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1032 
1033  using Teuchos::RCP;
1034  using Tpetra::MatrixMatrix::UnmanagedView;
1035  #ifdef HAVE_TPETRA_MMM_TIMINGS
1036  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1037  Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix LTGCore")));
1038  #endif
1039 
1040  typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1041  typedef Scalar SC;
1042  typedef LocalOrdinal LO;
1043  typedef GlobalOrdinal GO;
1044  typedef Node NO;
1045  typedef Map<LO,GO,NO> map_type;
1047  typedef typename KCRS::StaticCrsGraphType graph_t;
1048  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1049  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1050  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1051  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1052  typedef typename KCRS::device_type device_t;
1053  typedef typename device_t::execution_space execution_space;
1054  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1055 
1056  // Unmanaged versions of the above
1057  typedef UnmanagedView<lno_view_t> u_lno_view_t;
1058  typedef UnmanagedView<lno_nnz_view_t> u_lno_nnz_view_t;
1059  typedef UnmanagedView<scalar_view_t> u_scalar_view_t;
1060 
1061  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1062  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1063 
1064  // Sizes
1065  RCP<const map_type> Accolmap = Ac.getColMap();
1066  size_t m = Rview.origMatrix->getNodeNumRows();
1067  size_t n = Accolmap->getNodeNumElements();
1068 
1069  // Get raw Kokkos matrices, and the raw CSR views
1070  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1071  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1072  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1073 
1074  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Irowptr;
1075  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries;
1076  lno_nnz_view_t Icolind;
1077  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1078  scalar_view_t Ivals;
1079 
1080  if (!Pview.importMatrix.is_null())
1081  {
1082  const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1083  Irowptr = Imat.graph.row_map;
1084  Icolind = Imat.graph.entries;
1085  Ivals = Imat.values;
1086  }
1087 
1088  // Classic csr assembly (low memory edition)
1089  //
1090  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
1091  // The method loops over rows of R, and may resize after processing
1092  // each row. Chris Siefert says that this reflects experience in
1093  // ML; for the non-threaded case, ML found it faster to spend less
1094  // effort on estimation and risk an occasional reallocation.
1095 
1096  size_t Acest_nnz_per_row = std::ceil(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix) / m);
1097  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1098 
1099  // Get my node / thread info (right from openmp or parameter list)
1100  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1101  if(!params.is_null()) {
1102  if(params->isParameter("openmp: ltg thread max"))
1103  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1104  }
1105 
1106  double thread_chunk = (double)(m) / thread_max;
1107 
1108  // we can construct the rowptr inplace, allocate here and fault in parallel
1109  lno_view_t rowmapAc(Kokkos::ViewAllocateWithoutInitializing("non_const_lnow_row"), m + 1);
1110  // we do not touch these until copy out
1111  lno_nnz_view_t entriesAc;
1112  scalar_view_t valuesAc;
1113 
1114  // add this, since we do the rowptr in place, we could figure this out
1115  // using the rowptr, but it requires an unusual loop (that jumps all over memory)
1116  lno_nnz_view_t thread_total_nnz("thread_total_nnz",thread_max+1);
1117 
1118  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1119  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1120  //
1121  // For column index Aik in row i of A, Acol2PRow[Aik] tells
1122  // you whether the corresponding row of P belongs to P_local
1123  // ("orig") or P_remote ("Import").
1124 
1125  // Thread-local memory
1126  Kokkos::View<u_lno_nnz_view_t*> tl_colind("top_colind", thread_max);
1127  Kokkos::View<u_scalar_view_t*> tl_values("top_values", thread_max);
1128 
1129  // For each row of R
1130  Kokkos::parallel_for("MMM::RAP::NewMatrix::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1131  {
1132  // Thread coordination stuff
1133  size_t my_thread_start = tid * thread_chunk;
1134  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1135  size_t my_thread_m = my_thread_stop - my_thread_start;
1136 
1137  size_t nnzAllocated = (size_t) (my_thread_m * Acest_nnz_per_row + 100);
1138 
1139  std::vector<size_t> ac_status(n, INVALID);
1140 
1141  //manually allocate the thread-local storage for Ac
1142  u_lno_view_t Acrowptr((typename u_lno_view_t::data_type) malloc(u_lno_view_t::shmem_size(my_thread_m+1)), my_thread_m + 1);
1143  u_lno_nnz_view_t Accolind((typename u_lno_nnz_view_t::data_type) malloc(u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1144  u_scalar_view_t Acvals((typename u_scalar_view_t::data_type) malloc(u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1145 
1146  //count entries as they are added to Ac
1147  size_t nnz = 0, nnz_old = 0;
1148  // bmk: loop over the rows of R which are assigned to thread tid
1149  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1150  // directly index into the rowptr
1151  rowmapAc(i) = nnz;
1152  // mfh 27 Sep 2016: For each entry of R in the current row of R
1153  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1154  LO k = Rcolind(kk); // local column index of current entry of R
1155  const SC Rik = Rvals(kk); // value of current entry of R
1156  if (Rik == SC_ZERO)
1157  continue; // skip explicitly stored zero values in R
1158  // For each entry of A in the current row of A
1159  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1160  LO l = Acolind(ll); // local column index of current entry of A
1161  const SC Akl = Avals(ll); // value of current entry of A
1162  if (Akl == SC_ZERO)
1163  continue; // skip explicitly stored zero values in A
1164 
1165  if (Acol2PRow[l] != LO_INVALID) {
1166  // mfh 27 Sep 2016: If the entry of Acol2PRow
1167  // corresponding to the current entry of A is populated, then
1168  // the corresponding row of P is in P_local (i.e., it lives on
1169  // the calling process).
1170 
1171  // Local matrix
1172  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1173 
1174  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1175  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1176  LO j = Pcolind(jj);
1177  LO Acj = Pcol2Accol(j);
1178  SC Plj = Pvals(jj);
1179 
1180  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1181  #ifdef HAVE_TPETRA_DEBUG
1182  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1183  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.size()),
1184  std::runtime_error,
1185  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1186  #endif
1187  // New entry
1188  ac_status[Acj] = nnz;
1189  Accolind(nnz) = Acj;
1190  Acvals(nnz) = Rik*Akl*Plj;
1191  nnz++;
1192  } else {
1193  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1194  }
1195  }
1196  } else {
1197  // mfh 27 Sep 2016: If the entry of Acol2PRow
1198  // corresponding to the current entry of A is NOT populated (has
1199  // a flag "invalid" value), then the corresponding row of P is
1200  // in P_remote (i.e., it does not live on the calling process).
1201 
1202  // Remote matrix
1203  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1204  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1205  LO j = Icolind(jj);
1206  LO Acj = PIcol2Accol(j);
1207  SC Plj = Ivals(jj);
1208 
1209  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1210  #ifdef HAVE_TPETRA_DEBUG
1211  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1212  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Accolind.extent(0)),
1213  std::runtime_error,
1214  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Accolind.size() << std::endl);
1215  #endif
1216  // New entry
1217  ac_status[Acj] = nnz;
1218  Accolind(nnz) = Acj;
1219  Acvals(nnz) = Rik*Akl*Plj;
1220  nnz++;
1221  } else {
1222  Acvals(ac_status[Acj]) += Rik*Akl*Plj;
1223  }
1224  }
1225  }
1226  }
1227  }
1228  // Resize for next pass if needed
1229  if (nnz + n > nnzAllocated) {
1230  nnzAllocated *= 2;
1231  Accolind = u_lno_nnz_view_t((typename u_lno_nnz_view_t::data_type) realloc(Accolind.data(), u_lno_nnz_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1232  Acvals = u_scalar_view_t((typename u_scalar_view_t::data_type) realloc(Acvals.data(), u_scalar_view_t::shmem_size(nnzAllocated)), nnzAllocated);
1233  }
1234  nnz_old = nnz;
1235  }
1236  thread_total_nnz(tid) = nnz;
1237  tl_colind(tid) = Accolind;
1238  tl_values(tid) = Acvals;
1239  });
1240  #ifdef HAVE_TPETRA_MMM_TIMINGS
1241  MM.~();
1242  Teuchos::TimeMonitor MMcopy (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix copy from thread local")));
1243  #endif
1244 
1245  copy_out_from_thread_memory(thread_total_nnz,tl_colind, tl_values, m, thread_chunk, rowmapAc, entriesAc, valuesAc);
1246 
1247  #ifdef HAVE_TPETRA_MMM_TIMINGS
1248  MMcopy.~();
1249  Teuchos::TimeMonitor MMsort (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort")));
1250  #endif
1251 
1252  // Final sort & set of CRS arrays
1253  Import_Util::sortCrsEntries(rowmapAc, entriesAc, valuesAc);
1254  // mfh 27 Sep 2016: This just sets pointers.
1255  Ac.setAllValues(rowmapAc, entriesAc, valuesAc);
1256 
1257  #ifdef HAVE_TPETRA_MMM_TIMINGS
1258  MMsort.~();
1259  Teuchos::TimeMonitor MMfill (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC")));
1260  #endif
1261 
1262  // Final FillComplete
1263  //
1264  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1265  // Import (from domain Map to column Map) construction (which costs
1266  // lots of communication) by taking the previously constructed
1267  // Import object. We should be able to do this without interfering
1268  // with the implementation of the local part of sparse matrix-matrix
1269  // multiply above.
1270  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1271  labelList->set("Timer Label",label);
1272  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1273  RCP<const Export<LO,GO,NO> > dummyExport;
1274  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1275  Rview.origMatrix->getRangeMap(),
1276  Acimport,
1277  dummyExport,
1278  labelList);
1279 }
1280 
1281 
1282 
1283 /*********************************************************************************************************/
1284 template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class LocalOrdinalViewType>
1285 static inline void mult_R_A_P_reuse_LowThreadGustavsonKernel(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Rview,
1286  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Aview,
1287  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Pview,
1288  const LocalOrdinalViewType & Acol2PRow,
1289  const LocalOrdinalViewType & Acol2PRowImport,
1290  const LocalOrdinalViewType & Pcol2Accol,
1291  const LocalOrdinalViewType & PIcol2Accol,
1292  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode>& Ac,
1293  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal, Kokkos::Compat::KokkosOpenMPWrapperNode> > Acimport,
1294  const std::string& label,
1295  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1296 
1297  using Teuchos::RCP;
1298  using Tpetra::MatrixMatrix::UnmanagedView;
1299  #ifdef HAVE_TPETRA_MMM_TIMINGS
1300  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1301  Teuchos::TimeMonitor MM (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse LTGCore")));
1302  #endif
1303 
1304  typedef Kokkos::Compat::KokkosOpenMPWrapperNode Node;
1305  typedef Scalar SC;
1306  typedef LocalOrdinal LO;
1307  typedef GlobalOrdinal GO;
1308  typedef Node NO;
1309  typedef Map<LO,GO,NO> map_type;
1311  typedef typename KCRS::StaticCrsGraphType graph_t;
1312  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1313  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1314  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1315  typedef typename KCRS::device_type device_t;
1316  typedef typename device_t::execution_space execution_space;
1317  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1318 
1319  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1320  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1321  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1322 
1323  // Sizes
1324  RCP<const map_type> Accolmap = Ac.getColMap();
1325  size_t m = Rview.origMatrix->getNodeNumRows();
1326  size_t n = Accolmap->getNodeNumElements();
1327 
1328  // Get raw Kokkos matrices, and the raw CSR views
1329  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1330  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1331  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1332  const KCRS & Cmat = Ac.getLocalMatrix();
1333 
1334  c_lno_view_t Rrowptr = Rmat.graph.row_map, Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Crowptr = Cmat.graph.row_map, Irowptr;
1335  const lno_nnz_view_t Rcolind = Rmat.graph.entries, Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Ccolind = Cmat.graph.entries;
1336  lno_nnz_view_t Icolind;
1337  const scalar_view_t Rvals = Rmat.values, Avals = Amat.values, Pvals = Pmat.values;
1338  scalar_view_t Cvals = Cmat.values;
1339  scalar_view_t Ivals;
1340 
1341  if (!Pview.importMatrix.is_null())
1342  {
1343  const KCRS& Imat = Pview.importMatrix->getLocalMatrix();
1344  Irowptr = Imat.graph.row_map;
1345  Icolind = Imat.graph.entries;
1346  Ivals = Imat.values;
1347  }
1348 
1349  // Get my node / thread info (right from openmp or parameter list)
1350  size_t thread_max = Kokkos::Compat::KokkosOpenMPWrapperNode::execution_space::concurrency();
1351  if(!params.is_null()) {
1352  if(params->isParameter("openmp: ltg thread max"))
1353  thread_max = std::max((size_t)1,std::min(thread_max,params->get("openmp: ltg thread max",thread_max)));
1354  }
1355 
1356  double thread_chunk = (double)(m) / thread_max;
1357 
1358  // For each row of R
1359  Kokkos::parallel_for("MMM::RAP::Reuse::LTG::ThreadLocal",range_type(0, thread_max).set_chunk_size(1),[=](const size_t tid)
1360  {
1361  // Thread coordination stuff
1362  size_t my_thread_start = tid * thread_chunk;
1363  size_t my_thread_stop = tid == thread_max-1 ? m : (tid+1)*thread_chunk;
1364 
1365  std::vector<size_t> c_status(n, INVALID);
1366 
1367  //count entries as they are added to Ac
1368  size_t OLD_ip = 0, CSR_ip = 0;
1369  // bmk: loop over the rows of R which are assigned to thread tid
1370  for (size_t i = my_thread_start; i < my_thread_stop; i++) {
1371  // First fill the c_status array w/ locations where we're allowed to
1372  // generate nonzeros for this row
1373  OLD_ip = Crowptr(i);
1374  CSR_ip = Crowptr(i+1);
1375  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1376  c_status[Ccolind(k)] = k;
1377  // Reset values in the row of C
1378  Cvals(k) = SC_ZERO;
1379  }
1380 
1381  // mfh 27 Sep 2016: For each entry of R in the current row of R
1382  for (size_t kk = Rrowptr(i); kk < Rrowptr(i+1); kk++) {
1383  LO k = Rcolind(kk); // local column index of current entry of R
1384  const SC Rik = Rvals(kk); // value of current entry of R
1385  if (Rik == SC_ZERO)
1386  continue; // skip explicitly stored zero values in R
1387  // For each entry of A in the current row of A
1388  for (size_t ll = Arowptr(k); ll < Arowptr(k+1); ll++) {
1389  LO l = Acolind(ll); // local column index of current entry of A
1390  const SC Akl = Avals(ll); // value of current entry of A
1391  if (Akl == SC_ZERO)
1392  continue; // skip explicitly stored zero values in A
1393 
1394  if (Acol2PRow[l] != LO_INVALID) {
1395  // mfh 27 Sep 2016: If the entry of Acol2PRow
1396  // corresponding to the current entry of A is populated, then
1397  // the corresponding row of P is in P_local (i.e., it lives on
1398  // the calling process).
1399 
1400  // Local matrix
1401  size_t Pl = Teuchos::as<size_t>(Acol2PRow(l));
1402 
1403  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1404  for (size_t jj = Prowptr(Pl); jj < Prowptr(Pl+1); jj++) {
1405  LO j = Pcolind(jj);
1406  LO Cij = Pcol2Accol(j);
1407  SC Plj = Pvals(jj);
1408  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1409  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1410  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1411 
1412 
1413  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1414  }
1415  } else {
1416  // mfh 27 Sep 2016: If the entry of Acol2PRow
1417  // corresponding to the current entry of A is NOT populated (has
1418  // a flag "invalid" value), then the corresponding row of P is
1419  // in P_remote (i.e., it does not live on the calling process).
1420 
1421  // Remote matrix
1422  size_t Il = Teuchos::as<size_t>(Acol2PRowImport(l));
1423  for (size_t jj = Irowptr(Il); jj < Irowptr(Il+1); jj++) {
1424  LO j = Icolind(jj);
1425  LO Cij = PIcol2Accol(j);
1426  SC Plj = Ivals(jj);
1427  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
1428  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1429  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1430 
1431  Cvals(c_status[Cij]) += Rik*Akl*Plj;
1432  }
1433  }
1434  }
1435  }
1436  }
1437  });
1438  // NOTE: No copy out or "set" of data is needed here, since we're working directly with Kokkos::Views
1439 }
1440 
1441 
1442 
1443 #endif
1444 
1445 
1446 }//ExtraKernels
1447 }//MatrixMatrix
1448 }//Tpetra
1449 
1450 
1451 #endif
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...