Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
11 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
12 
14 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
15 #include "Teuchos_VerboseObject.hpp"
16 #include "Teuchos_Array.hpp"
17 #include "Tpetra_Util.hpp"
18 #include "Tpetra_ConfigDefs.hpp"
19 #include "Tpetra_CrsMatrix.hpp"
21 #include "Tpetra_RowMatrixTransposer.hpp"
22 #include "Tpetra_ConfigDefs.hpp"
23 #include "Tpetra_Map.hpp"
24 #include "Tpetra_Export.hpp"
25 #include "Tpetra_Import_Util.hpp"
26 #include "Tpetra_Import_Util2.hpp"
27 #include <algorithm>
28 #include <cmath>
29 #include "Teuchos_FancyOStream.hpp"
30 // #include "KokkosSparse_spgemm.hpp"
31 
37 /*********************************************************************************************************/
38 // Include the architecture-specific kernel partial specializations here
39 // NOTE: This needs to be outside all namespaces
40 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
41 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
42 #include "TpetraExt_MatrixMatrix_HIP.hpp"
43 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
44 
45 namespace Tpetra {
46 
47 namespace TripleMatrixMultiply {
48 
49 //
50 // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
51 // op(A) == A if transposeA is false,
52 // op(A) == A^T if transposeA is true,
53 // and similarly for op(R) and op(P).
54 //
55 template <class Scalar,
56  class LocalOrdinal,
57  class GlobalOrdinal,
58  class Node>
61  bool transposeR,
63  bool transposeA,
65  bool transposeP,
67  bool call_FillComplete_on_result,
68  const std::string& label,
69  const Teuchos::RCP<Teuchos::ParameterList>& params) {
70  using Teuchos::null;
71  using Teuchos::RCP;
72  typedef Scalar SC;
73  typedef LocalOrdinal LO;
74  typedef GlobalOrdinal GO;
75  typedef Node NO;
76  typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
77  typedef Import<LO, GO, NO> import_type;
78  typedef Export<LO, GO, NO> export_type;
79  typedef CrsMatrixStruct<SC, LO, GO, NO> crs_matrix_struct_type;
80  typedef Map<LO, GO, NO> map_type;
81  typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
82 
83 #ifdef HAVE_TPETRA_MMM_TIMINGS
84  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
85  using Teuchos::TimeMonitor;
86  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
87 #endif
88 
89  const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
90 
91  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
92 
93  // The input matrices R, A and P must both be fillComplete.
94  TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
95  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
96  TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
97 
98  // If transposeA is true, then Rprime will be the transpose of R
99  // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
100  // will just be a pointer to R.
101  RCP<const crs_matrix_type> Rprime = null;
102  // If transposeA is true, then Aprime will be the transpose of A
103  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
104  // will just be a pointer to A.
105  RCP<const crs_matrix_type> Aprime = null;
106  // If transposeB is true, then Pprime will be the transpose of P
107  // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
108  // will just be a pointer to P.
109  RCP<const crs_matrix_type> Pprime = null;
110 
111  // Is this a "clean" matrix?
112  //
113  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
114  // locally nor globally indexed, then it was empty. I don't like
115  // this, because the most straightforward implementation presumes
116  // lazy allocation of indices. However, historical precedent
117  // demands that we keep around this predicate as a way to test
118  // whether the matrix is empty.
119  const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
120 
121  using Teuchos::ParameterList;
122  RCP<ParameterList> transposeParams(new ParameterList);
123  transposeParams->set("sort", false);
124 
125  if (transposeR && &R != &P) {
126  transposer_type transposer(rcpFromRef(R));
127  Rprime = transposer.createTranspose(transposeParams);
128  } else {
129  Rprime = rcpFromRef(R);
130  }
131 
132  if (transposeA) {
133  transposer_type transposer(rcpFromRef(A));
134  Aprime = transposer.createTranspose(transposeParams);
135  } else {
136  Aprime = rcpFromRef(A);
137  }
138 
139  if (transposeP) {
140  transposer_type transposer(rcpFromRef(P));
141  Pprime = transposer.createTranspose(transposeParams);
142  } else {
143  Pprime = rcpFromRef(P);
144  }
145 
146  // Check size compatibility
147  global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
148  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
149  global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
150  global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
151  global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
152  global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
153  global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
154  global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
155  global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
156  TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
157  prefix << "ERROR, inner dimensions of op(R) and op(A) "
158  "must match for matrix-matrix product. op(R) is "
159  << Rleft << "x" << Rright << ", op(A) is " << Aleft << "x" << Aright);
160 
161  TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
162  prefix << "ERROR, inner dimensions of op(A) and op(P) "
163  "must match for matrix-matrix product. op(A) is "
164  << Aleft << "x" << Aright << ", op(P) is " << Pleft << "x" << Pright);
165 
166  // The result matrix Ac must at least have a row-map that reflects the correct
167  // row-size. Don't check the number of columns because rectangular matrices
168  // which were constructed with only one map can still end up having the
169  // correct capacity and dimensions when filled.
170  TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
171  prefix << "ERROR, dimensions of result Ac must "
172  "match dimensions of op(R) * op(A) * op(P). Ac has "
173  << Ac.getGlobalNumRows()
174  << " rows, should have at least " << Rleft << std::endl);
175 
176  // It doesn't matter whether Ac is already Filled or not. If it is already
177  // Filled, it must have space allocated for the positions that will be
178  // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
179  // we'll error out later when trying to store result values.
180 
181  // CGB: However, matrix must be in active-fill
182  TEUCHOS_TEST_FOR_EXCEPT(Ac.isFillActive() == false);
183 
184  // We're going to need to import remotely-owned sections of P if
185  // more than one processor is performing this run, depending on the scenario.
186  int numProcs = P.getComm()->getSize();
187 
188  // Declare a couple of structs that will be used to hold views of the data
189  // of R, A and P, to be used for fast access during the matrix-multiplication.
190  crs_matrix_struct_type Rview;
191  crs_matrix_struct_type Aview;
192  crs_matrix_struct_type Pview;
193 
194  RCP<const map_type> targetMap_R = Rprime->getRowMap();
195  RCP<const map_type> targetMap_A = Aprime->getRowMap();
196  RCP<const map_type> targetMap_P = Pprime->getRowMap();
197 
198 #ifdef HAVE_TPETRA_MMM_TIMINGS
199  MM = Teuchos::null;
200  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
201 #endif
202 
203  // Now import any needed remote rows and populate the Aview struct
204  // NOTE: We assert that an import isn't needed --- since we do the transpose
205  // above to handle that.
206  RCP<const import_type> dummyImporter;
207 
208  if (!(transposeR && &R == &P))
209  MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
210 
211  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
212 
213  // We will also need local access to all rows of P that correspond to the
214  // column-map of op(A).
215  if (numProcs > 1)
216  targetMap_P = Aprime->getColMap();
217 
218  // Import any needed remote rows and populate the Pview struct.
219  MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
220 
221  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
222 
223  bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
224  if (needs_final_export)
225  Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(), 0));
226  else
227  Actemp = rcp(&Ac, false); // don't allow deallocation
228 
229 #ifdef HAVE_TPETRA_MMM_TIMINGS
230  MM = Teuchos::null;
231  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
232 #endif
233 
234  // Call the appropriate method to perform the actual multiplication.
235  if (call_FillComplete_on_result && newFlag) {
236  if (transposeR && &R == &P)
237  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
238  else
239  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
240  } else if (call_FillComplete_on_result) {
241  if (transposeR && &R == &P)
242  MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
243  else
244  MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
245  } else {
246  // mfh 27 Sep 2016: Is this the "slow" case? This
247  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
248  // thread-parallel inserts, but that may take some effort.
249  // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
250 
251  // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
252 
253  // #ifdef HAVE_TPETRA_MMM_TIMINGS
254  // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
255  // #endif
256  // if (call_FillComplete_on_result) {
257  // // We'll call FillComplete on the C matrix before we exit, and give it a
258  // // domain-map and a range-map.
259  // // The domain-map will be the domain-map of B, unless
260  // // op(B)==transpose(B), in which case the range-map of B will be used.
261  // // The range-map will be the range-map of A, unless op(A)==transpose(A),
262  // // in which case the domain-map of A will be used.
263  // if (!C.isFillComplete())
264  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
265  // }
266  // Not implemented
267  if (transposeR && &R == &P)
268  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
269  else
270  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
271  }
272 
273  if (needs_final_export) {
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
275  MM = Teuchos::null;
276  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
277 #endif
278  Teuchos::ParameterList labelList;
279  labelList.set("Timer Label", label);
280  Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params", false);
281 
282  RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
283  bool isMM = true;
284  bool overrideAllreduce = false;
285  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
286  if (!params.is_null()) {
287  Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params", false);
288  mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
289  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
290  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
291  if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
292  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete", false);
293  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck", false);
294 
295  labelList.set("compute global constants", params->get("compute global constants", true));
296  }
297  labelList_subList.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count, "Core Count above which the optimized neighbor discovery is used");
298 
299  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete", isMM,
300  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
301  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
302 
303  export_type exporter = export_type(*Pprime->getGraph()->getImporter());
304  Actemp->exportAndFillComplete(Acprime,
305  exporter,
306  Acprime->getDomainMap(),
307  Acprime->getRangeMap(),
308  rcp(&labelList, false));
309  }
310 #ifdef HAVE_TPETRA_MMM_STATISTICS
311  printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label + std::string(" RAP MMM"));
312 #endif
313 }
314 
315 } // End namespace TripleMatrixMultiply
316 
317 namespace MMdetails {
318 
319 // Kernel method for computing the local portion of Ac = R*A*P
320 template <class Scalar,
321  class LocalOrdinal,
322  class GlobalOrdinal,
323  class Node>
324 void mult_R_A_P_newmatrix(
325  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
326  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
327  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
328  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
329  const std::string& label,
330  const Teuchos::RCP<Teuchos::ParameterList>& params) {
331  using Teuchos::Array;
332  using Teuchos::ArrayRCP;
333  using Teuchos::ArrayView;
334  using Teuchos::RCP;
335  using Teuchos::rcp;
336 
337  // typedef Scalar SC; // unused
338  typedef LocalOrdinal LO;
339  typedef GlobalOrdinal GO;
340  typedef Node NO;
341 
342  typedef Import<LO, GO, NO> import_type;
343  typedef Map<LO, GO, NO> map_type;
344 
345  // Kokkos typedefs
346  typedef typename map_type::local_map_type local_map_type;
348  typedef typename KCRS::StaticCrsGraphType graph_t;
349  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
350  typedef typename NO::execution_space execution_space;
351  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
352  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
353 
354 #ifdef HAVE_TPETRA_MMM_TIMINGS
355  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
356  using Teuchos::TimeMonitor;
357  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
358 #endif
359  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
360 
361  // Build the final importer / column map, hash table lookups for Ac
362  RCP<const import_type> Cimport;
363  RCP<const map_type> Ccolmap;
364  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
365  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
366  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
367  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
368  local_map_type Irowmap_local;
369  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
370  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
371  local_map_type Icolmap_local;
372  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
373 
374  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
375  // indices of B, to local column indices of Ac. (B and Ac have the
376  // same number of columns.) The kernel uses this, instead of
377  // copying the entire input matrix B and converting its column
378  // indices to those of C.
379  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
380 
381  if (Pview.importMatrix.is_null()) {
382  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
383  Cimport = Pimport;
384  Ccolmap = Pview.colMap;
385  const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
386  // Pcol2Ccol is trivial
387  Kokkos::parallel_for(
388  "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
389  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
390  KOKKOS_LAMBDA(const LO i) {
391  Pcol2Ccol(i) = i;
392  });
393  } else {
394  // mfh 27 Sep 2016: P has "remotes," so we need to build the
395  // column Map of C, as well as C's Import object (from its domain
396  // Map to its column Map). C's column Map is the union of the
397  // column Maps of (the local part of) P, and the "remote" part of
398  // P. Ditto for the Import. We have optimized this "setUnion"
399  // operation on Import objects and Maps.
400 
401  // Choose the right variant of setUnion
402  if (!Pimport.is_null() && !Iimport.is_null()) {
403  Cimport = Pimport->setUnion(*Iimport);
404  } else if (!Pimport.is_null() && Iimport.is_null()) {
405  Cimport = Pimport->setUnion();
406  } else if (Pimport.is_null() && !Iimport.is_null()) {
407  Cimport = Iimport->setUnion();
408  } else {
409  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
410  }
411  Ccolmap = Cimport->getTargetMap();
412 
413  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
414  // in general. We should get rid of it in order to reduce
415  // communication costs of sparse matrix-matrix multiply.
416  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
417  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
418 
419  // NOTE: This is not efficient and should be folded into setUnion
420  //
421  // mfh 27 Sep 2016: What the above comment means, is that the
422  // setUnion operation on Import objects could also compute these
423  // local index - to - local index look-up tables.
424  Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
425  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
426  Kokkos::parallel_for(
427  "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
428  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
429  });
430  Kokkos::parallel_for(
431  "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
432  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
433  });
434  }
435 
436  // Replace the column map
437  //
438  // mfh 27 Sep 2016: We do this because C was originally created
439  // without a column Map. Now we have its column Map.
440  Ac.replaceColMap(Ccolmap);
441 
442  // mfh 27 Sep 2016: Construct tables that map from local column
443  // indices of A, to local row indices of either B_local (the locally
444  // owned part of B), or B_remote (the "imported" remote part of B).
445  //
446  // For column index Aik in row i of A, if the corresponding row of B
447  // exists in the local part of B ("orig") (which I'll call B_local),
448  // then targetMapToOrigRow[Aik] is the local index of that row of B.
449  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
450  //
451  // For column index Aik in row i of A, if the corresponding row of B
452  // exists in the remote part of B ("Import") (which I'll call
453  // B_remote), then targetMapToImportRow[Aik] is the local index of
454  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
455  // (a flag value).
456 
457  // Run through all the hash table lookups once and for all
458  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
459  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
460  Kokkos::parallel_for(
461  "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
462  GO aidx = Acolmap_local.getGlobalElement(i);
463  LO P_LID = Prowmap_local.getLocalElement(aidx);
464  if (P_LID != LO_INVALID) {
465  targetMapToOrigRow(i) = P_LID;
466  targetMapToImportRow(i) = LO_INVALID;
467  } else {
468  LO I_LID = Irowmap_local.getLocalElement(aidx);
469  targetMapToOrigRow(i) = LO_INVALID;
470  targetMapToImportRow(i) = I_LID;
471  }
472  });
473 
474  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
475  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
476  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
477  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
478  targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
479  Ac, Cimport, label, params);
480 }
481 
482 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
483 template <class Scalar,
484  class LocalOrdinal,
485  class GlobalOrdinal,
486  class Node>
487 void mult_R_A_P_reuse(
488  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
489  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
490  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
491  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
492  const std::string& label,
493  const Teuchos::RCP<Teuchos::ParameterList>& params) {
494  using Teuchos::Array;
495  using Teuchos::ArrayRCP;
496  using Teuchos::ArrayView;
497  using Teuchos::RCP;
498  using Teuchos::rcp;
499 
500  // typedef Scalar SC; // unused
501  typedef LocalOrdinal LO;
502  typedef GlobalOrdinal GO;
503  typedef Node NO;
504 
505  typedef Import<LO, GO, NO> import_type;
506  typedef Map<LO, GO, NO> map_type;
507 
508  // Kokkos typedefs
509  typedef typename map_type::local_map_type local_map_type;
511  typedef typename KCRS::StaticCrsGraphType graph_t;
512  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
513  typedef typename NO::execution_space execution_space;
514  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
515  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
516 
517 #ifdef HAVE_TPETRA_MMM_TIMINGS
518  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
519  using Teuchos::TimeMonitor;
520  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
521 #endif
522  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
523 
524  // Build the final importer / column map, hash table lookups for Ac
525  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
526  RCP<const map_type> Ccolmap = Ac.getColMap();
527  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
528  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
529  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
530  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
531  local_map_type Irowmap_local;
532  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
533  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
534  local_map_type Icolmap_local;
535  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
536  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
537 
538  // Build the final importer / column map, hash table lookups for C
539  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
540  {
541  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
542  // So, column map of C may be a strict subset of the column map of B
543  Kokkos::parallel_for(
544  range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
545  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
546  });
547 
548  if (!Pview.importMatrix.is_null()) {
549  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
550  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
551 
552  Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
553  Kokkos::parallel_for(
554  range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
555  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
556  });
557  }
558  }
559 
560  // Run through all the hash table lookups once and for all
561  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
562  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
563  Kokkos::parallel_for(
564  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
565  GO aidx = Acolmap_local.getGlobalElement(i);
566  LO B_LID = Prowmap_local.getLocalElement(aidx);
567  if (B_LID != LO_INVALID) {
568  targetMapToOrigRow(i) = B_LID;
569  targetMapToImportRow(i) = LO_INVALID;
570  } else {
571  LO I_LID = Irowmap_local.getLocalElement(aidx);
572  targetMapToOrigRow(i) = LO_INVALID;
573  targetMapToImportRow(i) = I_LID;
574  }
575  });
576 
577  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
578  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
579  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
580  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
581  targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
582  Ac, Cimport, label, params);
583 }
584 
585 // Kernel method for computing the local portion of Ac = R*A*P
586 template <class Scalar,
587  class LocalOrdinal,
588  class GlobalOrdinal,
589  class Node>
590 void mult_PT_A_P_newmatrix(
591  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
592  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
593  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
594  const std::string& label,
595  const Teuchos::RCP<Teuchos::ParameterList>& params) {
596  using Teuchos::Array;
597  using Teuchos::ArrayRCP;
598  using Teuchos::ArrayView;
599  using Teuchos::RCP;
600  using Teuchos::rcp;
601 
602  // typedef Scalar SC; // unused
603  typedef LocalOrdinal LO;
604  typedef GlobalOrdinal GO;
605  typedef Node NO;
606 
607  typedef Import<LO, GO, NO> import_type;
608  typedef Map<LO, GO, NO> map_type;
609 
610  // Kokkos typedefs
611  typedef typename map_type::local_map_type local_map_type;
613  typedef typename KCRS::StaticCrsGraphType graph_t;
614  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
615  typedef typename NO::execution_space execution_space;
616  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
617  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
618 
619 #ifdef HAVE_TPETRA_MMM_TIMINGS
620  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
621  using Teuchos::TimeMonitor;
622  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
623 #endif
624  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
625 
626  // Build the final importer / column map, hash table lookups for Ac
627  RCP<const import_type> Cimport;
628  RCP<const map_type> Ccolmap;
629  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
630  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
631  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
632  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
633  local_map_type Irowmap_local;
634  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
635  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
636  local_map_type Icolmap_local;
637  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
638 
639  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
640  // indices of B, to local column indices of Ac. (B and Ac have the
641  // same number of columns.) The kernel uses this, instead of
642  // copying the entire input matrix B and converting its column
643  // indices to those of C.
644  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
645 
646  if (Pview.importMatrix.is_null()) {
647  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
648  Cimport = Pimport;
649  Ccolmap = Pview.colMap;
650  const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
651  // Pcol2Ccol is trivial
652  Kokkos::parallel_for(
653  "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
654  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
655  KOKKOS_LAMBDA(const LO i) {
656  Pcol2Ccol(i) = i;
657  });
658  } else {
659  // mfh 27 Sep 2016: P has "remotes," so we need to build the
660  // column Map of C, as well as C's Import object (from its domain
661  // Map to its column Map). C's column Map is the union of the
662  // column Maps of (the local part of) P, and the "remote" part of
663  // P. Ditto for the Import. We have optimized this "setUnion"
664  // operation on Import objects and Maps.
665 
666  // Choose the right variant of setUnion
667  if (!Pimport.is_null() && !Iimport.is_null()) {
668  Cimport = Pimport->setUnion(*Iimport);
669  } else if (!Pimport.is_null() && Iimport.is_null()) {
670  Cimport = Pimport->setUnion();
671  } else if (Pimport.is_null() && !Iimport.is_null()) {
672  Cimport = Iimport->setUnion();
673  } else {
674  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
675  }
676  Ccolmap = Cimport->getTargetMap();
677 
678  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
679  // in general. We should get rid of it in order to reduce
680  // communication costs of sparse matrix-matrix multiply.
681  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
682  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
683 
684  // NOTE: This is not efficient and should be folded into setUnion
685  //
686  // mfh 27 Sep 2016: What the above comment means, is that the
687  // setUnion operation on Import objects could also compute these
688  // local index - to - local index look-up tables.
689  Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
690  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
691  Kokkos::parallel_for(
692  "Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement", range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
693  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
694  });
695  Kokkos::parallel_for(
696  "Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
697  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
698  });
699  }
700 
701  // Replace the column map
702  //
703  // mfh 27 Sep 2016: We do this because C was originally created
704  // without a column Map. Now we have its column Map.
705  Ac.replaceColMap(Ccolmap);
706 
707  // mfh 27 Sep 2016: Construct tables that map from local column
708  // indices of A, to local row indices of either B_local (the locally
709  // owned part of B), or B_remote (the "imported" remote part of B).
710  //
711  // For column index Aik in row i of A, if the corresponding row of B
712  // exists in the local part of B ("orig") (which I'll call B_local),
713  // then targetMapToOrigRow[Aik] is the local index of that row of B.
714  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
715  //
716  // For column index Aik in row i of A, if the corresponding row of B
717  // exists in the remote part of B ("Import") (which I'll call
718  // B_remote), then targetMapToImportRow[Aik] is the local index of
719  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
720  // (a flag value).
721 
722  // Run through all the hash table lookups once and for all
723  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
724  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
725 
726  Kokkos::parallel_for(
727  "Tpetra::mult_R_A_P_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
728  GO aidx = Acolmap_local.getGlobalElement(i);
729  LO P_LID = Prowmap_local.getLocalElement(aidx);
730  if (P_LID != LO_INVALID) {
731  targetMapToOrigRow(i) = P_LID;
732  targetMapToImportRow(i) = LO_INVALID;
733  } else {
734  LO I_LID = Irowmap_local.getLocalElement(aidx);
735  targetMapToOrigRow(i) = LO_INVALID;
736  targetMapToImportRow(i) = I_LID;
737  }
738  });
739 
740  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
741  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
742  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
743  mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
744  targetMapToOrigRow, targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
745  Ac, Cimport, label, params);
746 }
747 
748 // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
749 template <class Scalar,
750  class LocalOrdinal,
751  class GlobalOrdinal,
752  class Node>
753 void mult_PT_A_P_reuse(
754  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
755  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
756  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
757  const std::string& label,
758  const Teuchos::RCP<Teuchos::ParameterList>& params) {
759  using Teuchos::Array;
760  using Teuchos::ArrayRCP;
761  using Teuchos::ArrayView;
762  using Teuchos::RCP;
763  using Teuchos::rcp;
764 
765  // typedef Scalar SC; // unused
766  typedef LocalOrdinal LO;
767  typedef GlobalOrdinal GO;
768  typedef Node NO;
769 
770  typedef Import<LO, GO, NO> import_type;
771  typedef Map<LO, GO, NO> map_type;
772 
773  // Kokkos typedefs
774  typedef typename map_type::local_map_type local_map_type;
776  typedef typename KCRS::StaticCrsGraphType graph_t;
777  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
778  typedef typename NO::execution_space execution_space;
779  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
780  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
781 
782 #ifdef HAVE_TPETRA_MMM_TIMINGS
783  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
784  using Teuchos::TimeMonitor;
785  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
786 #endif
787  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
788 
789  // Build the final importer / column map, hash table lookups for Ac
790  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
791  RCP<const map_type> Ccolmap = Ac.getColMap();
792  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
793  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
794  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
795  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
796  local_map_type Irowmap_local;
797  if (!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
798  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
799  local_map_type Icolmap_local;
800  if (!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
801  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
802 
803  // Build the final importer / column map, hash table lookups for C
804  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Pview.colMap->getLocalNumElements()), Icol2Ccol;
805  {
806  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
807  // So, column map of C may be a strict subset of the column map of B
808  Kokkos::parallel_for(
809  range_type(0, Pview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
810  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
811  });
812 
813  if (!Pview.importMatrix.is_null()) {
814  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
815  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
816 
817  Kokkos::resize(Icol2Ccol, Pview.importMatrix->getColMap()->getLocalNumElements());
818  Kokkos::parallel_for(
819  range_type(0, Pview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
820  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
821  });
822  }
823  }
824 
825  // Run through all the hash table lookups once and for all
826  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
827  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
828  Kokkos::parallel_for(
829  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
830  GO aidx = Acolmap_local.getGlobalElement(i);
831  LO B_LID = Prowmap_local.getLocalElement(aidx);
832  if (B_LID != LO_INVALID) {
833  targetMapToOrigRow(i) = B_LID;
834  targetMapToImportRow(i) = LO_INVALID;
835  } else {
836  LO I_LID = Irowmap_local.getLocalElement(aidx);
837  targetMapToOrigRow(i) = LO_INVALID;
838  targetMapToImportRow(i) = I_LID;
839  }
840  });
841 
842  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
843  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
844  KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::
845  mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
846  targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
847  Ac, Cimport, label, params);
848 }
849 
850 /*********************************************************************************************************/
851 // RAP NewMatrix Kernel wrappers (Default non-threaded version)
852 // Computes R * A * P -> Ac using classic Gustavson approach
853 template <class Scalar,
854  class LocalOrdinal,
855  class GlobalOrdinal,
856  class Node,
857  class LocalOrdinalViewType>
858 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
859  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
860  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
861  const LocalOrdinalViewType& Acol2Prow_dev,
862  const LocalOrdinalViewType& Acol2PIrow_dev,
863  const LocalOrdinalViewType& Pcol2Accol_dev,
864  const LocalOrdinalViewType& PIcol2Accol_dev,
865  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
866  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
867  const std::string& label,
868  const Teuchos::RCP<Teuchos::ParameterList>& params) {
869 #ifdef HAVE_TPETRA_MMM_TIMINGS
870  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
871  using Teuchos::TimeMonitor;
872  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
873 #endif
874 
875  using Teuchos::Array;
876  using Teuchos::ArrayRCP;
877  using Teuchos::ArrayView;
878  using Teuchos::RCP;
879  using Teuchos::rcp;
880 
881  // Lots and lots of typedefs
883  typedef typename KCRS::StaticCrsGraphType graph_t;
884  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
885  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
886  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
887  typedef typename KCRS::values_type::non_const_type scalar_view_t;
888 
889  typedef Scalar SC;
890  typedef LocalOrdinal LO;
891  typedef GlobalOrdinal GO;
892  typedef Node NO;
893  typedef Map<LO, GO, NO> map_type;
894  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
895  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
896  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
897 
898  // Sizes
899  RCP<const map_type> Accolmap = Ac.getColMap();
900  size_t m = Rview.origMatrix->getLocalNumRows();
901  size_t n = Accolmap->getLocalNumElements();
902  size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
903 
904  // Routine runs on host; have to put arguments on host, too
905  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
906  Acol2Prow_dev);
907  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
908  Acol2PIrow_dev);
909  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
910  Pcol2Accol_dev);
911  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
912  PIcol2Accol_dev);
913 
914  // Grab the Kokkos::SparseCrsMatrices & inner stuff
915  const auto Amat = Aview.origMatrix->getLocalMatrixHost();
916  const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
917  const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
918 
919  auto Arowptr = Amat.graph.row_map;
920  auto Prowptr = Pmat.graph.row_map;
921  auto Rrowptr = Rmat.graph.row_map;
922  const auto Acolind = Amat.graph.entries;
923  const auto Pcolind = Pmat.graph.entries;
924  const auto Rcolind = Rmat.graph.entries;
925  const auto Avals = Amat.values;
926  const auto Pvals = Pmat.values;
927  const auto Rvals = Rmat.values;
928 
929  typename c_lno_view_t::host_mirror_type::const_type Irowptr;
930  typename lno_nnz_view_t::host_mirror_type Icolind;
931  typename scalar_view_t::host_mirror_type Ivals;
932  if (!Pview.importMatrix.is_null()) {
933  auto lclP = Pview.importMatrix->getLocalMatrixHost();
934  Irowptr = lclP.graph.row_map;
935  Icolind = lclP.graph.entries;
936  Ivals = lclP.values;
937  p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
938  }
939 
940 #ifdef HAVE_TPETRA_MMM_TIMINGS
941  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
942 #endif
943 
944  // Classic csr assembly (low memory edition)
945  //
946  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
947  // The method loops over rows of R, and may resize after processing
948  // each row. Chris Siefert says that this reflects experience in
949  // ML; for the non-threaded case, ML found it faster to spend less
950  // effort on estimation and risk an occasional reallocation.
951  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
952  typename lno_view_t::host_mirror_type Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"), m + 1);
953  typename lno_nnz_view_t::host_mirror_type Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"), CSR_alloc);
954  typename scalar_view_t::host_mirror_type Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"), CSR_alloc);
955 
956  // mfh 27 Sep 2016: The ac_status array is an implementation detail
957  // of the local sparse matrix-matrix multiply routine.
958 
959  // The status array will contain the index into colind where this entry was last deposited.
960  // ac_status[i] < nnz - not in the row yet
961  // ac_status[i] >= nnz - this is the entry where you can find the data
962  // We start with this filled with INVALID's indicating that there are no entries yet.
963  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
964  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
965  Array<size_t> ac_status(n, ST_INVALID);
966 
967  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
968  // routine. The routine computes Ac := R * A * (P_local + P_remote).
969  //
970  // For column index Aik in row i of A, Acol2Prow[Aik] tells
971  // you whether the corresponding row of P belongs to P_local
972  // ("orig") or P_remote ("Import").
973 
974  // For each row of R
975  size_t nnz = 0, nnz_old = 0;
976  for (size_t i = 0; i < m; i++) {
977  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
978  // on the calling process.
979  Crowptr[i] = nnz;
980 
981  // mfh 27 Sep 2016: For each entry of R in the current row of R
982  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
983  LO k = Rcolind[kk]; // local column index of current entry of R
984  const SC Rik = Rvals[kk]; // value of current entry of R
985  if (Rik == SC_ZERO)
986  continue; // skip explicitly stored zero values in R
987  // For each entry of A in the current row of A
988  for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
989  LO l = Acolind[ll]; // local column index of current entry of A
990  const SC Akl = Avals[ll]; // value of current entry of A
991  if (Akl == SC_ZERO)
992  continue; // skip explicitly stored zero values in A
993 
994  if (Acol2Prow[l] != LO_INVALID) {
995  // mfh 27 Sep 2016: If the entry of Acol2Prow
996  // corresponding to the current entry of A is populated, then
997  // the corresponding row of P is in P_local (i.e., it lives on
998  // the calling process).
999 
1000  // Local matrix
1001  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1002 
1003  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1004  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1005  LO j = Pcolind[jj];
1006  LO Acj = Pcol2Accol[j];
1007  SC Plj = Pvals[jj];
1008 
1009  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1010 #ifdef HAVE_TPETRA_DEBUG
1011  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1012  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1013  std::runtime_error,
1014  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1015 #endif
1016  // New entry
1017  ac_status[Acj] = nnz;
1018  Ccolind[nnz] = Acj;
1019  Cvals[nnz] = Rik * Akl * Plj;
1020  nnz++;
1021  } else {
1022  Cvals[ac_status[Acj]] += Rik * Akl * Plj;
1023  }
1024  }
1025  } else {
1026  // mfh 27 Sep 2016: If the entry of Acol2PRow
1027  // corresponding to the current entry of A is NOT populated (has
1028  // a flag "invalid" value), then the corresponding row of P is
1029  // in P_remote (i.e., it does not live on the calling process).
1030 
1031  // Remote matrix
1032  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1033  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1034  LO j = Icolind[jj];
1035  LO Acj = PIcol2Accol[j];
1036  SC Plj = Ivals[jj];
1037 
1038  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1039 #ifdef HAVE_TPETRA_DEBUG
1040  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1041  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1042  std::runtime_error,
1043  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1044 #endif
1045  // New entry
1046  ac_status[Acj] = nnz;
1047  Ccolind[nnz] = Acj;
1048  Cvals[nnz] = Rik * Akl * Plj;
1049  nnz++;
1050  } else {
1051  Cvals[ac_status[Acj]] += Rik * Akl * Plj;
1052  }
1053  }
1054  }
1055  }
1056  }
1057  // Resize for next pass if needed
1058  if (nnz + n > CSR_alloc) {
1059  CSR_alloc *= 2;
1060  Kokkos::resize(Ccolind, CSR_alloc);
1061  Kokkos::resize(Cvals, CSR_alloc);
1062  }
1063  nnz_old = nnz;
1064  }
1065 
1066  Crowptr[m] = nnz;
1067 
1068  // Downward resize
1069  Kokkos::resize(Ccolind, nnz);
1070  Kokkos::resize(Cvals, nnz);
1071 
1072 #ifdef HAVE_TPETRA_MMM_TIMINGS
1073  MM = Teuchos::null;
1074  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1075 #endif
1076  auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1077  typename KCRS::device_type(), Crowptr);
1078  auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1079  typename KCRS::device_type(), Ccolind);
1080  auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1081  typename KCRS::device_type(), Cvals);
1082 
1083  // Final sort & set of CRS arrays
1084  if (params.is_null() || params->get("sort entries", true))
1085  Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1086  Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1087 
1088 #ifdef HAVE_TPETRA_MMM_TIMINGS
1089  MM = Teuchos::null;
1090  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1091 #endif
1092 
1093  // Final FillComplete
1094  //
1095  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1096  // Import (from domain Map to column Map) construction (which costs
1097  // lots of communication) by taking the previously constructed
1098  // Import object. We should be able to do this without interfering
1099  // with the implementation of the local part of sparse matrix-matrix
1100  // multply above.
1101  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1102  labelList->set("Timer Label", label);
1103  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1104  RCP<const Export<LO, GO, NO> > dummyExport;
1105  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1106  Rview.origMatrix->getRangeMap(),
1107  Acimport,
1108  dummyExport,
1109  labelList);
1110 }
1111 
1112 /*********************************************************************************************************/
1113 // RAP Reuse Kernel wrappers (Default non-threaded version)
1114 // Computes R * A * P -> Ac using reuse Gustavson
1115 template <class Scalar,
1116  class LocalOrdinal,
1117  class GlobalOrdinal,
1118  class Node,
1119  class LocalOrdinalViewType>
1120 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1121  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1122  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1123  const LocalOrdinalViewType& Acol2Prow_dev,
1124  const LocalOrdinalViewType& Acol2PIrow_dev,
1125  const LocalOrdinalViewType& Pcol2Accol_dev,
1126  const LocalOrdinalViewType& PIcol2Accol_dev,
1127  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1128  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1129  const std::string& label,
1130  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1131 #ifdef HAVE_TPETRA_MMM_TIMINGS
1132  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1133  using Teuchos::TimeMonitor;
1134  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1135 #endif
1136 
1137  using Teuchos::Array;
1138  using Teuchos::ArrayRCP;
1139  using Teuchos::ArrayView;
1140  using Teuchos::RCP;
1141  using Teuchos::rcp;
1142 
1143  // Lots and lots of typedefs
1144  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
1145  typedef typename KCRS::StaticCrsGraphType graph_t;
1146  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1147  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1148  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1149 
1150  typedef Scalar SC;
1151  typedef LocalOrdinal LO;
1152  typedef GlobalOrdinal GO;
1153  typedef Node NO;
1154  typedef Map<LO, GO, NO> map_type;
1155  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1156  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1157  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1158 
1159  // Sizes
1160  RCP<const map_type> Accolmap = Ac.getColMap();
1161  size_t m = Rview.origMatrix->getLocalNumRows();
1162  size_t n = Accolmap->getLocalNumElements();
1163  size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1164 
1165  // Routine runs on host; have to put arguments on host, too
1166  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1167  Acol2Prow_dev);
1168  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1169  Acol2PIrow_dev);
1170  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1171  Pcol2Accol_dev);
1172  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1173  PIcol2Accol_dev);
1174 
1175  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1176  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
1177  const KCRS& Pmat = Pview.origMatrix->getLocalMatrixHost();
1178  const KCRS& Rmat = Rview.origMatrix->getLocalMatrixHost();
1179  const KCRS& Cmat = Ac.getLocalMatrixHost();
1180 
1181  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1182  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries, Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1183  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1184  scalar_view_t Cvals = Cmat.values;
1185 
1186  c_lno_view_t Irowptr;
1187  lno_nnz_view_t Icolind;
1188  scalar_view_t Ivals;
1189  if (!Pview.importMatrix.is_null()) {
1190  auto lclP = Pview.importMatrix->getLocalMatrixHost();
1191  Irowptr = lclP.graph.row_map;
1192  Icolind = lclP.graph.entries;
1193  Ivals = lclP.values;
1194  p_max_nnz_per_row = std::max(p_max_nnz_per_row, Pview.importMatrix->getLocalMaxNumRowEntries());
1195  }
1196 
1197 #ifdef HAVE_TPETRA_MMM_TIMINGS
1198  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1199 #endif
1200 
1201  // mfh 27 Sep 2016: The ac_status array is an implementation detail
1202  // of the local sparse matrix-matrix multiply routine.
1203 
1204  // The status array will contain the index into colind where this entry was last deposited.
1205  // ac_status[i] < nnz - not in the row yet
1206  // ac_status[i] >= nnz - this is the entry where you can find the data
1207  // We start with this filled with INVALID's indicating that there are no entries yet.
1208  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1209  Array<size_t> ac_status(n, ST_INVALID);
1210 
1211  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1212  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1213  //
1214  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1215  // you whether the corresponding row of P belongs to P_local
1216  // ("orig") or P_remote ("Import").
1217 
1218  // Necessary until following UVM host accesses are changed - for example Crowptr
1219  // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1220  Kokkos::fence();
1221 
1222  // For each row of R
1223  size_t OLD_ip = 0, CSR_ip = 0;
1224  for (size_t i = 0; i < m; i++) {
1225  // First fill the c_status array w/ locations where we're allowed to
1226  // generate nonzeros for this row
1227  OLD_ip = Crowptr[i];
1228  CSR_ip = Crowptr[i + 1];
1229  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1230  ac_status[Ccolind[k]] = k;
1231 
1232  // Reset values in the row of C
1233  Cvals[k] = SC_ZERO;
1234  }
1235 
1236  // mfh 27 Sep 2016: For each entry of R in the current row of R
1237  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
1238  LO k = Rcolind[kk]; // local column index of current entry of R
1239  const SC Rik = Rvals[kk]; // value of current entry of R
1240  if (Rik == SC_ZERO)
1241  continue; // skip explicitly stored zero values in R
1242  // For each entry of A in the current row of A
1243  for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1244  LO l = Acolind[ll]; // local column index of current entry of A
1245  const SC Akl = Avals[ll]; // value of current entry of A
1246  if (Akl == SC_ZERO)
1247  continue; // skip explicitly stored zero values in A
1248 
1249  if (Acol2Prow[l] != LO_INVALID) {
1250  // mfh 27 Sep 2016: If the entry of Acol2Prow
1251  // corresponding to the current entry of A is populated, then
1252  // the corresponding row of P is in P_local (i.e., it lives on
1253  // the calling process).
1254 
1255  // Local matrix
1256  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1257 
1258  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1259  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1260  LO j = Pcolind[jj];
1261  LO Cij = Pcol2Accol[j];
1262  SC Plj = Pvals[jj];
1263 
1264  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1265  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1266  << "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1267 
1268  Cvals[ac_status[Cij]] += Rik * Akl * Plj;
1269  }
1270  } else {
1271  // mfh 27 Sep 2016: If the entry of Acol2PRow
1272  // corresponding to the current entry of A is NOT populated (has
1273  // a flag "invalid" value), then the corresponding row of P is
1274  // in P_remote (i.e., it does not live on the calling process).
1275 
1276  // Remote matrix
1277  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1278  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1279  LO j = Icolind[jj];
1280  LO Cij = PIcol2Accol[j];
1281  SC Plj = Ivals[jj];
1282 
1283  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1284  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
1285  << "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1286 
1287  Cvals[ac_status[Cij]] += Rik * Akl * Plj;
1288  }
1289  }
1290  }
1291  }
1292  }
1293 
1294 #ifdef HAVE_TPETRA_MMM_TIMINGS
1295  auto MM3 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1296 #endif
1297 
1298  Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1299 }
1300 
1301 /*********************************************************************************************************/
1302 // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1303 // Computes P.T * A * P -> Ac
1304 template <class Scalar,
1305  class LocalOrdinal,
1306  class GlobalOrdinal,
1307  class Node,
1308  class LocalOrdinalViewType>
1309 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1310  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1311  const LocalOrdinalViewType& Acol2Prow,
1312  const LocalOrdinalViewType& Acol2PIrow,
1313  const LocalOrdinalViewType& Pcol2Accol,
1314  const LocalOrdinalViewType& PIcol2Accol,
1315  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1316  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1317  const std::string& label,
1318  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1319 #ifdef HAVE_TPETRA_MMM_TIMINGS
1320  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1321  using Teuchos::TimeMonitor;
1322  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1323 #endif
1324 
1325  // We don't need a kernel-level PTAP, we just transpose here
1326  typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
1327  transposer_type transposer(Pview.origMatrix, label + std::string("XP: "));
1328 
1329  using Teuchos::ParameterList;
1330  using Teuchos::RCP;
1331  RCP<ParameterList> transposeParams(new ParameterList);
1332  transposeParams->set("sort", false);
1333 
1334  if (!params.is_null()) {
1335  transposeParams->set("compute global constants",
1336  params->get("compute global constants: temporaries",
1337  false));
1338  }
1339  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1340  transposer.createTransposeLocal(transposeParams);
1341  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1342  Rview.origMatrix = Ptrans;
1343 
1344  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
1345 }
1346 
1347 /*********************************************************************************************************/
1348 // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1349 // Computes P.T * A * P -> Ac
1350 template <class Scalar,
1351  class LocalOrdinal,
1352  class GlobalOrdinal,
1353  class Node,
1354  class LocalOrdinalViewType>
1355 void KernelWrappers3<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1356  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1357  const LocalOrdinalViewType& Acol2Prow,
1358  const LocalOrdinalViewType& Acol2PIrow,
1359  const LocalOrdinalViewType& Pcol2Accol,
1360  const LocalOrdinalViewType& PIcol2Accol,
1361  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1362  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1363  const std::string& label,
1364  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1365 #ifdef HAVE_TPETRA_MMM_TIMINGS
1366  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1367  using Teuchos::TimeMonitor;
1368  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1369 #endif
1370 
1371  // We don't need a kernel-level PTAP, we just transpose here
1372  typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
1373  transposer_type transposer(Pview.origMatrix, label + std::string("XP: "));
1374 
1375  using Teuchos::ParameterList;
1376  using Teuchos::RCP;
1377  RCP<ParameterList> transposeParams(new ParameterList);
1378  transposeParams->set("sort", false);
1379 
1380  if (!params.is_null()) {
1381  transposeParams->set("compute global constants",
1382  params->get("compute global constants: temporaries",
1383  false));
1384  }
1385  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1386  transposer.createTransposeLocal(transposeParams);
1387  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1388  Rview.origMatrix = Ptrans;
1389 
1390  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview, Acol2Prow, Acol2PIrow, Pcol2Accol, PIcol2Accol, Ac, Acimport, label, params);
1391 }
1392 
1393 /*********************************************************************************************************/
1394 // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1395 // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1396 // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1397 // Currently, this implementation never gets called.
1398 template <class Scalar,
1399  class LocalOrdinal,
1400  class GlobalOrdinal,
1401  class Node>
1402 void KernelWrappers3MMM<Scalar, LocalOrdinal, GlobalOrdinal, Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1403  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1404  const Teuchos::Array<LocalOrdinal>& Acol2PRow,
1405  const Teuchos::Array<LocalOrdinal>& Acol2PRowImport,
1406  const Teuchos::Array<LocalOrdinal>& Pcol2Accol,
1407  const Teuchos::Array<LocalOrdinal>& PIcol2Accol,
1408  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1409  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > Acimport,
1410  const std::string& label,
1411  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1412 #ifdef HAVE_TPETRA_MMM_TIMINGS
1413  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1414  using Teuchos::TimeMonitor;
1415  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1416 #endif
1417 
1418  using Teuchos::Array;
1419  using Teuchos::ArrayRCP;
1420  using Teuchos::ArrayView;
1421  using Teuchos::RCP;
1422  using Teuchos::rcp;
1423 
1424  typedef Scalar SC;
1425  typedef LocalOrdinal LO;
1426  typedef GlobalOrdinal GO;
1427  typedef Node NO;
1428  typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
1429  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1430  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1431 
1432  // number of rows on the process of the fine matrix
1433  // size_t m = Pview.origMatrix->getLocalNumRows();
1434  // number of rows on the process of the coarse matrix
1435  size_t n = Ac.getRowMap()->getLocalNumElements();
1436  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1437 
1438  // Get Data Pointers
1439  ArrayRCP<size_t> Acrowptr_RCP;
1440  ArrayRCP<LO> Accolind_RCP;
1441  ArrayRCP<SC> Acvals_RCP;
1442 
1443  // mfh 27 Sep 2016: get the three CSR arrays
1444  // out of the CrsMatrix. This code computes R * A * (P_local +
1445  // P_remote), where P_local contains the locally owned rows of P,
1446  // and P_remote the (previously Import'ed) remote rows of P.
1447 
1448  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1449  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1450  auto Avals = Aview.origMatrix->getLocalValuesHost(
1451  Tpetra::Access::ReadOnly);
1452  auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1453  auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1454  auto Pvals = Pview.origMatrix->getLocalValuesHost(
1455  Tpetra::Access::ReadOnly);
1456  decltype(Prowptr) Irowptr;
1457  decltype(Pcolind) Icolind;
1458  decltype(Pvals) Ivals;
1459 
1460  if (!Pview.importMatrix.is_null()) {
1461  Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1462  Icolind = Pview.importMatrix->getLocalIndicesHost();
1463  Ivals = Pview.importMatrix->getLocalValuesHost(
1464  Tpetra::Access::ReadOnly);
1465  }
1466 
1467  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1468  // where Teuchos::ArrayRCP::operator[] may be slower than
1469  // Teuchos::ArrayView::operator[].
1470 
1471  // For efficiency
1472  ArrayView<size_t> Acrowptr;
1473  ArrayView<LO> Accolind;
1474  ArrayView<SC> Acvals;
1475 
1477  // In a first pass, determine the graph of Ac.
1479 
1481  // Get the graph of Ac. This gets the local transpose of P,
1482  // then loops over R, A, P to get the graph of Ac.
1484 
1485 #ifdef HAVE_TPETRA_MMM_TIMINGS
1486  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1487 #endif
1488 
1490  // Get the local transpose of the graph of P by locally transposing
1491  // all of P
1492 
1493  transposer_type transposer(Pview.origMatrix, label + std::string("XP: "));
1494 
1495  using Teuchos::ParameterList;
1496  RCP<ParameterList> transposeParams(new ParameterList);
1497  transposeParams->set("sort", false);
1498  if (!params.is_null()) {
1499  transposeParams->set("compute global constants",
1500  params->get("compute global constants: temporaries",
1501  false));
1502  }
1503  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1504  transposer.createTransposeLocal(transposeParams);
1505 
1506  auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1507  auto Rcolind = Ptrans->getLocalIndicesHost();
1508  auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1509 
1511  // Construct graph
1512 
1513 #ifdef HAVE_TPETRA_MMM_TIMINGS
1514  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1515 #endif
1516 
1517  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1518  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1519 
1520  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1521  size_t nnzPerRowA = 100;
1522  if (Aview.origMatrix->getLocalNumEntries() > 0)
1523  nnzPerRowA = Aview.origMatrix->getLocalNumEntries() / Aview.origMatrix->getLocalNumRows();
1524  Acrowptr_RCP.resize(n + 1);
1525  Acrowptr = Acrowptr_RCP();
1526  Accolind_RCP.resize(nnz_alloc);
1527  Accolind = Accolind_RCP();
1528 
1529  size_t nnz = 0, nnz_old = 0;
1530  for (size_t i = 0; i < n; i++) {
1531  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1532  // on the calling process.
1533  Acrowptr[i] = nnz;
1534 
1535  // mfh 27 Sep 2016: For each entry of R in the current row of R
1536  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i + 1]; kk++) {
1537  LO k = Rcolind[kk]; // local column index of current entry of R
1538  // For each entry of A in the current row of A
1539  for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1540  LO l = Acolind[ll]; // local column index of current entry of A
1541 
1542  if (Acol2PRow[l] != LO_INVALID) {
1543  // mfh 27 Sep 2016: If the entry of Acol2PRow
1544  // corresponding to the current entry of A is populated, then
1545  // the corresponding row of P is in P_local (i.e., it lives on
1546  // the calling process).
1547 
1548  // Local matrix
1549  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1550 
1551  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1552  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1553  LO j = Pcolind[jj];
1554  LO Acj = Pcol2Accol[j];
1555 
1556  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1557  // New entry
1558  ac_status[Acj] = nnz;
1559  Accolind[nnz] = Acj;
1560  nnz++;
1561  }
1562  }
1563  } else {
1564  // mfh 27 Sep 2016: If the entry of Acol2PRow
1565  // corresponding to the current entry of A is NOT populated (has
1566  // a flag "invalid" value), then the corresponding row of P is
1567  // in P_remote (i.e., it does not live on the calling process).
1568 
1569  // Remote matrix
1570  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1571  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1572  LO j = Icolind[jj];
1573  LO Acj = PIcol2Accol[j];
1574 
1575  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1576  // New entry
1577  ac_status[Acj] = nnz;
1578  Accolind[nnz] = Acj;
1579  nnz++;
1580  }
1581  }
1582  }
1583  }
1584  }
1585  // Resize for next pass if needed
1586  // cag: Maybe we can do something more subtle here, and not double
1587  // the size right away.
1588  if (nnz + std::max(5 * nnzPerRowA, n) > nnz_alloc) {
1589  nnz_alloc *= 2;
1590  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5 * nnzPerRowA, n));
1591  Accolind_RCP.resize(nnz_alloc);
1592  Accolind = Accolind_RCP();
1593  Acvals_RCP.resize(nnz_alloc);
1594  Acvals = Acvals_RCP();
1595  }
1596  nnz_old = nnz;
1597  }
1598  Acrowptr[n] = nnz;
1599 
1600  // Downward resize
1601  Accolind_RCP.resize(nnz);
1602  Accolind = Accolind_RCP();
1603 
1604  // Allocate Acvals
1605  Acvals_RCP.resize(nnz, SC_ZERO);
1606  Acvals = Acvals_RCP();
1607 
1609  // In a second pass, enter the values into Acvals.
1611 
1612 #ifdef HAVE_TPETRA_MMM_TIMINGS
1613  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1614 #endif
1615 
1616  for (size_t k = 0; k < n; k++) {
1617  for (size_t ii = Prowptr[k]; ii < Prowptr[k + 1]; ii++) {
1618  LO i = Pcolind[ii];
1619  const SC Pki = Pvals[ii];
1620  for (size_t ll = Arowptr[k]; ll < Arowptr[k + 1]; ll++) {
1621  LO l = Acolind[ll];
1622  const SC Akl = Avals[ll];
1623  if (Akl == 0.)
1624  continue;
1625  if (Acol2PRow[l] != LO_INVALID) {
1626  // mfh 27 Sep 2016: If the entry of Acol2PRow
1627  // corresponding to the current entry of A is populated, then
1628  // the corresponding row of P is in P_local (i.e., it lives on
1629  // the calling process).
1630 
1631  // Local matrix
1632  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1633  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl + 1]; jj++) {
1634  LO j = Pcolind[jj];
1635  LO Acj = Pcol2Accol[j];
1636  size_t pp;
1637  for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
1638  if (Accolind[pp] == Acj)
1639  break;
1640  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1641  // std::runtime_error, "problem with Ac column indices");
1642  Acvals[pp] += Pki * Akl * Pvals[jj];
1643  }
1644  } else {
1645  // mfh 27 Sep 2016: If the entry of Acol2PRow
1646  // corresponding to the current entry of A NOT populated (has
1647  // a flag "invalid" value), then the corresponding row of P is
1648  // in P_remote (i.e., it does not live on the calling process).
1649 
1650  // Remote matrix
1651  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1652  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il + 1]; jj++) {
1653  LO j = Icolind[jj];
1654  LO Acj = PIcol2Accol[j];
1655  size_t pp;
1656  for (pp = Acrowptr[i]; pp < Acrowptr[i + 1]; pp++)
1657  if (Accolind[pp] == Acj)
1658  break;
1659  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1660  // std::runtime_error, "problem with Ac column indices");
1661  Acvals[pp] += Pki * Akl * Ivals[jj];
1662  }
1663  }
1664  }
1665  }
1666  }
1667 
1668 #ifdef HAVE_TPETRA_MMM_TIMINGS
1669  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1670 #endif
1671 
1672  // Final sort & set of CRS arrays
1673  //
1674  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1675  // matrix-matrix multiply routine sort the entries for us?
1676  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1677 
1678  // mfh 27 Sep 2016: This just sets pointers.
1679  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1680 
1681 #ifdef HAVE_TPETRA_MMM_TIMINGS
1682  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1683 #endif
1684 
1685  // Final FillComplete
1686  //
1687  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1688  // Import (from domain Map to column Map) construction (which costs
1689  // lots of communication) by taking the previously constructed
1690  // Import object. We should be able to do this without interfering
1691  // with the implementation of the local part of sparse matrix-matrix
1692  // multply above.
1693  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1694  labelList->set("Timer Label", label);
1695  // labelList->set("Sort column Map ghost GIDs")
1696  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
1697  RCP<const Export<LO, GO, NO> > dummyExport;
1698  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1699  Pview.origMatrix->getDomainMap(),
1700  Acimport,
1701  dummyExport, labelList);
1702 }
1703 
1704 } // namespace MMdetails
1705 
1706 } // End namespace Tpetra
1707 //
1708 // Explicit instantiation macro
1709 //
1710 // Must be expanded from within the Tpetra namespace!
1711 //
1712 
1713 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR, LO, GO, NODE) \
1714  \
1715  template void TripleMatrixMultiply::MultiplyRAP( \
1716  const CrsMatrix<SCALAR, LO, GO, NODE>& R, \
1717  bool transposeR, \
1718  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
1719  bool transposeA, \
1720  const CrsMatrix<SCALAR, LO, GO, NODE>& P, \
1721  bool transposeP, \
1722  CrsMatrix<SCALAR, LO, GO, NODE>& Ac, \
1723  bool call_FillComplete_on_result, \
1724  const std::string& label, \
1725  const Teuchos::RCP<Teuchos::ParameterList>& params);
1726 
1727 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.