Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
11 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
12 
14 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
15 #include "Teuchos_VerboseObject.hpp"
16 #include "Teuchos_Array.hpp"
17 #include "Tpetra_Util.hpp"
18 #include "Tpetra_ConfigDefs.hpp"
19 #include "Tpetra_CrsMatrix.hpp"
21 #include "Tpetra_RowMatrixTransposer.hpp"
22 #include "Tpetra_ConfigDefs.hpp"
23 #include "Tpetra_Map.hpp"
24 #include "Tpetra_Export.hpp"
25 #include "Tpetra_Import_Util.hpp"
26 #include "Tpetra_Import_Util2.hpp"
27 #include <algorithm>
28 #include <cmath>
29 #include "Teuchos_FancyOStream.hpp"
30 // #include "KokkosSparse_spgemm.hpp"
31 
32 
40 /*********************************************************************************************************/
41 // Include the architecture-specific kernel partial specializations here
42 // NOTE: This needs to be outside all namespaces
43 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
44 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
45 #include "TpetraExt_MatrixMatrix_HIP.hpp"
46 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
47 
48 namespace Tpetra {
49 
50  namespace TripleMatrixMultiply{
51 
52  //
53  // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
54  // op(A) == A if transposeA is false,
55  // op(A) == A^T if transposeA is true,
56  // and similarly for op(R) and op(P).
57  //
58  template <class Scalar,
59  class LocalOrdinal,
60  class GlobalOrdinal,
61  class Node>
64  bool transposeR,
66  bool transposeA,
68  bool transposeP,
70  bool call_FillComplete_on_result,
71  const std::string& label,
72  const Teuchos::RCP<Teuchos::ParameterList>& params)
73  {
74  using Teuchos::null;
75  using Teuchos::RCP;
76  typedef Scalar SC;
77  typedef LocalOrdinal LO;
78  typedef GlobalOrdinal GO;
79  typedef Node NO;
80  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
81  typedef Import<LO,GO,NO> import_type;
82  typedef Export<LO,GO,NO> export_type;
83  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
84  typedef Map<LO,GO,NO> map_type;
85  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
86 
87 #ifdef HAVE_TPETRA_MMM_TIMINGS
88  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
89  using Teuchos::TimeMonitor;
90  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
91 #endif
92 
93  const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
94 
95  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
96 
97  // The input matrices R, A and P must both be fillComplete.
98  TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
99  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
100  TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
101 
102  // If transposeA is true, then Rprime will be the transpose of R
103  // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
104  // will just be a pointer to R.
105  RCP<const crs_matrix_type> Rprime = null;
106  // If transposeA is true, then Aprime will be the transpose of A
107  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
108  // will just be a pointer to A.
109  RCP<const crs_matrix_type> Aprime = null;
110  // If transposeB is true, then Pprime will be the transpose of P
111  // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
112  // will just be a pointer to P.
113  RCP<const crs_matrix_type> Pprime = null;
114 
115  // Is this a "clean" matrix?
116  //
117  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
118  // locally nor globally indexed, then it was empty. I don't like
119  // this, because the most straightforward implementation presumes
120  // lazy allocation of indices. However, historical precedent
121  // demands that we keep around this predicate as a way to test
122  // whether the matrix is empty.
123  const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
124 
125  using Teuchos::ParameterList;
126  RCP<ParameterList> transposeParams (new ParameterList);
127  transposeParams->set ("sort", false);
128 
129  if (transposeR && &R != &P) {
130  transposer_type transposer(rcpFromRef (R));
131  Rprime = transposer.createTranspose (transposeParams);
132  } else {
133  Rprime = rcpFromRef(R);
134  }
135 
136  if (transposeA) {
137  transposer_type transposer(rcpFromRef (A));
138  Aprime = transposer.createTranspose (transposeParams);
139  } else {
140  Aprime = rcpFromRef(A);
141  }
142 
143  if (transposeP) {
144  transposer_type transposer(rcpFromRef (P));
145  Pprime = transposer.createTranspose (transposeParams);
146  } else {
147  Pprime = rcpFromRef(P);
148  }
149 
150  // Check size compatibility
151  global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
152  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
153  global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
154  global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
155  global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
156  global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
157  global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
158  global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
159  global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
160  TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
161  prefix << "ERROR, inner dimensions of op(R) and op(A) "
162  "must match for matrix-matrix product. op(R) is "
163  << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
164 
165  TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
166  prefix << "ERROR, inner dimensions of op(A) and op(P) "
167  "must match for matrix-matrix product. op(A) is "
168  << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
169 
170  // The result matrix Ac must at least have a row-map that reflects the correct
171  // row-size. Don't check the number of columns because rectangular matrices
172  // which were constructed with only one map can still end up having the
173  // correct capacity and dimensions when filled.
174  TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
175  prefix << "ERROR, dimensions of result Ac must "
176  "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
177  << " rows, should have at least " << Rleft << std::endl);
178 
179  // It doesn't matter whether Ac is already Filled or not. If it is already
180  // Filled, it must have space allocated for the positions that will be
181  // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
182  // we'll error out later when trying to store result values.
183 
184  // CGB: However, matrix must be in active-fill
185  TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
186 
187  // We're going to need to import remotely-owned sections of P if
188  // more than one processor is performing this run, depending on the scenario.
189  int numProcs = P.getComm()->getSize();
190 
191  // Declare a couple of structs that will be used to hold views of the data
192  // of R, A and P, to be used for fast access during the matrix-multiplication.
193  crs_matrix_struct_type Rview;
194  crs_matrix_struct_type Aview;
195  crs_matrix_struct_type Pview;
196 
197  RCP<const map_type> targetMap_R = Rprime->getRowMap();
198  RCP<const map_type> targetMap_A = Aprime->getRowMap();
199  RCP<const map_type> targetMap_P = Pprime->getRowMap();
200 
201 #ifdef HAVE_TPETRA_MMM_TIMINGS
202  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
203 #endif
204 
205  // Now import any needed remote rows and populate the Aview struct
206  // NOTE: We assert that an import isn't needed --- since we do the transpose
207  // above to handle that.
208  RCP<const import_type> dummyImporter;
209 
210  if (!(transposeR && &R == &P))
211  MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
212 
213  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
214 
215  // We will also need local access to all rows of P that correspond to the
216  // column-map of op(A).
217  if (numProcs > 1)
218  targetMap_P = Aprime->getColMap();
219 
220  // Import any needed remote rows and populate the Pview struct.
221  MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
222 
223 
224  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
225 
226  bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
227  if (needs_final_export)
228  Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
229  else
230  Actemp = rcp(&Ac,false);// don't allow deallocation
231 
232 #ifdef HAVE_TPETRA_MMM_TIMINGS
233  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
234 #endif
235 
236  // Call the appropriate method to perform the actual multiplication.
237  if (call_FillComplete_on_result && newFlag) {
238  if (transposeR && &R == &P)
239  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
240  else
241  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
242  } else if (call_FillComplete_on_result) {
243  if (transposeR && &R == &P)
244  MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
245  else
246  MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
247  } else {
248  // mfh 27 Sep 2016: Is this the "slow" case? This
249  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
250  // thread-parallel inserts, but that may take some effort.
251  // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
252 
253  // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
254 
255  // #ifdef HAVE_TPETRA_MMM_TIMINGS
256  // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
257  // #endif
258  // if (call_FillComplete_on_result) {
259  // // We'll call FillComplete on the C matrix before we exit, and give it a
260  // // domain-map and a range-map.
261  // // The domain-map will be the domain-map of B, unless
262  // // op(B)==transpose(B), in which case the range-map of B will be used.
263  // // The range-map will be the range-map of A, unless op(A)==transpose(A),
264  // // in which case the domain-map of A will be used.
265  // if (!C.isFillComplete())
266  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
267  // }
268  // Not implemented
269  if (transposeR && &R == &P)
270  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
271  else
272  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
273  }
274 
275  if (needs_final_export) {
276 #ifdef HAVE_TPETRA_MMM_TIMINGS
277  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
278 #endif
279  Teuchos::ParameterList labelList;
280  labelList.set("Timer Label", label);
281  Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
282 
283  RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
284  bool isMM = true;
285  bool overrideAllreduce = false;
286  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
287  if(!params.is_null()) {
288  Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
289  mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
290  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
291  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
292  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
293  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
294  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
295 
296  labelList.set("compute global constants",params->get("compute global constants",true));
297  }
298  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
299 
300  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
301  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
302  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
303 
304  export_type exporter = export_type(*Pprime->getGraph()->getImporter());
305  Actemp->exportAndFillComplete(Acprime,
306  exporter,
307  Acprime->getDomainMap(),
308  Acprime->getRangeMap(),
309  rcp(&labelList,false));
310 
311  }
312 #ifdef HAVE_TPETRA_MMM_STATISTICS
313  printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
314 #endif
315 
316  }
317 
318 
319  } //End namespace TripleMatrixMultiply
320 
321  namespace MMdetails{
322 
323  // Kernel method for computing the local portion of Ac = R*A*P
324  template<class Scalar,
325  class LocalOrdinal,
326  class GlobalOrdinal,
327  class Node>
328  void mult_R_A_P_newmatrix(
329  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
330  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
331  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
332  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
333  const std::string& label,
334  const Teuchos::RCP<Teuchos::ParameterList>& params)
335  {
336  using Teuchos::Array;
337  using Teuchos::ArrayRCP;
338  using Teuchos::ArrayView;
339  using Teuchos::RCP;
340  using Teuchos::rcp;
341 
342  //typedef Scalar SC; // unused
343  typedef LocalOrdinal LO;
344  typedef GlobalOrdinal GO;
345  typedef Node NO;
346 
347  typedef Import<LO,GO,NO> import_type;
348  typedef Map<LO,GO,NO> map_type;
349 
350  // Kokkos typedefs
351  typedef typename map_type::local_map_type local_map_type;
353  typedef typename KCRS::StaticCrsGraphType graph_t;
354  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
355  typedef typename NO::execution_space execution_space;
356  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
357  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
358 
359 #ifdef HAVE_TPETRA_MMM_TIMINGS
360  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
361  using Teuchos::TimeMonitor;
362  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
363 #endif
364  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
365 
366  // Build the final importer / column map, hash table lookups for Ac
367  RCP<const import_type> Cimport;
368  RCP<const map_type> Ccolmap;
369  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
370  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
371  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
372  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
373  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
374  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
375  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
376 
377 
378  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
379  // indices of B, to local column indices of Ac. (B and Ac have the
380  // same number of columns.) The kernel uses this, instead of
381  // copying the entire input matrix B and converting its column
382  // indices to those of C.
383  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
384 
385  if (Pview.importMatrix.is_null()) {
386  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
387  Cimport = Pimport;
388  Ccolmap = Pview.colMap;
389  const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
390  // Pcol2Ccol is trivial
391  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
392  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
393  KOKKOS_LAMBDA(const LO i) {
394  Pcol2Ccol(i) = i;
395  });
396  }
397  else {
398  // mfh 27 Sep 2016: P has "remotes," so we need to build the
399  // column Map of C, as well as C's Import object (from its domain
400  // Map to its column Map). C's column Map is the union of the
401  // column Maps of (the local part of) P, and the "remote" part of
402  // P. Ditto for the Import. We have optimized this "setUnion"
403  // operation on Import objects and Maps.
404 
405  // Choose the right variant of setUnion
406  if (!Pimport.is_null() && !Iimport.is_null()) {
407  Cimport = Pimport->setUnion(*Iimport);
408  }
409  else if (!Pimport.is_null() && Iimport.is_null()) {
410  Cimport = Pimport->setUnion();
411  }
412  else if (Pimport.is_null() && !Iimport.is_null()) {
413  Cimport = Iimport->setUnion();
414  }
415  else {
416  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
417  }
418  Ccolmap = Cimport->getTargetMap();
419 
420  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
421  // in general. We should get rid of it in order to reduce
422  // communication costs of sparse matrix-matrix multiply.
423  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
424  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
425 
426  // NOTE: This is not efficient and should be folded into setUnion
427  //
428  // mfh 27 Sep 2016: What the above comment means, is that the
429  // setUnion operation on Import objects could also compute these
430  // local index - to - local index look-up tables.
431  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
432  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
433  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
434  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
435  });
436  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
437  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
438  });
439 
440  }
441 
442  // Replace the column map
443  //
444  // mfh 27 Sep 2016: We do this because C was originally created
445  // without a column Map. Now we have its column Map.
446  Ac.replaceColMap(Ccolmap);
447 
448  // mfh 27 Sep 2016: Construct tables that map from local column
449  // indices of A, to local row indices of either B_local (the locally
450  // owned part of B), or B_remote (the "imported" remote part of B).
451  //
452  // For column index Aik in row i of A, if the corresponding row of B
453  // exists in the local part of B ("orig") (which I'll call B_local),
454  // then targetMapToOrigRow[Aik] is the local index of that row of B.
455  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
456  //
457  // For column index Aik in row i of A, if the corresponding row of B
458  // exists in the remote part of B ("Import") (which I'll call
459  // B_remote), then targetMapToImportRow[Aik] is the local index of
460  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
461  // (a flag value).
462 
463  // Run through all the hash table lookups once and for all
464  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
465  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
466  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
467  GO aidx = Acolmap_local.getGlobalElement(i);
468  LO P_LID = Prowmap_local.getLocalElement(aidx);
469  if (P_LID != LO_INVALID) {
470  targetMapToOrigRow(i) = P_LID;
471  targetMapToImportRow(i) = LO_INVALID;
472  } else {
473  LO I_LID = Irowmap_local.getLocalElement(aidx);
474  targetMapToOrigRow(i) = LO_INVALID;
475  targetMapToImportRow(i) = I_LID;
476  }
477  });
478 
479  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
480  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
481  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
482  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
483  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
484  Ac, Cimport, label, params);
485  }
486 
487 
488  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
489  template<class Scalar,
490  class LocalOrdinal,
491  class GlobalOrdinal,
492  class Node>
493  void mult_R_A_P_reuse(
494  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
495  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
496  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
497  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
498  const std::string& label,
499  const Teuchos::RCP<Teuchos::ParameterList>& params)
500  {
501  using Teuchos::Array;
502  using Teuchos::ArrayRCP;
503  using Teuchos::ArrayView;
504  using Teuchos::RCP;
505  using Teuchos::rcp;
506 
507  //typedef Scalar SC; // unused
508  typedef LocalOrdinal LO;
509  typedef GlobalOrdinal GO;
510  typedef Node NO;
511 
512  typedef Import<LO,GO,NO> import_type;
513  typedef Map<LO,GO,NO> map_type;
514 
515  // Kokkos typedefs
516  typedef typename map_type::local_map_type local_map_type;
518  typedef typename KCRS::StaticCrsGraphType graph_t;
519  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
520  typedef typename NO::execution_space execution_space;
521  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
522  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
523 
524 #ifdef HAVE_TPETRA_MMM_TIMINGS
525  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
526  using Teuchos::TimeMonitor;
527  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
528 #endif
529  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
530 
531  // Build the final importer / column map, hash table lookups for Ac
532  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
533  RCP<const map_type> Ccolmap = Ac.getColMap();
534  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
535  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
536  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
537  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
538  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
539  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
540  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
541  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
542 
543  // Build the final importer / column map, hash table lookups for C
544  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
545  {
546  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
547  // So, column map of C may be a strict subset of the column map of B
548  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
549  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
550  });
551 
552  if (!Pview.importMatrix.is_null()) {
553  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
554  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
555 
556  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
557  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
558  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
559  });
560  }
561  }
562 
563  // Run through all the hash table lookups once and for all
564  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
565  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
566  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
567  GO aidx = Acolmap_local.getGlobalElement(i);
568  LO B_LID = Prowmap_local.getLocalElement(aidx);
569  if (B_LID != LO_INVALID) {
570  targetMapToOrigRow(i) = B_LID;
571  targetMapToImportRow(i) = LO_INVALID;
572  } else {
573  LO I_LID = Irowmap_local.getLocalElement(aidx);
574  targetMapToOrigRow(i) = LO_INVALID;
575  targetMapToImportRow(i) = I_LID;
576 
577  }
578  });
579 
580  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
581  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
582  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
583  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
584  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
585  Ac, Cimport, label, params);
586  }
587 
588 
589  // Kernel method for computing the local portion of Ac = R*A*P
590  template<class Scalar,
591  class LocalOrdinal,
592  class GlobalOrdinal,
593  class Node>
594  void mult_PT_A_P_newmatrix(
595  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
596  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
597  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
598  const std::string& label,
599  const Teuchos::RCP<Teuchos::ParameterList>& params)
600  {
601  using Teuchos::Array;
602  using Teuchos::ArrayRCP;
603  using Teuchos::ArrayView;
604  using Teuchos::RCP;
605  using Teuchos::rcp;
606 
607  //typedef Scalar SC; // unused
608  typedef LocalOrdinal LO;
609  typedef GlobalOrdinal GO;
610  typedef Node NO;
611 
612  typedef Import<LO,GO,NO> import_type;
613  typedef Map<LO,GO,NO> map_type;
614 
615  // Kokkos typedefs
616  typedef typename map_type::local_map_type local_map_type;
618  typedef typename KCRS::StaticCrsGraphType graph_t;
619  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
620  typedef typename NO::execution_space execution_space;
621  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
622  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
623 
624 #ifdef HAVE_TPETRA_MMM_TIMINGS
625  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
626  using Teuchos::TimeMonitor;
627  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
628 #endif
629  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
630 
631  // Build the final importer / column map, hash table lookups for Ac
632  RCP<const import_type> Cimport;
633  RCP<const map_type> Ccolmap;
634  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
635  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
636  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
637  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
638  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
639  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
640  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
641 
642 
643  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
644  // indices of B, to local column indices of Ac. (B and Ac have the
645  // same number of columns.) The kernel uses this, instead of
646  // copying the entire input matrix B and converting its column
647  // indices to those of C.
648  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
649 
650  if (Pview.importMatrix.is_null()) {
651  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
652  Cimport = Pimport;
653  Ccolmap = Pview.colMap;
654  const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
655  // Pcol2Ccol is trivial
656  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
657  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
658  KOKKOS_LAMBDA(const LO i) {
659  Pcol2Ccol(i) = i;
660  });
661  }
662  else {
663  // mfh 27 Sep 2016: P has "remotes," so we need to build the
664  // column Map of C, as well as C's Import object (from its domain
665  // Map to its column Map). C's column Map is the union of the
666  // column Maps of (the local part of) P, and the "remote" part of
667  // P. Ditto for the Import. We have optimized this "setUnion"
668  // operation on Import objects and Maps.
669 
670  // Choose the right variant of setUnion
671  if (!Pimport.is_null() && !Iimport.is_null()) {
672  Cimport = Pimport->setUnion(*Iimport);
673  }
674  else if (!Pimport.is_null() && Iimport.is_null()) {
675  Cimport = Pimport->setUnion();
676  }
677  else if (Pimport.is_null() && !Iimport.is_null()) {
678  Cimport = Iimport->setUnion();
679  }
680  else {
681  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
682  }
683  Ccolmap = Cimport->getTargetMap();
684 
685  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
686  // in general. We should get rid of it in order to reduce
687  // communication costs of sparse matrix-matrix multiply.
688  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
689  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
690 
691  // NOTE: This is not efficient and should be folded into setUnion
692  //
693  // mfh 27 Sep 2016: What the above comment means, is that the
694  // setUnion operation on Import objects could also compute these
695  // local index - to - local index look-up tables.
696  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
697  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
698  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
699  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
700  });
701  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
702  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
703  });
704 
705  }
706 
707  // Replace the column map
708  //
709  // mfh 27 Sep 2016: We do this because C was originally created
710  // without a column Map. Now we have its column Map.
711  Ac.replaceColMap(Ccolmap);
712 
713  // mfh 27 Sep 2016: Construct tables that map from local column
714  // indices of A, to local row indices of either B_local (the locally
715  // owned part of B), or B_remote (the "imported" remote part of B).
716  //
717  // For column index Aik in row i of A, if the corresponding row of B
718  // exists in the local part of B ("orig") (which I'll call B_local),
719  // then targetMapToOrigRow[Aik] is the local index of that row of B.
720  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
721  //
722  // For column index Aik in row i of A, if the corresponding row of B
723  // exists in the remote part of B ("Import") (which I'll call
724  // B_remote), then targetMapToImportRow[Aik] is the local index of
725  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
726  // (a flag value).
727 
728  // Run through all the hash table lookups once and for all
729  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
730  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
731 
732  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
733  GO aidx = Acolmap_local.getGlobalElement(i);
734  LO P_LID = Prowmap_local.getLocalElement(aidx);
735  if (P_LID != LO_INVALID) {
736  targetMapToOrigRow(i) = P_LID;
737  targetMapToImportRow(i) = LO_INVALID;
738  } else {
739  LO I_LID = Irowmap_local.getLocalElement(aidx);
740  targetMapToOrigRow(i) = LO_INVALID;
741  targetMapToImportRow(i) = I_LID;
742  }
743  });
744 
745  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
746  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
747  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
748  mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
749  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
750  Ac, Cimport, label, params);
751  }
752 
753  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
754  template<class Scalar,
755  class LocalOrdinal,
756  class GlobalOrdinal,
757  class Node>
758  void mult_PT_A_P_reuse(
759  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
760  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
761  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
762  const std::string& label,
763  const Teuchos::RCP<Teuchos::ParameterList>& params)
764  {
765  using Teuchos::Array;
766  using Teuchos::ArrayRCP;
767  using Teuchos::ArrayView;
768  using Teuchos::RCP;
769  using Teuchos::rcp;
770 
771  //typedef Scalar SC; // unused
772  typedef LocalOrdinal LO;
773  typedef GlobalOrdinal GO;
774  typedef Node NO;
775 
776  typedef Import<LO,GO,NO> import_type;
777  typedef Map<LO,GO,NO> map_type;
778 
779  // Kokkos typedefs
780  typedef typename map_type::local_map_type local_map_type;
782  typedef typename KCRS::StaticCrsGraphType graph_t;
783  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
784  typedef typename NO::execution_space execution_space;
785  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
786  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
787 
788 #ifdef HAVE_TPETRA_MMM_TIMINGS
789  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
790  using Teuchos::TimeMonitor;
791  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
792 #endif
793  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
794 
795  // Build the final importer / column map, hash table lookups for Ac
796  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
797  RCP<const map_type> Ccolmap = Ac.getColMap();
798  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
799  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
800  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
801  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
802  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
803  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
804  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
805  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
806 
807  // Build the final importer / column map, hash table lookups for C
808  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
809  {
810  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
811  // So, column map of C may be a strict subset of the column map of B
812  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
813  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
814  });
815 
816  if (!Pview.importMatrix.is_null()) {
817  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
818  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
819 
820  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
821  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
822  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
823  });
824  }
825  }
826 
827  // Run through all the hash table lookups once and for all
828  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
829  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
830  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
831  GO aidx = Acolmap_local.getGlobalElement(i);
832  LO B_LID = Prowmap_local.getLocalElement(aidx);
833  if (B_LID != LO_INVALID) {
834  targetMapToOrigRow(i) = B_LID;
835  targetMapToImportRow(i) = LO_INVALID;
836  } else {
837  LO I_LID = Irowmap_local.getLocalElement(aidx);
838  targetMapToOrigRow(i) = LO_INVALID;
839  targetMapToImportRow(i) = I_LID;
840 
841  }
842  });
843 
844  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
845  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
846  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
847  mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
848  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
849  Ac, Cimport, label, params);
850  }
851 
852 
853  /*********************************************************************************************************/
854  // RAP NewMatrix Kernel wrappers (Default non-threaded version)
855  // Computes R * A * P -> Ac using classic Gustavson approach
856  template<class Scalar,
857  class LocalOrdinal,
858  class GlobalOrdinal,
859  class Node,
860  class LocalOrdinalViewType>
861  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
862  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
863  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
864  const LocalOrdinalViewType & Acol2Prow_dev,
865  const LocalOrdinalViewType & Acol2PIrow_dev,
866  const LocalOrdinalViewType & Pcol2Accol_dev,
867  const LocalOrdinalViewType & PIcol2Accol_dev,
868  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
869  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
870  const std::string& label,
871  const Teuchos::RCP<Teuchos::ParameterList>& params) {
872 #ifdef HAVE_TPETRA_MMM_TIMINGS
873  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
874  using Teuchos::TimeMonitor;
875  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
876 #endif
877 
878  using Teuchos::Array;
879  using Teuchos::ArrayRCP;
880  using Teuchos::ArrayView;
881  using Teuchos::RCP;
882  using Teuchos::rcp;
883 
884  // Lots and lots of typedefs
886  typedef typename KCRS::StaticCrsGraphType graph_t;
887  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
888  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
889  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
890  typedef typename KCRS::values_type::non_const_type scalar_view_t;
891 
892  typedef Scalar SC;
893  typedef LocalOrdinal LO;
894  typedef GlobalOrdinal GO;
895  typedef Node NO;
896  typedef Map<LO,GO,NO> map_type;
897  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
898  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
899  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
900 
901  // Sizes
902  RCP<const map_type> Accolmap = Ac.getColMap();
903  size_t m = Rview.origMatrix->getLocalNumRows();
904  size_t n = Accolmap->getLocalNumElements();
905  size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
906 
907  // Routine runs on host; have to put arguments on host, too
908  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
909  Acol2Prow_dev);
910  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
911  Acol2PIrow_dev);
912  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
913  Pcol2Accol_dev);
914  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
915  PIcol2Accol_dev);
916 
917  // Grab the Kokkos::SparseCrsMatrices & inner stuff
918  const auto Amat = Aview.origMatrix->getLocalMatrixHost();
919  const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
920  const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
921 
922  auto Arowptr = Amat.graph.row_map;
923  auto Prowptr = Pmat.graph.row_map;
924  auto Rrowptr = Rmat.graph.row_map;
925  const auto Acolind = Amat.graph.entries;
926  const auto Pcolind = Pmat.graph.entries;
927  const auto Rcolind = Rmat.graph.entries;
928  const auto Avals = Amat.values;
929  const auto Pvals = Pmat.values;
930  const auto Rvals = Rmat.values;
931 
932  typename c_lno_view_t::HostMirror::const_type Irowptr;
933  typename lno_nnz_view_t::HostMirror Icolind;
934  typename scalar_view_t::HostMirror Ivals;
935  if(!Pview.importMatrix.is_null()) {
936  auto lclP = Pview.importMatrix->getLocalMatrixHost();
937  Irowptr = lclP.graph.row_map;
938  Icolind = lclP.graph.entries;
939  Ivals = lclP.values;
940  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
941  }
942 
943 #ifdef HAVE_TPETRA_MMM_TIMINGS
944  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
945 #endif
946 
947  // Classic csr assembly (low memory edition)
948  //
949  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
950  // The method loops over rows of R, and may resize after processing
951  // each row. Chris Siefert says that this reflects experience in
952  // ML; for the non-threaded case, ML found it faster to spend less
953  // effort on estimation and risk an occasional reallocation.
954  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
955  typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
956  typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
957  typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
958 
959  // mfh 27 Sep 2016: The ac_status array is an implementation detail
960  // of the local sparse matrix-matrix multiply routine.
961 
962  // The status array will contain the index into colind where this entry was last deposited.
963  // ac_status[i] < nnz - not in the row yet
964  // ac_status[i] >= nnz - this is the entry where you can find the data
965  // We start with this filled with INVALID's indicating that there are no entries yet.
966  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
967  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
968  Array<size_t> ac_status(n, ST_INVALID);
969 
970  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
971  // routine. The routine computes Ac := R * A * (P_local + P_remote).
972  //
973  // For column index Aik in row i of A, Acol2Prow[Aik] tells
974  // you whether the corresponding row of P belongs to P_local
975  // ("orig") or P_remote ("Import").
976 
977  // For each row of R
978  size_t nnz = 0, nnz_old = 0;
979  for (size_t i = 0; i < m; i++) {
980  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
981  // on the calling process.
982  Crowptr[i] = nnz;
983 
984  // mfh 27 Sep 2016: For each entry of R in the current row of R
985  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
986  LO k = Rcolind[kk]; // local column index of current entry of R
987  const SC Rik = Rvals[kk]; // value of current entry of R
988  if (Rik == SC_ZERO)
989  continue; // skip explicitly stored zero values in R
990  // For each entry of A in the current row of A
991  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
992  LO l = Acolind[ll]; // local column index of current entry of A
993  const SC Akl = Avals[ll]; // value of current entry of A
994  if (Akl == SC_ZERO)
995  continue; // skip explicitly stored zero values in A
996 
997 
998  if (Acol2Prow[l] != LO_INVALID) {
999  // mfh 27 Sep 2016: If the entry of Acol2Prow
1000  // corresponding to the current entry of A is populated, then
1001  // the corresponding row of P is in P_local (i.e., it lives on
1002  // the calling process).
1003 
1004  // Local matrix
1005  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1006 
1007  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1008  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1009  LO j = Pcolind[jj];
1010  LO Acj = Pcol2Accol[j];
1011  SC Plj = Pvals[jj];
1012 
1013  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1014 #ifdef HAVE_TPETRA_DEBUG
1015  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1016  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1017  std::runtime_error,
1018  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1019 #endif
1020  // New entry
1021  ac_status[Acj] = nnz;
1022  Ccolind[nnz] = Acj;
1023  Cvals[nnz] = Rik*Akl*Plj;
1024  nnz++;
1025  } else {
1026  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1027  }
1028  }
1029  } else {
1030  // mfh 27 Sep 2016: If the entry of Acol2PRow
1031  // corresponding to the current entry of A is NOT populated (has
1032  // a flag "invalid" value), then the corresponding row of P is
1033  // in P_remote (i.e., it does not live on the calling process).
1034 
1035  // Remote matrix
1036  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1037  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1038  LO j = Icolind[jj];
1039  LO Acj = PIcol2Accol[j];
1040  SC Plj = Ivals[jj];
1041 
1042  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1043 #ifdef HAVE_TPETRA_DEBUG
1044  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1045  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1046  std::runtime_error,
1047  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1048 #endif
1049  // New entry
1050  ac_status[Acj] = nnz;
1051  Ccolind[nnz] = Acj;
1052  Cvals[nnz] = Rik*Akl*Plj;
1053  nnz++;
1054  } else {
1055  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1056  }
1057  }
1058  }
1059  }
1060  }
1061  // Resize for next pass if needed
1062  if (nnz + n > CSR_alloc) {
1063  CSR_alloc *= 2;
1064  Kokkos::resize(Ccolind,CSR_alloc);
1065  Kokkos::resize(Cvals,CSR_alloc);
1066  }
1067  nnz_old = nnz;
1068  }
1069 
1070  Crowptr[m] = nnz;
1071 
1072  // Downward resize
1073  Kokkos::resize(Ccolind,nnz);
1074  Kokkos::resize(Cvals,nnz);
1075 
1076 #ifdef HAVE_TPETRA_MMM_TIMINGS
1077  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1078 #endif
1079  auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1080  typename KCRS::device_type(), Crowptr);
1081  auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1082  typename KCRS::device_type(), Ccolind);
1083  auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1084  typename KCRS::device_type(), Cvals);
1085 
1086  // Final sort & set of CRS arrays
1087  if (params.is_null() || params->get("sort entries",true))
1088  Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1089  Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1090 
1091 #ifdef HAVE_TPETRA_MMM_TIMINGS
1092  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1093 #endif
1094 
1095  // Final FillComplete
1096  //
1097  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1098  // Import (from domain Map to column Map) construction (which costs
1099  // lots of communication) by taking the previously constructed
1100  // Import object. We should be able to do this without interfering
1101  // with the implementation of the local part of sparse matrix-matrix
1102  // multply above.
1103  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1104  labelList->set("Timer Label",label);
1105  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1106  RCP<const Export<LO,GO,NO> > dummyExport;
1107  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1108  Rview.origMatrix->getRangeMap(),
1109  Acimport,
1110  dummyExport,
1111  labelList);
1112 
1113  }
1114 
1115  /*********************************************************************************************************/
1116  // RAP Reuse Kernel wrappers (Default non-threaded version)
1117  // Computes R * A * P -> Ac using reuse Gustavson
1118  template<class Scalar,
1119  class LocalOrdinal,
1120  class GlobalOrdinal,
1121  class Node,
1122  class LocalOrdinalViewType>
1123  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1124  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1125  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1126  const LocalOrdinalViewType & Acol2Prow_dev,
1127  const LocalOrdinalViewType & Acol2PIrow_dev,
1128  const LocalOrdinalViewType & Pcol2Accol_dev,
1129  const LocalOrdinalViewType & PIcol2Accol_dev,
1130  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1131  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1132  const std::string& label,
1133  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1134 #ifdef HAVE_TPETRA_MMM_TIMINGS
1135  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1136  using Teuchos::TimeMonitor;
1137  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1138 #endif
1139 
1140  using Teuchos::Array;
1141  using Teuchos::ArrayRCP;
1142  using Teuchos::ArrayView;
1143  using Teuchos::RCP;
1144  using Teuchos::rcp;
1145 
1146  // Lots and lots of typedefs
1147  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1148  typedef typename KCRS::StaticCrsGraphType graph_t;
1149  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1150  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1151  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1152 
1153  typedef Scalar SC;
1154  typedef LocalOrdinal LO;
1155  typedef GlobalOrdinal GO;
1156  typedef Node NO;
1157  typedef Map<LO,GO,NO> map_type;
1158  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1159  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1160  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1161 
1162  // Sizes
1163  RCP<const map_type> Accolmap = Ac.getColMap();
1164  size_t m = Rview.origMatrix->getLocalNumRows();
1165  size_t n = Accolmap->getLocalNumElements();
1166  size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1167 
1168  // Routine runs on host; have to put arguments on host, too
1169  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1170  Acol2Prow_dev);
1171  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1172  Acol2PIrow_dev);
1173  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1174  Pcol2Accol_dev);
1175  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1176  PIcol2Accol_dev);
1177 
1178  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1179  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1180  const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
1181  const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
1182  const KCRS & Cmat = Ac.getLocalMatrixHost();
1183 
1184  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1185  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1186  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1187  scalar_view_t Cvals = Cmat.values;
1188 
1189  c_lno_view_t Irowptr;
1190  lno_nnz_view_t Icolind;
1191  scalar_view_t Ivals;
1192  if(!Pview.importMatrix.is_null()) {
1193  auto lclP = Pview.importMatrix->getLocalMatrixHost();
1194  Irowptr = lclP.graph.row_map;
1195  Icolind = lclP.graph.entries;
1196  Ivals = lclP.values;
1197  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
1198  }
1199 
1200 #ifdef HAVE_TPETRA_MMM_TIMINGS
1201  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1202 #endif
1203 
1204  // mfh 27 Sep 2016: The ac_status array is an implementation detail
1205  // of the local sparse matrix-matrix multiply routine.
1206 
1207  // The status array will contain the index into colind where this entry was last deposited.
1208  // ac_status[i] < nnz - not in the row yet
1209  // ac_status[i] >= nnz - this is the entry where you can find the data
1210  // We start with this filled with INVALID's indicating that there are no entries yet.
1211  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1212  Array<size_t> ac_status(n, ST_INVALID);
1213 
1214  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1215  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1216  //
1217  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1218  // you whether the corresponding row of P belongs to P_local
1219  // ("orig") or P_remote ("Import").
1220 
1221  // Necessary until following UVM host accesses are changed - for example Crowptr
1222  // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1223  Kokkos::fence();
1224 
1225  // For each row of R
1226  size_t OLD_ip = 0, CSR_ip = 0;
1227  for (size_t i = 0; i < m; i++) {
1228  // First fill the c_status array w/ locations where we're allowed to
1229  // generate nonzeros for this row
1230  OLD_ip = Crowptr[i];
1231  CSR_ip = Crowptr[i+1];
1232  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1233  ac_status[Ccolind[k]] = k;
1234 
1235  // Reset values in the row of C
1236  Cvals[k] = SC_ZERO;
1237  }
1238 
1239  // mfh 27 Sep 2016: For each entry of R in the current row of R
1240  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1241  LO k = Rcolind[kk]; // local column index of current entry of R
1242  const SC Rik = Rvals[kk]; // value of current entry of R
1243  if (Rik == SC_ZERO)
1244  continue; // skip explicitly stored zero values in R
1245  // For each entry of A in the current row of A
1246  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1247  LO l = Acolind[ll]; // local column index of current entry of A
1248  const SC Akl = Avals[ll]; // value of current entry of A
1249  if (Akl == SC_ZERO)
1250  continue; // skip explicitly stored zero values in A
1251 
1252 
1253  if (Acol2Prow[l] != LO_INVALID) {
1254  // mfh 27 Sep 2016: If the entry of Acol2Prow
1255  // corresponding to the current entry of A is populated, then
1256  // the corresponding row of P is in P_local (i.e., it lives on
1257  // the calling process).
1258 
1259  // Local matrix
1260  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1261 
1262  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1263  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1264  LO j = Pcolind[jj];
1265  LO Cij = Pcol2Accol[j];
1266  SC Plj = Pvals[jj];
1267 
1268  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1269  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1270  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1271 
1272  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1273  }
1274  } else {
1275  // mfh 27 Sep 2016: If the entry of Acol2PRow
1276  // corresponding to the current entry of A is NOT populated (has
1277  // a flag "invalid" value), then the corresponding row of P is
1278  // in P_remote (i.e., it does not live on the calling process).
1279 
1280  // Remote matrix
1281  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1282  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1283  LO j = Icolind[jj];
1284  LO Cij = PIcol2Accol[j];
1285  SC Plj = Ivals[jj];
1286 
1287  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1288  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1289  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1290 
1291  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1292  }
1293  }
1294  }
1295  }
1296  }
1297 
1298 #ifdef HAVE_TPETRA_MMM_TIMINGS
1299  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1300 #endif
1301 
1302  Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1303 
1304 }
1305 
1306 
1307  /*********************************************************************************************************/
1308  // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1309  // Computes P.T * A * P -> Ac
1310  template<class Scalar,
1311  class LocalOrdinal,
1312  class GlobalOrdinal,
1313  class Node,
1314  class LocalOrdinalViewType>
1315  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1316  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1317  const LocalOrdinalViewType & Acol2Prow,
1318  const LocalOrdinalViewType & Acol2PIrow,
1319  const LocalOrdinalViewType & Pcol2Accol,
1320  const LocalOrdinalViewType & PIcol2Accol,
1321  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1322  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1323  const std::string& label,
1324  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1325 #ifdef HAVE_TPETRA_MMM_TIMINGS
1326  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1327  using Teuchos::TimeMonitor;
1328  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1329 #endif
1330 
1331  // We don't need a kernel-level PTAP, we just transpose here
1332  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1333  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1334 
1335  using Teuchos::ParameterList;
1336  using Teuchos::RCP;
1337  RCP<ParameterList> transposeParams (new ParameterList);
1338  transposeParams->set ("sort", false);
1339 
1340  if (! params.is_null ()) {
1341  transposeParams->set ("compute global constants",
1342  params->get ("compute global constants: temporaries",
1343  false));
1344  }
1345  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1346  transposer.createTransposeLocal (transposeParams);
1347  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1348  Rview.origMatrix = Ptrans;
1349 
1350  mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1351  }
1352 
1353  /*********************************************************************************************************/
1354  // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1355  // Computes P.T * A * P -> Ac
1356  template<class Scalar,
1357  class LocalOrdinal,
1358  class GlobalOrdinal,
1359  class Node,
1360  class LocalOrdinalViewType>
1361  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1362  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1363  const LocalOrdinalViewType & Acol2Prow,
1364  const LocalOrdinalViewType & Acol2PIrow,
1365  const LocalOrdinalViewType & Pcol2Accol,
1366  const LocalOrdinalViewType & PIcol2Accol,
1367  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1368  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1369  const std::string& label,
1370  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1371 #ifdef HAVE_TPETRA_MMM_TIMINGS
1372  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1373  using Teuchos::TimeMonitor;
1374  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1375 #endif
1376 
1377  // We don't need a kernel-level PTAP, we just transpose here
1378  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1379  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1380 
1381  using Teuchos::ParameterList;
1382  using Teuchos::RCP;
1383  RCP<ParameterList> transposeParams (new ParameterList);
1384  transposeParams->set ("sort", false);
1385 
1386  if (! params.is_null ()) {
1387  transposeParams->set ("compute global constants",
1388  params->get ("compute global constants: temporaries",
1389  false));
1390  }
1391  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1392  transposer.createTransposeLocal (transposeParams);
1393  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1394  Rview.origMatrix = Ptrans;
1395 
1396  mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1397  }
1398 
1399  /*********************************************************************************************************/
1400  // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1401  // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1402  // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1403  // Currently, this implementation never gets called.
1404  template<class Scalar,
1405  class LocalOrdinal,
1406  class GlobalOrdinal,
1407  class Node>
1408  void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1409  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1410  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1411  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1412  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1413  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1414  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1415  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1416  const std::string& label,
1417  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1418 #ifdef HAVE_TPETRA_MMM_TIMINGS
1419  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1420  using Teuchos::TimeMonitor;
1421  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1422 #endif
1423 
1424  using Teuchos::Array;
1425  using Teuchos::ArrayRCP;
1426  using Teuchos::ArrayView;
1427  using Teuchos::RCP;
1428  using Teuchos::rcp;
1429 
1430  typedef Scalar SC;
1431  typedef LocalOrdinal LO;
1432  typedef GlobalOrdinal GO;
1433  typedef Node NO;
1434  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1435  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1436  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1437 
1438  // number of rows on the process of the fine matrix
1439  // size_t m = Pview.origMatrix->getLocalNumRows();
1440  // number of rows on the process of the coarse matrix
1441  size_t n = Ac.getRowMap()->getLocalNumElements();
1442  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1443 
1444  // Get Data Pointers
1445  ArrayRCP<size_t> Acrowptr_RCP;
1446  ArrayRCP<LO> Accolind_RCP;
1447  ArrayRCP<SC> Acvals_RCP;
1448 
1449  // mfh 27 Sep 2016: get the three CSR arrays
1450  // out of the CrsMatrix. This code computes R * A * (P_local +
1451  // P_remote), where P_local contains the locally owned rows of P,
1452  // and P_remote the (previously Import'ed) remote rows of P.
1453 
1454  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1455  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1456  auto Avals = Aview.origMatrix->getLocalValuesHost(
1457  Tpetra::Access::ReadOnly);
1458  auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1459  auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1460  auto Pvals = Pview.origMatrix->getLocalValuesHost(
1461  Tpetra::Access::ReadOnly);
1462  decltype(Prowptr) Irowptr;
1463  decltype(Pcolind) Icolind;
1464  decltype(Pvals) Ivals;
1465 
1466  if (!Pview.importMatrix.is_null()) {
1467  Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1468  Icolind = Pview.importMatrix->getLocalIndicesHost();
1469  Ivals = Pview.importMatrix->getLocalValuesHost(
1470  Tpetra::Access::ReadOnly);
1471  }
1472 
1473  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1474  // where Teuchos::ArrayRCP::operator[] may be slower than
1475  // Teuchos::ArrayView::operator[].
1476 
1477  // For efficiency
1478  ArrayView<size_t> Acrowptr;
1479  ArrayView<LO> Accolind;
1480  ArrayView<SC> Acvals;
1481 
1483  // In a first pass, determine the graph of Ac.
1485 
1487  // Get the graph of Ac. This gets the local transpose of P,
1488  // then loops over R, A, P to get the graph of Ac.
1490 
1491 #ifdef HAVE_TPETRA_MMM_TIMINGS
1492  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1493 #endif
1494 
1496  // Get the local transpose of the graph of P by locally transposing
1497  // all of P
1498 
1499  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1500 
1501  using Teuchos::ParameterList;
1502  RCP<ParameterList> transposeParams (new ParameterList);
1503  transposeParams->set ("sort", false);
1504  if (! params.is_null ()) {
1505  transposeParams->set ("compute global constants",
1506  params->get ("compute global constants: temporaries",
1507  false));
1508  }
1509  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1510  transposer.createTransposeLocal (transposeParams);
1511 
1512  auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1513  auto Rcolind = Ptrans->getLocalIndicesHost();
1514  auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1515 
1517  // Construct graph
1518 
1519  #ifdef HAVE_TPETRA_MMM_TIMINGS
1520  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1521  #endif
1522 
1523  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1524  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1525 
1526  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1527  size_t nnzPerRowA = 100;
1528  if (Aview.origMatrix->getLocalNumEntries() > 0)
1529  nnzPerRowA = Aview.origMatrix->getLocalNumEntries()/Aview.origMatrix->getLocalNumRows();
1530  Acrowptr_RCP.resize(n+1);
1531  Acrowptr = Acrowptr_RCP();
1532  Accolind_RCP.resize(nnz_alloc);
1533  Accolind = Accolind_RCP();
1534 
1535  size_t nnz = 0, nnz_old = 0;
1536  for (size_t i = 0; i < n; i++) {
1537  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1538  // on the calling process.
1539  Acrowptr[i] = nnz;
1540 
1541  // mfh 27 Sep 2016: For each entry of R in the current row of R
1542  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1543  LO k = Rcolind[kk]; // local column index of current entry of R
1544  // For each entry of A in the current row of A
1545  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1546  LO l = Acolind[ll]; // local column index of current entry of A
1547 
1548  if (Acol2PRow[l] != LO_INVALID) {
1549  // mfh 27 Sep 2016: If the entry of Acol2PRow
1550  // corresponding to the current entry of A is populated, then
1551  // the corresponding row of P is in P_local (i.e., it lives on
1552  // the calling process).
1553 
1554  // Local matrix
1555  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1556 
1557  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1558  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1559  LO j = Pcolind[jj];
1560  LO Acj = Pcol2Accol[j];
1561 
1562  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1563  // New entry
1564  ac_status[Acj] = nnz;
1565  Accolind[nnz] = Acj;
1566  nnz++;
1567  }
1568  }
1569  } else {
1570  // mfh 27 Sep 2016: If the entry of Acol2PRow
1571  // corresponding to the current entry of A is NOT populated (has
1572  // a flag "invalid" value), then the corresponding row of P is
1573  // in P_remote (i.e., it does not live on the calling process).
1574 
1575  // Remote matrix
1576  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1577  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1578  LO j = Icolind[jj];
1579  LO Acj = PIcol2Accol[j];
1580 
1581  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1582  // New entry
1583  ac_status[Acj] = nnz;
1584  Accolind[nnz] = Acj;
1585  nnz++;
1586  }
1587  }
1588  }
1589  }
1590  }
1591  // Resize for next pass if needed
1592  // cag: Maybe we can do something more subtle here, and not double
1593  // the size right away.
1594  if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1595  nnz_alloc *= 2;
1596  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1597  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1598  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1599  }
1600  nnz_old = nnz;
1601  }
1602  Acrowptr[n] = nnz;
1603 
1604  // Downward resize
1605  Accolind_RCP.resize(nnz);
1606  Accolind = Accolind_RCP();
1607 
1608  // Allocate Acvals
1609  Acvals_RCP.resize(nnz, SC_ZERO);
1610  Acvals = Acvals_RCP();
1611 
1612 
1614  // In a second pass, enter the values into Acvals.
1616 
1617  #ifdef HAVE_TPETRA_MMM_TIMINGS
1618  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1619  #endif
1620 
1621 
1622  for (size_t k = 0; k < n; k++) {
1623  for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1624  LO i = Pcolind[ii];
1625  const SC Pki = Pvals[ii];
1626  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1627  LO l = Acolind[ll];
1628  const SC Akl = Avals[ll];
1629  if (Akl == 0.)
1630  continue;
1631  if (Acol2PRow[l] != LO_INVALID) {
1632  // mfh 27 Sep 2016: If the entry of Acol2PRow
1633  // corresponding to the current entry of A is populated, then
1634  // the corresponding row of P is in P_local (i.e., it lives on
1635  // the calling process).
1636 
1637  // Local matrix
1638  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1639  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1640  LO j = Pcolind[jj];
1641  LO Acj = Pcol2Accol[j];
1642  size_t pp;
1643  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1644  if (Accolind[pp] == Acj)
1645  break;
1646  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1647  // std::runtime_error, "problem with Ac column indices");
1648  Acvals[pp] += Pki*Akl*Pvals[jj];
1649  }
1650  } else {
1651  // mfh 27 Sep 2016: If the entry of Acol2PRow
1652  // corresponding to the current entry of A NOT populated (has
1653  // a flag "invalid" value), then the corresponding row of P is
1654  // in P_remote (i.e., it does not live on the calling process).
1655 
1656  // Remote matrix
1657  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1658  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1659  LO j = Icolind[jj];
1660  LO Acj = PIcol2Accol[j];
1661  size_t pp;
1662  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1663  if (Accolind[pp] == Acj)
1664  break;
1665  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1666  // std::runtime_error, "problem with Ac column indices");
1667  Acvals[pp] += Pki*Akl*Ivals[jj];
1668  }
1669  }
1670  }
1671  }
1672  }
1673 
1674 
1675 #ifdef HAVE_TPETRA_MMM_TIMINGS
1676  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1677 #endif
1678 
1679  // Final sort & set of CRS arrays
1680  //
1681  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1682  // matrix-matrix multiply routine sort the entries for us?
1683  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1684 
1685  // mfh 27 Sep 2016: This just sets pointers.
1686  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1687 
1688 #ifdef HAVE_TPETRA_MMM_TIMINGS
1689  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1690 #endif
1691 
1692  // Final FillComplete
1693  //
1694  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1695  // Import (from domain Map to column Map) construction (which costs
1696  // lots of communication) by taking the previously constructed
1697  // Import object. We should be able to do this without interfering
1698  // with the implementation of the local part of sparse matrix-matrix
1699  // multply above.
1700  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1701  labelList->set("Timer Label",label);
1702  // labelList->set("Sort column Map ghost GIDs")
1703  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1704  RCP<const Export<LO,GO,NO> > dummyExport;
1705  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1706  Pview.origMatrix->getDomainMap(),
1707  Acimport,
1708  dummyExport, labelList);
1709  }
1710 
1711 
1712 
1713  } //End namepsace MMdetails
1714 
1715 } //End namespace Tpetra
1716 //
1717 // Explicit instantiation macro
1718 //
1719 // Must be expanded from within the Tpetra namespace!
1720 //
1721 
1722 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1723  \
1724  template \
1725  void TripleMatrixMultiply::MultiplyRAP( \
1726  const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1727  bool transposeR, \
1728  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1729  bool transposeA, \
1730  const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1731  bool transposeP, \
1732  CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1733  bool call_FillComplete_on_result, \
1734  const std::string & label, \
1735  const Teuchos::RCP<Teuchos::ParameterList>& params); \
1736  \
1737 
1738 
1739 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.