Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
43 
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Tpetra_Util.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
56 #include "Tpetra_Import_Util.hpp"
57 #include "Tpetra_Import_Util2.hpp"
58 #include <algorithm>
59 #include <cmath>
60 #include "Teuchos_FancyOStream.hpp"
61 // #include "KokkosSparse_spgemm.hpp"
62 
63 
71 /*********************************************************************************************************/
72 // Include the architecture-specific kernel partial specializations here
73 // NOTE: This needs to be outside all namespaces
74 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
76 
77 namespace Tpetra {
78 
79  namespace TripleMatrixMultiply{
80 
81  //
82  // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
83  // op(A) == A if transposeA is false,
84  // op(A) == A^T if transposeA is true,
85  // and similarly for op(R) and op(P).
86  //
87  template <class Scalar,
88  class LocalOrdinal,
89  class GlobalOrdinal,
90  class Node>
93  bool transposeR,
95  bool transposeA,
97  bool transposeP,
99  bool call_FillComplete_on_result,
100  const std::string& label,
101  const Teuchos::RCP<Teuchos::ParameterList>& params)
102  {
103  using Teuchos::null;
104  using Teuchos::RCP;
105  typedef Scalar SC;
106  typedef LocalOrdinal LO;
107  typedef GlobalOrdinal GO;
108  typedef Node NO;
109  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
110  typedef Import<LO,GO,NO> import_type;
111  typedef Export<LO,GO,NO> export_type;
112  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
113  typedef Map<LO,GO,NO> map_type;
114  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
115 
116 #ifdef HAVE_TPETRA_MMM_TIMINGS
117  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
118  using Teuchos::TimeMonitor;
119  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
120 #endif
121 
122  const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
123 
124  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
125 
126  // The input matrices R, A and P must both be fillComplete.
127  TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
128  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
129  TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
130 
131  // If transposeA is true, then Rprime will be the transpose of R
132  // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
133  // will just be a pointer to R.
134  RCP<const crs_matrix_type> Rprime = null;
135  // If transposeA is true, then Aprime will be the transpose of A
136  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
137  // will just be a pointer to A.
138  RCP<const crs_matrix_type> Aprime = null;
139  // If transposeB is true, then Pprime will be the transpose of P
140  // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
141  // will just be a pointer to P.
142  RCP<const crs_matrix_type> Pprime = null;
143 
144  // Is this a "clean" matrix?
145  //
146  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
147  // locally nor globally indexed, then it was empty. I don't like
148  // this, because the most straightforward implementation presumes
149  // lazy allocation of indices. However, historical precedent
150  // demands that we keep around this predicate as a way to test
151  // whether the matrix is empty.
152  const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
153 
154  using Teuchos::ParameterList;
155  RCP<ParameterList> transposeParams (new ParameterList);
156  transposeParams->set ("sort", false);
157 
158  if (transposeR && &R != &P) {
159  transposer_type transposer(rcpFromRef (R));
160  Rprime = transposer.createTranspose (transposeParams);
161  } else {
162  Rprime = rcpFromRef(R);
163  }
164 
165  if (transposeA) {
166  transposer_type transposer(rcpFromRef (A));
167  Aprime = transposer.createTranspose (transposeParams);
168  } else {
169  Aprime = rcpFromRef(A);
170  }
171 
172  if (transposeP) {
173  transposer_type transposer(rcpFromRef (P));
174  Pprime = transposer.createTranspose (transposeParams);
175  } else {
176  Pprime = rcpFromRef(P);
177  }
178 
179  // Check size compatibility
180  global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
181  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
182  global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
183  global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
184  global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
185  global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
186  global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
187  global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
188  global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
189  TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
190  prefix << "ERROR, inner dimensions of op(R) and op(A) "
191  "must match for matrix-matrix product. op(R) is "
192  << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
193 
194  TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
195  prefix << "ERROR, inner dimensions of op(A) and op(P) "
196  "must match for matrix-matrix product. op(A) is "
197  << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
198 
199  // The result matrix Ac must at least have a row-map that reflects the correct
200  // row-size. Don't check the number of columns because rectangular matrices
201  // which were constructed with only one map can still end up having the
202  // correct capacity and dimensions when filled.
203  TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
204  prefix << "ERROR, dimensions of result Ac must "
205  "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
206  << " rows, should have at least " << Rleft << std::endl);
207 
208  // It doesn't matter whether Ac is already Filled or not. If it is already
209  // Filled, it must have space allocated for the positions that will be
210  // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
211  // we'll error out later when trying to store result values.
212 
213  // CGB: However, matrix must be in active-fill
214  TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
215 
216  // We're going to need to import remotely-owned sections of P if
217  // more than one processor is performing this run, depending on the scenario.
218  int numProcs = P.getComm()->getSize();
219 
220  // Declare a couple of structs that will be used to hold views of the data
221  // of R, A and P, to be used for fast access during the matrix-multiplication.
222  crs_matrix_struct_type Rview;
223  crs_matrix_struct_type Aview;
224  crs_matrix_struct_type Pview;
225 
226  RCP<const map_type> targetMap_R = Rprime->getRowMap();
227  RCP<const map_type> targetMap_A = Aprime->getRowMap();
228  RCP<const map_type> targetMap_P = Pprime->getRowMap();
229 
230 #ifdef HAVE_TPETRA_MMM_TIMINGS
231  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
232 #endif
233 
234  // Now import any needed remote rows and populate the Aview struct
235  // NOTE: We assert that an import isn't needed --- since we do the transpose
236  // above to handle that.
237  RCP<const import_type> dummyImporter;
238 
239  if (!(transposeR && &R == &P))
240  MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
241 
242  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
243 
244  // We will also need local access to all rows of P that correspond to the
245  // column-map of op(A).
246  if (numProcs > 1)
247  targetMap_P = Aprime->getColMap();
248 
249  // Import any needed remote rows and populate the Pview struct.
250  MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
251 
252 
253  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
254 
255  bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
256  if (needs_final_export)
257  Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
258  else
259  Actemp = rcp(&Ac,false);// don't allow deallocation
260 
261 #ifdef HAVE_TPETRA_MMM_TIMINGS
262  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
263 #endif
264 
265  // Call the appropriate method to perform the actual multiplication.
266  if (call_FillComplete_on_result && newFlag) {
267  if (transposeR && &R == &P)
268  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
269  else
270  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
271  } else if (call_FillComplete_on_result) {
272  if (transposeR && &R == &P)
273  MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
274  else
275  MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
276  } else {
277  // mfh 27 Sep 2016: Is this the "slow" case? This
278  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
279  // thread-parallel inserts, but that may take some effort.
280  // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
281 
282  // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
283 
284  // #ifdef HAVE_TPETRA_MMM_TIMINGS
285  // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
286  // #endif
287  // if (call_FillComplete_on_result) {
288  // // We'll call FillComplete on the C matrix before we exit, and give it a
289  // // domain-map and a range-map.
290  // // The domain-map will be the domain-map of B, unless
291  // // op(B)==transpose(B), in which case the range-map of B will be used.
292  // // The range-map will be the range-map of A, unless op(A)==transpose(A),
293  // // in which case the domain-map of A will be used.
294  // if (!C.isFillComplete())
295  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
296  // }
297  // Not implemented
298  if (transposeR && &R == &P)
299  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
300  else
301  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
302  }
303 
304  if (needs_final_export) {
305 #ifdef HAVE_TPETRA_MMM_TIMINGS
306  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
307 #endif
308  Teuchos::ParameterList labelList;
309  labelList.set("Timer Label", label);
310  Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
311 
312  RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
313  bool isMM = true;
314  bool overrideAllreduce = false;
315  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
316  if(!params.is_null()) {
317  Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
318  mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
319  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
320  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
321  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
322  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
323  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
324 
325  labelList.set("compute global constants",params->get("compute global constants",true));
326  }
327  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
328 
329  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
330  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
331  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
332 
333  export_type exporter = export_type(*Pprime->getGraph()->getImporter());
334  Actemp->exportAndFillComplete(Acprime,
335  exporter,
336  Acprime->getDomainMap(),
337  Acprime->getRangeMap(),
338  rcp(&labelList,false));
339 
340  }
341 #ifdef HAVE_TPETRA_MMM_STATISTICS
342  printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
343 #endif
344 
345  }
346 
347 
348  } //End namespace TripleMatrixMultiply
349 
350  namespace MMdetails{
351 
352  // Kernel method for computing the local portion of Ac = R*A*P
353  template<class Scalar,
354  class LocalOrdinal,
355  class GlobalOrdinal,
356  class Node>
357  void mult_R_A_P_newmatrix(
358  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
359  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
360  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
361  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
362  const std::string& label,
363  const Teuchos::RCP<Teuchos::ParameterList>& params)
364  {
365  using Teuchos::Array;
366  using Teuchos::ArrayRCP;
367  using Teuchos::ArrayView;
368  using Teuchos::RCP;
369  using Teuchos::rcp;
370 
371  //typedef Scalar SC; // unused
372  typedef LocalOrdinal LO;
373  typedef GlobalOrdinal GO;
374  typedef Node NO;
375 
376  typedef Import<LO,GO,NO> import_type;
377  typedef Map<LO,GO,NO> map_type;
378 
379  // Kokkos typedefs
380  typedef typename map_type::local_map_type local_map_type;
382  typedef typename KCRS::StaticCrsGraphType graph_t;
383  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
384  typedef typename NO::execution_space execution_space;
385  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
386  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
387 
388 #ifdef HAVE_TPETRA_MMM_TIMINGS
389  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
390  using Teuchos::TimeMonitor;
391  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
392 #endif
393  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
394 
395  // Build the final importer / column map, hash table lookups for Ac
396  RCP<const import_type> Cimport;
397  RCP<const map_type> Ccolmap;
398  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
399  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
400  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
401  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
402  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
403  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
404  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
405 
406 
407  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
408  // indices of B, to local column indices of Ac. (B and Ac have the
409  // same number of columns.) The kernel uses this, instead of
410  // copying the entire input matrix B and converting its column
411  // indices to those of C.
412  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
413 
414  if (Pview.importMatrix.is_null()) {
415  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
416  Cimport = Pimport;
417  Ccolmap = Pview.colMap;
418  const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
419  // Pcol2Ccol is trivial
420  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
421  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
422  KOKKOS_LAMBDA(const LO i) {
423  Pcol2Ccol(i) = i;
424  });
425  }
426  else {
427  // mfh 27 Sep 2016: P has "remotes," so we need to build the
428  // column Map of C, as well as C's Import object (from its domain
429  // Map to its column Map). C's column Map is the union of the
430  // column Maps of (the local part of) P, and the "remote" part of
431  // P. Ditto for the Import. We have optimized this "setUnion"
432  // operation on Import objects and Maps.
433 
434  // Choose the right variant of setUnion
435  if (!Pimport.is_null() && !Iimport.is_null()) {
436  Cimport = Pimport->setUnion(*Iimport);
437  }
438  else if (!Pimport.is_null() && Iimport.is_null()) {
439  Cimport = Pimport->setUnion();
440  }
441  else if (Pimport.is_null() && !Iimport.is_null()) {
442  Cimport = Iimport->setUnion();
443  }
444  else {
445  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
446  }
447  Ccolmap = Cimport->getTargetMap();
448 
449  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
450  // in general. We should get rid of it in order to reduce
451  // communication costs of sparse matrix-matrix multiply.
452  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
453  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
454 
455  // NOTE: This is not efficient and should be folded into setUnion
456  //
457  // mfh 27 Sep 2016: What the above comment means, is that the
458  // setUnion operation on Import objects could also compute these
459  // local index - to - local index look-up tables.
460  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
461  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
462  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
463  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
464  });
465  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
466  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
467  });
468 
469  }
470 
471  // Replace the column map
472  //
473  // mfh 27 Sep 2016: We do this because C was originally created
474  // without a column Map. Now we have its column Map.
475  Ac.replaceColMap(Ccolmap);
476 
477  // mfh 27 Sep 2016: Construct tables that map from local column
478  // indices of A, to local row indices of either B_local (the locally
479  // owned part of B), or B_remote (the "imported" remote part of B).
480  //
481  // For column index Aik in row i of A, if the corresponding row of B
482  // exists in the local part of B ("orig") (which I'll call B_local),
483  // then targetMapToOrigRow[Aik] is the local index of that row of B.
484  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
485  //
486  // For column index Aik in row i of A, if the corresponding row of B
487  // exists in the remote part of B ("Import") (which I'll call
488  // B_remote), then targetMapToImportRow[Aik] is the local index of
489  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
490  // (a flag value).
491 
492  // Run through all the hash table lookups once and for all
493  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
494  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
495  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
496  GO aidx = Acolmap_local.getGlobalElement(i);
497  LO P_LID = Prowmap_local.getLocalElement(aidx);
498  if (P_LID != LO_INVALID) {
499  targetMapToOrigRow(i) = P_LID;
500  targetMapToImportRow(i) = LO_INVALID;
501  } else {
502  LO I_LID = Irowmap_local.getLocalElement(aidx);
503  targetMapToOrigRow(i) = LO_INVALID;
504  targetMapToImportRow(i) = I_LID;
505  }
506  });
507 
508  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
509  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
510  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
511  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
512  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
513  Ac, Cimport, label, params);
514  }
515 
516 
517  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
518  template<class Scalar,
519  class LocalOrdinal,
520  class GlobalOrdinal,
521  class Node>
522  void mult_R_A_P_reuse(
523  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
524  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
525  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
526  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
527  const std::string& label,
528  const Teuchos::RCP<Teuchos::ParameterList>& params)
529  {
530  using Teuchos::Array;
531  using Teuchos::ArrayRCP;
532  using Teuchos::ArrayView;
533  using Teuchos::RCP;
534  using Teuchos::rcp;
535 
536  //typedef Scalar SC; // unused
537  typedef LocalOrdinal LO;
538  typedef GlobalOrdinal GO;
539  typedef Node NO;
540 
541  typedef Import<LO,GO,NO> import_type;
542  typedef Map<LO,GO,NO> map_type;
543 
544  // Kokkos typedefs
545  typedef typename map_type::local_map_type local_map_type;
547  typedef typename KCRS::StaticCrsGraphType graph_t;
548  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
549  typedef typename NO::execution_space execution_space;
550  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
551  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
552 
553 #ifdef HAVE_TPETRA_MMM_TIMINGS
554  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
555  using Teuchos::TimeMonitor;
556  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
557 #endif
558  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
559 
560  // Build the final importer / column map, hash table lookups for Ac
561  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
562  RCP<const map_type> Ccolmap = Ac.getColMap();
563  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
564  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
565  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
566  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
567  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
568  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
569  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
570  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
571 
572  // Build the final importer / column map, hash table lookups for C
573  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
574  {
575  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
576  // So, column map of C may be a strict subset of the column map of B
577  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
578  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
579  });
580 
581  if (!Pview.importMatrix.is_null()) {
582  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
583  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
584 
585  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
586  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
587  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
588  });
589  }
590  }
591 
592  // Run through all the hash table lookups once and for all
593  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
594  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
595  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
596  GO aidx = Acolmap_local.getGlobalElement(i);
597  LO B_LID = Prowmap_local.getLocalElement(aidx);
598  if (B_LID != LO_INVALID) {
599  targetMapToOrigRow(i) = B_LID;
600  targetMapToImportRow(i) = LO_INVALID;
601  } else {
602  LO I_LID = Irowmap_local.getLocalElement(aidx);
603  targetMapToOrigRow(i) = LO_INVALID;
604  targetMapToImportRow(i) = I_LID;
605 
606  }
607  });
608 
609  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
610  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
611  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
612  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
613  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
614  Ac, Cimport, label, params);
615  }
616 
617 
618  // Kernel method for computing the local portion of Ac = R*A*P
619  template<class Scalar,
620  class LocalOrdinal,
621  class GlobalOrdinal,
622  class Node>
623  void mult_PT_A_P_newmatrix(
624  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
625  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
626  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
627  const std::string& label,
628  const Teuchos::RCP<Teuchos::ParameterList>& params)
629  {
630  using Teuchos::Array;
631  using Teuchos::ArrayRCP;
632  using Teuchos::ArrayView;
633  using Teuchos::RCP;
634  using Teuchos::rcp;
635 
636  //typedef Scalar SC; // unused
637  typedef LocalOrdinal LO;
638  typedef GlobalOrdinal GO;
639  typedef Node NO;
640 
641  typedef Import<LO,GO,NO> import_type;
642  typedef Map<LO,GO,NO> map_type;
643 
644  // Kokkos typedefs
645  typedef typename map_type::local_map_type local_map_type;
647  typedef typename KCRS::StaticCrsGraphType graph_t;
648  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
649  typedef typename NO::execution_space execution_space;
650  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
651  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
652 
653 #ifdef HAVE_TPETRA_MMM_TIMINGS
654  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
655  using Teuchos::TimeMonitor;
656  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
657 #endif
658  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
659 
660  // Build the final importer / column map, hash table lookups for Ac
661  RCP<const import_type> Cimport;
662  RCP<const map_type> Ccolmap;
663  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
664  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
665  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
666  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
667  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
668  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
669  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
670 
671 
672  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
673  // indices of B, to local column indices of Ac. (B and Ac have the
674  // same number of columns.) The kernel uses this, instead of
675  // copying the entire input matrix B and converting its column
676  // indices to those of C.
677  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
678 
679  if (Pview.importMatrix.is_null()) {
680  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
681  Cimport = Pimport;
682  Ccolmap = Pview.colMap;
683  const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
684  // Pcol2Ccol is trivial
685  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
686  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
687  KOKKOS_LAMBDA(const LO i) {
688  Pcol2Ccol(i) = i;
689  });
690  }
691  else {
692  // mfh 27 Sep 2016: P has "remotes," so we need to build the
693  // column Map of C, as well as C's Import object (from its domain
694  // Map to its column Map). C's column Map is the union of the
695  // column Maps of (the local part of) P, and the "remote" part of
696  // P. Ditto for the Import. We have optimized this "setUnion"
697  // operation on Import objects and Maps.
698 
699  // Choose the right variant of setUnion
700  if (!Pimport.is_null() && !Iimport.is_null()) {
701  Cimport = Pimport->setUnion(*Iimport);
702  }
703  else if (!Pimport.is_null() && Iimport.is_null()) {
704  Cimport = Pimport->setUnion();
705  }
706  else if (Pimport.is_null() && !Iimport.is_null()) {
707  Cimport = Iimport->setUnion();
708  }
709  else {
710  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
711  }
712  Ccolmap = Cimport->getTargetMap();
713 
714  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
715  // in general. We should get rid of it in order to reduce
716  // communication costs of sparse matrix-matrix multiply.
717  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
718  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
719 
720  // NOTE: This is not efficient and should be folded into setUnion
721  //
722  // mfh 27 Sep 2016: What the above comment means, is that the
723  // setUnion operation on Import objects could also compute these
724  // local index - to - local index look-up tables.
725  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
726  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
727  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
728  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
729  });
730  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
731  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
732  });
733 
734  }
735 
736  // Replace the column map
737  //
738  // mfh 27 Sep 2016: We do this because C was originally created
739  // without a column Map. Now we have its column Map.
740  Ac.replaceColMap(Ccolmap);
741 
742  // mfh 27 Sep 2016: Construct tables that map from local column
743  // indices of A, to local row indices of either B_local (the locally
744  // owned part of B), or B_remote (the "imported" remote part of B).
745  //
746  // For column index Aik in row i of A, if the corresponding row of B
747  // exists in the local part of B ("orig") (which I'll call B_local),
748  // then targetMapToOrigRow[Aik] is the local index of that row of B.
749  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
750  //
751  // For column index Aik in row i of A, if the corresponding row of B
752  // exists in the remote part of B ("Import") (which I'll call
753  // B_remote), then targetMapToImportRow[Aik] is the local index of
754  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
755  // (a flag value).
756 
757  // Run through all the hash table lookups once and for all
758  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
759  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
760 
761  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
762  GO aidx = Acolmap_local.getGlobalElement(i);
763  LO P_LID = Prowmap_local.getLocalElement(aidx);
764  if (P_LID != LO_INVALID) {
765  targetMapToOrigRow(i) = P_LID;
766  targetMapToImportRow(i) = LO_INVALID;
767  } else {
768  LO I_LID = Irowmap_local.getLocalElement(aidx);
769  targetMapToOrigRow(i) = LO_INVALID;
770  targetMapToImportRow(i) = I_LID;
771  }
772  });
773 
774  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
775  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
776  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
777  mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
778  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
779  Ac, Cimport, label, params);
780  }
781 
782  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
783  template<class Scalar,
784  class LocalOrdinal,
785  class GlobalOrdinal,
786  class Node>
787  void mult_PT_A_P_reuse(
788  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
789  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
790  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
791  const std::string& label,
792  const Teuchos::RCP<Teuchos::ParameterList>& params)
793  {
794  using Teuchos::Array;
795  using Teuchos::ArrayRCP;
796  using Teuchos::ArrayView;
797  using Teuchos::RCP;
798  using Teuchos::rcp;
799 
800  //typedef Scalar SC; // unused
801  typedef LocalOrdinal LO;
802  typedef GlobalOrdinal GO;
803  typedef Node NO;
804 
805  typedef Import<LO,GO,NO> import_type;
806  typedef Map<LO,GO,NO> map_type;
807 
808  // Kokkos typedefs
809  typedef typename map_type::local_map_type local_map_type;
811  typedef typename KCRS::StaticCrsGraphType graph_t;
812  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
813  typedef typename NO::execution_space execution_space;
814  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
815  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
816 
817 #ifdef HAVE_TPETRA_MMM_TIMINGS
818  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
819  using Teuchos::TimeMonitor;
820  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
821 #endif
822  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
823 
824  // Build the final importer / column map, hash table lookups for Ac
825  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
826  RCP<const map_type> Ccolmap = Ac.getColMap();
827  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
828  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
829  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
830  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
831  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
832  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
833  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
834  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
835 
836  // Build the final importer / column map, hash table lookups for C
837  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
838  {
839  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
840  // So, column map of C may be a strict subset of the column map of B
841  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
842  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
843  });
844 
845  if (!Pview.importMatrix.is_null()) {
846  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
847  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
848 
849  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
850  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
851  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
852  });
853  }
854  }
855 
856  // Run through all the hash table lookups once and for all
857  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
858  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
859  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
860  GO aidx = Acolmap_local.getGlobalElement(i);
861  LO B_LID = Prowmap_local.getLocalElement(aidx);
862  if (B_LID != LO_INVALID) {
863  targetMapToOrigRow(i) = B_LID;
864  targetMapToImportRow(i) = LO_INVALID;
865  } else {
866  LO I_LID = Irowmap_local.getLocalElement(aidx);
867  targetMapToOrigRow(i) = LO_INVALID;
868  targetMapToImportRow(i) = I_LID;
869 
870  }
871  });
872 
873  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
874  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
875  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
876  mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
877  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
878  Ac, Cimport, label, params);
879  }
880 
881 
882  /*********************************************************************************************************/
883  // RAP NewMatrix Kernel wrappers (Default non-threaded version)
884  // Computes R * A * P -> Ac using classic Gustavson approach
885  template<class Scalar,
886  class LocalOrdinal,
887  class GlobalOrdinal,
888  class Node,
889  class LocalOrdinalViewType>
890  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
891  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
892  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
893  const LocalOrdinalViewType & Acol2Prow,
894  const LocalOrdinalViewType & Acol2PIrow,
895  const LocalOrdinalViewType & Pcol2Accol,
896  const LocalOrdinalViewType & PIcol2Accol,
897  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
898  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
899  const std::string& label,
900  const Teuchos::RCP<Teuchos::ParameterList>& params) {
901 #ifdef HAVE_TPETRA_MMM_TIMINGS
902  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
903  using Teuchos::TimeMonitor;
904  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
905 #endif
906 
907  using Teuchos::Array;
908  using Teuchos::ArrayRCP;
909  using Teuchos::ArrayView;
910  using Teuchos::RCP;
911  using Teuchos::rcp;
912 
913  // Lots and lots of typedefs
915  typedef typename KCRS::StaticCrsGraphType graph_t;
916  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
917  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
918  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
919  typedef typename KCRS::values_type::non_const_type scalar_view_t;
920 
921  typedef Scalar SC;
922  typedef LocalOrdinal LO;
923  typedef GlobalOrdinal GO;
924  typedef Node NO;
925  typedef Map<LO,GO,NO> map_type;
926  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
927  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
928  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
929 
930  // Sizes
931  RCP<const map_type> Accolmap = Ac.getColMap();
932  size_t m = Rview.origMatrix->getNodeNumRows();
933  size_t n = Accolmap->getNodeNumElements();
934  size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
935 
936  // Grab the Kokkos::SparseCrsMatrices & inner stuff
937  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
938  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
939  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
940 
941  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map;
942  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries;
943  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
944 
945  c_lno_view_t Irowptr;
946  lno_nnz_view_t Icolind;
947  scalar_view_t Ivals;
948  if(!Pview.importMatrix.is_null()) {
949  Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
950  Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
951  Ivals = Pview.importMatrix->getLocalMatrix().values;
952  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
953  }
954 
955 #ifdef HAVE_TPETRA_MMM_TIMINGS
956  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
957 #endif
958 
959  // Classic csr assembly (low memory edition)
960  //
961  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
962  // The method loops over rows of R, and may resize after processing
963  // each row. Chris Siefert says that this reflects experience in
964  // ML; for the non-threaded case, ML found it faster to spend less
965  // effort on estimation and risk an occasional reallocation.
966  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
967  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
968  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
969  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
970 
971  // mfh 27 Sep 2016: The ac_status array is an implementation detail
972  // of the local sparse matrix-matrix multiply routine.
973 
974  // The status array will contain the index into colind where this entry was last deposited.
975  // ac_status[i] < nnz - not in the row yet
976  // ac_status[i] >= nnz - this is the entry where you can find the data
977  // We start with this filled with INVALID's indicating that there are no entries yet.
978  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
979  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
980  Array<size_t> ac_status(n, ST_INVALID);
981 
982  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
983  // routine. The routine computes Ac := R * A * (P_local + P_remote).
984  //
985  // For column index Aik in row i of A, Acol2Prow[Aik] tells
986  // you whether the corresponding row of P belongs to P_local
987  // ("orig") or P_remote ("Import").
988 
989  // For each row of R
990  size_t nnz = 0, nnz_old = 0;
991  for (size_t i = 0; i < m; i++) {
992  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
993  // on the calling process.
994  Crowptr[i] = nnz;
995 
996  // mfh 27 Sep 2016: For each entry of R in the current row of R
997  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
998  LO k = Rcolind[kk]; // local column index of current entry of R
999  const SC Rik = Rvals[kk]; // value of current entry of R
1000  if (Rik == SC_ZERO)
1001  continue; // skip explicitly stored zero values in R
1002  // For each entry of A in the current row of A
1003  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1004  LO l = Acolind[ll]; // local column index of current entry of A
1005  const SC Akl = Avals[ll]; // value of current entry of A
1006  if (Akl == SC_ZERO)
1007  continue; // skip explicitly stored zero values in A
1008 
1009 
1010  if (Acol2Prow[l] != LO_INVALID) {
1011  // mfh 27 Sep 2016: If the entry of Acol2Prow
1012  // corresponding to the current entry of A is populated, then
1013  // the corresponding row of P is in P_local (i.e., it lives on
1014  // the calling process).
1015 
1016  // Local matrix
1017  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1018 
1019  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1020  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1021  LO j = Pcolind[jj];
1022  LO Acj = Pcol2Accol[j];
1023  SC Plj = Pvals[jj];
1024 
1025  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1026 #ifdef HAVE_TPETRA_DEBUG
1027  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1028  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1029  std::runtime_error,
1030  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1031 #endif
1032  // New entry
1033  ac_status[Acj] = nnz;
1034  Ccolind[nnz] = Acj;
1035  Cvals[nnz] = Rik*Akl*Plj;
1036  nnz++;
1037  } else {
1038  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1039  }
1040  }
1041  } else {
1042  // mfh 27 Sep 2016: If the entry of Acol2PRow
1043  // corresponding to the current entry of A is NOT populated (has
1044  // a flag "invalid" value), then the corresponding row of P is
1045  // in P_remote (i.e., it does not live on the calling process).
1046 
1047  // Remote matrix
1048  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1049  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1050  LO j = Icolind[jj];
1051  LO Acj = PIcol2Accol[j];
1052  SC Plj = Ivals[jj];
1053 
1054  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1055 #ifdef HAVE_TPETRA_DEBUG
1056  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1057  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1058  std::runtime_error,
1059  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1060 #endif
1061  // New entry
1062  ac_status[Acj] = nnz;
1063  Ccolind[nnz] = Acj;
1064  Cvals[nnz] = Rik*Akl*Plj;
1065  nnz++;
1066  } else {
1067  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1068  }
1069  }
1070  }
1071  }
1072  }
1073  // Resize for next pass if needed
1074  if (nnz + n > CSR_alloc) {
1075  CSR_alloc *= 2;
1076  Kokkos::resize(Ccolind,CSR_alloc);
1077  Kokkos::resize(Cvals,CSR_alloc);
1078  }
1079  nnz_old = nnz;
1080  }
1081 
1082  Crowptr[m] = nnz;
1083 
1084  // Downward resize
1085  Kokkos::resize(Ccolind,nnz);
1086  Kokkos::resize(Cvals,nnz);
1087 
1088 #ifdef HAVE_TPETRA_MMM_TIMINGS
1089  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1090 #endif
1091 
1092  // Final sort & set of CRS arrays
1093  if (params.is_null() || params->get("sort entries",true))
1094  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1095  Ac.setAllValues(Crowptr, Ccolind, Cvals);
1096 
1097 #ifdef HAVE_TPETRA_MMM_TIMINGS
1098  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1099 #endif
1100 
1101  // Final FillComplete
1102  //
1103  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1104  // Import (from domain Map to column Map) construction (which costs
1105  // lots of communication) by taking the previously constructed
1106  // Import object. We should be able to do this without interfering
1107  // with the implementation of the local part of sparse matrix-matrix
1108  // multply above.
1109  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1110  labelList->set("Timer Label",label);
1111  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1112  RCP<const Export<LO,GO,NO> > dummyExport;
1113  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1114  Rview.origMatrix->getRangeMap(),
1115  Acimport,
1116  dummyExport,
1117  labelList);
1118 
1119  }
1120 
1121  /*********************************************************************************************************/
1122  // RAP Reuse Kernel wrappers (Default non-threaded version)
1123  // Computes R * A * P -> Ac using reuse Gustavson
1124  template<class Scalar,
1125  class LocalOrdinal,
1126  class GlobalOrdinal,
1127  class Node,
1128  class LocalOrdinalViewType>
1129  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1130  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1131  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1132  const LocalOrdinalViewType & Acol2Prow,
1133  const LocalOrdinalViewType & Acol2PIrow,
1134  const LocalOrdinalViewType & Pcol2Accol,
1135  const LocalOrdinalViewType & PIcol2Accol,
1136  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1137  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1138  const std::string& label,
1139  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1140 #ifdef HAVE_TPETRA_MMM_TIMINGS
1141  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1142  using Teuchos::TimeMonitor;
1143  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1144 #endif
1145 
1146  using Teuchos::Array;
1147  using Teuchos::ArrayRCP;
1148  using Teuchos::ArrayView;
1149  using Teuchos::RCP;
1150  using Teuchos::rcp;
1151 
1152  // Lots and lots of typedefs
1154  typedef typename KCRS::StaticCrsGraphType graph_t;
1155  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1156  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1157  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1158 
1159  typedef Scalar SC;
1160  typedef LocalOrdinal LO;
1161  typedef GlobalOrdinal GO;
1162  typedef Node NO;
1163  typedef Map<LO,GO,NO> map_type;
1164  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1165  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1166  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1167 
1168  // Sizes
1169  RCP<const map_type> Accolmap = Ac.getColMap();
1170  size_t m = Rview.origMatrix->getNodeNumRows();
1171  size_t n = Accolmap->getNodeNumElements();
1172  size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
1173 
1174  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1175  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1176  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1177  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1178  const KCRS & Cmat = Ac.getLocalMatrix();
1179 
1180  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1181  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1182  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1183  scalar_view_t Cvals = Cmat.values;
1184 
1185  c_lno_view_t Irowptr;
1186  lno_nnz_view_t Icolind;
1187  scalar_view_t Ivals;
1188  if(!Pview.importMatrix.is_null()) {
1189  Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
1190  Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
1191  Ivals = Pview.importMatrix->getLocalMatrix().values;
1192  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
1193  }
1194 
1195 #ifdef HAVE_TPETRA_MMM_TIMINGS
1196  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1197 #endif
1198 
1199  // mfh 27 Sep 2016: The ac_status array is an implementation detail
1200  // of the local sparse matrix-matrix multiply routine.
1201 
1202  // The status array will contain the index into colind where this entry was last deposited.
1203  // ac_status[i] < nnz - not in the row yet
1204  // ac_status[i] >= nnz - this is the entry where you can find the data
1205  // We start with this filled with INVALID's indicating that there are no entries yet.
1206  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1207  Array<size_t> ac_status(n, ST_INVALID);
1208 
1209  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1210  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1211  //
1212  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1213  // you whether the corresponding row of P belongs to P_local
1214  // ("orig") or P_remote ("Import").
1215 
1216  // Necessary until following UVM host accesses are changed - for example Crowptr
1217  // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1218  Kokkos::fence();
1219 
1220  // For each row of R
1221  size_t OLD_ip = 0, CSR_ip = 0;
1222  for (size_t i = 0; i < m; i++) {
1223  // First fill the c_status array w/ locations where we're allowed to
1224  // generate nonzeros for this row
1225  OLD_ip = Crowptr[i];
1226  CSR_ip = Crowptr[i+1];
1227  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1228  ac_status[Ccolind[k]] = k;
1229 
1230  // Reset values in the row of C
1231  Cvals[k] = SC_ZERO;
1232  }
1233 
1234  // mfh 27 Sep 2016: For each entry of R in the current row of R
1235  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1236  LO k = Rcolind[kk]; // local column index of current entry of R
1237  const SC Rik = Rvals[kk]; // value of current entry of R
1238  if (Rik == SC_ZERO)
1239  continue; // skip explicitly stored zero values in R
1240  // For each entry of A in the current row of A
1241  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1242  LO l = Acolind[ll]; // local column index of current entry of A
1243  const SC Akl = Avals[ll]; // value of current entry of A
1244  if (Akl == SC_ZERO)
1245  continue; // skip explicitly stored zero values in A
1246 
1247 
1248  if (Acol2Prow[l] != LO_INVALID) {
1249  // mfh 27 Sep 2016: If the entry of Acol2Prow
1250  // corresponding to the current entry of A is populated, then
1251  // the corresponding row of P is in P_local (i.e., it lives on
1252  // the calling process).
1253 
1254  // Local matrix
1255  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1256 
1257  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1258  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1259  LO j = Pcolind[jj];
1260  LO Cij = Pcol2Accol[j];
1261  SC Plj = Pvals[jj];
1262 
1263  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1264  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1265  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1266 
1267  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1268  }
1269  } else {
1270  // mfh 27 Sep 2016: If the entry of Acol2PRow
1271  // corresponding to the current entry of A is NOT populated (has
1272  // a flag "invalid" value), then the corresponding row of P is
1273  // in P_remote (i.e., it does not live on the calling process).
1274 
1275  // Remote matrix
1276  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1277  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1278  LO j = Icolind[jj];
1279  LO Cij = PIcol2Accol[j];
1280  SC Plj = Ivals[jj];
1281 
1282  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1283  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1284  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1285 
1286  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1287  }
1288  }
1289  }
1290  }
1291  }
1292 
1293 #ifdef HAVE_TPETRA_MMM_TIMINGS
1294  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1295 #endif
1296 
1297  Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1298 
1299 }
1300 
1301 
1302  /*********************************************************************************************************/
1303  // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1304  // Computes P.T * A * P -> Ac
1305  template<class Scalar,
1306  class LocalOrdinal,
1307  class GlobalOrdinal,
1308  class Node,
1309  class LocalOrdinalViewType>
1310  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1311  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1312  const LocalOrdinalViewType & Acol2Prow,
1313  const LocalOrdinalViewType & Acol2PIrow,
1314  const LocalOrdinalViewType & Pcol2Accol,
1315  const LocalOrdinalViewType & PIcol2Accol,
1316  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1317  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1318  const std::string& label,
1319  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1320 #ifdef HAVE_TPETRA_MMM_TIMINGS
1321  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1322  using Teuchos::TimeMonitor;
1323  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1324 #endif
1325 
1326  // We don't need a kernel-level PTAP, we just transpose here
1327  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1328  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1329 
1330  using Teuchos::ParameterList;
1331  using Teuchos::RCP;
1332  RCP<ParameterList> transposeParams (new ParameterList);
1333  transposeParams->set ("sort", false);
1334 
1335  if (! params.is_null ()) {
1336  transposeParams->set ("compute global constants",
1337  params->get ("compute global constants: temporaries",
1338  false));
1339  }
1340  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1341  transposer.createTransposeLocal (transposeParams);
1342  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1343  Rview.origMatrix = Ptrans;
1344 
1345  mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1346  }
1347 
1348  /*********************************************************************************************************/
1349  // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1350  // Computes P.T * A * P -> Ac
1351  template<class Scalar,
1352  class LocalOrdinal,
1353  class GlobalOrdinal,
1354  class Node,
1355  class LocalOrdinalViewType>
1356  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1357  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1358  const LocalOrdinalViewType & Acol2Prow,
1359  const LocalOrdinalViewType & Acol2PIrow,
1360  const LocalOrdinalViewType & Pcol2Accol,
1361  const LocalOrdinalViewType & PIcol2Accol,
1362  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1363  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1364  const std::string& label,
1365  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1366 #ifdef HAVE_TPETRA_MMM_TIMINGS
1367  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1368  using Teuchos::TimeMonitor;
1369  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1370 #endif
1371 
1372  // We don't need a kernel-level PTAP, we just transpose here
1373  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1374  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1375 
1376  using Teuchos::ParameterList;
1377  using Teuchos::RCP;
1378  RCP<ParameterList> transposeParams (new ParameterList);
1379  transposeParams->set ("sort", false);
1380 
1381  if (! params.is_null ()) {
1382  transposeParams->set ("compute global constants",
1383  params->get ("compute global constants: temporaries",
1384  false));
1385  }
1386  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1387  transposer.createTransposeLocal (transposeParams);
1388  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1389  Rview.origMatrix = Ptrans;
1390 
1391  mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1392  }
1393 
1394  /*********************************************************************************************************/
1395  // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1396  // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1397  // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1398  // Currently, this implementation never gets called.
1399  template<class Scalar,
1400  class LocalOrdinal,
1401  class GlobalOrdinal,
1402  class Node>
1403  void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1404  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1405  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1406  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1407  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1408  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1409  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1410  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1411  const std::string& label,
1412  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1413 #ifdef HAVE_TPETRA_MMM_TIMINGS
1414  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1415  using Teuchos::TimeMonitor;
1416  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1417 #endif
1418 
1419  using Teuchos::Array;
1420  using Teuchos::ArrayRCP;
1421  using Teuchos::ArrayView;
1422  using Teuchos::RCP;
1423  using Teuchos::rcp;
1424 
1425  typedef Scalar SC;
1426  typedef LocalOrdinal LO;
1427  typedef GlobalOrdinal GO;
1428  typedef Node NO;
1429  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1430  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1431  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1432 
1433  // number of rows on the process of the fine matrix
1434  // size_t m = Pview.origMatrix->getNodeNumRows();
1435  // number of rows on the process of the coarse matrix
1436  size_t n = Ac.getRowMap()->getNodeNumElements();
1437  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1438 
1439  // Get Data Pointers
1440  ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1441  ArrayRCP<size_t> Acrowptr_RCP;
1442  ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1443  ArrayRCP<LO> Accolind_RCP;
1444  ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1445  ArrayRCP<SC> Acvals_RCP;
1446 
1447  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1448  // out of the CrsMatrix. This code computes R * A * (P_local +
1449  // P_remote), where P_local contains the locally owned rows of P,
1450  // and P_remote the (previously Import'ed) remote rows of P.
1451 
1452  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1453  Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1454 
1455  if (!Pview.importMatrix.is_null())
1456  Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1457 
1458  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1459  // where Teuchos::ArrayRCP::operator[] may be slower than
1460  // Teuchos::ArrayView::operator[].
1461 
1462  // For efficiency
1463  ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1464  ArrayView<const LO> Acolind, Pcolind, Icolind;
1465  ArrayView<const SC> Avals, Pvals, Ivals;
1466  ArrayView<size_t> Acrowptr;
1467  ArrayView<LO> Accolind;
1468  ArrayView<SC> Acvals;
1469  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1470  Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1471  if (!Pview.importMatrix.is_null()) {
1472  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1473  }
1474 
1476  // In a first pass, determine the graph of Ac.
1478 
1480  // Get the graph of Ac. This gets the local transpose of P,
1481  // then loops over R, A, P to get the graph of Ac.
1483 
1484 #ifdef HAVE_TPETRA_MMM_TIMINGS
1485  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1486 #endif
1487 
1489  // Get the local transpose of the graph of P by locally transposing
1490  // all of P
1491 
1492  ArrayRCP<const size_t> Rrowptr_RCP;
1493  ArrayRCP<const LO> Rcolind_RCP;
1494  ArrayRCP<const Scalar> Rvals_RCP;
1495  ArrayView<const size_t> Rrowptr;
1496  ArrayView<const LO> Rcolind;
1497  ArrayView<const SC> Rvals;
1498 
1499  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1500 
1501  using Teuchos::ParameterList;
1502  RCP<ParameterList> transposeParams (new ParameterList);
1503  transposeParams->set ("sort", false);
1504  if (! params.is_null ()) {
1505  transposeParams->set ("compute global constants",
1506  params->get ("compute global constants: temporaries",
1507  false));
1508  }
1509  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1510  transposer.createTransposeLocal (transposeParams);
1511 
1512  Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1513  Rrowptr = Rrowptr_RCP();
1514  Rcolind = Rcolind_RCP();
1515  Rvals = Rvals_RCP();
1516 
1518  // Construct graph
1519 
1520  #ifdef HAVE_TPETRA_MMM_TIMINGS
1521  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1522  #endif
1523 
1524  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1525  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1526 
1527  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1528  size_t nnzPerRowA = 100;
1529  if (Aview.origMatrix->getNodeNumEntries() > 0)
1530  nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1531  Acrowptr_RCP.resize(n+1);
1532  Acrowptr = Acrowptr_RCP();
1533  Accolind_RCP.resize(nnz_alloc);
1534  Accolind = Accolind_RCP();
1535 
1536  size_t nnz = 0, nnz_old = 0;
1537  for (size_t i = 0; i < n; i++) {
1538  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1539  // on the calling process.
1540  Acrowptr[i] = nnz;
1541 
1542  // mfh 27 Sep 2016: For each entry of R in the current row of R
1543  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1544  LO k = Rcolind[kk]; // local column index of current entry of R
1545  // For each entry of A in the current row of A
1546  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1547  LO l = Acolind[ll]; // local column index of current entry of A
1548 
1549  if (Acol2PRow[l] != LO_INVALID) {
1550  // mfh 27 Sep 2016: If the entry of Acol2PRow
1551  // corresponding to the current entry of A is populated, then
1552  // the corresponding row of P is in P_local (i.e., it lives on
1553  // the calling process).
1554 
1555  // Local matrix
1556  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1557 
1558  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1559  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1560  LO j = Pcolind[jj];
1561  LO Acj = Pcol2Accol[j];
1562 
1563  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1564  // New entry
1565  ac_status[Acj] = nnz;
1566  Accolind[nnz] = Acj;
1567  nnz++;
1568  }
1569  }
1570  } else {
1571  // mfh 27 Sep 2016: If the entry of Acol2PRow
1572  // corresponding to the current entry of A is NOT populated (has
1573  // a flag "invalid" value), then the corresponding row of P is
1574  // in P_remote (i.e., it does not live on the calling process).
1575 
1576  // Remote matrix
1577  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1578  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1579  LO j = Icolind[jj];
1580  LO Acj = PIcol2Accol[j];
1581 
1582  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1583  // New entry
1584  ac_status[Acj] = nnz;
1585  Accolind[nnz] = Acj;
1586  nnz++;
1587  }
1588  }
1589  }
1590  }
1591  }
1592  // Resize for next pass if needed
1593  // cag: Maybe we can do something more subtle here, and not double
1594  // the size right away.
1595  if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1596  nnz_alloc *= 2;
1597  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1598  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1599  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1600  }
1601  nnz_old = nnz;
1602  }
1603  Acrowptr[n] = nnz;
1604 
1605  // Downward resize
1606  Accolind_RCP.resize(nnz);
1607  Accolind = Accolind_RCP();
1608 
1609  // Allocate Acvals
1610  Acvals_RCP.resize(nnz, SC_ZERO);
1611  Acvals = Acvals_RCP();
1612 
1613 
1615  // In a second pass, enter the values into Acvals.
1617 
1618  #ifdef HAVE_TPETRA_MMM_TIMINGS
1619  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1620  #endif
1621 
1622 
1623  for (size_t k = 0; k < n; k++) {
1624  for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1625  LO i = Pcolind[ii];
1626  const SC Pki = Pvals[ii];
1627  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1628  LO l = Acolind[ll];
1629  const SC Akl = Avals[ll];
1630  if (Akl == 0.)
1631  continue;
1632  if (Acol2PRow[l] != LO_INVALID) {
1633  // mfh 27 Sep 2016: If the entry of Acol2PRow
1634  // corresponding to the current entry of A is populated, then
1635  // the corresponding row of P is in P_local (i.e., it lives on
1636  // the calling process).
1637 
1638  // Local matrix
1639  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1640  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1641  LO j = Pcolind[jj];
1642  LO Acj = Pcol2Accol[j];
1643  size_t pp;
1644  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1645  if (Accolind[pp] == Acj)
1646  break;
1647  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1648  // std::runtime_error, "problem with Ac column indices");
1649  Acvals[pp] += Pki*Akl*Pvals[jj];
1650  }
1651  } else {
1652  // mfh 27 Sep 2016: If the entry of Acol2PRow
1653  // corresponding to the current entry of A NOT populated (has
1654  // a flag "invalid" value), then the corresponding row of P is
1655  // in P_remote (i.e., it does not live on the calling process).
1656 
1657  // Remote matrix
1658  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1659  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1660  LO j = Icolind[jj];
1661  LO Acj = PIcol2Accol[j];
1662  size_t pp;
1663  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1664  if (Accolind[pp] == Acj)
1665  break;
1666  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1667  // std::runtime_error, "problem with Ac column indices");
1668  Acvals[pp] += Pki*Akl*Ivals[jj];
1669  }
1670  }
1671  }
1672  }
1673  }
1674 
1675 
1676 #ifdef HAVE_TPETRA_MMM_TIMINGS
1677  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1678 #endif
1679 
1680  // Final sort & set of CRS arrays
1681  //
1682  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1683  // matrix-matrix multiply routine sort the entries for us?
1684  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1685 
1686  // mfh 27 Sep 2016: This just sets pointers.
1687  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1688 
1689 #ifdef HAVE_TPETRA_MMM_TIMINGS
1690  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1691 #endif
1692 
1693  // Final FillComplete
1694  //
1695  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1696  // Import (from domain Map to column Map) construction (which costs
1697  // lots of communication) by taking the previously constructed
1698  // Import object. We should be able to do this without interfering
1699  // with the implementation of the local part of sparse matrix-matrix
1700  // multply above.
1701  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1702  labelList->set("Timer Label",label);
1703  // labelList->set("Sort column Map ghost GIDs")
1704  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1705  RCP<const Export<LO,GO,NO> > dummyExport;
1706  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1707  Pview.origMatrix->getDomainMap(),
1708  Acimport,
1709  dummyExport, labelList);
1710  }
1711 
1712 
1713 
1714  } //End namepsace MMdetails
1715 
1716 } //End namespace Tpetra
1717 //
1718 // Explicit instantiation macro
1719 //
1720 // Must be expanded from within the Tpetra namespace!
1721 //
1722 
1723 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1724  \
1725  template \
1726  void TripleMatrixMultiply::MultiplyRAP( \
1727  const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1728  bool transposeR, \
1729  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1730  bool transposeA, \
1731  const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1732  bool transposeP, \
1733  CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1734  bool call_FillComplete_on_result, \
1735  const std::string & label, \
1736  const Teuchos::RCP<Teuchos::ParameterList>& params); \
1737  \
1738 
1739 
1740 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.