Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
43 
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Tpetra_Util.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
56 #include "Tpetra_Import_Util.hpp"
57 #include "Tpetra_Import_Util2.hpp"
58 #include <algorithm>
59 #include <cmath>
60 #include "Teuchos_FancyOStream.hpp"
61 // #include "KokkosSparse_spgemm.hpp"
62 
63 
71 /*********************************************************************************************************/
72 // Include the architecture-specific kernel partial specializations here
73 // NOTE: This needs to be outside all namespaces
74 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
76 #include "TpetraExt_MatrixMatrix_HIP.hpp"
77 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
78 
79 namespace Tpetra {
80 
81  namespace TripleMatrixMultiply{
82 
83  //
84  // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
85  // op(A) == A if transposeA is false,
86  // op(A) == A^T if transposeA is true,
87  // and similarly for op(R) and op(P).
88  //
89  template <class Scalar,
90  class LocalOrdinal,
91  class GlobalOrdinal,
92  class Node>
95  bool transposeR,
97  bool transposeA,
99  bool transposeP,
101  bool call_FillComplete_on_result,
102  const std::string& label,
103  const Teuchos::RCP<Teuchos::ParameterList>& params)
104  {
105  using Teuchos::null;
106  using Teuchos::RCP;
107  typedef Scalar SC;
108  typedef LocalOrdinal LO;
109  typedef GlobalOrdinal GO;
110  typedef Node NO;
111  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
112  typedef Import<LO,GO,NO> import_type;
113  typedef Export<LO,GO,NO> export_type;
114  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
115  typedef Map<LO,GO,NO> map_type;
116  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
117 
118 #ifdef HAVE_TPETRA_MMM_TIMINGS
119  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
120  using Teuchos::TimeMonitor;
121  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
122 #endif
123 
124  const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
125 
126  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
127 
128  // The input matrices R, A and P must both be fillComplete.
129  TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
130  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
131  TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
132 
133  // If transposeA is true, then Rprime will be the transpose of R
134  // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
135  // will just be a pointer to R.
136  RCP<const crs_matrix_type> Rprime = null;
137  // If transposeA is true, then Aprime will be the transpose of A
138  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
139  // will just be a pointer to A.
140  RCP<const crs_matrix_type> Aprime = null;
141  // If transposeB is true, then Pprime will be the transpose of P
142  // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
143  // will just be a pointer to P.
144  RCP<const crs_matrix_type> Pprime = null;
145 
146  // Is this a "clean" matrix?
147  //
148  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
149  // locally nor globally indexed, then it was empty. I don't like
150  // this, because the most straightforward implementation presumes
151  // lazy allocation of indices. However, historical precedent
152  // demands that we keep around this predicate as a way to test
153  // whether the matrix is empty.
154  const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
155 
156  using Teuchos::ParameterList;
157  RCP<ParameterList> transposeParams (new ParameterList);
158  transposeParams->set ("sort", false);
159 
160  if (transposeR && &R != &P) {
161  transposer_type transposer(rcpFromRef (R));
162  Rprime = transposer.createTranspose (transposeParams);
163  } else {
164  Rprime = rcpFromRef(R);
165  }
166 
167  if (transposeA) {
168  transposer_type transposer(rcpFromRef (A));
169  Aprime = transposer.createTranspose (transposeParams);
170  } else {
171  Aprime = rcpFromRef(A);
172  }
173 
174  if (transposeP) {
175  transposer_type transposer(rcpFromRef (P));
176  Pprime = transposer.createTranspose (transposeParams);
177  } else {
178  Pprime = rcpFromRef(P);
179  }
180 
181  // Check size compatibility
182  global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
183  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
184  global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
185  global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
186  global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
187  global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
188  global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
189  global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
190  global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
191  TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
192  prefix << "ERROR, inner dimensions of op(R) and op(A) "
193  "must match for matrix-matrix product. op(R) is "
194  << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
195 
196  TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
197  prefix << "ERROR, inner dimensions of op(A) and op(P) "
198  "must match for matrix-matrix product. op(A) is "
199  << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
200 
201  // The result matrix Ac must at least have a row-map that reflects the correct
202  // row-size. Don't check the number of columns because rectangular matrices
203  // which were constructed with only one map can still end up having the
204  // correct capacity and dimensions when filled.
205  TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
206  prefix << "ERROR, dimensions of result Ac must "
207  "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
208  << " rows, should have at least " << Rleft << std::endl);
209 
210  // It doesn't matter whether Ac is already Filled or not. If it is already
211  // Filled, it must have space allocated for the positions that will be
212  // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
213  // we'll error out later when trying to store result values.
214 
215  // CGB: However, matrix must be in active-fill
216  TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
217 
218  // We're going to need to import remotely-owned sections of P if
219  // more than one processor is performing this run, depending on the scenario.
220  int numProcs = P.getComm()->getSize();
221 
222  // Declare a couple of structs that will be used to hold views of the data
223  // of R, A and P, to be used for fast access during the matrix-multiplication.
224  crs_matrix_struct_type Rview;
225  crs_matrix_struct_type Aview;
226  crs_matrix_struct_type Pview;
227 
228  RCP<const map_type> targetMap_R = Rprime->getRowMap();
229  RCP<const map_type> targetMap_A = Aprime->getRowMap();
230  RCP<const map_type> targetMap_P = Pprime->getRowMap();
231 
232 #ifdef HAVE_TPETRA_MMM_TIMINGS
233  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
234 #endif
235 
236  // Now import any needed remote rows and populate the Aview struct
237  // NOTE: We assert that an import isn't needed --- since we do the transpose
238  // above to handle that.
239  RCP<const import_type> dummyImporter;
240 
241  if (!(transposeR && &R == &P))
242  MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
243 
244  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
245 
246  // We will also need local access to all rows of P that correspond to the
247  // column-map of op(A).
248  if (numProcs > 1)
249  targetMap_P = Aprime->getColMap();
250 
251  // Import any needed remote rows and populate the Pview struct.
252  MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
253 
254 
255  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
256 
257  bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
258  if (needs_final_export)
259  Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
260  else
261  Actemp = rcp(&Ac,false);// don't allow deallocation
262 
263 #ifdef HAVE_TPETRA_MMM_TIMINGS
264  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
265 #endif
266 
267  // Call the appropriate method to perform the actual multiplication.
268  if (call_FillComplete_on_result && newFlag) {
269  if (transposeR && &R == &P)
270  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
271  else
272  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
273  } else if (call_FillComplete_on_result) {
274  if (transposeR && &R == &P)
275  MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
276  else
277  MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
278  } else {
279  // mfh 27 Sep 2016: Is this the "slow" case? This
280  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
281  // thread-parallel inserts, but that may take some effort.
282  // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
283 
284  // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
285 
286  // #ifdef HAVE_TPETRA_MMM_TIMINGS
287  // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
288  // #endif
289  // if (call_FillComplete_on_result) {
290  // // We'll call FillComplete on the C matrix before we exit, and give it a
291  // // domain-map and a range-map.
292  // // The domain-map will be the domain-map of B, unless
293  // // op(B)==transpose(B), in which case the range-map of B will be used.
294  // // The range-map will be the range-map of A, unless op(A)==transpose(A),
295  // // in which case the domain-map of A will be used.
296  // if (!C.isFillComplete())
297  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
298  // }
299  // Not implemented
300  if (transposeR && &R == &P)
301  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
302  else
303  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
304  }
305 
306  if (needs_final_export) {
307 #ifdef HAVE_TPETRA_MMM_TIMINGS
308  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
309 #endif
310  Teuchos::ParameterList labelList;
311  labelList.set("Timer Label", label);
312  Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
313 
314  RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
315  bool isMM = true;
316  bool overrideAllreduce = false;
317  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
318  if(!params.is_null()) {
319  Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
320  mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
321  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
322  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
323  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
324  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
325  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
326 
327  labelList.set("compute global constants",params->get("compute global constants",true));
328  }
329  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
330 
331  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
332  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
333  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
334 
335  export_type exporter = export_type(*Pprime->getGraph()->getImporter());
336  Actemp->exportAndFillComplete(Acprime,
337  exporter,
338  Acprime->getDomainMap(),
339  Acprime->getRangeMap(),
340  rcp(&labelList,false));
341 
342  }
343 #ifdef HAVE_TPETRA_MMM_STATISTICS
344  printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
345 #endif
346 
347  }
348 
349 
350  } //End namespace TripleMatrixMultiply
351 
352  namespace MMdetails{
353 
354  // Kernel method for computing the local portion of Ac = R*A*P
355  template<class Scalar,
356  class LocalOrdinal,
357  class GlobalOrdinal,
358  class Node>
359  void mult_R_A_P_newmatrix(
360  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
361  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
362  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
363  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
364  const std::string& label,
365  const Teuchos::RCP<Teuchos::ParameterList>& params)
366  {
367  using Teuchos::Array;
368  using Teuchos::ArrayRCP;
369  using Teuchos::ArrayView;
370  using Teuchos::RCP;
371  using Teuchos::rcp;
372 
373  //typedef Scalar SC; // unused
374  typedef LocalOrdinal LO;
375  typedef GlobalOrdinal GO;
376  typedef Node NO;
377 
378  typedef Import<LO,GO,NO> import_type;
379  typedef Map<LO,GO,NO> map_type;
380 
381  // Kokkos typedefs
382  typedef typename map_type::local_map_type local_map_type;
384  typedef typename KCRS::StaticCrsGraphType graph_t;
385  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
386  typedef typename NO::execution_space execution_space;
387  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
388  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
389 
390 #ifdef HAVE_TPETRA_MMM_TIMINGS
391  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
392  using Teuchos::TimeMonitor;
393  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
394 #endif
395  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
396 
397  // Build the final importer / column map, hash table lookups for Ac
398  RCP<const import_type> Cimport;
399  RCP<const map_type> Ccolmap;
400  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
401  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
402  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
403  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
404  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
405  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
406  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
407 
408 
409  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
410  // indices of B, to local column indices of Ac. (B and Ac have the
411  // same number of columns.) The kernel uses this, instead of
412  // copying the entire input matrix B and converting its column
413  // indices to those of C.
414  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
415 
416  if (Pview.importMatrix.is_null()) {
417  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
418  Cimport = Pimport;
419  Ccolmap = Pview.colMap;
420  const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
421  // Pcol2Ccol is trivial
422  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
423  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
424  KOKKOS_LAMBDA(const LO i) {
425  Pcol2Ccol(i) = i;
426  });
427  }
428  else {
429  // mfh 27 Sep 2016: P has "remotes," so we need to build the
430  // column Map of C, as well as C's Import object (from its domain
431  // Map to its column Map). C's column Map is the union of the
432  // column Maps of (the local part of) P, and the "remote" part of
433  // P. Ditto for the Import. We have optimized this "setUnion"
434  // operation on Import objects and Maps.
435 
436  // Choose the right variant of setUnion
437  if (!Pimport.is_null() && !Iimport.is_null()) {
438  Cimport = Pimport->setUnion(*Iimport);
439  }
440  else if (!Pimport.is_null() && Iimport.is_null()) {
441  Cimport = Pimport->setUnion();
442  }
443  else if (Pimport.is_null() && !Iimport.is_null()) {
444  Cimport = Iimport->setUnion();
445  }
446  else {
447  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
448  }
449  Ccolmap = Cimport->getTargetMap();
450 
451  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
452  // in general. We should get rid of it in order to reduce
453  // communication costs of sparse matrix-matrix multiply.
454  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
455  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
456 
457  // NOTE: This is not efficient and should be folded into setUnion
458  //
459  // mfh 27 Sep 2016: What the above comment means, is that the
460  // setUnion operation on Import objects could also compute these
461  // local index - to - local index look-up tables.
462  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
463  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
464  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
465  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
466  });
467  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
468  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
469  });
470 
471  }
472 
473  // Replace the column map
474  //
475  // mfh 27 Sep 2016: We do this because C was originally created
476  // without a column Map. Now we have its column Map.
477  Ac.replaceColMap(Ccolmap);
478 
479  // mfh 27 Sep 2016: Construct tables that map from local column
480  // indices of A, to local row indices of either B_local (the locally
481  // owned part of B), or B_remote (the "imported" remote part of B).
482  //
483  // For column index Aik in row i of A, if the corresponding row of B
484  // exists in the local part of B ("orig") (which I'll call B_local),
485  // then targetMapToOrigRow[Aik] is the local index of that row of B.
486  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
487  //
488  // For column index Aik in row i of A, if the corresponding row of B
489  // exists in the remote part of B ("Import") (which I'll call
490  // B_remote), then targetMapToImportRow[Aik] is the local index of
491  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
492  // (a flag value).
493 
494  // Run through all the hash table lookups once and for all
495  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
496  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
497  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
498  GO aidx = Acolmap_local.getGlobalElement(i);
499  LO P_LID = Prowmap_local.getLocalElement(aidx);
500  if (P_LID != LO_INVALID) {
501  targetMapToOrigRow(i) = P_LID;
502  targetMapToImportRow(i) = LO_INVALID;
503  } else {
504  LO I_LID = Irowmap_local.getLocalElement(aidx);
505  targetMapToOrigRow(i) = LO_INVALID;
506  targetMapToImportRow(i) = I_LID;
507  }
508  });
509 
510  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
511  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
512  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
513  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
514  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
515  Ac, Cimport, label, params);
516  }
517 
518 
519  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
520  template<class Scalar,
521  class LocalOrdinal,
522  class GlobalOrdinal,
523  class Node>
524  void mult_R_A_P_reuse(
525  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
526  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
527  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
528  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
529  const std::string& label,
530  const Teuchos::RCP<Teuchos::ParameterList>& params)
531  {
532  using Teuchos::Array;
533  using Teuchos::ArrayRCP;
534  using Teuchos::ArrayView;
535  using Teuchos::RCP;
536  using Teuchos::rcp;
537 
538  //typedef Scalar SC; // unused
539  typedef LocalOrdinal LO;
540  typedef GlobalOrdinal GO;
541  typedef Node NO;
542 
543  typedef Import<LO,GO,NO> import_type;
544  typedef Map<LO,GO,NO> map_type;
545 
546  // Kokkos typedefs
547  typedef typename map_type::local_map_type local_map_type;
549  typedef typename KCRS::StaticCrsGraphType graph_t;
550  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
551  typedef typename NO::execution_space execution_space;
552  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
553  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
554 
555 #ifdef HAVE_TPETRA_MMM_TIMINGS
556  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
557  using Teuchos::TimeMonitor;
558  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
559 #endif
560  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
561 
562  // Build the final importer / column map, hash table lookups for Ac
563  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
564  RCP<const map_type> Ccolmap = Ac.getColMap();
565  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
566  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
567  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
568  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
569  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
570  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
571  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
572  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
573 
574  // Build the final importer / column map, hash table lookups for C
575  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
576  {
577  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
578  // So, column map of C may be a strict subset of the column map of B
579  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
580  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
581  });
582 
583  if (!Pview.importMatrix.is_null()) {
584  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
585  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
586 
587  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
588  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
589  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
590  });
591  }
592  }
593 
594  // Run through all the hash table lookups once and for all
595  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
596  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
597  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
598  GO aidx = Acolmap_local.getGlobalElement(i);
599  LO B_LID = Prowmap_local.getLocalElement(aidx);
600  if (B_LID != LO_INVALID) {
601  targetMapToOrigRow(i) = B_LID;
602  targetMapToImportRow(i) = LO_INVALID;
603  } else {
604  LO I_LID = Irowmap_local.getLocalElement(aidx);
605  targetMapToOrigRow(i) = LO_INVALID;
606  targetMapToImportRow(i) = I_LID;
607 
608  }
609  });
610 
611  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
612  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
613  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
614  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
615  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
616  Ac, Cimport, label, params);
617  }
618 
619 
620  // Kernel method for computing the local portion of Ac = R*A*P
621  template<class Scalar,
622  class LocalOrdinal,
623  class GlobalOrdinal,
624  class Node>
625  void mult_PT_A_P_newmatrix(
626  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
627  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
628  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
629  const std::string& label,
630  const Teuchos::RCP<Teuchos::ParameterList>& params)
631  {
632  using Teuchos::Array;
633  using Teuchos::ArrayRCP;
634  using Teuchos::ArrayView;
635  using Teuchos::RCP;
636  using Teuchos::rcp;
637 
638  //typedef Scalar SC; // unused
639  typedef LocalOrdinal LO;
640  typedef GlobalOrdinal GO;
641  typedef Node NO;
642 
643  typedef Import<LO,GO,NO> import_type;
644  typedef Map<LO,GO,NO> map_type;
645 
646  // Kokkos typedefs
647  typedef typename map_type::local_map_type local_map_type;
649  typedef typename KCRS::StaticCrsGraphType graph_t;
650  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
651  typedef typename NO::execution_space execution_space;
652  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
653  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
654 
655 #ifdef HAVE_TPETRA_MMM_TIMINGS
656  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
657  using Teuchos::TimeMonitor;
658  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
659 #endif
660  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
661 
662  // Build the final importer / column map, hash table lookups for Ac
663  RCP<const import_type> Cimport;
664  RCP<const map_type> Ccolmap;
665  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
666  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
667  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
668  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
669  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
670  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
671  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
672 
673 
674  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
675  // indices of B, to local column indices of Ac. (B and Ac have the
676  // same number of columns.) The kernel uses this, instead of
677  // copying the entire input matrix B and converting its column
678  // indices to those of C.
679  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
680 
681  if (Pview.importMatrix.is_null()) {
682  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
683  Cimport = Pimport;
684  Ccolmap = Pview.colMap;
685  const LO colMapSize = static_cast<LO>(Pview.colMap->getLocalNumElements());
686  // Pcol2Ccol is trivial
687  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
688  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
689  KOKKOS_LAMBDA(const LO i) {
690  Pcol2Ccol(i) = i;
691  });
692  }
693  else {
694  // mfh 27 Sep 2016: P has "remotes," so we need to build the
695  // column Map of C, as well as C's Import object (from its domain
696  // Map to its column Map). C's column Map is the union of the
697  // column Maps of (the local part of) P, and the "remote" part of
698  // P. Ditto for the Import. We have optimized this "setUnion"
699  // operation on Import objects and Maps.
700 
701  // Choose the right variant of setUnion
702  if (!Pimport.is_null() && !Iimport.is_null()) {
703  Cimport = Pimport->setUnion(*Iimport);
704  }
705  else if (!Pimport.is_null() && Iimport.is_null()) {
706  Cimport = Pimport->setUnion();
707  }
708  else if (Pimport.is_null() && !Iimport.is_null()) {
709  Cimport = Iimport->setUnion();
710  }
711  else {
712  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
713  }
714  Ccolmap = Cimport->getTargetMap();
715 
716  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
717  // in general. We should get rid of it in order to reduce
718  // communication costs of sparse matrix-matrix multiply.
719  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
720  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
721 
722  // NOTE: This is not efficient and should be folded into setUnion
723  //
724  // mfh 27 Sep 2016: What the above comment means, is that the
725  // setUnion operation on Import objects could also compute these
726  // local index - to - local index look-up tables.
727  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
728  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
729  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
730  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
731  });
732  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
733  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
734  });
735 
736  }
737 
738  // Replace the column map
739  //
740  // mfh 27 Sep 2016: We do this because C was originally created
741  // without a column Map. Now we have its column Map.
742  Ac.replaceColMap(Ccolmap);
743 
744  // mfh 27 Sep 2016: Construct tables that map from local column
745  // indices of A, to local row indices of either B_local (the locally
746  // owned part of B), or B_remote (the "imported" remote part of B).
747  //
748  // For column index Aik in row i of A, if the corresponding row of B
749  // exists in the local part of B ("orig") (which I'll call B_local),
750  // then targetMapToOrigRow[Aik] is the local index of that row of B.
751  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
752  //
753  // For column index Aik in row i of A, if the corresponding row of B
754  // exists in the remote part of B ("Import") (which I'll call
755  // B_remote), then targetMapToImportRow[Aik] is the local index of
756  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
757  // (a flag value).
758 
759  // Run through all the hash table lookups once and for all
760  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
761  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
762 
763  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
764  GO aidx = Acolmap_local.getGlobalElement(i);
765  LO P_LID = Prowmap_local.getLocalElement(aidx);
766  if (P_LID != LO_INVALID) {
767  targetMapToOrigRow(i) = P_LID;
768  targetMapToImportRow(i) = LO_INVALID;
769  } else {
770  LO I_LID = Irowmap_local.getLocalElement(aidx);
771  targetMapToOrigRow(i) = LO_INVALID;
772  targetMapToImportRow(i) = I_LID;
773  }
774  });
775 
776  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
777  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
778  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
779  mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
780  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
781  Ac, Cimport, label, params);
782  }
783 
784  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
785  template<class Scalar,
786  class LocalOrdinal,
787  class GlobalOrdinal,
788  class Node>
789  void mult_PT_A_P_reuse(
790  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
791  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
792  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
793  const std::string& label,
794  const Teuchos::RCP<Teuchos::ParameterList>& params)
795  {
796  using Teuchos::Array;
797  using Teuchos::ArrayRCP;
798  using Teuchos::ArrayView;
799  using Teuchos::RCP;
800  using Teuchos::rcp;
801 
802  //typedef Scalar SC; // unused
803  typedef LocalOrdinal LO;
804  typedef GlobalOrdinal GO;
805  typedef Node NO;
806 
807  typedef Import<LO,GO,NO> import_type;
808  typedef Map<LO,GO,NO> map_type;
809 
810  // Kokkos typedefs
811  typedef typename map_type::local_map_type local_map_type;
813  typedef typename KCRS::StaticCrsGraphType graph_t;
814  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
815  typedef typename NO::execution_space execution_space;
816  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
817  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
818 
819 #ifdef HAVE_TPETRA_MMM_TIMINGS
820  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
821  using Teuchos::TimeMonitor;
822  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
823 #endif
824  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
825 
826  // Build the final importer / column map, hash table lookups for Ac
827  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
828  RCP<const map_type> Ccolmap = Ac.getColMap();
829  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
830  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
831  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
832  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
833  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
834  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
835  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
836  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
837 
838  // Build the final importer / column map, hash table lookups for C
839  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getLocalNumElements()), Icol2Ccol;
840  {
841  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
842  // So, column map of C may be a strict subset of the column map of B
843  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
844  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
845  });
846 
847  if (!Pview.importMatrix.is_null()) {
848  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
849  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
850 
851  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getLocalNumElements());
852  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
853  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
854  });
855  }
856  }
857 
858  // Run through all the hash table lookups once and for all
859  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
860  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
861  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
862  GO aidx = Acolmap_local.getGlobalElement(i);
863  LO B_LID = Prowmap_local.getLocalElement(aidx);
864  if (B_LID != LO_INVALID) {
865  targetMapToOrigRow(i) = B_LID;
866  targetMapToImportRow(i) = LO_INVALID;
867  } else {
868  LO I_LID = Irowmap_local.getLocalElement(aidx);
869  targetMapToOrigRow(i) = LO_INVALID;
870  targetMapToImportRow(i) = I_LID;
871 
872  }
873  });
874 
875  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
876  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
877  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
878  mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
879  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
880  Ac, Cimport, label, params);
881  }
882 
883 
884  /*********************************************************************************************************/
885  // RAP NewMatrix Kernel wrappers (Default non-threaded version)
886  // Computes R * A * P -> Ac using classic Gustavson approach
887  template<class Scalar,
888  class LocalOrdinal,
889  class GlobalOrdinal,
890  class Node,
891  class LocalOrdinalViewType>
892  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
893  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
894  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
895  const LocalOrdinalViewType & Acol2Prow_dev,
896  const LocalOrdinalViewType & Acol2PIrow_dev,
897  const LocalOrdinalViewType & Pcol2Accol_dev,
898  const LocalOrdinalViewType & PIcol2Accol_dev,
899  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
900  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
901  const std::string& label,
902  const Teuchos::RCP<Teuchos::ParameterList>& params) {
903 #ifdef HAVE_TPETRA_MMM_TIMINGS
904  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
905  using Teuchos::TimeMonitor;
906  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
907 #endif
908 
909  using Teuchos::Array;
910  using Teuchos::ArrayRCP;
911  using Teuchos::ArrayView;
912  using Teuchos::RCP;
913  using Teuchos::rcp;
914 
915  // Lots and lots of typedefs
917  typedef typename KCRS::StaticCrsGraphType graph_t;
918  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
919  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
920  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
921  typedef typename KCRS::values_type::non_const_type scalar_view_t;
922 
923  typedef Scalar SC;
924  typedef LocalOrdinal LO;
925  typedef GlobalOrdinal GO;
926  typedef Node NO;
927  typedef Map<LO,GO,NO> map_type;
928  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
929  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
930  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
931 
932  // Sizes
933  RCP<const map_type> Accolmap = Ac.getColMap();
934  size_t m = Rview.origMatrix->getLocalNumRows();
935  size_t n = Accolmap->getLocalNumElements();
936  size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
937 
938  // Routine runs on host; have to put arguments on host, too
939  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
940  Acol2Prow_dev);
941  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
942  Acol2PIrow_dev);
943  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
944  Pcol2Accol_dev);
945  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
946  PIcol2Accol_dev);
947 
948  // Grab the Kokkos::SparseCrsMatrices & inner stuff
949  const auto Amat = Aview.origMatrix->getLocalMatrixHost();
950  const auto Pmat = Pview.origMatrix->getLocalMatrixHost();
951  const auto Rmat = Rview.origMatrix->getLocalMatrixHost();
952 
953  auto Arowptr = Amat.graph.row_map;
954  auto Prowptr = Pmat.graph.row_map;
955  auto Rrowptr = Rmat.graph.row_map;
956  const auto Acolind = Amat.graph.entries;
957  const auto Pcolind = Pmat.graph.entries;
958  const auto Rcolind = Rmat.graph.entries;
959  const auto Avals = Amat.values;
960  const auto Pvals = Pmat.values;
961  const auto Rvals = Rmat.values;
962 
963  typename c_lno_view_t::HostMirror::const_type Irowptr;
964  typename lno_nnz_view_t::HostMirror Icolind;
965  typename scalar_view_t::HostMirror Ivals;
966  if(!Pview.importMatrix.is_null()) {
967  auto lclP = Pview.importMatrix->getLocalMatrixHost();
968  Irowptr = lclP.graph.row_map;
969  Icolind = lclP.graph.entries;
970  Ivals = lclP.values;
971  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
972  }
973 
974 #ifdef HAVE_TPETRA_MMM_TIMINGS
975  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
976 #endif
977 
978  // Classic csr assembly (low memory edition)
979  //
980  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
981  // The method loops over rows of R, and may resize after processing
982  // each row. Chris Siefert says that this reflects experience in
983  // ML; for the non-threaded case, ML found it faster to spend less
984  // effort on estimation and risk an occasional reallocation.
985  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
986  typename lno_view_t::HostMirror Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
987  typename lno_nnz_view_t::HostMirror Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
988  typename scalar_view_t::HostMirror Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
989 
990  // mfh 27 Sep 2016: The ac_status array is an implementation detail
991  // of the local sparse matrix-matrix multiply routine.
992 
993  // The status array will contain the index into colind where this entry was last deposited.
994  // ac_status[i] < nnz - not in the row yet
995  // ac_status[i] >= nnz - this is the entry where you can find the data
996  // We start with this filled with INVALID's indicating that there are no entries yet.
997  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
998  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
999  Array<size_t> ac_status(n, ST_INVALID);
1000 
1001  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1002  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1003  //
1004  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1005  // you whether the corresponding row of P belongs to P_local
1006  // ("orig") or P_remote ("Import").
1007 
1008  // For each row of R
1009  size_t nnz = 0, nnz_old = 0;
1010  for (size_t i = 0; i < m; i++) {
1011  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1012  // on the calling process.
1013  Crowptr[i] = nnz;
1014 
1015  // mfh 27 Sep 2016: For each entry of R in the current row of R
1016  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1017  LO k = Rcolind[kk]; // local column index of current entry of R
1018  const SC Rik = Rvals[kk]; // value of current entry of R
1019  if (Rik == SC_ZERO)
1020  continue; // skip explicitly stored zero values in R
1021  // For each entry of A in the current row of A
1022  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1023  LO l = Acolind[ll]; // local column index of current entry of A
1024  const SC Akl = Avals[ll]; // value of current entry of A
1025  if (Akl == SC_ZERO)
1026  continue; // skip explicitly stored zero values in A
1027 
1028 
1029  if (Acol2Prow[l] != LO_INVALID) {
1030  // mfh 27 Sep 2016: If the entry of Acol2Prow
1031  // corresponding to the current entry of A is populated, then
1032  // the corresponding row of P is in P_local (i.e., it lives on
1033  // the calling process).
1034 
1035  // Local matrix
1036  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1037 
1038  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1039  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1040  LO j = Pcolind[jj];
1041  LO Acj = Pcol2Accol[j];
1042  SC Plj = Pvals[jj];
1043 
1044  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1045 #ifdef HAVE_TPETRA_DEBUG
1046  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1047  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1048  std::runtime_error,
1049  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1050 #endif
1051  // New entry
1052  ac_status[Acj] = nnz;
1053  Ccolind[nnz] = Acj;
1054  Cvals[nnz] = Rik*Akl*Plj;
1055  nnz++;
1056  } else {
1057  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1058  }
1059  }
1060  } else {
1061  // mfh 27 Sep 2016: If the entry of Acol2PRow
1062  // corresponding to the current entry of A is NOT populated (has
1063  // a flag "invalid" value), then the corresponding row of P is
1064  // in P_remote (i.e., it does not live on the calling process).
1065 
1066  // Remote matrix
1067  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1068  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1069  LO j = Icolind[jj];
1070  LO Acj = PIcol2Accol[j];
1071  SC Plj = Ivals[jj];
1072 
1073  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1074 #ifdef HAVE_TPETRA_DEBUG
1075  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1076  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1077  std::runtime_error,
1078  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1079 #endif
1080  // New entry
1081  ac_status[Acj] = nnz;
1082  Ccolind[nnz] = Acj;
1083  Cvals[nnz] = Rik*Akl*Plj;
1084  nnz++;
1085  } else {
1086  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1087  }
1088  }
1089  }
1090  }
1091  }
1092  // Resize for next pass if needed
1093  if (nnz + n > CSR_alloc) {
1094  CSR_alloc *= 2;
1095  Kokkos::resize(Ccolind,CSR_alloc);
1096  Kokkos::resize(Cvals,CSR_alloc);
1097  }
1098  nnz_old = nnz;
1099  }
1100 
1101  Crowptr[m] = nnz;
1102 
1103  // Downward resize
1104  Kokkos::resize(Ccolind,nnz);
1105  Kokkos::resize(Cvals,nnz);
1106 
1107 #ifdef HAVE_TPETRA_MMM_TIMINGS
1108  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1109 #endif
1110  auto Crowptr_dev = Kokkos::create_mirror_view_and_copy(
1111  typename KCRS::device_type(), Crowptr);
1112  auto Ccolind_dev = Kokkos::create_mirror_view_and_copy(
1113  typename KCRS::device_type(), Ccolind);
1114  auto Cvals_dev = Kokkos::create_mirror_view_and_copy(
1115  typename KCRS::device_type(), Cvals);
1116 
1117  // Final sort & set of CRS arrays
1118  if (params.is_null() || params->get("sort entries",true))
1119  Import_Util::sortCrsEntries(Crowptr_dev, Ccolind_dev, Cvals_dev);
1120  Ac.setAllValues(Crowptr_dev, Ccolind_dev, Cvals_dev);
1121 
1122 #ifdef HAVE_TPETRA_MMM_TIMINGS
1123  MM = Teuchos::null; MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1124 #endif
1125 
1126  // Final FillComplete
1127  //
1128  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1129  // Import (from domain Map to column Map) construction (which costs
1130  // lots of communication) by taking the previously constructed
1131  // Import object. We should be able to do this without interfering
1132  // with the implementation of the local part of sparse matrix-matrix
1133  // multply above.
1134  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1135  labelList->set("Timer Label",label);
1136  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1137  RCP<const Export<LO,GO,NO> > dummyExport;
1138  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1139  Rview.origMatrix->getRangeMap(),
1140  Acimport,
1141  dummyExport,
1142  labelList);
1143 
1144  }
1145 
1146  /*********************************************************************************************************/
1147  // RAP Reuse Kernel wrappers (Default non-threaded version)
1148  // Computes R * A * P -> Ac using reuse Gustavson
1149  template<class Scalar,
1150  class LocalOrdinal,
1151  class GlobalOrdinal,
1152  class Node,
1153  class LocalOrdinalViewType>
1154  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1155  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1156  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1157  const LocalOrdinalViewType & Acol2Prow_dev,
1158  const LocalOrdinalViewType & Acol2PIrow_dev,
1159  const LocalOrdinalViewType & Pcol2Accol_dev,
1160  const LocalOrdinalViewType & PIcol2Accol_dev,
1161  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1162  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1163  const std::string& label,
1164  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1165 #ifdef HAVE_TPETRA_MMM_TIMINGS
1166  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1167  using Teuchos::TimeMonitor;
1168  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1169 #endif
1170 
1171  using Teuchos::Array;
1172  using Teuchos::ArrayRCP;
1173  using Teuchos::ArrayView;
1174  using Teuchos::RCP;
1175  using Teuchos::rcp;
1176 
1177  // Lots and lots of typedefs
1178  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1179  typedef typename KCRS::StaticCrsGraphType graph_t;
1180  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1181  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1182  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1183 
1184  typedef Scalar SC;
1185  typedef LocalOrdinal LO;
1186  typedef GlobalOrdinal GO;
1187  typedef Node NO;
1188  typedef Map<LO,GO,NO> map_type;
1189  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1190  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1191  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1192 
1193  // Sizes
1194  RCP<const map_type> Accolmap = Ac.getColMap();
1195  size_t m = Rview.origMatrix->getLocalNumRows();
1196  size_t n = Accolmap->getLocalNumElements();
1197  size_t p_max_nnz_per_row = Pview.origMatrix->getLocalMaxNumRowEntries();
1198 
1199  // Routine runs on host; have to put arguments on host, too
1200  auto Acol2Prow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1201  Acol2Prow_dev);
1202  auto Acol2PIrow = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1203  Acol2PIrow_dev);
1204  auto Pcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1205  Pcol2Accol_dev);
1206  auto PIcol2Accol = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(),
1207  PIcol2Accol_dev);
1208 
1209  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1210  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
1211  const KCRS & Pmat = Pview.origMatrix->getLocalMatrixHost();
1212  const KCRS & Rmat = Rview.origMatrix->getLocalMatrixHost();
1213  const KCRS & Cmat = Ac.getLocalMatrixHost();
1214 
1215  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1216  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1217  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1218  scalar_view_t Cvals = Cmat.values;
1219 
1220  c_lno_view_t Irowptr;
1221  lno_nnz_view_t Icolind;
1222  scalar_view_t Ivals;
1223  if(!Pview.importMatrix.is_null()) {
1224  auto lclP = Pview.importMatrix->getLocalMatrixHost();
1225  Irowptr = lclP.graph.row_map;
1226  Icolind = lclP.graph.entries;
1227  Ivals = lclP.values;
1228  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getLocalMaxNumRowEntries());
1229  }
1230 
1231 #ifdef HAVE_TPETRA_MMM_TIMINGS
1232  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1233 #endif
1234 
1235  // mfh 27 Sep 2016: The ac_status array is an implementation detail
1236  // of the local sparse matrix-matrix multiply routine.
1237 
1238  // The status array will contain the index into colind where this entry was last deposited.
1239  // ac_status[i] < nnz - not in the row yet
1240  // ac_status[i] >= nnz - this is the entry where you can find the data
1241  // We start with this filled with INVALID's indicating that there are no entries yet.
1242  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1243  Array<size_t> ac_status(n, ST_INVALID);
1244 
1245  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1246  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1247  //
1248  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1249  // you whether the corresponding row of P belongs to P_local
1250  // ("orig") or P_remote ("Import").
1251 
1252  // Necessary until following UVM host accesses are changed - for example Crowptr
1253  // Also probably needed in mult_R_A_P_newmatrix_kernel_wrapper - did not demonstrate this in test failure yet
1254  Kokkos::fence();
1255 
1256  // For each row of R
1257  size_t OLD_ip = 0, CSR_ip = 0;
1258  for (size_t i = 0; i < m; i++) {
1259  // First fill the c_status array w/ locations where we're allowed to
1260  // generate nonzeros for this row
1261  OLD_ip = Crowptr[i];
1262  CSR_ip = Crowptr[i+1];
1263  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1264  ac_status[Ccolind[k]] = k;
1265 
1266  // Reset values in the row of C
1267  Cvals[k] = SC_ZERO;
1268  }
1269 
1270  // mfh 27 Sep 2016: For each entry of R in the current row of R
1271  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1272  LO k = Rcolind[kk]; // local column index of current entry of R
1273  const SC Rik = Rvals[kk]; // value of current entry of R
1274  if (Rik == SC_ZERO)
1275  continue; // skip explicitly stored zero values in R
1276  // For each entry of A in the current row of A
1277  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1278  LO l = Acolind[ll]; // local column index of current entry of A
1279  const SC Akl = Avals[ll]; // value of current entry of A
1280  if (Akl == SC_ZERO)
1281  continue; // skip explicitly stored zero values in A
1282 
1283 
1284  if (Acol2Prow[l] != LO_INVALID) {
1285  // mfh 27 Sep 2016: If the entry of Acol2Prow
1286  // corresponding to the current entry of A is populated, then
1287  // the corresponding row of P is in P_local (i.e., it lives on
1288  // the calling process).
1289 
1290  // Local matrix
1291  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1292 
1293  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1294  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1295  LO j = Pcolind[jj];
1296  LO Cij = Pcol2Accol[j];
1297  SC Plj = Pvals[jj];
1298 
1299  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1300  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1301  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1302 
1303  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1304  }
1305  } else {
1306  // mfh 27 Sep 2016: If the entry of Acol2PRow
1307  // corresponding to the current entry of A is NOT populated (has
1308  // a flag "invalid" value), then the corresponding row of P is
1309  // in P_remote (i.e., it does not live on the calling process).
1310 
1311  // Remote matrix
1312  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1313  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1314  LO j = Icolind[jj];
1315  LO Cij = PIcol2Accol[j];
1316  SC Plj = Ivals[jj];
1317 
1318  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1319  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1320  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1321 
1322  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1323  }
1324  }
1325  }
1326  }
1327  }
1328 
1329 #ifdef HAVE_TPETRA_MMM_TIMINGS
1330  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1331 #endif
1332 
1333  Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1334 
1335 }
1336 
1337 
1338  /*********************************************************************************************************/
1339  // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1340  // Computes P.T * A * P -> Ac
1341  template<class Scalar,
1342  class LocalOrdinal,
1343  class GlobalOrdinal,
1344  class Node,
1345  class LocalOrdinalViewType>
1346  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1347  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1348  const LocalOrdinalViewType & Acol2Prow,
1349  const LocalOrdinalViewType & Acol2PIrow,
1350  const LocalOrdinalViewType & Pcol2Accol,
1351  const LocalOrdinalViewType & PIcol2Accol,
1352  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1353  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1354  const std::string& label,
1355  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1356 #ifdef HAVE_TPETRA_MMM_TIMINGS
1357  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1358  using Teuchos::TimeMonitor;
1359  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1360 #endif
1361 
1362  // We don't need a kernel-level PTAP, we just transpose here
1363  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1364  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1365 
1366  using Teuchos::ParameterList;
1367  using Teuchos::RCP;
1368  RCP<ParameterList> transposeParams (new ParameterList);
1369  transposeParams->set ("sort", false);
1370 
1371  if (! params.is_null ()) {
1372  transposeParams->set ("compute global constants",
1373  params->get ("compute global constants: temporaries",
1374  false));
1375  }
1376  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1377  transposer.createTransposeLocal (transposeParams);
1378  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1379  Rview.origMatrix = Ptrans;
1380 
1381  mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1382  }
1383 
1384  /*********************************************************************************************************/
1385  // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1386  // Computes P.T * A * P -> Ac
1387  template<class Scalar,
1388  class LocalOrdinal,
1389  class GlobalOrdinal,
1390  class Node,
1391  class LocalOrdinalViewType>
1392  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1393  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1394  const LocalOrdinalViewType & Acol2Prow,
1395  const LocalOrdinalViewType & Acol2PIrow,
1396  const LocalOrdinalViewType & Pcol2Accol,
1397  const LocalOrdinalViewType & PIcol2Accol,
1398  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1399  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1400  const std::string& label,
1401  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1402 #ifdef HAVE_TPETRA_MMM_TIMINGS
1403  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1404  using Teuchos::TimeMonitor;
1405  Teuchos::TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose")));
1406 #endif
1407 
1408  // We don't need a kernel-level PTAP, we just transpose here
1409  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1410  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1411 
1412  using Teuchos::ParameterList;
1413  using Teuchos::RCP;
1414  RCP<ParameterList> transposeParams (new ParameterList);
1415  transposeParams->set ("sort", false);
1416 
1417  if (! params.is_null ()) {
1418  transposeParams->set ("compute global constants",
1419  params->get ("compute global constants: temporaries",
1420  false));
1421  }
1422  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1423  transposer.createTransposeLocal (transposeParams);
1424  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1425  Rview.origMatrix = Ptrans;
1426 
1427  mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1428  }
1429 
1430  /*********************************************************************************************************/
1431  // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1432  // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1433  // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1434  // Currently, this implementation never gets called.
1435  template<class Scalar,
1436  class LocalOrdinal,
1437  class GlobalOrdinal,
1438  class Node>
1439  void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1440  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1441  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1442  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1443  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1444  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1445  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1446  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1447  const std::string& label,
1448  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1449 #ifdef HAVE_TPETRA_MMM_TIMINGS
1450  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1451  using Teuchos::TimeMonitor;
1452  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1453 #endif
1454 
1455  using Teuchos::Array;
1456  using Teuchos::ArrayRCP;
1457  using Teuchos::ArrayView;
1458  using Teuchos::RCP;
1459  using Teuchos::rcp;
1460 
1461  typedef Scalar SC;
1462  typedef LocalOrdinal LO;
1463  typedef GlobalOrdinal GO;
1464  typedef Node NO;
1465  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1466  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1467  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1468 
1469  // number of rows on the process of the fine matrix
1470  // size_t m = Pview.origMatrix->getLocalNumRows();
1471  // number of rows on the process of the coarse matrix
1472  size_t n = Ac.getRowMap()->getLocalNumElements();
1473  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1474 
1475  // Get Data Pointers
1476  ArrayRCP<size_t> Acrowptr_RCP;
1477  ArrayRCP<LO> Accolind_RCP;
1478  ArrayRCP<SC> Acvals_RCP;
1479 
1480  // mfh 27 Sep 2016: get the three CSR arrays
1481  // out of the CrsMatrix. This code computes R * A * (P_local +
1482  // P_remote), where P_local contains the locally owned rows of P,
1483  // and P_remote the (previously Import'ed) remote rows of P.
1484 
1485  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1486  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1487  auto Avals = Aview.origMatrix->getLocalValuesHost(
1488  Tpetra::Access::ReadOnly);
1489  auto Prowptr = Pview.origMatrix->getLocalRowPtrsHost();
1490  auto Pcolind = Pview.origMatrix->getLocalIndicesHost();
1491  auto Pvals = Pview.origMatrix->getLocalValuesHost(
1492  Tpetra::Access::ReadOnly);
1493  decltype(Prowptr) Irowptr;
1494  decltype(Pcolind) Icolind;
1495  decltype(Pvals) Ivals;
1496 
1497  if (!Pview.importMatrix.is_null()) {
1498  Irowptr = Pview.importMatrix->getLocalRowPtrsHost();
1499  Icolind = Pview.importMatrix->getLocalIndicesHost();
1500  Ivals = Pview.importMatrix->getLocalValuesHost(
1501  Tpetra::Access::ReadOnly);
1502  }
1503 
1504  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1505  // where Teuchos::ArrayRCP::operator[] may be slower than
1506  // Teuchos::ArrayView::operator[].
1507 
1508  // For efficiency
1509  ArrayView<size_t> Acrowptr;
1510  ArrayView<LO> Accolind;
1511  ArrayView<SC> Acvals;
1512 
1514  // In a first pass, determine the graph of Ac.
1516 
1518  // Get the graph of Ac. This gets the local transpose of P,
1519  // then loops over R, A, P to get the graph of Ac.
1521 
1522 #ifdef HAVE_TPETRA_MMM_TIMINGS
1523  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1524 #endif
1525 
1527  // Get the local transpose of the graph of P by locally transposing
1528  // all of P
1529 
1530  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1531 
1532  using Teuchos::ParameterList;
1533  RCP<ParameterList> transposeParams (new ParameterList);
1534  transposeParams->set ("sort", false);
1535  if (! params.is_null ()) {
1536  transposeParams->set ("compute global constants",
1537  params->get ("compute global constants: temporaries",
1538  false));
1539  }
1540  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans =
1541  transposer.createTransposeLocal (transposeParams);
1542 
1543  auto Rrowptr = Ptrans->getLocalRowPtrsHost();
1544  auto Rcolind = Ptrans->getLocalIndicesHost();
1545  auto Rvals = Ptrans->getLocalValuesHost(Tpetra::Access::ReadOnly);
1546 
1548  // Construct graph
1549 
1550  #ifdef HAVE_TPETRA_MMM_TIMINGS
1551  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1552  #endif
1553 
1554  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1555  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1556 
1557  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1558  size_t nnzPerRowA = 100;
1559  if (Aview.origMatrix->getLocalNumEntries() > 0)
1560  nnzPerRowA = Aview.origMatrix->getLocalNumEntries()/Aview.origMatrix->getLocalNumRows();
1561  Acrowptr_RCP.resize(n+1);
1562  Acrowptr = Acrowptr_RCP();
1563  Accolind_RCP.resize(nnz_alloc);
1564  Accolind = Accolind_RCP();
1565 
1566  size_t nnz = 0, nnz_old = 0;
1567  for (size_t i = 0; i < n; i++) {
1568  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1569  // on the calling process.
1570  Acrowptr[i] = nnz;
1571 
1572  // mfh 27 Sep 2016: For each entry of R in the current row of R
1573  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1574  LO k = Rcolind[kk]; // local column index of current entry of R
1575  // For each entry of A in the current row of A
1576  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1577  LO l = Acolind[ll]; // local column index of current entry of A
1578 
1579  if (Acol2PRow[l] != LO_INVALID) {
1580  // mfh 27 Sep 2016: If the entry of Acol2PRow
1581  // corresponding to the current entry of A is populated, then
1582  // the corresponding row of P is in P_local (i.e., it lives on
1583  // the calling process).
1584 
1585  // Local matrix
1586  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1587 
1588  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1589  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1590  LO j = Pcolind[jj];
1591  LO Acj = Pcol2Accol[j];
1592 
1593  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1594  // New entry
1595  ac_status[Acj] = nnz;
1596  Accolind[nnz] = Acj;
1597  nnz++;
1598  }
1599  }
1600  } else {
1601  // mfh 27 Sep 2016: If the entry of Acol2PRow
1602  // corresponding to the current entry of A is NOT populated (has
1603  // a flag "invalid" value), then the corresponding row of P is
1604  // in P_remote (i.e., it does not live on the calling process).
1605 
1606  // Remote matrix
1607  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1608  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1609  LO j = Icolind[jj];
1610  LO Acj = PIcol2Accol[j];
1611 
1612  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1613  // New entry
1614  ac_status[Acj] = nnz;
1615  Accolind[nnz] = Acj;
1616  nnz++;
1617  }
1618  }
1619  }
1620  }
1621  }
1622  // Resize for next pass if needed
1623  // cag: Maybe we can do something more subtle here, and not double
1624  // the size right away.
1625  if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1626  nnz_alloc *= 2;
1627  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1628  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1629  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1630  }
1631  nnz_old = nnz;
1632  }
1633  Acrowptr[n] = nnz;
1634 
1635  // Downward resize
1636  Accolind_RCP.resize(nnz);
1637  Accolind = Accolind_RCP();
1638 
1639  // Allocate Acvals
1640  Acvals_RCP.resize(nnz, SC_ZERO);
1641  Acvals = Acvals_RCP();
1642 
1643 
1645  // In a second pass, enter the values into Acvals.
1647 
1648  #ifdef HAVE_TPETRA_MMM_TIMINGS
1649  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1650  #endif
1651 
1652 
1653  for (size_t k = 0; k < n; k++) {
1654  for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1655  LO i = Pcolind[ii];
1656  const SC Pki = Pvals[ii];
1657  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1658  LO l = Acolind[ll];
1659  const SC Akl = Avals[ll];
1660  if (Akl == 0.)
1661  continue;
1662  if (Acol2PRow[l] != LO_INVALID) {
1663  // mfh 27 Sep 2016: If the entry of Acol2PRow
1664  // corresponding to the current entry of A is populated, then
1665  // the corresponding row of P is in P_local (i.e., it lives on
1666  // the calling process).
1667 
1668  // Local matrix
1669  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1670  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1671  LO j = Pcolind[jj];
1672  LO Acj = Pcol2Accol[j];
1673  size_t pp;
1674  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1675  if (Accolind[pp] == Acj)
1676  break;
1677  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1678  // std::runtime_error, "problem with Ac column indices");
1679  Acvals[pp] += Pki*Akl*Pvals[jj];
1680  }
1681  } else {
1682  // mfh 27 Sep 2016: If the entry of Acol2PRow
1683  // corresponding to the current entry of A NOT populated (has
1684  // a flag "invalid" value), then the corresponding row of P is
1685  // in P_remote (i.e., it does not live on the calling process).
1686 
1687  // Remote matrix
1688  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1689  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1690  LO j = Icolind[jj];
1691  LO Acj = PIcol2Accol[j];
1692  size_t pp;
1693  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1694  if (Accolind[pp] == Acj)
1695  break;
1696  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1697  // std::runtime_error, "problem with Ac column indices");
1698  Acvals[pp] += Pki*Akl*Ivals[jj];
1699  }
1700  }
1701  }
1702  }
1703  }
1704 
1705 
1706 #ifdef HAVE_TPETRA_MMM_TIMINGS
1707  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1708 #endif
1709 
1710  // Final sort & set of CRS arrays
1711  //
1712  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1713  // matrix-matrix multiply routine sort the entries for us?
1714  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1715 
1716  // mfh 27 Sep 2016: This just sets pointers.
1717  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1718 
1719 #ifdef HAVE_TPETRA_MMM_TIMINGS
1720  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1721 #endif
1722 
1723  // Final FillComplete
1724  //
1725  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1726  // Import (from domain Map to column Map) construction (which costs
1727  // lots of communication) by taking the previously constructed
1728  // Import object. We should be able to do this without interfering
1729  // with the implementation of the local part of sparse matrix-matrix
1730  // multply above.
1731  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1732  labelList->set("Timer Label",label);
1733  // labelList->set("Sort column Map ghost GIDs")
1734  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1735  RCP<const Export<LO,GO,NO> > dummyExport;
1736  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1737  Pview.origMatrix->getDomainMap(),
1738  Acimport,
1739  dummyExport, labelList);
1740  }
1741 
1742 
1743 
1744  } //End namepsace MMdetails
1745 
1746 } //End namespace Tpetra
1747 //
1748 // Explicit instantiation macro
1749 //
1750 // Must be expanded from within the Tpetra namespace!
1751 //
1752 
1753 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1754  \
1755  template \
1756  void TripleMatrixMultiply::MultiplyRAP( \
1757  const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1758  bool transposeR, \
1759  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1760  bool transposeA, \
1761  const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1762  bool transposeP, \
1763  CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1764  bool call_FillComplete_on_result, \
1765  const std::string & label, \
1766  const Teuchos::RCP<Teuchos::ParameterList>& params); \
1767  \
1768 
1769 
1770 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.