Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
TpetraExt_TripleMatrixMultiply_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
42 #define TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
43 
45 #include "TpetraExt_MatrixMatrix_ExtraKernels_decl.hpp" //for UnmanagedView
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Tpetra_Util.hpp"
49 #include "Tpetra_ConfigDefs.hpp"
50 #include "Tpetra_CrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_ConfigDefs.hpp"
54 #include "Tpetra_Map.hpp"
55 #include "Tpetra_Export.hpp"
56 #include "Tpetra_Import_Util.hpp"
57 #include "Tpetra_Import_Util2.hpp"
58 #include <algorithm>
59 #include <cmath>
60 #include "Teuchos_FancyOStream.hpp"
61 // #include "KokkosSparse_spgemm.hpp"
62 
63 
71 /*********************************************************************************************************/
72 // Include the architecture-specific kernel partial specializations here
73 // NOTE: This needs to be outside all namespaces
74 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
75 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
76 
77 namespace Tpetra {
78 
79  namespace TripleMatrixMultiply{
80 
81  //
82  // This method forms the matrix-matrix product Ac = op(R) * op(A) * op(P), where
83  // op(A) == A if transposeA is false,
84  // op(A) == A^T if transposeA is true,
85  // and similarly for op(R) and op(P).
86  //
87  template <class Scalar,
88  class LocalOrdinal,
89  class GlobalOrdinal,
90  class Node>
93  bool transposeR,
95  bool transposeA,
97  bool transposeP,
99  bool call_FillComplete_on_result,
100  const std::string& label,
101  const Teuchos::RCP<Teuchos::ParameterList>& params)
102  {
103  using Teuchos::null;
104  using Teuchos::RCP;
105  typedef Scalar SC;
106  typedef LocalOrdinal LO;
107  typedef GlobalOrdinal GO;
108  typedef Node NO;
109  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
110  typedef Import<LO,GO,NO> import_type;
111  typedef Export<LO,GO,NO> export_type;
112  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
113  typedef Map<LO,GO,NO> map_type;
114  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
115 
116 #ifdef HAVE_TPETRA_MMM_TIMINGS
117  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
118  using Teuchos::TimeMonitor;
119  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Setup"))));
120 #endif
121 
122  const std::string prefix = "TpetraExt::TripleMatrixMultiply::MultiplyRAP(): ";
123 
124  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
125 
126  // The input matrices R, A and P must both be fillComplete.
127  TEUCHOS_TEST_FOR_EXCEPTION(!R.isFillComplete(), std::runtime_error, prefix << "Matrix R is not fill complete.");
128  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
129  TEUCHOS_TEST_FOR_EXCEPTION(!P.isFillComplete(), std::runtime_error, prefix << "Matrix P is not fill complete.");
130 
131  // If transposeA is true, then Rprime will be the transpose of R
132  // (computed explicitly via RowMatrixTransposer). Otherwise, Rprime
133  // will just be a pointer to R.
134  RCP<const crs_matrix_type> Rprime = null;
135  // If transposeA is true, then Aprime will be the transpose of A
136  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
137  // will just be a pointer to A.
138  RCP<const crs_matrix_type> Aprime = null;
139  // If transposeB is true, then Pprime will be the transpose of P
140  // (computed explicitly via RowMatrixTransposer). Otherwise, Pprime
141  // will just be a pointer to P.
142  RCP<const crs_matrix_type> Pprime = null;
143 
144  // Is this a "clean" matrix?
145  //
146  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
147  // locally nor globally indexed, then it was empty. I don't like
148  // this, because the most straightforward implementation presumes
149  // lazy allocation of indices. However, historical precedent
150  // demands that we keep around this predicate as a way to test
151  // whether the matrix is empty.
152  const bool newFlag = !Ac.getGraph()->isLocallyIndexed() && !Ac.getGraph()->isGloballyIndexed();
153 
154  if (transposeR && &R != &P) {
155  transposer_type transposer(rcpFromRef (R));
156  Rprime = transposer.createTranspose();
157  } else {
158  Rprime = rcpFromRef(R);
159  }
160 
161  if (transposeA) {
162  transposer_type transposer(rcpFromRef (A));
163  Aprime = transposer.createTranspose();
164  } else {
165  Aprime = rcpFromRef(A);
166  }
167 
168  if (transposeP) {
169  transposer_type transposer(rcpFromRef (P));
170  Pprime = transposer.createTranspose();
171  } else {
172  Pprime = rcpFromRef(P);
173  }
174 
175  // Check size compatibility
176  global_size_t numRCols = R.getDomainMap()->getGlobalNumElements();
177  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
178  global_size_t numPCols = P.getDomainMap()->getGlobalNumElements();
179  global_size_t Rleft = transposeR ? numRCols : R.getGlobalNumRows();
180  global_size_t Rright = transposeR ? R.getGlobalNumRows() : numRCols;
181  global_size_t Aleft = transposeA ? numACols : A.getGlobalNumRows();
182  global_size_t Aright = transposeA ? A.getGlobalNumRows() : numACols;
183  global_size_t Pleft = transposeP ? numPCols : P.getGlobalNumRows();
184  global_size_t Pright = transposeP ? P.getGlobalNumRows() : numPCols;
185  TEUCHOS_TEST_FOR_EXCEPTION(Rright != Aleft, std::runtime_error,
186  prefix << "ERROR, inner dimensions of op(R) and op(A) "
187  "must match for matrix-matrix product. op(R) is "
188  << Rleft << "x" << Rright << ", op(A) is "<< Aleft << "x" << Aright);
189 
190  TEUCHOS_TEST_FOR_EXCEPTION(Aright != Pleft, std::runtime_error,
191  prefix << "ERROR, inner dimensions of op(A) and op(P) "
192  "must match for matrix-matrix product. op(A) is "
193  << Aleft << "x" << Aright << ", op(P) is "<< Pleft << "x" << Pright);
194 
195  // The result matrix Ac must at least have a row-map that reflects the correct
196  // row-size. Don't check the number of columns because rectangular matrices
197  // which were constructed with only one map can still end up having the
198  // correct capacity and dimensions when filled.
199  TEUCHOS_TEST_FOR_EXCEPTION(Rleft > Ac.getGlobalNumRows(), std::runtime_error,
200  prefix << "ERROR, dimensions of result Ac must "
201  "match dimensions of op(R) * op(A) * op(P). Ac has " << Ac.getGlobalNumRows()
202  << " rows, should have at least " << Rleft << std::endl);
203 
204  // It doesn't matter whether Ac is already Filled or not. If it is already
205  // Filled, it must have space allocated for the positions that will be
206  // referenced in forming Ac = op(R)*op(A)*op(P). If it doesn't have enough space,
207  // we'll error out later when trying to store result values.
208 
209  // CGB: However, matrix must be in active-fill
210  TEUCHOS_TEST_FOR_EXCEPT( Ac.isFillActive() == false );
211 
212  // We're going to need to import remotely-owned sections of P if
213  // more than one processor is performing this run, depending on the scenario.
214  int numProcs = P.getComm()->getSize();
215 
216  // Declare a couple of structs that will be used to hold views of the data
217  // of R, A and P, to be used for fast access during the matrix-multiplication.
218  crs_matrix_struct_type Rview;
219  crs_matrix_struct_type Aview;
220  crs_matrix_struct_type Pview;
221 
222  RCP<const map_type> targetMap_R = Rprime->getRowMap();
223  RCP<const map_type> targetMap_A = Aprime->getRowMap();
224  RCP<const map_type> targetMap_P = Pprime->getRowMap();
225 
226 #ifdef HAVE_TPETRA_MMM_TIMINGS
227  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All I&X"))));
228 #endif
229 
230  // Now import any needed remote rows and populate the Aview struct
231  // NOTE: We assert that an import isn't needed --- since we do the transpose
232  // above to handle that.
233  RCP<const import_type> dummyImporter;
234 
235  if (!(transposeR && &R == &P))
236  MMdetails::import_and_extract_views(*Rprime, targetMap_R, Rview, dummyImporter, true, label, params);
237 
238  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
239 
240  // We will also need local access to all rows of P that correspond to the
241  // column-map of op(A).
242  if (numProcs > 1)
243  targetMap_P = Aprime->getColMap();
244 
245  // Import any needed remote rows and populate the Pview struct.
246  MMdetails::import_and_extract_views(*Pprime, targetMap_P, Pview, Aprime->getGraph()->getImporter(), false, label, params);
247 
248 
249  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Actemp;
250 
251  bool needs_final_export = !Pprime->getGraph()->getImporter().is_null();
252  if (needs_final_export)
253  Actemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Pprime->getColMap(),0));
254  else
255  Actemp = rcp(&Ac,false);// don't allow deallocation
256 
257 #ifdef HAVE_TPETRA_MMM_TIMINGS
258  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All Multiply"))));
259 #endif
260 
261  // Call the appropriate method to perform the actual multiplication.
262  if (call_FillComplete_on_result && newFlag) {
263  if (transposeR && &R == &P)
264  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
265  else
266  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
267  } else if (call_FillComplete_on_result) {
268  if (transposeR && &R == &P)
269  MMdetails::mult_PT_A_P_reuse(Aview, Pview, *Actemp, label, params);
270  else
271  MMdetails::mult_R_A_P_reuse(Rview, Aview, Pview, *Actemp, label, params);
272  } else {
273  // mfh 27 Sep 2016: Is this the "slow" case? This
274  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
275  // thread-parallel inserts, but that may take some effort.
276  // CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(Ac);
277 
278  // MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
279 
280  // #ifdef HAVE_TPETRA_MMM_TIMINGS
281  // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP All FillComplete"))));
282  // #endif
283  // if (call_FillComplete_on_result) {
284  // // We'll call FillComplete on the C matrix before we exit, and give it a
285  // // domain-map and a range-map.
286  // // The domain-map will be the domain-map of B, unless
287  // // op(B)==transpose(B), in which case the range-map of B will be used.
288  // // The range-map will be the range-map of A, unless op(A)==transpose(A),
289  // // in which case the domain-map of A will be used.
290  // if (!C.isFillComplete())
291  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
292  // }
293  // Not implemented
294  if (transposeR && &R == &P)
295  MMdetails::mult_PT_A_P_newmatrix(Aview, Pview, *Actemp, label, params);
296  else
297  MMdetails::mult_R_A_P_newmatrix(Rview, Aview, Pview, *Actemp, label, params);
298  }
299 
300  if (needs_final_export) {
301 #ifdef HAVE_TPETRA_MMM_TIMINGS
302  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP exportAndFillComplete"))));
303 #endif
304  Teuchos::ParameterList labelList;
305  labelList.set("Timer Label", label);
306  Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
307 
308  RCP<crs_matrix_type> Acprime = rcpFromRef(Ac);
309  bool isMM = true;
310  bool overrideAllreduce = false;
311  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
312  if(!params.is_null()) {
313  Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
314  mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
315  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
316  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
317  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
318  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
319  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
320 
321  labelList.set("compute global constants",params->get("compute global constants",true));
322  }
323  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
324 
325  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
326  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
327  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
328 
329  export_type exporter = export_type(*Pprime->getGraph()->getImporter());
330  Actemp->exportAndFillComplete(Acprime,
331  exporter,
332  Acprime->getDomainMap(),
333  Acprime->getRangeMap(),
334  rcp(&labelList,false));
335 
336  }
337 #ifdef HAVE_TPETRA_MMM_STATISTICS
338  printMultiplicationStatistics(Actemp->getGraph()->getExporter(), label+std::string(" RAP MMM"));
339 #endif
340 
341  }
342 
343 
344  } //End namespace TripleMatrixMultiply
345 
346  namespace MMdetails{
347 
348  // Kernel method for computing the local portion of Ac = R*A*P
349  template<class Scalar,
350  class LocalOrdinal,
351  class GlobalOrdinal,
352  class Node>
353  void mult_R_A_P_newmatrix(
354  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
355  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
356  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
357  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
358  const std::string& label,
359  const Teuchos::RCP<Teuchos::ParameterList>& params)
360  {
361  using Teuchos::Array;
362  using Teuchos::ArrayRCP;
363  using Teuchos::ArrayView;
364  using Teuchos::RCP;
365  using Teuchos::rcp;
366 
367  //typedef Scalar SC; // unused
368  typedef LocalOrdinal LO;
369  typedef GlobalOrdinal GO;
370  typedef Node NO;
371 
372  typedef Import<LO,GO,NO> import_type;
373  typedef Map<LO,GO,NO> map_type;
374 
375  // Kokkos typedefs
376  typedef typename map_type::local_map_type local_map_type;
378  typedef typename KCRS::StaticCrsGraphType graph_t;
379  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
380  typedef typename NO::execution_space execution_space;
381  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
382  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
383 
384 #ifdef HAVE_TPETRA_MMM_TIMINGS
385  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
386  using Teuchos::TimeMonitor;
387  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
388 #endif
389  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
390 
391  // Build the final importer / column map, hash table lookups for Ac
392  RCP<const import_type> Cimport;
393  RCP<const map_type> Ccolmap;
394  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
395  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
396  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
397  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
398  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
399  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
400  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
401 
402 
403  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
404  // indices of B, to local column indices of Ac. (B and Ac have the
405  // same number of columns.) The kernel uses this, instead of
406  // copying the entire input matrix B and converting its column
407  // indices to those of C.
408  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
409 
410  if (Pview.importMatrix.is_null()) {
411  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
412  Cimport = Pimport;
413  Ccolmap = Pview.colMap;
414  const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
415  // Pcol2Ccol is trivial
416  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
417  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
418  KOKKOS_LAMBDA(const LO i) {
419  Pcol2Ccol(i) = i;
420  });
421  }
422  else {
423  // mfh 27 Sep 2016: P has "remotes," so we need to build the
424  // column Map of C, as well as C's Import object (from its domain
425  // Map to its column Map). C's column Map is the union of the
426  // column Maps of (the local part of) P, and the "remote" part of
427  // P. Ditto for the Import. We have optimized this "setUnion"
428  // operation on Import objects and Maps.
429 
430  // Choose the right variant of setUnion
431  if (!Pimport.is_null() && !Iimport.is_null()) {
432  Cimport = Pimport->setUnion(*Iimport);
433  }
434  else if (!Pimport.is_null() && Iimport.is_null()) {
435  Cimport = Pimport->setUnion();
436  }
437  else if (Pimport.is_null() && !Iimport.is_null()) {
438  Cimport = Iimport->setUnion();
439  }
440  else {
441  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
442  }
443  Ccolmap = Cimport->getTargetMap();
444 
445  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
446  // in general. We should get rid of it in order to reduce
447  // communication costs of sparse matrix-matrix multiply.
448  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
449  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
450 
451  // NOTE: This is not efficient and should be folded into setUnion
452  //
453  // mfh 27 Sep 2016: What the above comment means, is that the
454  // setUnion operation on Import objects could also compute these
455  // local index - to - local index look-up tables.
456  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
457  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
458  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
459  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
460  });
461  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
462  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
463  });
464 
465  }
466 
467  // Replace the column map
468  //
469  // mfh 27 Sep 2016: We do this because C was originally created
470  // without a column Map. Now we have its column Map.
471  Ac.replaceColMap(Ccolmap);
472 
473  // mfh 27 Sep 2016: Construct tables that map from local column
474  // indices of A, to local row indices of either B_local (the locally
475  // owned part of B), or B_remote (the "imported" remote part of B).
476  //
477  // For column index Aik in row i of A, if the corresponding row of B
478  // exists in the local part of B ("orig") (which I'll call B_local),
479  // then targetMapToOrigRow[Aik] is the local index of that row of B.
480  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
481  //
482  // For column index Aik in row i of A, if the corresponding row of B
483  // exists in the remote part of B ("Import") (which I'll call
484  // B_remote), then targetMapToImportRow[Aik] is the local index of
485  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
486  // (a flag value).
487 
488  // Run through all the hash table lookups once and for all
489  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
490  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
491  Kokkos::fence();
492  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
493  GO aidx = Acolmap_local.getGlobalElement(i);
494  LO P_LID = Prowmap_local.getLocalElement(aidx);
495  if (P_LID != LO_INVALID) {
496  targetMapToOrigRow(i) = P_LID;
497  targetMapToImportRow(i) = LO_INVALID;
498  } else {
499  LO I_LID = Irowmap_local.getLocalElement(aidx);
500  targetMapToOrigRow(i) = LO_INVALID;
501  targetMapToImportRow(i) = I_LID;
502  }
503  });
504 
505  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
506  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
507  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
508  mult_R_A_P_newmatrix_kernel_wrapper(Rview, Aview, Pview,
509  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
510  Ac, Cimport, label, params);
511  }
512 
513 
514  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
515  template<class Scalar,
516  class LocalOrdinal,
517  class GlobalOrdinal,
518  class Node>
519  void mult_R_A_P_reuse(
520  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
521  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
522  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
523  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
524  const std::string& label,
525  const Teuchos::RCP<Teuchos::ParameterList>& params)
526  {
527  using Teuchos::Array;
528  using Teuchos::ArrayRCP;
529  using Teuchos::ArrayView;
530  using Teuchos::RCP;
531  using Teuchos::rcp;
532 
533  //typedef Scalar SC; // unused
534  typedef LocalOrdinal LO;
535  typedef GlobalOrdinal GO;
536  typedef Node NO;
537 
538  typedef Import<LO,GO,NO> import_type;
539  typedef Map<LO,GO,NO> map_type;
540 
541  // Kokkos typedefs
542  typedef typename map_type::local_map_type local_map_type;
544  typedef typename KCRS::StaticCrsGraphType graph_t;
545  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
546  typedef typename NO::execution_space execution_space;
547  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
548  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
549 
550 #ifdef HAVE_TPETRA_MMM_TIMINGS
551  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
552  using Teuchos::TimeMonitor;
553  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
554 #endif
555  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
556 
557  // Build the final importer / column map, hash table lookups for Ac
558  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
559  RCP<const map_type> Ccolmap = Ac.getColMap();
560  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
561  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
562  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
563  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
564  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
565  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
566  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
567  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
568 
569  // Build the final importer / column map, hash table lookups for C
570  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
571  {
572  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
573  // So, column map of C may be a strict subset of the column map of B
574  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
575  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
576  });
577 
578  if (!Pview.importMatrix.is_null()) {
579  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
580  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
581 
582  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
583  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
584  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
585  });
586  }
587  }
588 
589  // Run through all the hash table lookups once and for all
590  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
591  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
592  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
593  GO aidx = Acolmap_local.getGlobalElement(i);
594  LO B_LID = Prowmap_local.getLocalElement(aidx);
595  if (B_LID != LO_INVALID) {
596  targetMapToOrigRow(i) = B_LID;
597  targetMapToImportRow(i) = LO_INVALID;
598  } else {
599  LO I_LID = Irowmap_local.getLocalElement(aidx);
600  targetMapToOrigRow(i) = LO_INVALID;
601  targetMapToImportRow(i) = I_LID;
602 
603  }
604  });
605 
606  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
607  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
608  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
609  mult_R_A_P_reuse_kernel_wrapper(Rview, Aview, Pview,
610  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
611  Ac, Cimport, label, params);
612  }
613 
614 
615  // Kernel method for computing the local portion of Ac = R*A*P
616  template<class Scalar,
617  class LocalOrdinal,
618  class GlobalOrdinal,
619  class Node>
620  void mult_PT_A_P_newmatrix(
621  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
622  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
623  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
624  const std::string& label,
625  const Teuchos::RCP<Teuchos::ParameterList>& params)
626  {
627  using Teuchos::Array;
628  using Teuchos::ArrayRCP;
629  using Teuchos::ArrayView;
630  using Teuchos::RCP;
631  using Teuchos::rcp;
632 
633  //typedef Scalar SC; // unused
634  typedef LocalOrdinal LO;
635  typedef GlobalOrdinal GO;
636  typedef Node NO;
637 
638  typedef Import<LO,GO,NO> import_type;
639  typedef Map<LO,GO,NO> map_type;
640 
641  // Kokkos typedefs
642  typedef typename map_type::local_map_type local_map_type;
644  typedef typename KCRS::StaticCrsGraphType graph_t;
645  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
646  typedef typename NO::execution_space execution_space;
647  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
648  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
649 
650 #ifdef HAVE_TPETRA_MMM_TIMINGS
651  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
652  using Teuchos::TimeMonitor;
653  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP M5 Cmap")))));
654 #endif
655  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
656 
657  // Build the final importer / column map, hash table lookups for Ac
658  RCP<const import_type> Cimport;
659  RCP<const map_type> Ccolmap;
660  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
661  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
662  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
663  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
664  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
665  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
666  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
667 
668 
669  // mfh 27 Sep 2016: Pcol2Ccol is a table that maps from local column
670  // indices of B, to local column indices of Ac. (B and Ac have the
671  // same number of columns.) The kernel uses this, instead of
672  // copying the entire input matrix B and converting its column
673  // indices to those of C.
674  lo_view_t Pcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Pcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
675 
676  if (Pview.importMatrix.is_null()) {
677  // mfh 27 Sep 2016: B has no "remotes," so P and C have the same column Map.
678  Cimport = Pimport;
679  Ccolmap = Pview.colMap;
680  const LO colMapSize = static_cast<LO>(Pview.colMap->getNodeNumElements());
681  // Pcol2Ccol is trivial
682  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_fill",
683  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
684  KOKKOS_LAMBDA(const LO i) {
685  Pcol2Ccol(i) = i;
686  });
687  }
688  else {
689  // mfh 27 Sep 2016: P has "remotes," so we need to build the
690  // column Map of C, as well as C's Import object (from its domain
691  // Map to its column Map). C's column Map is the union of the
692  // column Maps of (the local part of) P, and the "remote" part of
693  // P. Ditto for the Import. We have optimized this "setUnion"
694  // operation on Import objects and Maps.
695 
696  // Choose the right variant of setUnion
697  if (!Pimport.is_null() && !Iimport.is_null()) {
698  Cimport = Pimport->setUnion(*Iimport);
699  }
700  else if (!Pimport.is_null() && Iimport.is_null()) {
701  Cimport = Pimport->setUnion();
702  }
703  else if (Pimport.is_null() && !Iimport.is_null()) {
704  Cimport = Iimport->setUnion();
705  }
706  else {
707  throw std::runtime_error("TpetraExt::RAP status of matrix importers is nonsensical");
708  }
709  Ccolmap = Cimport->getTargetMap();
710 
711  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
712  // in general. We should get rid of it in order to reduce
713  // communication costs of sparse matrix-matrix multiply.
714  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
715  std::runtime_error, "Tpetra::RAP: Import setUnion messed with the DomainMap in an unfortunate way");
716 
717  // NOTE: This is not efficient and should be folded into setUnion
718  //
719  // mfh 27 Sep 2016: What the above comment means, is that the
720  // setUnion operation on Import objects could also compute these
721  // local index - to - local index look-up tables.
722  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
723  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
724  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Pcol2Ccol_getGlobalElement",range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
725  Pcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
726  });
727  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
728  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
729  });
730 
731  }
732 
733  // Replace the column map
734  //
735  // mfh 27 Sep 2016: We do this because C was originally created
736  // without a column Map. Now we have its column Map.
737  Ac.replaceColMap(Ccolmap);
738 
739  // mfh 27 Sep 2016: Construct tables that map from local column
740  // indices of A, to local row indices of either B_local (the locally
741  // owned part of B), or B_remote (the "imported" remote part of B).
742  //
743  // For column index Aik in row i of A, if the corresponding row of B
744  // exists in the local part of B ("orig") (which I'll call B_local),
745  // then targetMapToOrigRow[Aik] is the local index of that row of B.
746  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
747  //
748  // For column index Aik in row i of A, if the corresponding row of B
749  // exists in the remote part of B ("Import") (which I'll call
750  // B_remote), then targetMapToImportRow[Aik] is the local index of
751  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
752  // (a flag value).
753 
754  // Run through all the hash table lookups once and for all
755  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
756  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
757  Kokkos::fence();
758  Kokkos::parallel_for("Tpetra::mult_R_A_P_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
759  GO aidx = Acolmap_local.getGlobalElement(i);
760  LO P_LID = Prowmap_local.getLocalElement(aidx);
761  if (P_LID != LO_INVALID) {
762  targetMapToOrigRow(i) = P_LID;
763  targetMapToImportRow(i) = LO_INVALID;
764  } else {
765  LO I_LID = Irowmap_local.getLocalElement(aidx);
766  targetMapToOrigRow(i) = LO_INVALID;
767  targetMapToImportRow(i) = I_LID;
768  }
769  });
770 
771  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
772  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
773  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
774  mult_PT_A_P_newmatrix_kernel_wrapper(Aview, Pview,
775  targetMapToOrigRow,targetMapToImportRow, Pcol2Ccol, Icol2Ccol,
776  Ac, Cimport, label, params);
777  }
778 
779  // Kernel method for computing the local portion of Ac = R*A*P (reuse mode)
780  template<class Scalar,
781  class LocalOrdinal,
782  class GlobalOrdinal,
783  class Node>
784  void mult_PT_A_P_reuse(
785  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
786  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
787  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
788  const std::string& label,
789  const Teuchos::RCP<Teuchos::ParameterList>& params)
790  {
791  using Teuchos::Array;
792  using Teuchos::ArrayRCP;
793  using Teuchos::ArrayView;
794  using Teuchos::RCP;
795  using Teuchos::rcp;
796 
797  //typedef Scalar SC; // unused
798  typedef LocalOrdinal LO;
799  typedef GlobalOrdinal GO;
800  typedef Node NO;
801 
802  typedef Import<LO,GO,NO> import_type;
803  typedef Map<LO,GO,NO> map_type;
804 
805  // Kokkos typedefs
806  typedef typename map_type::local_map_type local_map_type;
808  typedef typename KCRS::StaticCrsGraphType graph_t;
809  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
810  typedef typename NO::execution_space execution_space;
811  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
812  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
813 
814 #ifdef HAVE_TPETRA_MMM_TIMINGS
815  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
816  using Teuchos::TimeMonitor;
817  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP M5 Cmap")))));
818 #endif
819  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
820 
821  // Build the final importer / column map, hash table lookups for Ac
822  RCP<const import_type> Cimport = Ac.getGraph()->getImporter();
823  RCP<const map_type> Ccolmap = Ac.getColMap();
824  RCP<const import_type> Pimport = Pview.origMatrix->getGraph()->getImporter();
825  RCP<const import_type> Iimport = Pview.importMatrix.is_null() ? Teuchos::null : Pview.importMatrix->getGraph()->getImporter();
826  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
827  local_map_type Prowmap_local = Pview.origMatrix->getRowMap()->getLocalMap();
828  local_map_type Irowmap_local; if(!Pview.importMatrix.is_null()) Irowmap_local = Pview.importMatrix->getRowMap()->getLocalMap();
829  local_map_type Pcolmap_local = Pview.origMatrix->getColMap()->getLocalMap();
830  local_map_type Icolmap_local; if(!Pview.importMatrix.is_null()) Icolmap_local = Pview.importMatrix->getColMap()->getLocalMap();
831  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
832 
833  // Build the final importer / column map, hash table lookups for C
834  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Pview.colMap->getNodeNumElements()), Icol2Ccol;
835  {
836  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
837  // So, column map of C may be a strict subset of the column map of B
838  Kokkos::parallel_for(range_type(0,Pview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
839  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Pcolmap_local.getGlobalElement(i));
840  });
841 
842  if (!Pview.importMatrix.is_null()) {
843  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Pview.origMatrix->getDomainMap()),
844  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
845 
846  Kokkos::resize(Icol2Ccol,Pview.importMatrix->getColMap()->getNodeNumElements());
847  Kokkos::parallel_for(range_type(0,Pview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
848  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
849  });
850  }
851  }
852 
853  // Run through all the hash table lookups once and for all
854  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
855  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
856  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
857  GO aidx = Acolmap_local.getGlobalElement(i);
858  LO B_LID = Prowmap_local.getLocalElement(aidx);
859  if (B_LID != LO_INVALID) {
860  targetMapToOrigRow(i) = B_LID;
861  targetMapToImportRow(i) = LO_INVALID;
862  } else {
863  LO I_LID = Irowmap_local.getLocalElement(aidx);
864  targetMapToOrigRow(i) = LO_INVALID;
865  targetMapToImportRow(i) = I_LID;
866 
867  }
868  });
869 
870  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
871  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
872  KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::
873  mult_PT_A_P_reuse_kernel_wrapper(Aview, Pview,
874  targetMapToOrigRow,targetMapToImportRow, Bcol2Ccol, Icol2Ccol,
875  Ac, Cimport, label, params);
876  }
877 
878 
879  /*********************************************************************************************************/
880  // RAP NewMatrix Kernel wrappers (Default non-threaded version)
881  // Computes R * A * P -> Ac using classic Gustavson approach
882  template<class Scalar,
883  class LocalOrdinal,
884  class GlobalOrdinal,
885  class Node,
886  class LocalOrdinalViewType>
887  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
888  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
889  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
890  const LocalOrdinalViewType & Acol2Prow,
891  const LocalOrdinalViewType & Acol2PIrow,
892  const LocalOrdinalViewType & Pcol2Accol,
893  const LocalOrdinalViewType & PIcol2Accol,
894  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
895  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
896  const std::string& label,
897  const Teuchos::RCP<Teuchos::ParameterList>& params) {
898 #ifdef HAVE_TPETRA_MMM_TIMINGS
899  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
900  using Teuchos::TimeMonitor;
901  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore"))));
902 #endif
903 
904  using Teuchos::Array;
905  using Teuchos::ArrayRCP;
906  using Teuchos::ArrayView;
907  using Teuchos::RCP;
908  using Teuchos::rcp;
909 
910  // Lots and lots of typedefs
912  typedef typename KCRS::StaticCrsGraphType graph_t;
913  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
914  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
915  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
916  typedef typename KCRS::values_type::non_const_type scalar_view_t;
917 
918  typedef Scalar SC;
919  typedef LocalOrdinal LO;
920  typedef GlobalOrdinal GO;
921  typedef Node NO;
922  typedef Map<LO,GO,NO> map_type;
923  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
924  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
925  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
926 
927  // Sizes
928  RCP<const map_type> Accolmap = Ac.getColMap();
929  size_t m = Rview.origMatrix->getNodeNumRows();
930  size_t n = Accolmap->getNodeNumElements();
931  size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
932 
933  // Grab the Kokkos::SparseCrsMatrices & inner stuff
934  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
935  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
936  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
937 
938  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map;
939  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries;
940  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
941 
942  c_lno_view_t Irowptr;
943  lno_nnz_view_t Icolind;
944  scalar_view_t Ivals;
945  if(!Pview.importMatrix.is_null()) {
946  Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
947  Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
948  Ivals = Pview.importMatrix->getLocalMatrix().values;
949  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
950  }
951 
952 #ifdef HAVE_TPETRA_MMM_TIMINGS
953  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix SerialCore - Compare"))));
954 #endif
955 
956  // Classic csr assembly (low memory edition)
957  //
958  // mfh 27 Sep 2016: Ac_estimate_nnz does not promise an upper bound.
959  // The method loops over rows of R, and may resize after processing
960  // each row. Chris Siefert says that this reflects experience in
961  // ML; for the non-threaded case, ML found it faster to spend less
962  // effort on estimation and risk an occasional reallocation.
963  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
964  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
965  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
966  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
967 
968  // mfh 27 Sep 2016: The ac_status array is an implementation detail
969  // of the local sparse matrix-matrix multiply routine.
970 
971  // The status array will contain the index into colind where this entry was last deposited.
972  // ac_status[i] < nnz - not in the row yet
973  // ac_status[i] >= nnz - this is the entry where you can find the data
974  // We start with this filled with INVALID's indicating that there are no entries yet.
975  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
976  const size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
977  Array<size_t> ac_status(n, ST_INVALID);
978 
979  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
980  // routine. The routine computes Ac := R * A * (P_local + P_remote).
981  //
982  // For column index Aik in row i of A, Acol2Prow[Aik] tells
983  // you whether the corresponding row of P belongs to P_local
984  // ("orig") or P_remote ("Import").
985 
986  // For each row of R
987  size_t nnz = 0, nnz_old = 0;
988  for (size_t i = 0; i < m; i++) {
989  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
990  // on the calling process.
991  Crowptr[i] = nnz;
992 
993  // mfh 27 Sep 2016: For each entry of R in the current row of R
994  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
995  LO k = Rcolind[kk]; // local column index of current entry of R
996  const SC Rik = Rvals[kk]; // value of current entry of R
997  if (Rik == SC_ZERO)
998  continue; // skip explicitly stored zero values in R
999  // For each entry of A in the current row of A
1000  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1001  LO l = Acolind[ll]; // local column index of current entry of A
1002  const SC Akl = Avals[ll]; // value of current entry of A
1003  if (Akl == SC_ZERO)
1004  continue; // skip explicitly stored zero values in A
1005 
1006 
1007  if (Acol2Prow[l] != LO_INVALID) {
1008  // mfh 27 Sep 2016: If the entry of Acol2Prow
1009  // corresponding to the current entry of A is populated, then
1010  // the corresponding row of P is in P_local (i.e., it lives on
1011  // the calling process).
1012 
1013  // Local matrix
1014  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1015 
1016  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1017  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1018  LO j = Pcolind[jj];
1019  LO Acj = Pcol2Accol[j];
1020  SC Plj = Pvals[jj];
1021 
1022  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1023 #ifdef HAVE_TPETRA_DEBUG
1024  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1025  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1026  std::runtime_error,
1027  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1028 #endif
1029  // New entry
1030  ac_status[Acj] = nnz;
1031  Ccolind[nnz] = Acj;
1032  Cvals[nnz] = Rik*Akl*Plj;
1033  nnz++;
1034  } else {
1035  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1036  }
1037  }
1038  } else {
1039  // mfh 27 Sep 2016: If the entry of Acol2PRow
1040  // corresponding to the current entry of A is NOT populated (has
1041  // a flag "invalid" value), then the corresponding row of P is
1042  // in P_remote (i.e., it does not live on the calling process).
1043 
1044  // Remote matrix
1045  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1046  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1047  LO j = Icolind[jj];
1048  LO Acj = PIcol2Accol[j];
1049  SC Plj = Ivals[jj];
1050 
1051  if (ac_status[Acj] == INVALID || ac_status[Acj] < nnz_old) {
1052 #ifdef HAVE_TPETRA_DEBUG
1053  // Ac_estimate_nnz() is probably not perfect yet. If this happens, we need to allocate more memory..
1054  TEUCHOS_TEST_FOR_EXCEPTION(nnz >= Teuchos::as<size_t>(Ccolind.size()),
1055  std::runtime_error,
1056  label << " ERROR, not enough memory allocated for matrix product. Allocated: " << Ccolind.extent(0) << std::endl);
1057 #endif
1058  // New entry
1059  ac_status[Acj] = nnz;
1060  Ccolind[nnz] = Acj;
1061  Cvals[nnz] = Rik*Akl*Plj;
1062  nnz++;
1063  } else {
1064  Cvals[ac_status[Acj]] += Rik*Akl*Plj;
1065  }
1066  }
1067  }
1068  }
1069  }
1070  // Resize for next pass if needed
1071  if (nnz + n > CSR_alloc) {
1072  CSR_alloc *= 2;
1073  Kokkos::resize(Ccolind,CSR_alloc);
1074  Kokkos::resize(Cvals,CSR_alloc);
1075  }
1076  nnz_old = nnz;
1077  }
1078 
1079  Crowptr[m] = nnz;
1080 
1081  // Downward resize
1082  Kokkos::resize(Ccolind,nnz);
1083  Kokkos::resize(Cvals,nnz);
1084 
1085 #ifdef HAVE_TPETRA_MMM_TIMINGS
1086  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix Final Sort"))));
1087 #endif
1088 
1089  // Final sort & set of CRS arrays
1090  if (params.is_null() || params->get("sort entries",true))
1091  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1092  Ac.setAllValues(Crowptr, Ccolind, Cvals);
1093 
1094 #ifdef HAVE_TPETRA_MMM_TIMINGS
1095  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Newmatrix ESFC"))));
1096 #endif
1097 
1098  // Final FillComplete
1099  //
1100  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1101  // Import (from domain Map to column Map) construction (which costs
1102  // lots of communication) by taking the previously constructed
1103  // Import object. We should be able to do this without interfering
1104  // with the implementation of the local part of sparse matrix-matrix
1105  // multply above.
1106  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1107  labelList->set("Timer Label",label);
1108  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1109  RCP<const Export<LO,GO,NO> > dummyExport;
1110  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1111  Rview.origMatrix->getRangeMap(),
1112  Acimport,
1113  dummyExport,
1114  labelList);
1115 
1116  }
1117 
1118  /*********************************************************************************************************/
1119  // RAP Reuse Kernel wrappers (Default non-threaded version)
1120  // Computes R * A * P -> Ac using reuse Gustavson
1121  template<class Scalar,
1122  class LocalOrdinal,
1123  class GlobalOrdinal,
1124  class Node,
1125  class LocalOrdinalViewType>
1126  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_R_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Rview,
1127  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1128  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1129  const LocalOrdinalViewType & Acol2Prow,
1130  const LocalOrdinalViewType & Acol2PIrow,
1131  const LocalOrdinalViewType & Pcol2Accol,
1132  const LocalOrdinalViewType & PIcol2Accol,
1133  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1134  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1135  const std::string& label,
1136  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1137 #ifdef HAVE_TPETRA_MMM_TIMINGS
1138  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1139  using Teuchos::TimeMonitor;
1140  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore"))));
1141 #endif
1142 
1143  using Teuchos::Array;
1144  using Teuchos::ArrayRCP;
1145  using Teuchos::ArrayView;
1146  using Teuchos::RCP;
1147  using Teuchos::rcp;
1148 
1149  // Lots and lots of typedefs
1151  typedef typename KCRS::StaticCrsGraphType graph_t;
1152  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1153  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1154  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1155 
1156  typedef Scalar SC;
1157  typedef LocalOrdinal LO;
1158  typedef GlobalOrdinal GO;
1159  typedef Node NO;
1160  typedef Map<LO,GO,NO> map_type;
1161  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1162  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1163  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1164 
1165  // Sizes
1166  RCP<const map_type> Accolmap = Ac.getColMap();
1167  size_t m = Rview.origMatrix->getNodeNumRows();
1168  size_t n = Accolmap->getNodeNumElements();
1169  size_t p_max_nnz_per_row = Pview.origMatrix->getNodeMaxNumRowEntries();
1170 
1171  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1172  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1173  const KCRS & Pmat = Pview.origMatrix->getLocalMatrix();
1174  const KCRS & Rmat = Rview.origMatrix->getLocalMatrix();
1175  const KCRS & Cmat = Ac.getLocalMatrix();
1176 
1177  c_lno_view_t Arowptr = Amat.graph.row_map, Prowptr = Pmat.graph.row_map, Rrowptr = Rmat.graph.row_map, Crowptr = Cmat.graph.row_map;
1178  const lno_nnz_view_t Acolind = Amat.graph.entries, Pcolind = Pmat.graph.entries , Rcolind = Rmat.graph.entries, Ccolind = Cmat.graph.entries;
1179  const scalar_view_t Avals = Amat.values, Pvals = Pmat.values, Rvals = Rmat.values;
1180  scalar_view_t Cvals = Cmat.values;
1181 
1182  c_lno_view_t Irowptr;
1183  lno_nnz_view_t Icolind;
1184  scalar_view_t Ivals;
1185  if(!Pview.importMatrix.is_null()) {
1186  Irowptr = Pview.importMatrix->getLocalMatrix().graph.row_map;
1187  Icolind = Pview.importMatrix->getLocalMatrix().graph.entries;
1188  Ivals = Pview.importMatrix->getLocalMatrix().values;
1189  p_max_nnz_per_row = std::max(p_max_nnz_per_row,Pview.importMatrix->getNodeMaxNumRowEntries());
1190  }
1191 
1192 #ifdef HAVE_TPETRA_MMM_TIMINGS
1193  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse SerialCore - Compare"))));
1194 #endif
1195 
1196  // mfh 27 Sep 2016: The ac_status array is an implementation detail
1197  // of the local sparse matrix-matrix multiply routine.
1198 
1199  // The status array will contain the index into colind where this entry was last deposited.
1200  // ac_status[i] < nnz - not in the row yet
1201  // ac_status[i] >= nnz - this is the entry where you can find the data
1202  // We start with this filled with INVALID's indicating that there are no entries yet.
1203  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1204  Array<size_t> ac_status(n, ST_INVALID);
1205 
1206  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1207  // routine. The routine computes Ac := R * A * (P_local + P_remote).
1208  //
1209  // For column index Aik in row i of A, Acol2Prow[Aik] tells
1210  // you whether the corresponding row of P belongs to P_local
1211  // ("orig") or P_remote ("Import").
1212 
1213  // For each row of R
1214  size_t OLD_ip = 0, CSR_ip = 0;
1215  for (size_t i = 0; i < m; i++) {
1216  // First fill the c_status array w/ locations where we're allowed to
1217  // generate nonzeros for this row
1218  OLD_ip = Crowptr[i];
1219  CSR_ip = Crowptr[i+1];
1220  for (size_t k = OLD_ip; k < CSR_ip; k++) {
1221  ac_status[Ccolind[k]] = k;
1222 
1223  // Reset values in the row of C
1224  Cvals[k] = SC_ZERO;
1225  }
1226 
1227  // mfh 27 Sep 2016: For each entry of R in the current row of R
1228  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1229  LO k = Rcolind[kk]; // local column index of current entry of R
1230  const SC Rik = Rvals[kk]; // value of current entry of R
1231  if (Rik == SC_ZERO)
1232  continue; // skip explicitly stored zero values in R
1233  // For each entry of A in the current row of A
1234  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1235  LO l = Acolind[ll]; // local column index of current entry of A
1236  const SC Akl = Avals[ll]; // value of current entry of A
1237  if (Akl == SC_ZERO)
1238  continue; // skip explicitly stored zero values in A
1239 
1240 
1241  if (Acol2Prow[l] != LO_INVALID) {
1242  // mfh 27 Sep 2016: If the entry of Acol2Prow
1243  // corresponding to the current entry of A is populated, then
1244  // the corresponding row of P is in P_local (i.e., it lives on
1245  // the calling process).
1246 
1247  // Local matrix
1248  size_t Pl = Teuchos::as<size_t>(Acol2Prow[l]);
1249 
1250  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1251  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1252  LO j = Pcolind[jj];
1253  LO Cij = Pcol2Accol[j];
1254  SC Plj = Pvals[jj];
1255 
1256  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1257  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1258  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1259 
1260  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1261  }
1262  } else {
1263  // mfh 27 Sep 2016: If the entry of Acol2PRow
1264  // corresponding to the current entry of A is NOT populated (has
1265  // a flag "invalid" value), then the corresponding row of P is
1266  // in P_remote (i.e., it does not live on the calling process).
1267 
1268  // Remote matrix
1269  size_t Il = Teuchos::as<size_t>(Acol2PIrow[l]);
1270  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1271  LO j = Icolind[jj];
1272  LO Cij = PIcol2Accol[j];
1273  SC Plj = Ivals[jj];
1274 
1275  TEUCHOS_TEST_FOR_EXCEPTION(ac_status[Cij] < OLD_ip || ac_status[Cij] >= CSR_ip,
1276  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
1277  "(c_status = " << ac_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
1278 
1279  Cvals[ac_status[Cij]] += Rik*Akl*Plj;
1280  }
1281  }
1282  }
1283  }
1284  }
1285 
1286 #ifdef HAVE_TPETRA_MMM_TIMINGS
1287  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("RAP Reuse ESFC"))));
1288 #endif
1289 
1290  Ac.fillComplete(Ac.getDomainMap(), Ac.getRangeMap());
1291 
1292 }
1293 
1294 
1295  /*********************************************************************************************************/
1296  // PT_A_P NewMatrix Kernel wrappers (Default, general, non-threaded version)
1297  // Computes P.T * A * P -> Ac
1298  template<class Scalar,
1299  class LocalOrdinal,
1300  class GlobalOrdinal,
1301  class Node,
1302  class LocalOrdinalViewType>
1303  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1304  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1305  const LocalOrdinalViewType & Acol2Prow,
1306  const LocalOrdinalViewType & Acol2PIrow,
1307  const LocalOrdinalViewType & Pcol2Accol,
1308  const LocalOrdinalViewType & PIcol2Accol,
1309  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1310  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1311  const std::string& label,
1312  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1313 #ifdef HAVE_TPETRA_MMM_TIMINGS
1314  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1315  using Teuchos::TimeMonitor;
1316  Teuchos::TimeMonitor MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1317 #endif
1318 
1319  // We don't need a kernel-level PTAP, we just transpose here
1320  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1321  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1322  Teuchos::RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1323  if (!params.is_null())
1324  transposeParams->set("compute global constants",
1325  params->get("compute global constants: temporaries",
1326  false));
1327  Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1328  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1329  Rview.origMatrix = Ptrans;
1330 
1331  mult_R_A_P_newmatrix_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1332  }
1333 
1334  /*********************************************************************************************************/
1335  // PT_A_P Reuse Kernel wrappers (Default, general, non-threaded version)
1336  // Computes P.T * A * P -> Ac
1337  template<class Scalar,
1338  class LocalOrdinal,
1339  class GlobalOrdinal,
1340  class Node,
1341  class LocalOrdinalViewType>
1342  void KernelWrappers3<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_PT_A_P_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1343  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1344  const LocalOrdinalViewType & Acol2Prow,
1345  const LocalOrdinalViewType & Acol2PIrow,
1346  const LocalOrdinalViewType & Pcol2Accol,
1347  const LocalOrdinalViewType & PIcol2Accol,
1348  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1349  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1350  const std::string& label,
1351  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1352 #ifdef HAVE_TPETRA_MMM_TIMINGS
1353  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1354  using Teuchos::TimeMonitor;
1355  Teuchos::TimeMonitor MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1356 #endif
1357 
1358  // We don't need a kernel-level PTAP, we just transpose here
1359  typedef RowMatrixTransposer<Scalar,LocalOrdinal,GlobalOrdinal, Node> transposer_type;
1360  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1361  Teuchos::RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1362  if (!params.is_null())
1363  transposeParams->set("compute global constants",
1364  params->get("compute global constants: temporaries",
1365  false));
1366  Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1367  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node> Rview;
1368  Rview.origMatrix = Ptrans;
1369 
1370  mult_R_A_P_reuse_kernel_wrapper(Rview,Aview,Pview,Acol2Prow,Acol2PIrow,Pcol2Accol,PIcol2Accol,Ac,Acimport,label,params);
1371  }
1372 
1373  /*********************************************************************************************************/
1374  // PT_A_P NewMatrix Kernel wrappers (Default non-threaded version)
1375  // Computes P.T * A * P -> Ac using a 2-pass algorithm.
1376  // This turned out to be slower on SerialNode, but it might still be helpful when going to Kokkos, so I left it in.
1377  // Currently, this implementation never gets called.
1378  template<class Scalar,
1379  class LocalOrdinal,
1380  class GlobalOrdinal,
1381  class Node>
1382  void KernelWrappers3MMM<Scalar,LocalOrdinal,GlobalOrdinal,Node>::mult_PT_A_P_newmatrix_kernel_wrapper_2pass(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1383  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Pview,
1384  const Teuchos::Array<LocalOrdinal> & Acol2PRow,
1385  const Teuchos::Array<LocalOrdinal> & Acol2PRowImport,
1386  const Teuchos::Array<LocalOrdinal> & Pcol2Accol,
1387  const Teuchos::Array<LocalOrdinal> & PIcol2Accol,
1388  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Ac,
1389  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Acimport,
1390  const std::string& label,
1391  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1392 #ifdef HAVE_TPETRA_MMM_TIMINGS
1393  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1394  using Teuchos::TimeMonitor;
1395  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix SerialCore"))));
1396 #endif
1397 
1398  using Teuchos::Array;
1399  using Teuchos::ArrayRCP;
1400  using Teuchos::ArrayView;
1401  using Teuchos::RCP;
1402  using Teuchos::rcp;
1403 
1404  typedef Scalar SC;
1405  typedef LocalOrdinal LO;
1406  typedef GlobalOrdinal GO;
1407  typedef Node NO;
1408  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1409  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1410  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1411 
1412  // number of rows on the process of the fine matrix
1413  // size_t m = Pview.origMatrix->getNodeNumRows();
1414  // number of rows on the process of the coarse matrix
1415  size_t n = Ac.getRowMap()->getNodeNumElements();
1416  LO maxAccol = Ac.getColMap()->getMaxLocalIndex();
1417 
1418  // Get Data Pointers
1419  ArrayRCP<const size_t> Arowptr_RCP, Prowptr_RCP, Irowptr_RCP;
1420  ArrayRCP<size_t> Acrowptr_RCP;
1421  ArrayRCP<const LO> Acolind_RCP, Pcolind_RCP, Icolind_RCP;
1422  ArrayRCP<LO> Accolind_RCP;
1423  ArrayRCP<const Scalar> Avals_RCP, Pvals_RCP, Ivals_RCP;
1424  ArrayRCP<SC> Acvals_RCP;
1425 
1426  // mfh 27 Sep 2016: "getAllValues" just gets the three CSR arrays
1427  // out of the CrsMatrix. This code computes R * A * (P_local +
1428  // P_remote), where P_local contains the locally owned rows of P,
1429  // and P_remote the (previously Import'ed) remote rows of P.
1430 
1431  Aview.origMatrix->getAllValues(Arowptr_RCP, Acolind_RCP, Avals_RCP);
1432  Pview.origMatrix->getAllValues(Prowptr_RCP, Pcolind_RCP, Pvals_RCP);
1433 
1434  if (!Pview.importMatrix.is_null())
1435  Pview.importMatrix->getAllValues(Irowptr_RCP, Icolind_RCP, Ivals_RCP);
1436 
1437  // mfh 27 Sep 2016: Remark below "For efficiency" refers to an issue
1438  // where Teuchos::ArrayRCP::operator[] may be slower than
1439  // Teuchos::ArrayView::operator[].
1440 
1441  // For efficiency
1442  ArrayView<const size_t> Arowptr, Prowptr, Irowptr;
1443  ArrayView<const LO> Acolind, Pcolind, Icolind;
1444  ArrayView<const SC> Avals, Pvals, Ivals;
1445  ArrayView<size_t> Acrowptr;
1446  ArrayView<LO> Accolind;
1447  ArrayView<SC> Acvals;
1448  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1449  Prowptr = Prowptr_RCP(); Pcolind = Pcolind_RCP(); Pvals = Pvals_RCP();
1450  if (!Pview.importMatrix.is_null()) {
1451  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1452  }
1453 
1455  // In a first pass, determine the graph of Ac.
1457 
1459  // Get the graph of Ac. This gets the local transpose of P,
1460  // then loops over R, A, P to get the graph of Ac.
1462 
1463 #ifdef HAVE_TPETRA_MMM_TIMINGS
1464  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP local transpose"))));
1465 #endif
1466 
1468  // Get the local transpose of the graph of P by locally transposing
1469  // all of P
1470 
1471  ArrayRCP<const size_t> Rrowptr_RCP;
1472  ArrayRCP<const LO> Rcolind_RCP;
1473  ArrayRCP<const Scalar> Rvals_RCP;
1474  ArrayView<const size_t> Rrowptr;
1475  ArrayView<const LO> Rcolind;
1476  ArrayView<const SC> Rvals;
1477 
1478  transposer_type transposer (Pview.origMatrix,label+std::string("XP: "));
1479  RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1480  if (!params.is_null())
1481  transposeParams->set("compute global constants",
1482  params->get("compute global constants: temporaries",
1483  false));
1484  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Ptrans = transposer.createTransposeLocal(transposeParams);
1485 
1486  Ptrans->getAllValues(Rrowptr_RCP, Rcolind_RCP, Rvals_RCP);
1487  Rrowptr = Rrowptr_RCP();
1488  Rcolind = Rcolind_RCP();
1489  Rvals = Rvals_RCP();
1490 
1492  // Construct graph
1493 
1494  #ifdef HAVE_TPETRA_MMM_TIMINGS
1495  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP graph"))));
1496  #endif
1497 
1498  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1499  Array<size_t> ac_status(maxAccol + 1, ST_INVALID);
1500 
1501  size_t nnz_alloc = std::max(Ac_estimate_nnz(*Aview.origMatrix, *Pview.origMatrix), n);
1502  size_t nnzPerRowA = 100;
1503  if (Aview.origMatrix->getNodeNumEntries() > 0)
1504  nnzPerRowA = Aview.origMatrix->getNodeNumEntries()/Aview.origMatrix->getNodeNumRows();
1505  Acrowptr_RCP.resize(n+1);
1506  Acrowptr = Acrowptr_RCP();
1507  Accolind_RCP.resize(nnz_alloc);
1508  Accolind = Accolind_RCP();
1509 
1510  size_t nnz = 0, nnz_old = 0;
1511  for (size_t i = 0; i < n; i++) {
1512  // mfh 27 Sep 2016: m is the number of rows in the input matrix R
1513  // on the calling process.
1514  Acrowptr[i] = nnz;
1515 
1516  // mfh 27 Sep 2016: For each entry of R in the current row of R
1517  for (size_t kk = Rrowptr[i]; kk < Rrowptr[i+1]; kk++) {
1518  LO k = Rcolind[kk]; // local column index of current entry of R
1519  // For each entry of A in the current row of A
1520  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1521  LO l = Acolind[ll]; // local column index of current entry of A
1522 
1523  if (Acol2PRow[l] != LO_INVALID) {
1524  // mfh 27 Sep 2016: If the entry of Acol2PRow
1525  // corresponding to the current entry of A is populated, then
1526  // the corresponding row of P is in P_local (i.e., it lives on
1527  // the calling process).
1528 
1529  // Local matrix
1530  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1531 
1532  // mfh 27 Sep 2016: Go through all entries in that row of P_local.
1533  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1534  LO j = Pcolind[jj];
1535  LO Acj = Pcol2Accol[j];
1536 
1537  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old) {
1538  // New entry
1539  ac_status[Acj] = nnz;
1540  Accolind[nnz] = Acj;
1541  nnz++;
1542  }
1543  }
1544  } else {
1545  // mfh 27 Sep 2016: If the entry of Acol2PRow
1546  // corresponding to the current entry of A is NOT populated (has
1547  // a flag "invalid" value), then the corresponding row of P is
1548  // in P_remote (i.e., it does not live on the calling process).
1549 
1550  // Remote matrix
1551  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1552  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1553  LO j = Icolind[jj];
1554  LO Acj = PIcol2Accol[j];
1555 
1556  if (ac_status[Acj] == ST_INVALID || ac_status[Acj] < nnz_old){
1557  // New entry
1558  ac_status[Acj] = nnz;
1559  Accolind[nnz] = Acj;
1560  nnz++;
1561  }
1562  }
1563  }
1564  }
1565  }
1566  // Resize for next pass if needed
1567  // cag: Maybe we can do something more subtle here, and not double
1568  // the size right away.
1569  if (nnz + std::max(5*nnzPerRowA, n) > nnz_alloc) {
1570  nnz_alloc *= 2;
1571  nnz_alloc = std::max(nnz_alloc, nnz + std::max(5*nnzPerRowA, n));
1572  Accolind_RCP.resize(nnz_alloc); Accolind = Accolind_RCP();
1573  Acvals_RCP.resize(nnz_alloc); Acvals = Acvals_RCP();
1574  }
1575  nnz_old = nnz;
1576  }
1577  Acrowptr[n] = nnz;
1578 
1579  // Downward resize
1580  Accolind_RCP.resize(nnz);
1581  Accolind = Accolind_RCP();
1582 
1583  // Allocate Acvals
1584  Acvals_RCP.resize(nnz, SC_ZERO);
1585  Acvals = Acvals_RCP();
1586 
1587 
1589  // In a second pass, enter the values into Acvals.
1591 
1592  #ifdef HAVE_TPETRA_MMM_TIMINGS
1593  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix Fill Matrix"))));
1594  #endif
1595 
1596 
1597  for (size_t k = 0; k < n; k++) {
1598  for (size_t ii = Prowptr[k]; ii < Prowptr[k+1]; ii++) {
1599  LO i = Pcolind[ii];
1600  const SC Pki = Pvals[ii];
1601  for (size_t ll = Arowptr[k]; ll < Arowptr[k+1]; ll++) {
1602  LO l = Acolind[ll];
1603  const SC Akl = Avals[ll];
1604  if (Akl == 0.)
1605  continue;
1606  if (Acol2PRow[l] != LO_INVALID) {
1607  // mfh 27 Sep 2016: If the entry of Acol2PRow
1608  // corresponding to the current entry of A is populated, then
1609  // the corresponding row of P is in P_local (i.e., it lives on
1610  // the calling process).
1611 
1612  // Local matrix
1613  size_t Pl = Teuchos::as<size_t>(Acol2PRow[l]);
1614  for (size_t jj = Prowptr[Pl]; jj < Prowptr[Pl+1]; jj++) {
1615  LO j = Pcolind[jj];
1616  LO Acj = Pcol2Accol[j];
1617  size_t pp;
1618  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1619  if (Accolind[pp] == Acj)
1620  break;
1621  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1622  // std::runtime_error, "problem with Ac column indices");
1623  Acvals[pp] += Pki*Akl*Pvals[jj];
1624  }
1625  } else {
1626  // mfh 27 Sep 2016: If the entry of Acol2PRow
1627  // corresponding to the current entry of A NOT populated (has
1628  // a flag "invalid" value), then the corresponding row of P is
1629  // in P_remote (i.e., it does not live on the calling process).
1630 
1631  // Remote matrix
1632  size_t Il = Teuchos::as<size_t>(Acol2PRowImport[l]);
1633  for (size_t jj = Irowptr[Il]; jj < Irowptr[Il+1]; jj++) {
1634  LO j = Icolind[jj];
1635  LO Acj = PIcol2Accol[j];
1636  size_t pp;
1637  for (pp = Acrowptr[i]; pp < Acrowptr[i+1]; pp++)
1638  if (Accolind[pp] == Acj)
1639  break;
1640  // TEUCHOS_TEST_FOR_EXCEPTION(Accolind[pp] != Acj,
1641  // std::runtime_error, "problem with Ac column indices");
1642  Acvals[pp] += Pki*Akl*Ivals[jj];
1643  }
1644  }
1645  }
1646  }
1647  }
1648 
1649 
1650 #ifdef HAVE_TPETRA_MMM_TIMINGS
1651  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP sort"))));
1652 #endif
1653 
1654  // Final sort & set of CRS arrays
1655  //
1656  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
1657  // matrix-matrix multiply routine sort the entries for us?
1658  Import_Util::sortCrsEntries(Acrowptr_RCP(), Accolind_RCP(), Acvals_RCP());
1659 
1660  // mfh 27 Sep 2016: This just sets pointers.
1661  Ac.setAllValues(Acrowptr_RCP, Accolind_RCP, Acvals_RCP);
1662 
1663 #ifdef HAVE_TPETRA_MMM_TIMINGS
1664  MM = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("PTAP Newmatrix ESFC"))));
1665 #endif
1666 
1667  // Final FillComplete
1668  //
1669  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1670  // Import (from domain Map to column Map) construction (which costs
1671  // lots of communication) by taking the previously constructed
1672  // Import object. We should be able to do this without interfering
1673  // with the implementation of the local part of sparse matrix-matrix
1674  // multply above.
1675  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1676  labelList->set("Timer Label",label);
1677  // labelList->set("Sort column Map ghost GIDs")
1678  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1679  RCP<const Export<LO,GO,NO> > dummyExport;
1680  Ac.expertStaticFillComplete(Pview.origMatrix->getDomainMap(),
1681  Pview.origMatrix->getDomainMap(),
1682  Acimport,
1683  dummyExport, labelList);
1684  }
1685 
1686 
1687 
1688  } //End namepsace MMdetails
1689 
1690 } //End namespace Tpetra
1691 //
1692 // Explicit instantiation macro
1693 //
1694 // Must be expanded from within the Tpetra namespace!
1695 //
1696 
1697 #define TPETRA_TRIPLEMATRIXMULTIPLY_INSTANT(SCALAR,LO,GO,NODE) \
1698  \
1699  template \
1700  void TripleMatrixMultiply::MultiplyRAP( \
1701  const CrsMatrix< SCALAR , LO , GO , NODE >& R, \
1702  bool transposeR, \
1703  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
1704  bool transposeA, \
1705  const CrsMatrix< SCALAR , LO , GO , NODE >& P, \
1706  bool transposeP, \
1707  CrsMatrix< SCALAR , LO , GO , NODE >& Ac, \
1708  bool call_FillComplete_on_result, \
1709  const std::string & label, \
1710  const Teuchos::RCP<Teuchos::ParameterList>& params); \
1711  \
1712 
1713 
1714 #endif // TPETRA_TRIPLEMATRIXMULTIPLY_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
The core count above which Tpetra::CrsMatrix::transferAndFillComplere will attempt to do advanced nei...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
size_t global_size_t
Global size_t object.
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void MultiplyRAP(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &R, bool transposeR, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &P, bool transposeP, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Ac, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.