Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_DEF_HPP
13 #include "KokkosSparse_Utils.hpp"
14 #include "Tpetra_ConfigDefs.hpp"
16 #include "Teuchos_VerboseObject.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Tpetra_Util.hpp"
19 #include "Tpetra_CrsMatrix.hpp"
20 #include "Tpetra_BlockCrsMatrix.hpp"
22 #include "Tpetra_RowMatrixTransposer.hpp"
25 #include "Tpetra_Details_makeColMap.hpp"
26 #include "Tpetra_ConfigDefs.hpp"
27 #include "Tpetra_Map.hpp"
28 #include "Tpetra_Export.hpp"
29 #include "Tpetra_Import_Util.hpp"
30 #include "Tpetra_Import_Util2.hpp"
31 #include <algorithm>
32 #include <type_traits>
33 #include "Teuchos_FancyOStream.hpp"
34 
35 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
37 
38 #include "KokkosSparse_spgemm.hpp"
39 #include "KokkosSparse_spadd.hpp"
40 #include "Kokkos_Bitset.hpp"
41 
42 #include <MatrixMarket_Tpetra.hpp>
43 
51 /*********************************************************************************************************/
52 // Include the architecture-specific kernel partial specializations here
53 // NOTE: This needs to be outside all namespaces
54 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
55 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
56 #include "TpetraExt_MatrixMatrix_HIP.hpp"
57 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
58 
59 namespace Tpetra {
60 
61 namespace MatrixMatrix{
62 
63 //
64 // This method forms the matrix-matrix product C = op(A) * op(B), where
65 // op(A) == A if transposeA is false,
66 // op(A) == A^T if transposeA is true,
67 // and similarly for op(B).
68 //
69 template <class Scalar,
70  class LocalOrdinal,
71  class GlobalOrdinal,
72  class Node>
73 void Multiply(
75  bool transposeA,
77  bool transposeB,
79  bool call_FillComplete_on_result,
80  const std::string& label,
81  const Teuchos::RCP<Teuchos::ParameterList>& params)
82 {
83  using Teuchos::null;
84  using Teuchos::RCP;
85  using Teuchos::rcp;
86  typedef Scalar SC;
87  typedef LocalOrdinal LO;
88  typedef GlobalOrdinal GO;
89  typedef Node NO;
90  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
91  typedef Import<LO,GO,NO> import_type;
92  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
93  typedef Map<LO,GO,NO> map_type;
94  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
95 
96 #ifdef HAVE_TPETRA_MMM_TIMINGS
97  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
98  using Teuchos::TimeMonitor;
99  //MM is used to time setup, and then multiply.
100 
101  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
102 #endif
103 
104  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
105 
106  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
107 
108  // The input matrices A and B must both be fillComplete.
109  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
110  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
111 
112  // If transposeA is true, then Aprime will be the transpose of A
113  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
114  // will just be a pointer to A.
115  RCP<const crs_matrix_type> Aprime = null;
116  // If transposeB is true, then Bprime will be the transpose of B
117  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
118  // will just be a pointer to B.
119  RCP<const crs_matrix_type> Bprime = null;
120 
121  // Is this a "clean" matrix?
122  //
123  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
124  // locally nor globally indexed, then it was empty. I don't like
125  // this, because the most straightforward implementation presumes
126  // lazy allocation of indices. However, historical precedent
127  // demands that we keep around this predicate as a way to test
128  // whether the matrix is empty.
129  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
130 
131  bool use_optimized_ATB = false;
132  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
133  use_optimized_ATB = true;
134 
135 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
136  use_optimized_ATB = false;
137 #endif
138 
139  using Teuchos::ParameterList;
140  RCP<ParameterList> transposeParams (new ParameterList);
141  transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
142 
143  if (!use_optimized_ATB && transposeA) {
144  transposer_type transposer (rcpFromRef (A));
145  Aprime = transposer.createTranspose (transposeParams);
146  }
147  else {
148  Aprime = rcpFromRef(A);
149  }
150 
151  if (transposeB) {
152  transposer_type transposer (rcpFromRef (B));
153  Bprime = transposer.createTranspose (transposeParams);
154  }
155  else {
156  Bprime = rcpFromRef(B);
157  }
158 
159  // Check size compatibility
160  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
161  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
162  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
163  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
164  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
165  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
166  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
167  prefix << "ERROR, inner dimensions of op(A) and op(B) "
168  "must match for matrix-matrix product. op(A) is "
169  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
170 
171  // The result matrix C must at least have a row-map that reflects the correct
172  // row-size. Don't check the number of columns because rectangular matrices
173  // which were constructed with only one map can still end up having the
174  // correct capacity and dimensions when filled.
175  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
176  prefix << "ERROR, dimensions of result C must "
177  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
178  << " rows, should have at least " << Aouter << std::endl);
179 
180  // It doesn't matter whether C is already Filled or not. If it is already
181  // Filled, it must have space allocated for the positions that will be
182  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
183  // we'll error out later when trying to store result values.
184 
185  // CGB: However, matrix must be in active-fill
186  if (!C.isFillActive()) C.resumeFill();
187 
188  // We're going to need to import remotely-owned sections of A and/or B if
189  // more than one processor is performing this run, depending on the scenario.
190  int numProcs = A.getComm()->getSize();
191 
192  // Declare a couple of structs that will be used to hold views of the data
193  // of A and B, to be used for fast access during the matrix-multiplication.
194  crs_matrix_struct_type Aview;
195  crs_matrix_struct_type Bview;
196 
197  RCP<const map_type> targetMap_A = Aprime->getRowMap();
198  RCP<const map_type> targetMap_B = Bprime->getRowMap();
199 
200 #ifdef HAVE_TPETRA_MMM_TIMINGS
201  {
202  TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
203 #endif
204 
205  // Now import any needed remote rows and populate the Aview struct
206  // NOTE: We assert that an import isn't needed --- since we do the transpose
207  // above to handle that.
208  if (!use_optimized_ATB) {
209  RCP<const import_type> dummyImporter;
210  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
211  }
212 
213  // We will also need local access to all rows of B that correspond to the
214  // column-map of op(A).
215  if (numProcs > 1)
216  targetMap_B = Aprime->getColMap();
217 
218  // Import any needed remote rows and populate the Bview struct.
219  if (!use_optimized_ATB)
220  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
221 
222 #ifdef HAVE_TPETRA_MMM_TIMINGS
223  } //stop MM_importExtract here
224  //stop the setup timer, and start the multiply timer
225  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
226 #endif
227 
228  // Call the appropriate method to perform the actual multiplication.
229  if (use_optimized_ATB) {
230  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
231 
232  } else if (call_FillComplete_on_result && newFlag) {
233  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
234 
235  } else if (call_FillComplete_on_result) {
236  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
237 
238  } else {
239  // mfh 27 Sep 2016: Is this the "slow" case? This
240  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
241  // thread-parallel inserts, but that may take some effort.
242  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
243 
244  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
245  }
246 
247 #ifdef HAVE_TPETRA_MMM_TIMINGS
248  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
249 #endif
250  if (call_FillComplete_on_result && !C.isFillComplete()) {
251  // We'll call FillComplete on the C matrix before we exit, and give it a
252  // domain-map and a range-map.
253  // The domain-map will be the domain-map of B, unless
254  // op(B)==transpose(B), in which case the range-map of B will be used.
255  // The range-map will be the range-map of A, unless op(A)==transpose(A),
256  // in which case the domain-map of A will be used.
257  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
258  }
259 }
260 
261 //
262 // This method forms the matrix-matrix product C = op(A) * op(B), where
263 // op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
264 //
265 template <class Scalar,
266  class LocalOrdinal,
267  class GlobalOrdinal,
268  class Node>
269 void Multiply(
270  const Teuchos::RCP<const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& A,
271  bool transposeA,
272  const Teuchos::RCP<const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& B,
273  bool transposeB,
275  const std::string& label)
276 {
277  using Teuchos::null;
278  using Teuchos::RCP;
279  using Teuchos::rcp;
280  typedef Scalar SC;
281  typedef LocalOrdinal LO;
282  typedef GlobalOrdinal GO;
283  typedef Node NO;
284  typedef BlockCrsMatrixStruct<SC,LO,GO,NO> blockcrs_matrix_struct_type;
285  typedef Map<LO,GO,NO> map_type;
286  typedef Import<LO,GO,NO> import_type;
287 
288  std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
289 
290  TEUCHOS_TEST_FOR_EXCEPTION(transposeA==true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
291  TEUCHOS_TEST_FOR_EXCEPTION(transposeB==true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
292 
293  // Check size compatibility
294  global_size_t numACols = A->getGlobalNumCols();
295  global_size_t numBCols = B->getGlobalNumCols();
296  global_size_t numARows = A->getGlobalNumRows();
297  global_size_t numBRows = B->getGlobalNumRows();
298 
299  global_size_t Aouter = numARows;
300  global_size_t Bouter = numBCols;
301  global_size_t Ainner = numACols;
302  global_size_t Binner = numBRows;
303  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
304  prefix << "ERROR, inner dimensions of op(A) and op(B) "
305  "must match for matrix-matrix product. op(A) is "
306  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
307 
308  // We're going to need to import remotely-owned sections of A and/or B if
309  // more than one processor is performing this run, depending on the scenario.
310  int numProcs = A->getComm()->getSize();
311 
312  const LO blocksize = A->getBlockSize();
313  TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
314  prefix << "ERROR, Blocksizes do not match. A.blocksize = " <<
315  blocksize << ", B.blocksize = " << B->getBlockSize() );
316 
317  // Declare a couple of structs that will be used to hold views of the data
318  // of A and B, to be used for fast access during the matrix-multiplication.
319  blockcrs_matrix_struct_type Aview(blocksize);
320  blockcrs_matrix_struct_type Bview(blocksize);
321 
322  RCP<const map_type> targetMap_A = A->getRowMap();
323  RCP<const map_type> targetMap_B = B->getRowMap();
324 
325  // Populate the Aview struct. No remotes are needed.
326  RCP<const import_type> dummyImporter;
327  MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
328 
329  // We will also need local access to all rows of B that correspond to the
330  // column-map of op(A).
331  if (numProcs > 1)
332  targetMap_B = A->getColMap();
333 
334  // Import any needed remote rows and populate the Bview struct.
335  MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
336  A->getGraph()->getImporter().is_null());
337 
338  // Call the appropriate method to perform the actual multiplication.
339  MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
340 }
341 
342 template <class Scalar,
343  class LocalOrdinal,
344  class GlobalOrdinal,
345  class Node>
346 void Jacobi(Scalar omega,
351  bool call_FillComplete_on_result,
352  const std::string& label,
353  const Teuchos::RCP<Teuchos::ParameterList>& params)
354 {
355  using Teuchos::RCP;
356  typedef Scalar SC;
357  typedef LocalOrdinal LO;
358  typedef GlobalOrdinal GO;
359  typedef Node NO;
360  typedef Import<LO,GO,NO> import_type;
361  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
362  typedef Map<LO,GO,NO> map_type;
363  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
364 
365 #ifdef HAVE_TPETRA_MMM_TIMINGS
366  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
367  using Teuchos::TimeMonitor;
368  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
369 #endif
370 
371  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
372 
373  // A and B should already be Filled.
374  // Should we go ahead and call FillComplete() on them if necessary or error
375  // out? For now, we choose to error out.
376  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
377  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
378 
379  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
380  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
381 
382  // Now check size compatibility
383  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
384  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
385  global_size_t Aouter = A.getGlobalNumRows();
386  global_size_t Bouter = numBCols;
387  global_size_t Ainner = numACols;
388  global_size_t Binner = B.getGlobalNumRows();
389  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
390  prefix << "ERROR, inner dimensions of op(A) and op(B) "
391  "must match for matrix-matrix product. op(A) is "
392  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
393 
394  // The result matrix C must at least have a row-map that reflects the correct
395  // row-size. Don't check the number of columns because rectangular matrices
396  // which were constructed with only one map can still end up having the
397  // correct capacity and dimensions when filled.
398  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
399  prefix << "ERROR, dimensions of result C must "
400  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
401  << " rows, should have at least "<< Aouter << std::endl);
402 
403  // It doesn't matter whether C is already Filled or not. If it is already
404  // Filled, it must have space allocated for the positions that will be
405  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
406  // we'll error out later when trying to store result values.
407 
408  // CGB: However, matrix must be in active-fill
409  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
410 
411  // We're going to need to import remotely-owned sections of A and/or B if
412  // more than one processor is performing this run, depending on the scenario.
413  int numProcs = A.getComm()->getSize();
414 
415  // Declare a couple of structs that will be used to hold views of the data of
416  // A and B, to be used for fast access during the matrix-multiplication.
417  crs_matrix_struct_type Aview;
418  crs_matrix_struct_type Bview;
419 
420  RCP<const map_type> targetMap_A = Aprime->getRowMap();
421  RCP<const map_type> targetMap_B = Bprime->getRowMap();
422 
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
424  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
425  {
426 #endif
427 
428  // Enable globalConstants by default
429  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
430  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
431  if(!params.is_null()) {
432  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
433  int mm_optimization_core_count=0;
434  auto slist = params->sublist("matrixmatrix: kernel params",false);
435  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
436  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
437  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
438  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
439  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
440  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
441  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
442  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
443  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
444  }
445 
446  //Now import any needed remote rows and populate the Aview struct.
447  RCP<const import_type> dummyImporter;
448  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
449 
450  // We will also need local access to all rows of B that correspond to the
451  // column-map of op(A).
452  if (numProcs > 1)
453  targetMap_B = Aprime->getColMap();
454 
455  // Now import any needed remote rows and populate the Bview struct.
456  // Enable globalConstants by default
457  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
458  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
459  if(!params.is_null()) {
460  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
461 
462  auto slist = params->sublist("matrixmatrix: kernel params",false);
463  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
464  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
465  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
466  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
467  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
468  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
469  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
470  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
471  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
472  }
473 
474  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
475 
476 #ifdef HAVE_TPETRA_MMM_TIMINGS
477  }
478  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
479 #endif
480 
481  // Now call the appropriate method to perform the actual multiplication.
482  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
483 
484  // Is this a "clean" matrix
485  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
486 
487  if (call_FillComplete_on_result && newFlag) {
488  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
489 
490  } else if (call_FillComplete_on_result) {
491  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
492 
493  } else {
494  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
495  }
496 
497  if(!params.is_null()) {
498  bool removeZeroEntries = params->get("remove zeros", false);
499  if (removeZeroEntries) {
500  typedef Teuchos::ScalarTraits<Scalar> STS;
501  typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
502  removeCrsMatrixZeros(C, threshold);
503  }
504  }
505 }
506 
507 
508 template <class Scalar,
509  class LocalOrdinal,
510  class GlobalOrdinal,
511  class Node>
512 void Add(
514  bool transposeA,
515  Scalar scalarA,
517  Scalar scalarB )
518 {
519  using Teuchos::Array;
520  using Teuchos::RCP;
521  using Teuchos::null;
522  typedef Scalar SC;
523  typedef LocalOrdinal LO;
524  typedef GlobalOrdinal GO;
525  typedef Node NO;
526  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
527  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
528 
529  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
530 
531  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
532  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
533  "(Result matrix B is not required to be isFillComplete()).");
534  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
535  prefix << "ERROR, input matrix B must not be fill complete!");
536  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
537  prefix << "ERROR, input matrix B must not have static graph!");
538  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
539  prefix << "ERROR, input matrix B must not be locally indexed!");
540 
541  using Teuchos::ParameterList;
542  RCP<ParameterList> transposeParams (new ParameterList);
543  transposeParams->set ("sort", false);
544 
545  RCP<const crs_matrix_type> Aprime = null;
546  if (transposeA) {
547  transposer_type transposer (rcpFromRef (A));
548  Aprime = transposer.createTranspose (transposeParams);
549  }
550  else {
551  Aprime = rcpFromRef(A);
552  }
553 
554  size_t a_numEntries;
555  typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getLocalMaxNumRowEntries());
556  typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getLocalMaxNumRowEntries());
557  GO row;
558 
559  if (scalarB != Teuchos::ScalarTraits<SC>::one())
560  B.scale(scalarB);
561 
562  size_t numMyRows = B.getLocalNumRows();
563  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
564  for (LO i = 0; (size_t)i < numMyRows; ++i) {
565  row = B.getRowMap()->getGlobalElement(i);
566  Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
567 
568  if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
569  for (size_t j = 0; j < a_numEntries; ++j)
570  a_vals[j] *= scalarA;
571  }
572  B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
573  }
574  }
575 }
576 
577 template <class Scalar,
578  class LocalOrdinal,
579  class GlobalOrdinal,
580  class Node>
581 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
582 add (const Scalar& alpha,
583  const bool transposeA,
585  const Scalar& beta,
586  const bool transposeB,
588  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
589  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
590  const Teuchos::RCP<Teuchos::ParameterList>& params)
591 {
592  using Teuchos::RCP;
593  using Teuchos::rcpFromRef;
594  using Teuchos::rcp;
595  using Teuchos::ParameterList;
597  if(!params.is_null())
598  {
599  TEUCHOS_TEST_FOR_EXCEPTION(
600  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
601  std::invalid_argument,
602  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
603  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
604  params->set("Call fillComplete", true);
605  }
606  //If transposeB, must compute B's explicit transpose to
607  //get the correct row map for C.
608  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
609  if(transposeB)
610  {
612  Brcp = transposer.createTranspose();
613  }
614  //Check that A,B are fillComplete before getting B's column map
615  TEUCHOS_TEST_FOR_EXCEPTION
616  (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
617  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
618  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
619  //this version of add() always fill completes the result, no matter what is in params on input
620  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
621  return C;
622 }
623 
624 //This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
625 //but since the spadd() output is always packed there is no need for a separate
626 //numRowEntries here.
627 //
628 template<class LO, class GO, class LOView, class GOView, class LocalMap>
629 struct ConvertGlobalToLocalFunctor
630 {
631  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
632  : lids(lids_), gids(gids_), localColMap(localColMap_)
633  {}
634 
635  KOKKOS_FUNCTION void operator() (const GO i) const
636  {
637  lids(i) = localColMap.getLocalElement(gids(i));
638  }
639 
640  LOView lids;
641  const GOView gids;
642  const LocalMap localColMap;
643 };
644 
645 template <class Scalar,
646  class LocalOrdinal,
647  class GlobalOrdinal,
648  class Node>
649 void
650 add (const Scalar& alpha,
651  const bool transposeA,
653  const Scalar& beta,
654  const bool transposeB,
657  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
658  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
659  const Teuchos::RCP<Teuchos::ParameterList>& params)
660 {
661  using Teuchos::RCP;
662  using Teuchos::rcp;
663  using Teuchos::rcpFromRef;
664  using Teuchos::rcp_implicit_cast;
665  using Teuchos::rcp_dynamic_cast;
666  using Teuchos::TimeMonitor;
667  using SC = Scalar;
668  using LO = LocalOrdinal;
669  using GO = GlobalOrdinal;
670  using NO = Node;
671  using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
672  using crs_graph_type = CrsGraph<LO,GO,NO>;
673  using map_type = Map<LO,GO,NO>;
674  using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
675  using import_type = Import<LO,GO,NO>;
676  using export_type = Export<LO,GO,NO>;
677  using exec_space = typename crs_graph_type::execution_space;
678  using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
679  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
680  constexpr bool debug = false;
681 
682 #ifdef HAVE_TPETRA_MMM_TIMINGS
683  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
684 #endif
685 
686  if (debug) {
687  std::ostringstream os;
688  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
689  << "TpetraExt::MatrixMatrix::add" << std::endl;
690  std::cerr << os.str ();
691  }
692 
693  TEUCHOS_TEST_FOR_EXCEPTION
694  (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
695  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
696  TEUCHOS_TEST_FOR_EXCEPTION
697  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
698  prefix_mmm << "A and B must both be fill complete.");
699 #ifdef HAVE_TPETRA_DEBUG
700  // The matrices don't have domain or range Maps unless they are fill complete.
701  if (A.isFillComplete () && B.isFillComplete ()) {
702  const bool domainMapsSame =
703  (! transposeA && ! transposeB &&
704  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
705  (! transposeA && transposeB &&
706  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
707  ( transposeA && ! transposeB &&
708  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
709  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
710  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
711 
712  const bool rangeMapsSame =
713  (! transposeA && ! transposeB &&
714  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
715  (! transposeA && transposeB &&
716  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
717  ( transposeA && ! transposeB &&
718  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
719  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
720  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
721  }
722 #endif // HAVE_TPETRA_DEBUG
723 
724  using Teuchos::ParameterList;
725  // Form the explicit transpose of A if necessary.
726  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
727  if (transposeA) {
728  transposer_type transposer (Aprime);
729  Aprime = transposer.createTranspose ();
730  }
731 
732 #ifdef HAVE_TPETRA_DEBUG
733  TEUCHOS_TEST_FOR_EXCEPTION
734  (Aprime.is_null (), std::logic_error,
735  prefix_mmm << "Failed to compute Op(A). "
736  "Please report this bug to the Tpetra developers.");
737 #endif // HAVE_TPETRA_DEBUG
738 
739  // Form the explicit transpose of B if necessary.
740  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
741  if (transposeB) {
742  if (debug) {
743  std::ostringstream os;
744  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
745  << "Form explicit xpose of B" << std::endl;
746  std::cerr << os.str ();
747  }
748  transposer_type transposer (Bprime);
749  Bprime = transposer.createTranspose ();
750  }
751 #ifdef HAVE_TPETRA_DEBUG
752  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
753  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
754  TEUCHOS_TEST_FOR_EXCEPTION(
755  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
756  prefix_mmm << "Aprime and Bprime must both be fill complete. "
757  "Please report this bug to the Tpetra developers.");
758 #endif // HAVE_TPETRA_DEBUG
759  RCP<const map_type> CDomainMap = domainMap;
760  RCP<const map_type> CRangeMap = rangeMap;
761  if(CDomainMap.is_null())
762  {
763  CDomainMap = Bprime->getDomainMap();
764  }
765  if(CRangeMap.is_null())
766  {
767  CRangeMap = Bprime->getRangeMap();
768  }
769  assert(!(CDomainMap.is_null()));
770  assert(!(CRangeMap.is_null()));
771  typedef typename AddKern::values_array values_array;
772  typedef typename AddKern::row_ptrs_array row_ptrs_array;
773  typedef typename AddKern::col_inds_array col_inds_array;
774  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
775  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
776  values_array vals;
777  row_ptrs_array rowptrs;
778  col_inds_array colinds;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
780  MM = Teuchos::null;
781  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
782 #endif
783  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
784  {
785  //import Aprime into Bprime's row map so the local matrices have same # of rows
786  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
787  // cbl do not set
788  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
789  // this import _may_ take the form of a transfer. In practice it would be unlikely,
790  // but the general case is not so forgiving.
791  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
792  }
793  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
794  bool sorted = AGraphSorted && BGraphSorted;
795  RCP<const import_type> Cimport = Teuchos::null;
796  RCP<export_type> Cexport = Teuchos::null;
797  bool doFillComplete = true;
798  if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
799  {
800  doFillComplete = params->get<bool>("Call fillComplete");
801  }
802  auto Alocal = Aprime->getLocalMatrixDevice();
803  auto Blocal = Bprime->getLocalMatrixDevice();
804  LO numLocalRows = Alocal.numRows();
805  if(numLocalRows == 0)
806  {
807  //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
808  //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
809  //Handle this case now
810  //(without interfering with collective operations, since it's possible for
811  //some ranks to have 0 local rows and others not).
812  rowptrs = row_ptrs_array("C rowptrs", 0);
813  }
814  auto Acolmap = Aprime->getColMap();
815  auto Bcolmap = Bprime->getColMap();
816  if(!matchingColMaps)
817  {
818  using global_col_inds_array = typename AddKern::global_col_inds_array;
819 #ifdef HAVE_TPETRA_MMM_TIMINGS
820  MM = Teuchos::null;
821  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
822 #endif
823  //use kernel that converts col indices in both A and B to common domain map before adding
824  auto AlocalColmap = Acolmap->getLocalMap();
825  auto BlocalColmap = Bcolmap->getLocalMap();
826  global_col_inds_array globalColinds;
827  if (debug) {
828  std::ostringstream os;
829  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
830  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
831  std::cerr << os.str ();
832  }
833  AddKern::convertToGlobalAndAdd(
834  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
835  vals, rowptrs, globalColinds);
836  if (debug) {
837  std::ostringstream os;
838  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
839  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
840  std::cerr << os.str ();
841  }
842 #ifdef HAVE_TPETRA_MMM_TIMINGS
843  MM = Teuchos::null;
844  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
845 #endif
846  RCP<const map_type> CcolMap;
847  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
848  (CcolMap, CDomainMap, globalColinds);
849  C.replaceColMap(CcolMap);
850  col_inds_array localColinds("C colinds", globalColinds.extent(0));
851  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
852  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
853  col_inds_array, global_col_inds_array,
854  typename map_type::local_map_type>
855  (localColinds, globalColinds, CcolMap->getLocalMap()));
856  KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
857  C.setAllValues(rowptrs, localColinds, vals);
858  C.fillComplete(CDomainMap, CRangeMap, params);
859  if(!doFillComplete)
860  C.resumeFill();
861  }
862  else
863  {
864  //Aprime, Bprime and C all have the same column maps
865  auto Avals = Alocal.values;
866  auto Bvals = Blocal.values;
867  auto Arowptrs = Alocal.graph.row_map;
868  auto Browptrs = Blocal.graph.row_map;
869  auto Acolinds = Alocal.graph.entries;
870  auto Bcolinds = Blocal.graph.entries;
871  if(sorted)
872  {
873  //use sorted kernel
874 #ifdef HAVE_TPETRA_MMM_TIMINGS
875  MM = Teuchos::null;
876  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
877 #endif
878  if (debug) {
879  std::ostringstream os;
880  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
881  << "Call AddKern::addSorted(...)" << std::endl;
882  std::cerr << os.str ();
883  }
884 #if KOKKOSKERNELS_VERSION >= 40299
885  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
886 #else
887  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
888 #endif
889  }
890  else
891  {
892  //use unsorted kernel
893 #ifdef HAVE_TPETRA_MMM_TIMINGS
894  MM = Teuchos::null;
895  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
896 #endif
897  if (debug) {
898  std::ostringstream os;
899  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
900  << "Call AddKern::addUnsorted(...)" << std::endl;
901  std::cerr << os.str ();
902  }
903  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
904  }
905  //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
906  RCP<const map_type> Ccolmap = Bcolmap;
907  C.replaceColMap(Ccolmap);
908  C.setAllValues(rowptrs, colinds, vals);
909  if(doFillComplete)
910  {
911  #ifdef HAVE_TPETRA_MMM_TIMINGS
912  MM = Teuchos::null;
913  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
914  #endif
915  if(!CDomainMap->isSameAs(*Ccolmap))
916  {
917  if (debug) {
918  std::ostringstream os;
919  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
920  << "Create Cimport" << std::endl;
921  std::cerr << os.str ();
922  }
923  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
924  }
925  if(!C.getRowMap()->isSameAs(*CRangeMap))
926  {
927  if (debug) {
928  std::ostringstream os;
929  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
930  << "Create Cexport" << std::endl;
931  std::cerr << os.str ();
932  }
933  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
934  }
935 
936  if (debug) {
937  std::ostringstream os;
938  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
939  << "Call C->expertStaticFillComplete(...)" << std::endl;
940  std::cerr << os.str ();
941  }
942  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
943  }
944  }
945 }
946 
947 // This version of Add takes C as RCP&, so C may be null on input (in this case,
948 // it is allocated and constructed in this function).
949 template <class Scalar,
950  class LocalOrdinal,
951  class GlobalOrdinal,
952  class Node>
953 void Add(
955  bool transposeA,
956  Scalar scalarA,
958  bool transposeB,
959  Scalar scalarB,
961 {
962  using Teuchos::Array;
963  using Teuchos::ArrayRCP;
964  using Teuchos::ArrayView;
965  using Teuchos::RCP;
966  using Teuchos::rcp;
967  using Teuchos::rcp_dynamic_cast;
968  using Teuchos::rcpFromRef;
969  using Teuchos::tuple;
970  using std::endl;
971  // typedef typename ArrayView<const Scalar>::size_type size_type;
972  typedef Teuchos::ScalarTraits<Scalar> STS;
974  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
975  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
976  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
979 
980  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
981 
982  TEUCHOS_TEST_FOR_EXCEPTION(
983  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
984  prefix << "A and B must both be fill complete before calling this function.");
985 
986  if(C.is_null()) {
987  TEUCHOS_TEST_FOR_EXCEPTION(!A.haveGlobalConstants(), std::logic_error,
988  prefix << "C is null (must be allocated), but A.haveGlobalConstants() is false. "
989  "Please report this bug to the Tpetra developers.");
990  TEUCHOS_TEST_FOR_EXCEPTION(!B.haveGlobalConstants(), std::logic_error,
991  prefix << "C is null (must be allocated), but B.haveGlobalConstants() is false. "
992  "Please report this bug to the Tpetra developers.");
993  }
994 
995 #ifdef HAVE_TPETRA_DEBUG
996  {
997  const bool domainMapsSame =
998  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
999  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
1000  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
1001  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
1002  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
1003 
1004  const bool rangeMapsSame =
1005  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
1006  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
1007  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
1008  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
1009  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
1010  }
1011 #endif // HAVE_TPETRA_DEBUG
1012 
1013  using Teuchos::ParameterList;
1014  RCP<ParameterList> transposeParams (new ParameterList);
1015  transposeParams->set ("sort", false);
1016 
1017  // Form the explicit transpose of A if necessary.
1018  RCP<const crs_matrix_type> Aprime;
1019  if (transposeA) {
1020  transposer_type theTransposer (rcpFromRef (A));
1021  Aprime = theTransposer.createTranspose (transposeParams);
1022  }
1023  else {
1024  Aprime = rcpFromRef (A);
1025  }
1026 
1027 #ifdef HAVE_TPETRA_DEBUG
1028  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1029  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1030 #endif // HAVE_TPETRA_DEBUG
1031 
1032  // Form the explicit transpose of B if necessary.
1033  RCP<const crs_matrix_type> Bprime;
1034  if (transposeB) {
1035  transposer_type theTransposer (rcpFromRef (B));
1036  Bprime = theTransposer.createTranspose (transposeParams);
1037  }
1038  else {
1039  Bprime = rcpFromRef (B);
1040  }
1041 
1042 #ifdef HAVE_TPETRA_DEBUG
1043  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1044  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1045 #endif // HAVE_TPETRA_DEBUG
1046 
1047  bool CwasFillComplete = false;
1048 
1049  // Allocate or zero the entries of the result matrix.
1050  if (! C.is_null ()) {
1051  CwasFillComplete = C->isFillComplete();
1052  if(CwasFillComplete)
1053  C->resumeFill();
1054  C->setAllToScalar (STS::zero ());
1055  } else {
1056  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1057  // noticed that C was being given the row Map of Aprime (the
1058  // possibly transposed version of A). Is this what we want?
1059 
1060  // It is a precondition that Aprime and Bprime have the same domain and range maps.
1061  // However, they may have different row maps. In this case, it's difficult to
1062  // get a precise upper bound on the number of entries in each local row of C, so
1063  // just use the looser upper bound based on the max number of entries in any row of Aprime and Bprime.
1064  if(Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
1065  LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1066  Array<size_t> CmaxEntriesPerRow(numLocalRows);
1067  for(LocalOrdinal i = 0; i < numLocalRows; i++) {
1068  CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1069  }
1070  C = rcp (new crs_matrix_type (Aprime->getRowMap (), CmaxEntriesPerRow()));
1071  }
1072  else {
1073  // Note: above we checked that Aprime and Bprime have global constants, so it's safe to ask for max entries per row.
1074  C = rcp (new crs_matrix_type (Aprime->getRowMap (), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1075  }
1076  }
1077 
1078 #ifdef HAVE_TPETRA_DEBUG
1079  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1080  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1081  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1082  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1083  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1084  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1085 #endif // HAVE_TPETRA_DEBUG
1086 
1087  Array<RCP<const crs_matrix_type> > Mat =
1088  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1089  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1090 
1091  // do a loop over each matrix to add: A reordering might be more efficient
1092  for (int k = 0; k < 2; ++k) {
1093  typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1094  typename crs_matrix_type::nonconst_values_host_view_type Values;
1095 
1096  // Loop over each locally owned row of the current matrix (either
1097  // Aprime or Bprime), and sum its entries into the corresponding
1098  // row of C. This works regardless of whether Aprime or Bprime
1099  // has the same row Map as C, because both sumIntoGlobalValues and
1100  // insertGlobalValues allow summing resp. inserting into nonowned
1101  // rows of C.
1102 #ifdef HAVE_TPETRA_DEBUG
1103  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1104  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1105 #endif // HAVE_TPETRA_DEBUG
1106  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1107 #ifdef HAVE_TPETRA_DEBUG
1108  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1109  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1110 #endif // HAVE_TPETRA_DEBUG
1111 
1112  const size_t localNumRows = Mat[k]->getLocalNumRows ();
1113  for (size_t i = 0; i < localNumRows; ++i) {
1114  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1115  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1116  if (numEntries > 0) {
1117  if(numEntries > Indices.extent(0)) {
1118  Kokkos::resize(Indices, numEntries);
1119  Kokkos::resize(Values, numEntries);
1120  }
1121  Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1122 
1123  if (scalar[k] != STS::one ()) {
1124  for (size_t j = 0; j < numEntries; ++j) {
1125  Values[j] *= scalar[k];
1126  }
1127  }
1128 
1129  if (CwasFillComplete) {
1130  size_t result = C->sumIntoGlobalValues (globalRow, numEntries,
1131  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1132  TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1133  prefix << "sumIntoGlobalValues failed to add entries from A or B into C.");
1134  } else {
1135  C->insertGlobalValues (globalRow, numEntries,
1136  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1137  }
1138  }
1139  }
1140  }
1141  if(CwasFillComplete) {
1142  C->fillComplete(C->getDomainMap (),
1143  C->getRangeMap ());
1144  }
1145 }
1146 
1147 // This version of Add takes C as const RCP&, so C must not be null on input. Otherwise, its behavior is identical
1148 // to the above version where C is RCP&.
1149 template <class Scalar,
1150  class LocalOrdinal,
1151  class GlobalOrdinal,
1152  class Node>
1153 void Add(
1155  bool transposeA,
1156  Scalar scalarA,
1158  bool transposeB,
1159  Scalar scalarB,
1161 {
1162  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1163 
1164  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::invalid_argument,
1165  prefix << "C must not be null");
1166 
1167  Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C_ = C;
1168  Add(A, transposeA, scalarA, B, transposeB, scalarB, C_);
1169 }
1170 
1171 } //End namespace MatrixMatrix
1172 
1173 namespace MMdetails{
1174 
1175 /*********************************************************************************************************/
1177 //template <class TransferType>
1178 //void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1179 // if (Transfer.is_null())
1180 // return;
1181 //
1182 // const Distributor & Distor = Transfer->getDistributor();
1183 // Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1184 //
1185 // size_t rows_send = Transfer->getNumExportIDs();
1186 // size_t rows_recv = Transfer->getNumRemoteIDs();
1187 //
1188 // size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1189 // size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1190 // size_t num_send_neighbors = Distor.getNumSends();
1191 // size_t num_recv_neighbors = Distor.getNumReceives();
1192 // size_t round2_send, round2_recv;
1193 // Distor.getLastDoStatistics(round2_send,round2_recv);
1194 //
1195 // int myPID = Comm->getRank();
1196 // int NumProcs = Comm->getSize();
1197 //
1198 // // Processor by processor statistics
1199 // // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1200 // // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1201 //
1202 // // Global statistics
1203 // size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1204 // size_t gstats_min[8], gstats_max[8];
1205 //
1206 // double lstats_avg[8], gstats_avg[8];
1207 // for(int i=0; i<8; i++)
1208 // lstats_avg[i] = ((double)lstats[i])/NumProcs;
1209 //
1210 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1211 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1212 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1213 //
1214 // if(!myPID) {
1215 // printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1216 // (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1217 // (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1218 // printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1219 // (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1220 // (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1221 // }
1222 //}
1223 
1224 // Kernel method for computing the local portion of C = A*B
1225 template<class Scalar,
1226  class LocalOrdinal,
1227  class GlobalOrdinal,
1228  class Node>
1229 void mult_AT_B_newmatrix(
1230  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1231  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1232  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1233  const std::string & label,
1234  const Teuchos::RCP<Teuchos::ParameterList>& params)
1235 {
1236  using Teuchos::RCP;
1237  using Teuchos::rcp;
1238  typedef Scalar SC;
1239  typedef LocalOrdinal LO;
1240  typedef GlobalOrdinal GO;
1241  typedef Node NO;
1242  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1243  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1244 
1245 #ifdef HAVE_TPETRA_MMM_TIMINGS
1246  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1247  using Teuchos::TimeMonitor;
1248  RCP<TimeMonitor> MM = rcp (new TimeMonitor
1249  (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1250 #endif
1251 
1252  /*************************************************************/
1253  /* 1) Local Transpose of A */
1254  /*************************************************************/
1255  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1256 
1257  using Teuchos::ParameterList;
1258  RCP<ParameterList> transposeParams (new ParameterList);
1259  transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
1260  if(! params.is_null ()) {
1261  transposeParams->set ("compute global constants",
1262  params->get ("compute global constants: temporaries",
1263  false));
1264  }
1265  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1266  transposer.createTransposeLocal (transposeParams);
1267 
1268  /*************************************************************/
1269  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1270  /*************************************************************/
1271 #ifdef HAVE_TPETRA_MMM_TIMINGS
1272  MM = Teuchos::null;
1273  MM = rcp (new TimeMonitor
1274  (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1275 #endif
1276 
1277  // Get views, asserting that no import is required to speed up computation
1278  crs_matrix_struct_type Aview;
1279  crs_matrix_struct_type Bview;
1280  RCP<const Import<LO, GO, NO> > dummyImporter;
1281 
1282  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1283  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1284  if (! params.is_null ()) {
1285  importParams1->set ("compute global constants",
1286  params->get ("compute global constants: temporaries",
1287  false));
1288  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1289  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1290  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1291  int mm_optimization_core_count =
1293  mm_optimization_core_count =
1294  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1295  int mm_optimization_core_count2 =
1296  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1297  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1298  mm_optimization_core_count = mm_optimization_core_count2;
1299  }
1300  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1301  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1302  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1303  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1304  }
1305 
1306  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1307  Aview, dummyImporter, true,
1308  label, importParams1);
1309 
1310  RCP<ParameterList> importParams2 (new ParameterList);
1311  if (! params.is_null ()) {
1312  importParams2->set ("compute global constants",
1313  params->get ("compute global constants: temporaries",
1314  false));
1315  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1316  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1317  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1318  int mm_optimization_core_count =
1320  mm_optimization_core_count =
1321  slist.get ("MM_TAFC_OptimizationCoreCount",
1322  mm_optimization_core_count);
1323  int mm_optimization_core_count2 =
1324  params->get ("MM_TAFC_OptimizationCoreCount",
1325  mm_optimization_core_count);
1326  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1327  mm_optimization_core_count = mm_optimization_core_count2;
1328  }
1329  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1330  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1331  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1332  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1333  }
1334 
1335  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1336  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1337  }
1338  else {
1339  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1340  }
1341 
1342 #ifdef HAVE_TPETRA_MMM_TIMINGS
1343  MM = Teuchos::null;
1344  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1345 #endif
1346 
1347  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1348 
1349  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1350  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1351  if (needs_final_export) {
1352  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1353  }
1354  else {
1355  Ctemp = rcp (&C, false);
1356  }
1357 
1358  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1359 
1360  /*************************************************************/
1361  /* 4) exportAndFillComplete matrix */
1362  /*************************************************************/
1363 #ifdef HAVE_TPETRA_MMM_TIMINGS
1364  MM = Teuchos::null;
1365  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1366 #endif
1367 
1368  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1369 
1370  if (needs_final_export) {
1371  ParameterList labelList;
1372  labelList.set("Timer Label", label);
1373  if(!params.is_null()) {
1374  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1375  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1376  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1377  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1378  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1379  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1380  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1381  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1382  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1383 
1384  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1385  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1386  labelList.set("compute global constants",params->get("compute global constants",true));
1387  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1388  }
1389  Ctemp->exportAndFillComplete (Crcp,
1390  *Ctemp->getGraph ()->getExporter (),
1391  B.getDomainMap (),
1392  A.getDomainMap (),
1393  rcp (&labelList, false));
1394  }
1395 #ifdef HAVE_TPETRA_MMM_STATISTICS
1396  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1397 #endif
1398 }
1399 
1400 /*********************************************************************************************************/
1401 // Kernel method for computing the local portion of C = A*B
1402 template<class Scalar,
1403  class LocalOrdinal,
1404  class GlobalOrdinal,
1405  class Node>
1406 void mult_A_B(
1407  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1408  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1409  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1410  const std::string& /* label */,
1411  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1412 {
1413  using Teuchos::Array;
1414  using Teuchos::ArrayRCP;
1415  using Teuchos::ArrayView;
1416  using Teuchos::OrdinalTraits;
1417  using Teuchos::null;
1418 
1419  typedef Teuchos::ScalarTraits<Scalar> STS;
1420  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1421  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1422  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1423 
1424  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1425  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1426 
1427  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1428  ArrayView<const GlobalOrdinal> bcols_import = null;
1429  if (Bview.importColMap != null) {
1430  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1431  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1432 
1433  bcols_import = Bview.importColMap->getLocalElementList();
1434  }
1435 
1436  size_t C_numCols = C_lastCol - C_firstCol +
1437  OrdinalTraits<LocalOrdinal>::one();
1438  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1439  OrdinalTraits<LocalOrdinal>::one();
1440 
1441  if (C_numCols_import > C_numCols)
1442  C_numCols = C_numCols_import;
1443 
1444  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1445  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1446  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1447 
1448  Array<Scalar> C_row_i = dwork;
1449  Array<GlobalOrdinal> C_cols = iwork;
1450  Array<size_t> c_index = iwork2;
1451  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1452  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1453 
1454  size_t C_row_i_length, j, k, last_index;
1455 
1456  // Run through all the hash table lookups once and for all
1457  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1458  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1459  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1460  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1461  // Maps are the same: Use local IDs as the hash
1462  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1463  Aview.colMap->getMaxLocalIndex(); i++)
1464  Acol2Brow[i]=i;
1465  }
1466  else {
1467  // Maps are not the same: Use the map's hash
1468  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1469  Aview.colMap->getMaxLocalIndex(); i++) {
1470  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1471  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1472  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1473  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1474  }
1475  }
1476 
1477  // To form C = A*B we're going to execute this expression:
1478  //
1479  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1480  //
1481  // Our goal, of course, is to navigate the data in A and B once, without
1482  // performing searches for column-indices, etc.
1483  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1484  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1485  auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1486  auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1487  auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1488  auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1489  decltype(Browptr) Irowptr;
1490  decltype(Bcolind) Icolind;
1491  decltype(Bvals) Ivals;
1492  if(!Bview.importMatrix.is_null()) {
1493  Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1494  Icolind = Bview.importMatrix->getLocalIndicesHost();
1495  Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1496  }
1497 
1498  bool C_filled = C.isFillComplete();
1499 
1500  for (size_t i = 0; i < C_numCols; i++)
1501  c_index[i] = OrdinalTraits<size_t>::invalid();
1502 
1503  // Loop over the rows of A.
1504  size_t Arows = Aview.rowMap->getLocalNumElements();
1505  for(size_t i=0; i<Arows; ++i) {
1506 
1507  // Only navigate the local portion of Aview... which is, thankfully, all of
1508  // A since this routine doesn't do transpose modes
1509  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1510 
1511  // Loop across the i-th row of A and for each corresponding row in B, loop
1512  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1513  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1514  // calculating updates for row i of the result matrix C.
1515  C_row_i_length = OrdinalTraits<size_t>::zero();
1516 
1517  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1518  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1519  const Scalar Aval = Avals[k];
1520  if (Aval == STS::zero())
1521  continue;
1522 
1523  if (Ak == LO_INVALID)
1524  continue;
1525 
1526  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1527  LocalOrdinal col = Bcolind[j];
1528  //assert(col >= 0 && col < C_numCols);
1529 
1530  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1531  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1532  // This has to be a += so insertGlobalValue goes out
1533  C_row_i[C_row_i_length] = Aval*Bvals[j];
1534  C_cols[C_row_i_length] = col;
1535  c_index[col] = C_row_i_length;
1536  C_row_i_length++;
1537 
1538  } else {
1539  // static cast from impl_scalar_type to Scalar needed for complex
1540  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1541  }
1542  }
1543  }
1544 
1545  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1546  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1547  C_cols[ii] = bcols[C_cols[ii]];
1548  combined_index[ii] = C_cols[ii];
1549  combined_values[ii] = C_row_i[ii];
1550  }
1551  last_index = C_row_i_length;
1552 
1553  //
1554  //Now put the C_row_i values into C.
1555  //
1556  // We might have to revamp this later.
1557  C_row_i_length = OrdinalTraits<size_t>::zero();
1558 
1559  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1560  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1561  const Scalar Aval = Avals[k];
1562  if (Aval == STS::zero())
1563  continue;
1564 
1565  if (Ak!=LO_INVALID) continue;
1566 
1567  Ak = Acol2Irow[Acolind[k]];
1568  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1569  LocalOrdinal col = Icolind[j];
1570  //assert(col >= 0 && col < C_numCols);
1571 
1572  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1573  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1574  // This has to be a += so insertGlobalValue goes out
1575  C_row_i[C_row_i_length] = Aval*Ivals[j];
1576  C_cols[C_row_i_length] = col;
1577  c_index[col] = C_row_i_length;
1578  C_row_i_length++;
1579 
1580  } else {
1581  // This has to be a += so insertGlobalValue goes out
1582  // static cast from impl_scalar_type to Scalar needed for complex
1583  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1584  }
1585  }
1586  }
1587 
1588  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1589  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1590  C_cols[ii] = bcols_import[C_cols[ii]];
1591  combined_index[last_index] = C_cols[ii];
1592  combined_values[last_index] = C_row_i[ii];
1593  last_index++;
1594  }
1595 
1596  // Now put the C_row_i values into C.
1597  // We might have to revamp this later.
1598  C_filled ?
1599  C.sumIntoGlobalValues(
1600  global_row,
1601  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1602  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1603  :
1604  C.insertGlobalValues(
1605  global_row,
1606  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1607  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1608 
1609  }
1610 }
1611 
1612 /*********************************************************************************************************/
1613 template<class Scalar,
1614  class LocalOrdinal,
1615  class GlobalOrdinal,
1616  class Node>
1617 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1618  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1619  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1620 
1621  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1622  Mview.maxNumRowEntries = Mview.indices[0].size();
1623 
1624  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1625  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1626  Mview.maxNumRowEntries = Mview.indices[i].size();
1627  }
1628 }
1629 
1630 /*********************************************************************************************************/
1631 template<class CrsMatrixType>
1632 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1633  // Follows the NZ estimate in ML's ml_matmatmult.c
1634  size_t Aest = 100, Best=100;
1635  if (A.getLocalNumEntries() >= A.getLocalNumRows())
1636  Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1637  if (B.getLocalNumEntries() >= B.getLocalNumRows())
1638  Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1639 
1640  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1641  nnzperrow *= nnzperrow;
1642 
1643  return (size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1644 }
1645 
1646 
1647 /*********************************************************************************************************/
1648 // Kernel method for computing the local portion of C = A*B for CrsMatrix
1649 //
1650 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1651 // function, so this is probably the function we want to
1652 // thread-parallelize.
1653 template<class Scalar,
1654  class LocalOrdinal,
1655  class GlobalOrdinal,
1656  class Node>
1657 void mult_A_B_newmatrix(
1658  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1659  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1660  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1661  const std::string& label,
1662  const Teuchos::RCP<Teuchos::ParameterList>& params)
1663 {
1664  using Teuchos::Array;
1665  using Teuchos::ArrayRCP;
1666  using Teuchos::ArrayView;
1667  using Teuchos::RCP;
1668  using Teuchos::rcp;
1669 
1670  // Tpetra typedefs
1671  typedef LocalOrdinal LO;
1672  typedef GlobalOrdinal GO;
1673  typedef Node NO;
1674  typedef Import<LO,GO,NO> import_type;
1675  typedef Map<LO,GO,NO> map_type;
1676 
1677  // Kokkos typedefs
1678  typedef typename map_type::local_map_type local_map_type;
1680  typedef typename KCRS::StaticCrsGraphType graph_t;
1681  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1682  typedef typename NO::execution_space execution_space;
1683  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1684  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1685 
1686 #ifdef HAVE_TPETRA_MMM_TIMINGS
1687  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1688  using Teuchos::TimeMonitor;
1689  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1690 #endif
1691  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1692 
1693  // Build the final importer / column map, hash table lookups for C
1694  RCP<const import_type> Cimport;
1695  RCP<const map_type> Ccolmap;
1696  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1697  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1698  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1699  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1700  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1701  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1702  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1703  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1704 
1705  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1706  // indices of B, to local column indices of C. (B and C have the
1707  // same number of columns.) The kernel uses this, instead of
1708  // copying the entire input matrix B and converting its column
1709  // indices to those of C.
1710  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1711 
1712  if (Bview.importMatrix.is_null()) {
1713  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1714  Cimport = Bimport;
1715  Ccolmap = Bview.colMap;
1716  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1717  // Bcol2Ccol is trivial
1718  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1719  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1720  KOKKOS_LAMBDA(const LO i) {
1721  Bcol2Ccol(i) = i;
1722  });
1723  }
1724  else {
1725  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1726  // column Map of C, as well as C's Import object (from its domain
1727  // Map to its column Map). C's column Map is the union of the
1728  // column Maps of (the local part of) B, and the "remote" part of
1729  // B. Ditto for the Import. We have optimized this "setUnion"
1730  // operation on Import objects and Maps.
1731 
1732  // Choose the right variant of setUnion
1733  if (!Bimport.is_null() && !Iimport.is_null()) {
1734  Cimport = Bimport->setUnion(*Iimport);
1735  }
1736  else if (!Bimport.is_null() && Iimport.is_null()) {
1737  Cimport = Bimport->setUnion();
1738  }
1739  else if (Bimport.is_null() && !Iimport.is_null()) {
1740  Cimport = Iimport->setUnion();
1741  }
1742  else {
1743  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1744  }
1745  Ccolmap = Cimport->getTargetMap();
1746 
1747  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1748  // in general. We should get rid of it in order to reduce
1749  // communication costs of sparse matrix-matrix multiply.
1750  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1751  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1752 
1753  // NOTE: This is not efficient and should be folded into setUnion
1754  //
1755  // mfh 27 Sep 2016: What the above comment means, is that the
1756  // setUnion operation on Import objects could also compute these
1757  // local index - to - local index look-up tables.
1758  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1759  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1760  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1761  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1762  });
1763  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1764  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1765  });
1766 
1767  }
1768 
1769  // Replace the column map
1770  //
1771  // mfh 27 Sep 2016: We do this because C was originally created
1772  // without a column Map. Now we have its column Map.
1773  C.replaceColMap(Ccolmap);
1774 
1775  // mfh 27 Sep 2016: Construct tables that map from local column
1776  // indices of A, to local row indices of either B_local (the locally
1777  // owned part of B), or B_remote (the "imported" remote part of B).
1778  //
1779  // For column index Aik in row i of A, if the corresponding row of B
1780  // exists in the local part of B ("orig") (which I'll call B_local),
1781  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1782  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1783  //
1784  // For column index Aik in row i of A, if the corresponding row of B
1785  // exists in the remote part of B ("Import") (which I'll call
1786  // B_remote), then targetMapToImportRow[Aik] is the local index of
1787  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1788  // (a flag value).
1789 
1790  // Run through all the hash table lookups once and for all
1791  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1792  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1793 
1794  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1795  GO aidx = Acolmap_local.getGlobalElement(i);
1796  LO B_LID = Browmap_local.getLocalElement(aidx);
1797  if (B_LID != LO_INVALID) {
1798  targetMapToOrigRow(i) = B_LID;
1799  targetMapToImportRow(i) = LO_INVALID;
1800  } else {
1801  LO I_LID = Irowmap_local.getLocalElement(aidx);
1802  targetMapToOrigRow(i) = LO_INVALID;
1803  targetMapToImportRow(i) = I_LID;
1804 
1805  }
1806  });
1807 
1808  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1809  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1810  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1811 
1812 }
1813 
1814 /*********************************************************************************************************/
1815 // Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1816 template<class Scalar,
1817  class LocalOrdinal,
1818  class GlobalOrdinal,
1819  class Node>
1820 void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1821  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1822  Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1823 {
1824  using Teuchos::null;
1825  using Teuchos::Array;
1826  using Teuchos::ArrayRCP;
1827  using Teuchos::ArrayView;
1828  using Teuchos::RCP;
1829  using Teuchos::rcp;
1830 
1831  // Tpetra typedefs
1832  typedef LocalOrdinal LO;
1833  typedef GlobalOrdinal GO;
1834  typedef Node NO;
1835  typedef Import<LO,GO,NO> import_type;
1836  typedef Map<LO,GO,NO> map_type;
1837  typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1838  typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1839 
1840  // Kokkos typedefs
1841  typedef typename map_type::local_map_type local_map_type;
1842  typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1843  typedef typename KBSR::device_type device_t;
1844  typedef typename KBSR::StaticCrsGraphType static_graph_t;
1845  typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1846  typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1847  typedef typename KBSR::values_type::non_const_type scalar_view_t;
1848  typedef typename NO::execution_space execution_space;
1849  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1850  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1851 
1852  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1853 
1854  // Build the final importer / column map, hash table lookups for C
1855  RCP<const import_type> Cimport;
1856  RCP<const map_type> Ccolmap;
1857  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1858  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1859  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1860  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1861  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1862  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1863  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1864  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1865 
1866  // Bcol2Ccol is a table that maps from local column
1867  // indices of B, to local column indices of C. (B and C have the
1868  // same number of columns.) The kernel uses this, instead of
1869  // copying the entire input matrix B and converting its column
1870  // indices to those of C.
1871  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1872 
1873  if (Bview.importMatrix.is_null()) {
1874  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1875  Cimport = Bimport;
1876  Ccolmap = Bview.colMap;
1877  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1878  // Bcol2Ccol is trivial
1879  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1880  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1881  KOKKOS_LAMBDA(const LO i) {
1882  Bcol2Ccol(i) = i;
1883  });
1884  }
1885  else {
1886  // B has "remotes," so we need to build the
1887  // column Map of C, as well as C's Import object (from its domain
1888  // Map to its column Map). C's column Map is the union of the
1889  // column Maps of (the local part of) B, and the "remote" part of
1890  // B. Ditto for the Import. We have optimized this "setUnion"
1891  // operation on Import objects and Maps.
1892 
1893  // Choose the right variant of setUnion
1894  if (!Bimport.is_null() && !Iimport.is_null()) {
1895  Cimport = Bimport->setUnion(*Iimport);
1896  }
1897  else if (!Bimport.is_null() && Iimport.is_null()) {
1898  Cimport = Bimport->setUnion();
1899  }
1900  else if (Bimport.is_null() && !Iimport.is_null()) {
1901  Cimport = Iimport->setUnion();
1902  }
1903  else {
1904  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1905  }
1906  Ccolmap = Cimport->getTargetMap();
1907 
1908  // NOTE: This is not efficient and should be folded into setUnion
1909  //
1910  // What the above comment means, is that the
1911  // setUnion operation on Import objects could also compute these
1912  // local index - to - local index look-up tables.
1913  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1914  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1915  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1916  range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1917  KOKKOS_LAMBDA(const LO i) {
1918  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1919  });
1920  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1921  range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1922  KOKKOS_LAMBDA(const LO i) {
1923  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1924  });
1925  }
1926 
1927  // Construct tables that map from local column
1928  // indices of A, to local row indices of either B_local (the locally
1929  // owned part of B), or B_remote (the "imported" remote part of B).
1930  //
1931  // For column index Aik in row i of A, if the corresponding row of B
1932  // exists in the local part of B ("orig") (which I'll call B_local),
1933  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1934  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1935  //
1936  // For column index Aik in row i of A, if the corresponding row of B
1937  // exists in the remote part of B ("Import") (which I'll call
1938  // B_remote), then targetMapToImportRow[Aik] is the local index of
1939  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1940  // (a flag value).
1941 
1942  // Run through all the hash table lookups once and for all
1943  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1944  Aview.colMap->getLocalNumElements());
1945  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1946  Aview.colMap->getLocalNumElements());
1947 
1948  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",
1949  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1950  KOKKOS_LAMBDA(const LO i) {
1951  GO aidx = Acolmap_local.getGlobalElement(i);
1952  LO B_LID = Browmap_local.getLocalElement(aidx);
1953  if (B_LID != LO_INVALID) {
1954  targetMapToOrigRow(i) = B_LID;
1955  targetMapToImportRow(i) = LO_INVALID;
1956  } else {
1957  LO I_LID = Irowmap_local.getLocalElement(aidx);
1958  targetMapToOrigRow(i) = LO_INVALID;
1959  targetMapToImportRow(i) = I_LID;
1960  }
1961  });
1962 
1963  // Create the KernelHandle
1964  using KernelHandle =
1965  KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1966  typename lno_nnz_view_t::const_value_type,
1967  typename scalar_view_t::const_value_type,
1968  typename device_t::execution_space,
1969  typename device_t::memory_space,
1970  typename device_t::memory_space>;
1971  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1972  std::string myalg("SPGEMM_KK_MEMORY");
1973  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1974 
1975  KernelHandle kh;
1976  kh.create_spgemm_handle(alg_enum);
1977  kh.set_team_work_size(team_work_size);
1978 
1979  // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
1980  const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1981  const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
1982  targetMapToOrigRow,targetMapToImportRow,
1983  Bcol2Ccol,Icol2Ccol,
1984  Ccolmap.getConst()->getLocalNumElements());
1985 
1986  RCP<graph_t> graphC;
1987  typename KBSR::values_type values;
1988  {
1989  // Call KokkosSparse routines to calculate Amat*Bmerged on device.
1990  // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
1991  KBSR Cmat;
1992  KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
1993  KokkosSparse::block_spgemm_numeric (kh, Amat, false, Bmerged, false, Cmat);
1994  kh.destroy_spgemm_handle();
1995 
1996  // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
1997  graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1998  values = Cmat.values;
1999  }
2000  C = rcp (new block_crs_matrix_type (*graphC, values, Aview.blocksize));
2001 
2002 }
2003 
2004 /*********************************************************************************************************/
2005 // AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
2006 template<class Scalar,
2007  class LocalOrdinal,
2008  class GlobalOrdinal,
2009  class Node,
2010  class LocalOrdinalViewType>
2011 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2012  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2013  const LocalOrdinalViewType & targetMapToOrigRow,
2014  const LocalOrdinalViewType & targetMapToImportRow,
2015  const LocalOrdinalViewType & Bcol2Ccol,
2016  const LocalOrdinalViewType & Icol2Ccol,
2017  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2018  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2019  const std::string& label,
2020  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2021 #ifdef HAVE_TPETRA_MMM_TIMINGS
2022  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2023  using Teuchos::TimeMonitor;
2024  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
2025 #endif
2026 
2027  using Teuchos::Array;
2028  using Teuchos::ArrayRCP;
2029  using Teuchos::ArrayView;
2030  using Teuchos::RCP;
2031  using Teuchos::rcp;
2032 
2033  // Lots and lots of typedefs
2034  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2035  typedef typename KCRS::StaticCrsGraphType graph_t;
2036  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2037  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2038  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2039  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2040 
2041  typedef Scalar SC;
2042  typedef LocalOrdinal LO;
2043  typedef GlobalOrdinal GO;
2044  typedef Node NO;
2045  typedef Map<LO,GO,NO> map_type;
2046  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2047  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2048  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2049 
2050  // Sizes
2051  RCP<const map_type> Ccolmap = C.getColMap();
2052  size_t m = Aview.origMatrix->getLocalNumRows();
2053  size_t n = Ccolmap->getLocalNumElements();
2054  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2055 
2056  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2057  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2058  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2059 
2060  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2061  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2062  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2063 
2064  c_lno_view_t Irowptr;
2065  lno_nnz_view_t Icolind;
2066  scalar_view_t Ivals;
2067  if(!Bview.importMatrix.is_null()) {
2068  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2069  Irowptr = lclB.graph.row_map;
2070  Icolind = lclB.graph.entries;
2071  Ivals = lclB.values;
2072  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2073  }
2074 
2075 #ifdef HAVE_TPETRA_MMM_TIMINGS
2076  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2077 #endif
2078 
2079  // Classic csr assembly (low memory edition)
2080  //
2081  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2082  // The method loops over rows of A, and may resize after processing
2083  // each row. Chris Siefert says that this reflects experience in
2084  // ML; for the non-threaded case, ML found it faster to spend less
2085  // effort on estimation and risk an occasional reallocation.
2086  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2087  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2088  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2089  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2090 
2091  // mfh 27 Sep 2016: The c_status array is an implementation detail
2092  // of the local sparse matrix-matrix multiply routine.
2093 
2094  // The status array will contain the index into colind where this entry was last deposited.
2095  // c_status[i] < CSR_ip - not in the row yet
2096  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2097  // We start with this filled with INVALID's indicating that there are no entries yet.
2098  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2099  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2100  std::vector<size_t> c_status(n, ST_INVALID);
2101 
2102  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2103  // routine. The routine computes C := A * (B_local + B_remote).
2104  //
2105  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2106  // you whether the corresponding row of B belongs to B_local
2107  // ("orig") or B_remote ("Import").
2108 
2109  // For each row of A/C
2110  size_t CSR_ip = 0, OLD_ip = 0;
2111  for (size_t i = 0; i < m; i++) {
2112  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2113  // on the calling process.
2114  Crowptr[i] = CSR_ip;
2115 
2116  // mfh 27 Sep 2016: For each entry of A in the current row of A
2117  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2118  LO Aik = Acolind[k]; // local column index of current entry of A
2119  const SC Aval = Avals[k]; // value of current entry of A
2120  if (Aval == SC_ZERO)
2121  continue; // skip explicitly stored zero values in A
2122 
2123  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2124  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2125  // corresponding to the current entry of A is populated, then
2126  // the corresponding row of B is in B_local (i.e., it lives on
2127  // the calling process).
2128 
2129  // Local matrix
2130  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2131 
2132  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2133  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2134  LO Bkj = Bcolind[j];
2135  LO Cij = Bcol2Ccol[Bkj];
2136 
2137  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2138  // New entry
2139  c_status[Cij] = CSR_ip;
2140  Ccolind[CSR_ip] = Cij;
2141  Cvals[CSR_ip] = Aval*Bvals[j];
2142  CSR_ip++;
2143 
2144  } else {
2145  Cvals[c_status[Cij]] += Aval*Bvals[j];
2146  }
2147  }
2148 
2149  } else {
2150  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2151  // corresponding to the current entry of A NOT populated (has
2152  // a flag "invalid" value), then the corresponding row of B is
2153  // in B_local (i.e., it lives on the calling process).
2154 
2155  // Remote matrix
2156  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2157  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2158  LO Ikj = Icolind[j];
2159  LO Cij = Icol2Ccol[Ikj];
2160 
2161  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2162  // New entry
2163  c_status[Cij] = CSR_ip;
2164  Ccolind[CSR_ip] = Cij;
2165  Cvals[CSR_ip] = Aval*Ivals[j];
2166  CSR_ip++;
2167  } else {
2168  Cvals[c_status[Cij]] += Aval*Ivals[j];
2169  }
2170  }
2171  }
2172  }
2173 
2174  // Resize for next pass if needed
2175  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2176  CSR_alloc *= 2;
2177  Kokkos::resize(Ccolind,CSR_alloc);
2178  Kokkos::resize(Cvals,CSR_alloc);
2179  }
2180  OLD_ip = CSR_ip;
2181  }
2182 
2183  Crowptr[m] = CSR_ip;
2184 
2185  // Downward resize
2186  Kokkos::resize(Ccolind,CSR_ip);
2187  Kokkos::resize(Cvals,CSR_ip);
2188 
2189 #ifdef HAVE_TPETRA_MMM_TIMINGS
2190  {
2191  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
2192 #endif
2193 
2194  // Final sort & set of CRS arrays
2195  if (params.is_null() || params->get("sort entries",true))
2196  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2197  C.setAllValues(Crowptr,Ccolind, Cvals);
2198 
2199 
2200 #ifdef HAVE_TPETRA_MMM_TIMINGS
2201  }
2202  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
2203  {
2204 #endif
2205 
2206  // Final FillComplete
2207  //
2208  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2209  // Import (from domain Map to column Map) construction (which costs
2210  // lots of communication) by taking the previously constructed
2211  // Import object. We should be able to do this without interfering
2212  // with the implementation of the local part of sparse matrix-matrix
2213  // multply above.
2214  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2215  labelList->set("Timer Label",label);
2216  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2217  RCP<const Export<LO,GO,NO> > dummyExport;
2218  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2219 #ifdef HAVE_TPETRA_MMM_TIMINGS
2220  }
2221  MM2 = Teuchos::null;
2222 #endif
2223 
2224 }
2225 /*********************************************************************************************************/
2226 // Kernel method for computing the local portion of C = A*B (reuse)
2227 template<class Scalar,
2228  class LocalOrdinal,
2229  class GlobalOrdinal,
2230  class Node>
2231 void mult_A_B_reuse(
2232  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2233  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2234  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2235  const std::string& label,
2236  const Teuchos::RCP<Teuchos::ParameterList>& params)
2237 {
2238  using Teuchos::Array;
2239  using Teuchos::ArrayRCP;
2240  using Teuchos::ArrayView;
2241  using Teuchos::RCP;
2242  using Teuchos::rcp;
2243 
2244  // Tpetra typedefs
2245  typedef LocalOrdinal LO;
2246  typedef GlobalOrdinal GO;
2247  typedef Node NO;
2248  typedef Import<LO,GO,NO> import_type;
2249  typedef Map<LO,GO,NO> map_type;
2250 
2251  // Kokkos typedefs
2252  typedef typename map_type::local_map_type local_map_type;
2254  typedef typename KCRS::StaticCrsGraphType graph_t;
2255  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2256  typedef typename NO::execution_space execution_space;
2257  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2258  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2259 
2260 #ifdef HAVE_TPETRA_MMM_TIMINGS
2261  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2262  using Teuchos::TimeMonitor;
2263  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2264 #endif
2265  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2266 
2267  // Grab all the maps
2268  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2269  RCP<const map_type> Ccolmap = C.getColMap();
2270  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2271  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2272  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2273  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2274  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2275  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2276 
2277  // Build the final importer / column map, hash table lookups for C
2278  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2279  {
2280  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2281  // So, column map of C may be a strict subset of the column map of B
2282  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2283  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2284  });
2285 
2286  if (!Bview.importMatrix.is_null()) {
2287  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2288  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2289 
2290  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2291  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2292  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2293  });
2294  }
2295  }
2296 
2297  // Run through all the hash table lookups once and for all
2298  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2299  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2300  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2301  GO aidx = Acolmap_local.getGlobalElement(i);
2302  LO B_LID = Browmap_local.getLocalElement(aidx);
2303  if (B_LID != LO_INVALID) {
2304  targetMapToOrigRow(i) = B_LID;
2305  targetMapToImportRow(i) = LO_INVALID;
2306  } else {
2307  LO I_LID = Irowmap_local.getLocalElement(aidx);
2308  targetMapToOrigRow(i) = LO_INVALID;
2309  targetMapToImportRow(i) = I_LID;
2310 
2311  }
2312  });
2313 
2314  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2315  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2316  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2317 }
2318 
2319 /*********************************************************************************************************/
2320 template<class Scalar,
2321  class LocalOrdinal,
2322  class GlobalOrdinal,
2323  class Node,
2324  class LocalOrdinalViewType>
2325 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2326  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2327  const LocalOrdinalViewType & targetMapToOrigRow,
2328  const LocalOrdinalViewType & targetMapToImportRow,
2329  const LocalOrdinalViewType & Bcol2Ccol,
2330  const LocalOrdinalViewType & Icol2Ccol,
2331  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2332  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2333  const std::string& label,
2334  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2335 #ifdef HAVE_TPETRA_MMM_TIMINGS
2336  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2337  using Teuchos::TimeMonitor;
2338  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2339  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2340 #else
2341  (void)label;
2342 #endif
2343  using Teuchos::RCP;
2344  using Teuchos::rcp;
2345 
2346 
2347  // Lots and lots of typedefs
2348  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2349  typedef typename KCRS::StaticCrsGraphType graph_t;
2350  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2351  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2352  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2353 
2354  typedef Scalar SC;
2355  typedef LocalOrdinal LO;
2356  typedef GlobalOrdinal GO;
2357  typedef Node NO;
2358  typedef Map<LO,GO,NO> map_type;
2359  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2360  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2361  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2362 
2363  // Sizes
2364  RCP<const map_type> Ccolmap = C.getColMap();
2365  size_t m = Aview.origMatrix->getLocalNumRows();
2366  size_t n = Ccolmap->getLocalNumElements();
2367 
2368  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2369  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2370  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2371  const KCRS & Cmat = C.getLocalMatrixHost();
2372 
2373  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2374  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2375  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2376  scalar_view_t Cvals = Cmat.values;
2377 
2378  c_lno_view_t Irowptr;
2379  lno_nnz_view_t Icolind;
2380  scalar_view_t Ivals;
2381  if(!Bview.importMatrix.is_null()) {
2382  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2383  Irowptr = lclB.graph.row_map;
2384  Icolind = lclB.graph.entries;
2385  Ivals = lclB.values;
2386  }
2387 
2388 #ifdef HAVE_TPETRA_MMM_TIMINGS
2389  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2390 #endif
2391 
2392  // Classic csr assembly (low memory edition)
2393  // mfh 27 Sep 2016: The c_status array is an implementation detail
2394  // of the local sparse matrix-matrix multiply routine.
2395 
2396  // The status array will contain the index into colind where this entry was last deposited.
2397  // c_status[i] < CSR_ip - not in the row yet
2398  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2399  // We start with this filled with INVALID's indicating that there are no entries yet.
2400  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2401  std::vector<size_t> c_status(n, ST_INVALID);
2402 
2403  // For each row of A/C
2404  size_t CSR_ip = 0, OLD_ip = 0;
2405  for (size_t i = 0; i < m; i++) {
2406  // First fill the c_status array w/ locations where we're allowed to
2407  // generate nonzeros for this row
2408  OLD_ip = Crowptr[i];
2409  CSR_ip = Crowptr[i+1];
2410  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2411  c_status[Ccolind[k]] = k;
2412 
2413  // Reset values in the row of C
2414  Cvals[k] = SC_ZERO;
2415  }
2416 
2417  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2418  LO Aik = Acolind[k];
2419  const SC Aval = Avals[k];
2420  if (Aval == SC_ZERO)
2421  continue;
2422 
2423  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2424  // Local matrix
2425  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2426 
2427  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2428  LO Bkj = Bcolind[j];
2429  LO Cij = Bcol2Ccol[Bkj];
2430 
2431  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2432  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2433  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2434 
2435  Cvals[c_status[Cij]] += Aval * Bvals[j];
2436  }
2437 
2438  } else {
2439  // Remote matrix
2440  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2441  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2442  LO Ikj = Icolind[j];
2443  LO Cij = Icol2Ccol[Ikj];
2444 
2445  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2446  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2447  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2448 
2449  Cvals[c_status[Cij]] += Aval * Ivals[j];
2450  }
2451  }
2452  }
2453  }
2454 
2455 #ifdef HAVE_TPETRA_MMM_TIMINGS
2456  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2457 #endif
2458 
2459  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2460 }
2461 
2462 
2463 /*********************************************************************************************************/
2464 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2465 template<class Scalar,
2466  class LocalOrdinal,
2467  class GlobalOrdinal,
2468  class Node>
2469 void jacobi_A_B_newmatrix(
2470  Scalar omega,
2471  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2472  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2473  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2474  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2475  const std::string& label,
2476  const Teuchos::RCP<Teuchos::ParameterList>& params)
2477 {
2478  using Teuchos::Array;
2479  using Teuchos::ArrayRCP;
2480  using Teuchos::ArrayView;
2481  using Teuchos::RCP;
2482  using Teuchos::rcp;
2483  // typedef Scalar SC;
2484  typedef LocalOrdinal LO;
2485  typedef GlobalOrdinal GO;
2486  typedef Node NO;
2487 
2488  typedef Import<LO,GO,NO> import_type;
2489  typedef Map<LO,GO,NO> map_type;
2490  typedef typename map_type::local_map_type local_map_type;
2491 
2492  // All of the Kokkos typedefs
2494  typedef typename KCRS::StaticCrsGraphType graph_t;
2495  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2496  typedef typename NO::execution_space execution_space;
2497  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2498  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2499 
2500 
2501 #ifdef HAVE_TPETRA_MMM_TIMINGS
2502  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2503  using Teuchos::TimeMonitor;
2504  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2505 #endif
2506  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2507 
2508  // Build the final importer / column map, hash table lookups for C
2509  RCP<const import_type> Cimport;
2510  RCP<const map_type> Ccolmap;
2511  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2512  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2513  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2514  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2515  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2516  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2517  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2518  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2519 
2520  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2521  // indices of B, to local column indices of C. (B and C have the
2522  // same number of columns.) The kernel uses this, instead of
2523  // copying the entire input matrix B and converting its column
2524  // indices to those of C.
2525  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2526 
2527  if (Bview.importMatrix.is_null()) {
2528  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2529  Cimport = Bimport;
2530  Ccolmap = Bview.colMap;
2531  // Bcol2Ccol is trivial
2532  // Bcol2Ccol is trivial
2533 
2534  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2535  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2536  Bcol2Ccol(i) = static_cast<LO> (i);
2537  });
2538  } else {
2539  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2540  // column Map of C, as well as C's Import object (from its domain
2541  // Map to its column Map). C's column Map is the union of the
2542  // column Maps of (the local part of) B, and the "remote" part of
2543  // B. Ditto for the Import. We have optimized this "setUnion"
2544  // operation on Import objects and Maps.
2545 
2546  // Choose the right variant of setUnion
2547  if (!Bimport.is_null() && !Iimport.is_null()){
2548  Cimport = Bimport->setUnion(*Iimport);
2549  Ccolmap = Cimport->getTargetMap();
2550 
2551  } else if (!Bimport.is_null() && Iimport.is_null()) {
2552  Cimport = Bimport->setUnion();
2553 
2554  } else if(Bimport.is_null() && !Iimport.is_null()) {
2555  Cimport = Iimport->setUnion();
2556 
2557  } else
2558  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2559 
2560  Ccolmap = Cimport->getTargetMap();
2561 
2562  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2563  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2564 
2565  // NOTE: This is not efficient and should be folded into setUnion
2566  //
2567  // mfh 27 Sep 2016: What the above comment means, is that the
2568  // setUnion operation on Import objects could also compute these
2569  // local index - to - local index look-up tables.
2570  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2571  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2572  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2573  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2574  });
2575  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2576  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2577  });
2578 
2579  }
2580 
2581  // Replace the column map
2582  //
2583  // mfh 27 Sep 2016: We do this because C was originally created
2584  // without a column Map. Now we have its column Map.
2585  C.replaceColMap(Ccolmap);
2586 
2587  // mfh 27 Sep 2016: Construct tables that map from local column
2588  // indices of A, to local row indices of either B_local (the locally
2589  // owned part of B), or B_remote (the "imported" remote part of B).
2590  //
2591  // For column index Aik in row i of A, if the corresponding row of B
2592  // exists in the local part of B ("orig") (which I'll call B_local),
2593  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2594  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2595  //
2596  // For column index Aik in row i of A, if the corresponding row of B
2597  // exists in the remote part of B ("Import") (which I'll call
2598  // B_remote), then targetMapToImportRow[Aik] is the local index of
2599  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2600  // (a flag value).
2601 
2602  // Run through all the hash table lookups once and for all
2603  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2604  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2605  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2606  GO aidx = Acolmap_local.getGlobalElement(i);
2607  LO B_LID = Browmap_local.getLocalElement(aidx);
2608  if (B_LID != LO_INVALID) {
2609  targetMapToOrigRow(i) = B_LID;
2610  targetMapToImportRow(i) = LO_INVALID;
2611  } else {
2612  LO I_LID = Irowmap_local.getLocalElement(aidx);
2613  targetMapToOrigRow(i) = LO_INVALID;
2614  targetMapToImportRow(i) = I_LID;
2615 
2616  }
2617  });
2618 
2619  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2620  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2621  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2622 
2623 }
2624 
2625 
2626 /*********************************************************************************************************/
2627 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2628 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2629 
2630 template<class Scalar,
2631  class LocalOrdinal,
2632  class GlobalOrdinal,
2633  class Node,
2634  class LocalOrdinalViewType>
2635 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2636  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2637  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2638  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2639  const LocalOrdinalViewType & targetMapToOrigRow,
2640  const LocalOrdinalViewType & targetMapToImportRow,
2641  const LocalOrdinalViewType & Bcol2Ccol,
2642  const LocalOrdinalViewType & Icol2Ccol,
2643  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2644  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2645  const std::string& label,
2646  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2647 
2648 #ifdef HAVE_TPETRA_MMM_TIMINGS
2649  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2650  using Teuchos::TimeMonitor;
2651  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2652 #endif
2653 
2654  using Teuchos::Array;
2655  using Teuchos::ArrayRCP;
2656  using Teuchos::ArrayView;
2657  using Teuchos::RCP;
2658  using Teuchos::rcp;
2659 
2660  // Lots and lots of typedefs
2661  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2662  typedef typename KCRS::StaticCrsGraphType graph_t;
2663  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2664  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2665  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2666  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2667 
2668  // Jacobi-specific
2669  typedef typename scalar_view_t::memory_space scalar_memory_space;
2670 
2671  typedef Scalar SC;
2672  typedef LocalOrdinal LO;
2673  typedef GlobalOrdinal GO;
2674  typedef Node NO;
2675 
2676  typedef Map<LO,GO,NO> map_type;
2677  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2678  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2679 
2680  // Sizes
2681  RCP<const map_type> Ccolmap = C.getColMap();
2682  size_t m = Aview.origMatrix->getLocalNumRows();
2683  size_t n = Ccolmap->getLocalNumElements();
2684  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2685 
2686  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2687  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2688  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2689 
2690  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2691  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2692  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2693 
2694  c_lno_view_t Irowptr;
2695  lno_nnz_view_t Icolind;
2696  scalar_view_t Ivals;
2697  if(!Bview.importMatrix.is_null()) {
2698  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2699  Irowptr = lclB.graph.row_map;
2700  Icolind = lclB.graph.entries;
2701  Ivals = lclB.values;
2702  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2703  }
2704 
2705  // Jacobi-specific inner stuff
2706  auto Dvals =
2707  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2708 
2709  // Teuchos::ArrayView::operator[].
2710  // The status array will contain the index into colind where this entry was last deposited.
2711  // c_status[i] < CSR_ip - not in the row yet.
2712  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2713  // We start with this filled with INVALID's indicating that there are no entries yet.
2714  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2715  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2716  Array<size_t> c_status(n, ST_INVALID);
2717 
2718  // Classic csr assembly (low memory edition)
2719  //
2720  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2721  // The method loops over rows of A, and may resize after processing
2722  // each row. Chris Siefert says that this reflects experience in
2723  // ML; for the non-threaded case, ML found it faster to spend less
2724  // effort on estimation and risk an occasional reallocation.
2725  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2726  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2727  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2728  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2729  size_t CSR_ip = 0, OLD_ip = 0;
2730 
2731  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2732 
2733  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2734  // routine. The routine computes
2735  //
2736  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2737  //
2738  // This corresponds to one sweep of (weighted) Jacobi.
2739  //
2740  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2741  // you whether the corresponding row of B belongs to B_local
2742  // ("orig") or B_remote ("Import").
2743 
2744  // For each row of A/C
2745  for (size_t i = 0; i < m; i++) {
2746  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2747  // on the calling process.
2748  Crowptr[i] = CSR_ip;
2749  SC minusOmegaDval = -omega*Dvals(i,0);
2750 
2751  // Entries of B
2752  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2753  Scalar Bval = Bvals[j];
2754  if (Bval == SC_ZERO)
2755  continue;
2756  LO Bij = Bcolind[j];
2757  LO Cij = Bcol2Ccol[Bij];
2758 
2759  // Assume no repeated entries in B
2760  c_status[Cij] = CSR_ip;
2761  Ccolind[CSR_ip] = Cij;
2762  Cvals[CSR_ip] = Bvals[j];
2763  CSR_ip++;
2764  }
2765 
2766  // Entries of -omega * Dinv * A * B
2767  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2768  LO Aik = Acolind[k];
2769  const SC Aval = Avals[k];
2770  if (Aval == SC_ZERO)
2771  continue;
2772 
2773  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2774  // Local matrix
2775  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2776 
2777  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2778  LO Bkj = Bcolind[j];
2779  LO Cij = Bcol2Ccol[Bkj];
2780 
2781  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2782  // New entry
2783  c_status[Cij] = CSR_ip;
2784  Ccolind[CSR_ip] = Cij;
2785  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2786  CSR_ip++;
2787 
2788  } else {
2789  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2790  }
2791  }
2792 
2793  } else {
2794  // Remote matrix
2795  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2796  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2797  LO Ikj = Icolind[j];
2798  LO Cij = Icol2Ccol[Ikj];
2799 
2800  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2801  // New entry
2802  c_status[Cij] = CSR_ip;
2803  Ccolind[CSR_ip] = Cij;
2804  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2805  CSR_ip++;
2806  } else {
2807  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2808  }
2809  }
2810  }
2811  }
2812 
2813  // Resize for next pass if needed
2814  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2815  CSR_alloc *= 2;
2816  Kokkos::resize(Ccolind,CSR_alloc);
2817  Kokkos::resize(Cvals,CSR_alloc);
2818  }
2819  OLD_ip = CSR_ip;
2820  }
2821  Crowptr[m] = CSR_ip;
2822 
2823  // Downward resize
2824  Kokkos::resize(Ccolind,CSR_ip);
2825  Kokkos::resize(Cvals,CSR_ip);
2826 
2827  {
2828 #ifdef HAVE_TPETRA_MMM_TIMINGS
2829  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2830 #endif
2831 
2832  // Replace the column map
2833  //
2834  // mfh 27 Sep 2016: We do this because C was originally created
2835  // without a column Map. Now we have its column Map.
2836  C.replaceColMap(Ccolmap);
2837 
2838  // Final sort & set of CRS arrays
2839  //
2840  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2841  // matrix-matrix multiply routine sort the entries for us?
2842  // Final sort & set of CRS arrays
2843  if (params.is_null() || params->get("sort entries",true))
2844  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2845  C.setAllValues(Crowptr,Ccolind, Cvals);
2846  }
2847  {
2848 #ifdef HAVE_TPETRA_MMM_TIMINGS
2849  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2850 #endif
2851 
2852  // Final FillComplete
2853  //
2854  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2855  // Import (from domain Map to column Map) construction (which costs
2856  // lots of communication) by taking the previously constructed
2857  // Import object. We should be able to do this without interfering
2858  // with the implementation of the local part of sparse matrix-matrix
2859  // multply above
2860  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2861  labelList->set("Timer Label",label);
2862  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2863  RCP<const Export<LO,GO,NO> > dummyExport;
2864  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2865 
2866  }
2867 }
2868 
2869 
2870 /*********************************************************************************************************/
2871 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2872 template<class Scalar,
2873  class LocalOrdinal,
2874  class GlobalOrdinal,
2875  class Node>
2876 void jacobi_A_B_reuse(
2877  Scalar omega,
2878  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2879  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2880  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2881  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2882  const std::string& label,
2883  const Teuchos::RCP<Teuchos::ParameterList>& params)
2884 {
2885  using Teuchos::Array;
2886  using Teuchos::ArrayRCP;
2887  using Teuchos::ArrayView;
2888  using Teuchos::RCP;
2889  using Teuchos::rcp;
2890 
2891  typedef LocalOrdinal LO;
2892  typedef GlobalOrdinal GO;
2893  typedef Node NO;
2894 
2895  typedef Import<LO,GO,NO> import_type;
2896  typedef Map<LO,GO,NO> map_type;
2897 
2898  // Kokkos typedefs
2899  typedef typename map_type::local_map_type local_map_type;
2901  typedef typename KCRS::StaticCrsGraphType graph_t;
2902  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2903  typedef typename NO::execution_space execution_space;
2904  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2905  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2906 
2907 #ifdef HAVE_TPETRA_MMM_TIMINGS
2908  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2909  using Teuchos::TimeMonitor;
2910  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2911 #endif
2912  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2913 
2914  // Grab all the maps
2915  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2916  RCP<const map_type> Ccolmap = C.getColMap();
2917  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2918  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2919  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2920  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2921  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2922  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2923 
2924  // Build the final importer / column map, hash table lookups for C
2925  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2926  {
2927  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2928  // So, column map of C may be a strict subset of the column map of B
2929  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2930  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2931  });
2932 
2933  if (!Bview.importMatrix.is_null()) {
2934  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2935  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2936 
2937  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2938  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2939  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2940  });
2941  }
2942  }
2943 
2944  // Run through all the hash table lookups once and for all
2945  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2946  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2947  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2948  GO aidx = Acolmap_local.getGlobalElement(i);
2949  LO B_LID = Browmap_local.getLocalElement(aidx);
2950  if (B_LID != LO_INVALID) {
2951  targetMapToOrigRow(i) = B_LID;
2952  targetMapToImportRow(i) = LO_INVALID;
2953  } else {
2954  LO I_LID = Irowmap_local.getLocalElement(aidx);
2955  targetMapToOrigRow(i) = LO_INVALID;
2956  targetMapToImportRow(i) = I_LID;
2957 
2958  }
2959  });
2960 
2961 #ifdef HAVE_TPETRA_MMM_TIMINGS
2962  MM = Teuchos::null;
2963 #endif
2964 
2965  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2966  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2967  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2968 }
2969 
2970 
2971 
2972 /*********************************************************************************************************/
2973 template<class Scalar,
2974  class LocalOrdinal,
2975  class GlobalOrdinal,
2976  class Node,
2977  class LocalOrdinalViewType>
2978 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2979  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2980  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2981  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2982  const LocalOrdinalViewType & targetMapToOrigRow,
2983  const LocalOrdinalViewType & targetMapToImportRow,
2984  const LocalOrdinalViewType & Bcol2Ccol,
2985  const LocalOrdinalViewType & Icol2Ccol,
2986  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2987  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2988  const std::string& label,
2989  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2990 #ifdef HAVE_TPETRA_MMM_TIMINGS
2991  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2992  using Teuchos::TimeMonitor;
2993  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2994  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2995 #else
2996  (void)label;
2997 #endif
2998  using Teuchos::RCP;
2999  using Teuchos::rcp;
3000 
3001  // Lots and lots of typedefs
3002  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
3003  typedef typename KCRS::StaticCrsGraphType graph_t;
3004  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3005  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3006  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3007  typedef typename scalar_view_t::memory_space scalar_memory_space;
3008 
3009  typedef Scalar SC;
3010  typedef LocalOrdinal LO;
3011  typedef GlobalOrdinal GO;
3012  typedef Node NO;
3013  typedef Map<LO,GO,NO> map_type;
3014  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3015  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3016  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3017 
3018  // Sizes
3019  RCP<const map_type> Ccolmap = C.getColMap();
3020  size_t m = Aview.origMatrix->getLocalNumRows();
3021  size_t n = Ccolmap->getLocalNumElements();
3022 
3023  // Grab the Kokkos::SparseCrsMatrices & inner stuff
3024  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
3025  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
3026  const KCRS & Cmat = C.getLocalMatrixHost();
3027 
3028  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3029  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3030  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3031  scalar_view_t Cvals = Cmat.values;
3032 
3033  c_lno_view_t Irowptr;
3034  lno_nnz_view_t Icolind;
3035  scalar_view_t Ivals;
3036  if(!Bview.importMatrix.is_null()) {
3037  auto lclB = Bview.importMatrix->getLocalMatrixHost();
3038  Irowptr = lclB.graph.row_map;
3039  Icolind = lclB.graph.entries;
3040  Ivals = lclB.values;
3041  }
3042 
3043  // Jacobi-specific inner stuff
3044  auto Dvals =
3045  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
3046 
3047 #ifdef HAVE_TPETRA_MMM_TIMINGS
3048  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
3049 #endif
3050 
3051  // The status array will contain the index into colind where this entry was last deposited.
3052  // c_status[i] < CSR_ip - not in the row yet
3053  // c_status[i] >= CSR_ip - this is the entry where you can find the data
3054  // We start with this filled with INVALID's indicating that there are no entries yet.
3055  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
3056  std::vector<size_t> c_status(n, ST_INVALID);
3057 
3058  // For each row of A/C
3059  size_t CSR_ip = 0, OLD_ip = 0;
3060  for (size_t i = 0; i < m; i++) {
3061 
3062  // First fill the c_status array w/ locations where we're allowed to
3063  // generate nonzeros for this row
3064  OLD_ip = Crowptr[i];
3065  CSR_ip = Crowptr[i+1];
3066  for (size_t k = OLD_ip; k < CSR_ip; k++) {
3067  c_status[Ccolind[k]] = k;
3068 
3069  // Reset values in the row of C
3070  Cvals[k] = SC_ZERO;
3071  }
3072 
3073  SC minusOmegaDval = -omega*Dvals(i,0);
3074 
3075  // Entries of B
3076  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3077  Scalar Bval = Bvals[j];
3078  if (Bval == SC_ZERO)
3079  continue;
3080  LO Bij = Bcolind[j];
3081  LO Cij = Bcol2Ccol[Bij];
3082 
3083  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3084  std::runtime_error, "Trying to insert a new entry into a static graph");
3085 
3086  Cvals[c_status[Cij]] = Bvals[j];
3087  }
3088 
3089  // Entries of -omega * Dinv * A * B
3090  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3091  LO Aik = Acolind[k];
3092  const SC Aval = Avals[k];
3093  if (Aval == SC_ZERO)
3094  continue;
3095 
3096  if (targetMapToOrigRow[Aik] != LO_INVALID) {
3097  // Local matrix
3098  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3099 
3100  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3101  LO Bkj = Bcolind[j];
3102  LO Cij = Bcol2Ccol[Bkj];
3103 
3104  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3105  std::runtime_error, "Trying to insert a new entry into a static graph");
3106 
3107  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3108  }
3109 
3110  } else {
3111  // Remote matrix
3112  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3113  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3114  LO Ikj = Icolind[j];
3115  LO Cij = Icol2Ccol[Ikj];
3116 
3117  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3118  std::runtime_error, "Trying to insert a new entry into a static graph");
3119 
3120  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3121  }
3122  }
3123  }
3124  }
3125 
3126 #ifdef HAVE_TPETRA_MMM_TIMINGS
3127  MM2 = Teuchos::null;
3128  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3129 #endif
3130 
3131  C.fillComplete(C.getDomainMap(), C.getRangeMap());
3132 #ifdef HAVE_TPETRA_MMM_TIMINGS
3133  MM2 = Teuchos::null;
3134  MM = Teuchos::null;
3135 #endif
3136 
3137 }
3138 
3139 
3140 
3141 /*********************************************************************************************************/
3142 template<class Scalar,
3143  class LocalOrdinal,
3144  class GlobalOrdinal,
3145  class Node>
3146 void import_and_extract_views(
3147  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3148  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3149  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3150  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3151  bool userAssertsThereAreNoRemotes,
3152  const std::string& label,
3153  const Teuchos::RCP<Teuchos::ParameterList>& params)
3154 {
3155  using Teuchos::Array;
3156  using Teuchos::ArrayView;
3157  using Teuchos::RCP;
3158  using Teuchos::rcp;
3159  using Teuchos::null;
3160 
3161  typedef Scalar SC;
3162  typedef LocalOrdinal LO;
3163  typedef GlobalOrdinal GO;
3164  typedef Node NO;
3165 
3166  typedef Map<LO,GO,NO> map_type;
3167  typedef Import<LO,GO,NO> import_type;
3168  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3169 
3170 #ifdef HAVE_TPETRA_MMM_TIMINGS
3171  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3172  using Teuchos::TimeMonitor;
3173  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3174 #endif
3175  // The goal of this method is to populate the 'Aview' struct with views of the
3176  // rows of A, including all rows that correspond to elements in 'targetMap'.
3177  //
3178  // If targetMap includes local elements that correspond to remotely-owned rows
3179  // of A, then those remotely-owned rows will be imported into
3180  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3181  Aview.deleteContents();
3182 
3183  Aview.origMatrix = rcp(&A, false);
3184  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3185  Aview.origMatrix->getApplyHelper();
3186  Aview.origRowMap = A.getRowMap();
3187  Aview.rowMap = targetMap;
3188  Aview.colMap = A.getColMap();
3189  Aview.domainMap = A.getDomainMap();
3190  Aview.importColMap = null;
3191  RCP<const map_type> rowMap = A.getRowMap();
3192  const int numProcs = rowMap->getComm()->getSize();
3193 
3194  // Short circuit if the user swears there are no remotes (or if we're in serial)
3195  if (userAssertsThereAreNoRemotes || numProcs < 2)
3196  return;
3197 
3198  RCP<const import_type> importer;
3199  if (params != null && params->isParameter("importer")) {
3200  importer = params->get<RCP<const import_type> >("importer");
3201 
3202  } else {
3203 #ifdef HAVE_TPETRA_MMM_TIMINGS
3204  MM = Teuchos::null;
3205  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3206 #endif
3207 
3208  // Mark each row in targetMap as local or remote, and go ahead and get a view
3209  // for the local rows
3210  RCP<const map_type> remoteRowMap;
3211  size_t numRemote = 0;
3212  int mode = 0;
3213  if (!prototypeImporter.is_null() &&
3214  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3215  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3216  // We have a valid prototype importer --- ask it for the remotes
3217 #ifdef HAVE_TPETRA_MMM_TIMINGS
3218  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
3219 #endif
3220  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3221  numRemote = prototypeImporter->getNumRemoteIDs();
3222 
3223  Array<GO> remoteRows(numRemote);
3224  for (size_t i = 0; i < numRemote; i++)
3225  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3226 
3227  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3228  rowMap->getIndexBase(), rowMap->getComm()));
3229  mode = 1;
3230 
3231  } else if (prototypeImporter.is_null()) {
3232  // No prototype importer --- count the remotes the hard way
3233 #ifdef HAVE_TPETRA_MMM_TIMINGS
3234  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
3235 #endif
3236  ArrayView<const GO> rows = targetMap->getLocalElementList();
3237  size_t numRows = targetMap->getLocalNumElements();
3238 
3239  Array<GO> remoteRows(numRows);
3240  for(size_t i = 0; i < numRows; ++i) {
3241  const LO mlid = rowMap->getLocalElement(rows[i]);
3242 
3243  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3244  remoteRows[numRemote++] = rows[i];
3245  }
3246  remoteRows.resize(numRemote);
3247  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3248  rowMap->getIndexBase(), rowMap->getComm()));
3249  mode = 2;
3250 
3251  } else {
3252  // PrototypeImporter is bad. But if we're in serial that's OK.
3253  mode = 3;
3254  }
3255 
3256  if (numProcs < 2) {
3257  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3258  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3259  // If only one processor we don't need to import any remote rows, so return.
3260  return;
3261  }
3262 
3263  //
3264  // Now we will import the needed remote rows of A, if the global maximum
3265  // value of numRemote is greater than 0.
3266  //
3267 #ifdef HAVE_TPETRA_MMM_TIMINGS
3268  MM = Teuchos::null;
3269  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3270 #endif
3271 
3272  global_size_t globalMaxNumRemote = 0;
3273  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3274 
3275  if (globalMaxNumRemote > 0) {
3276 #ifdef HAVE_TPETRA_MMM_TIMINGS
3277  MM = Teuchos::null;
3278  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3279 #endif
3280  // Create an importer with target-map remoteRowMap and source-map rowMap.
3281  if (mode == 1)
3282  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3283  else if (mode == 2)
3284  importer = rcp(new import_type(rowMap, remoteRowMap));
3285  else
3286  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3287  }
3288 
3289  if (params != null)
3290  params->set("importer", importer);
3291  }
3292 
3293  if (importer != null) {
3294 #ifdef HAVE_TPETRA_MMM_TIMINGS
3295  MM = Teuchos::null;
3296  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3297 #endif
3298 
3299  // Now create a new matrix into which we can import the remote rows of A that we need.
3300  Teuchos::ParameterList labelList;
3301  labelList.set("Timer Label", label);
3302  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3303 
3304  bool isMM = true;
3305  bool overrideAllreduce = false;
3306  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3307  // Minor speedup tweak - avoid computing the global constants
3308  Teuchos::ParameterList params_sublist;
3309  if(!params.is_null()) {
3310  labelList.set("compute global constants", params->get("compute global constants",false));
3311  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3312  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3313  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3314  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3315  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3316  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3317  }
3318  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3319  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3320  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3321 
3322  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3323  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3324  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3325  Aview.importMatrix->getApplyHelper();
3326 
3327 #if 0
3328  // Disabled code for dumping input matrices
3329  static int count=0;
3330  char str[80];
3331  sprintf(str,"import_matrix.%d.dat",count);
3333  count++;
3334 #endif
3335 
3336 #ifdef HAVE_TPETRA_MMM_STATISTICS
3337  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3338 #endif
3339 
3340 
3341 #ifdef HAVE_TPETRA_MMM_TIMINGS
3342  MM = Teuchos::null;
3343  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3344 #endif
3345 
3346  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3347  Aview.importColMap = Aview.importMatrix->getColMap();
3348 #ifdef HAVE_TPETRA_MMM_TIMINGS
3349  MM = Teuchos::null;
3350 #endif
3351  }
3352 }
3353 
3354 /*********************************************************************************************************/
3355 template<class Scalar,
3356  class LocalOrdinal,
3357  class GlobalOrdinal,
3358  class Node>
3359 void import_and_extract_views(
3360  const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3361  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3362  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3363  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3364  bool userAssertsThereAreNoRemotes)
3365 {
3366  using Teuchos::Array;
3367  using Teuchos::ArrayView;
3368  using Teuchos::RCP;
3369  using Teuchos::rcp;
3370  using Teuchos::null;
3371 
3372  typedef Scalar SC;
3373  typedef LocalOrdinal LO;
3374  typedef GlobalOrdinal GO;
3375  typedef Node NO;
3376 
3377  typedef Map<LO,GO,NO> map_type;
3378  typedef Import<LO,GO,NO> import_type;
3379  typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3380 
3381  // The goal of this method is to populate the 'Mview' struct with views of the
3382  // rows of M, including all rows that correspond to elements in 'targetMap'.
3383  //
3384  // If targetMap includes local elements that correspond to remotely-owned rows
3385  // of M, then those remotely-owned rows will be imported into
3386  // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3387  Mview.deleteContents();
3388 
3389  Mview.origMatrix = rcp(&M, false);
3390  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3391  Mview.origMatrix->getApplyHelper();
3392  Mview.origRowMap = M.getRowMap();
3393  Mview.rowMap = targetMap;
3394  Mview.colMap = M.getColMap();
3395  Mview.importColMap = null;
3396  RCP<const map_type> rowMap = M.getRowMap();
3397  const int numProcs = rowMap->getComm()->getSize();
3398 
3399  // Short circuit if the user swears there are no remotes (or if we're in serial)
3400  if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3401 
3402  // Mark each row in targetMap as local or remote, and go ahead and get a view
3403  // for the local rows
3404  RCP<const map_type> remoteRowMap;
3405  size_t numRemote = 0;
3406  int mode = 0;
3407  if (!prototypeImporter.is_null() &&
3408  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3409  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3410 
3411  // We have a valid prototype importer --- ask it for the remotes
3412  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3413  numRemote = prototypeImporter->getNumRemoteIDs();
3414 
3415  Array<GO> remoteRows(numRemote);
3416  for (size_t i = 0; i < numRemote; i++)
3417  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3418 
3419  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3420  rowMap->getIndexBase(), rowMap->getComm()));
3421  mode = 1;
3422 
3423  } else if (prototypeImporter.is_null()) {
3424 
3425  // No prototype importer --- count the remotes the hard way
3426  ArrayView<const GO> rows = targetMap->getLocalElementList();
3427  size_t numRows = targetMap->getLocalNumElements();
3428 
3429  Array<GO> remoteRows(numRows);
3430  for(size_t i = 0; i < numRows; ++i) {
3431  const LO mlid = rowMap->getLocalElement(rows[i]);
3432 
3433  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3434  remoteRows[numRemote++] = rows[i];
3435  }
3436  remoteRows.resize(numRemote);
3437  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3438  rowMap->getIndexBase(), rowMap->getComm()));
3439  mode = 2;
3440 
3441  } else {
3442  // PrototypeImporter is bad. But if we're in serial that's OK.
3443  mode = 3;
3444  }
3445 
3446  if (numProcs < 2) {
3447  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3448  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3449  // If only one processor we don't need to import any remote rows, so return.
3450  return;
3451  }
3452 
3453  // Now we will import the needed remote rows of M, if the global maximum
3454  // value of numRemote is greater than 0.
3455  global_size_t globalMaxNumRemote = 0;
3456  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3457 
3458  RCP<const import_type> importer;
3459 
3460  if (globalMaxNumRemote > 0) {
3461  // Create an importer with target-map remoteRowMap and source-map rowMap.
3462  if (mode == 1)
3463  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3464  else if (mode == 2)
3465  importer = rcp(new import_type(rowMap, remoteRowMap));
3466  else
3467  throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3468  }
3469 
3470  if (importer != null) {
3471  // Get import matrix
3472  // TODO: create the int-typed row-pointer here
3473  Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3474  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3475  Mview.importMatrix->getApplyHelper();
3476  // Save the column map of the imported matrix, so that we can convert indices
3477  // back to global for arithmetic later
3478  Mview.importColMap = Mview.importMatrix->getColMap();
3479  }
3480 }
3481 
3482 /*********************************************************************************************************/
3483  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3484 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3486 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3487  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3488  const LocalOrdinalViewType & Acol2Brow,
3489  const LocalOrdinalViewType & Acol2Irow,
3490  const LocalOrdinalViewType & Bcol2Ccol,
3491  const LocalOrdinalViewType & Icol2Ccol,
3492  const size_t mergedNodeNumCols) {
3493 
3494  using Teuchos::RCP;
3496  typedef typename KCRS::StaticCrsGraphType graph_t;
3497  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3498  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3499  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3500  // Grab the Kokkos::SparseCrsMatrices
3501  const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3502  const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3503 
3504  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3505  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3506  // We do have a Bimport
3507  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3508  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3509  RCP<const KCRS> Ik_;
3510  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3511  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3512  KCRS Iks;
3513  if(Ik!=0) Iks = *Ik;
3514  size_t merge_numrows = Ak.numCols();
3515  // The last entry of this at least, need to be initialized
3516  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3517 
3518  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3519 
3520  // Use a Kokkos::parallel_scan to build the rowptr
3521  typedef typename Node::execution_space execution_space;
3522  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3523  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3524  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3525  if(final) Mrowptr(i) = update;
3526  // Get the row count
3527  size_t ct=0;
3528  if(Acol2Brow(i)!=LO_INVALID)
3529  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3530  else
3531  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3532  update+=ct;
3533 
3534  if(final && i+1==merge_numrows)
3535  Mrowptr(i+1)=update;
3536  });
3537 
3538  // Allocate nnz
3539  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3540  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3541  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3542 
3543  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3544  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3545  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3546  if(Acol2Brow(i)!=LO_INVALID) {
3547  size_t row = Acol2Brow(i);
3548  size_t start = Bk.graph.row_map(row);
3549  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3550  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3551  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3552  }
3553  }
3554  else {
3555  size_t row = Acol2Irow(i);
3556  size_t start = Iks.graph.row_map(row);
3557  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3558  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3559  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3560  }
3561  }
3562  });
3563 
3564  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3565  return newmat;
3566  }
3567  else {
3568  // We don't have a Bimport (the easy case)
3569  return Bk;
3570  }
3571 }//end merge_matrices
3572 
3573 /*********************************************************************************************************/
3574  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3575 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3576 const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3577 merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3578  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3579  const LocalOrdinalViewType & Acol2Brow,
3580  const LocalOrdinalViewType & Acol2Irow,
3581  const LocalOrdinalViewType & Bcol2Ccol,
3582  const LocalOrdinalViewType & Icol2Ccol,
3583  const size_t mergedNodeNumCols)
3584 {
3585  using Teuchos::RCP;
3586  typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3587  typedef typename KBCRS::StaticCrsGraphType graph_t;
3588  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3589  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3590  typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3591 
3592  // Grab the KokkosSparse::BsrMatrix
3593  const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3594  const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3595 
3596  // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3597  if(!Bview.importMatrix.is_null() ||
3598  (Bview.importMatrix.is_null() &&
3599  (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3600 
3601  // We do have a Bimport
3602  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3603  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3604  RCP<const KBCRS> Ik_;
3605  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3606  const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3607  KBCRS Iks;
3608  if(Ik!=0) Iks = *Ik;
3609  size_t merge_numrows = Ak.numCols();
3610 
3611  // The last entry of this at least, need to be initialized
3612  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3613 
3614  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3615 
3616  // Use a Kokkos::parallel_scan to build the rowptr
3617  typedef typename Node::execution_space execution_space;
3618  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3619  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3620  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3621  if(final) Mrowptr(i) = update;
3622  // Get the row count
3623  size_t ct=0;
3624  if(Acol2Brow(i)!=LO_INVALID)
3625  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3626  else
3627  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3628  update+=ct;
3629 
3630  if(final && i+1==merge_numrows)
3631  Mrowptr(i+1)=update;
3632  });
3633 
3634  // Allocate nnz
3635  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3636  const int blocksize = Ak.blockDim();
3637  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3638  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz*blocksize*blocksize);
3639 
3640  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3641  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3642  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3643  if(Acol2Brow(i)!=LO_INVALID) {
3644  size_t row = Acol2Brow(i);
3645  size_t start = Bk.graph.row_map(row);
3646  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3647  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3648 
3649  for (int b=0; b<blocksize*blocksize; ++b) {
3650  const int val_indx = j*blocksize*blocksize + b;
3651  const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3652  Mvalues(val_indx) = Bk.values(b_val_indx);
3653  }
3654  }
3655  }
3656  else {
3657  size_t row = Acol2Irow(i);
3658  size_t start = Iks.graph.row_map(row);
3659  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3660  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3661 
3662  for (int b=0; b<blocksize*blocksize; ++b) {
3663  const int val_indx = j*blocksize*blocksize + b;
3664  const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3665  Mvalues(val_indx) = Iks.values(b_val_indx);
3666  }
3667  }
3668  }
3669  });
3670 
3671  // Build and return merged KokkosSparse matrix
3672  KBCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3673  return newmat;
3674  }
3675  else {
3676  // We don't have a Bimport (the easy case)
3677  return Bk;
3678  }
3679 }//end merge_matrices
3680 
3681 /*********************************************************************************************************/
3682 template<typename SC, typename LO, typename GO, typename NO>
3683 void AddKernels<SC, LO, GO, NO>::
3684 addSorted(
3685  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3686  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3687  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3688  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3689  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3690  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3691  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3692  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3693 #if KOKKOSKERNELS_VERSION >= 40299
3694  GO numGlobalCols,
3695 #endif
3696  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3697  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3698  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3699 {
3700  using Teuchos::TimeMonitor;
3701  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3702  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3703  auto nrows = Arowptrs.extent(0) - 1;
3704  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3705  typename AddKern::KKH handle;
3706  handle.create_spadd_handle(true);
3707  auto addHandle = handle.get_spadd_handle();
3708 #ifdef HAVE_TPETRA_MMM_TIMINGS
3709  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3710 #endif
3711  KokkosSparse::Experimental::spadd_symbolic
3712  (&handle,
3713 #if KOKKOSKERNELS_VERSION >= 40299
3714  nrows, numGlobalCols,
3715 #endif
3716  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3717  //KokkosKernels requires values to be zeroed
3718  Cvals = values_array("C values", addHandle->get_c_nnz());
3719  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3720 #ifdef HAVE_TPETRA_MMM_TIMINGS
3721  MM = Teuchos::null;
3722  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3723 #endif
3724  KokkosSparse::Experimental::spadd_numeric(&handle,
3725 #if KOKKOSKERNELS_VERSION >= 40299
3726  nrows, numGlobalCols,
3727 #endif
3728  Arowptrs, Acolinds, Avals, scalarA,
3729  Browptrs, Bcolinds, Bvals, scalarB,
3730  Crowptrs, Ccolinds, Cvals);
3731 }
3732 
3733 template<typename SC, typename LO, typename GO, typename NO>
3734 void AddKernels<SC, LO, GO, NO>::
3735 addUnsorted(
3736  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3737  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3738  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3739  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3740  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3741  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3742  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3743  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3744 #if KOKKOSKERNELS_VERSION >= 40299
3745  GO numGlobalCols,
3746 #else
3747  GO /* numGlobalCols */,
3748 #endif
3749  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3750  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3751  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3752 {
3753  using Teuchos::TimeMonitor;
3754  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3755  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3756  auto nrows = Arowptrs.extent(0) - 1;
3757  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3758  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3759  typename AddKern::KKH handle;
3760  handle.create_spadd_handle(false);
3761  auto addHandle = handle.get_spadd_handle();
3762 #ifdef HAVE_TPETRA_MMM_TIMINGS
3763  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3764 #endif
3765  KokkosSparse::Experimental::spadd_symbolic
3766  (&handle,
3767 #if KOKKOSKERNELS_VERSION >= 40299
3768  nrows, numGlobalCols,
3769 #endif
3770  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3771  //Cvals must be zeroed out
3772  Cvals = values_array("C values", addHandle->get_c_nnz());
3773  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3774 #ifdef HAVE_TPETRA_MMM_TIMINGS
3775  MM = Teuchos::null;
3776  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3777 #endif
3778  KokkosSparse::Experimental::spadd_numeric(&handle,
3779 #if KOKKOSKERNELS_VERSION >= 40299
3780  nrows, numGlobalCols,
3781 #endif
3782  Arowptrs, Acolinds, Avals, scalarA,
3783  Browptrs, Bcolinds, Bvals, scalarB,
3784  Crowptrs, Ccolinds, Cvals);
3785 }
3786 
3787 template<typename GO,
3788  typename LocalIndicesType,
3789  typename GlobalIndicesType,
3790  typename ColMapType>
3791 struct ConvertLocalToGlobalFunctor
3792 {
3793  ConvertLocalToGlobalFunctor(
3794  const LocalIndicesType& colindsOrig_,
3795  const GlobalIndicesType& colindsConverted_,
3796  const ColMapType& colmap_) :
3797  colindsOrig (colindsOrig_),
3798  colindsConverted (colindsConverted_),
3799  colmap (colmap_)
3800  {}
3801  KOKKOS_INLINE_FUNCTION void
3802  operator() (const GO i) const
3803  {
3804  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3805  }
3806  LocalIndicesType colindsOrig;
3807  GlobalIndicesType colindsConverted;
3808  ColMapType colmap;
3809 };
3810 
3811 template<typename SC, typename LO, typename GO, typename NO>
3812 void MMdetails::AddKernels<SC, LO, GO, NO>::
3813 convertToGlobalAndAdd(
3814  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3815  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3816  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3817  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3818  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3819  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3820  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3821  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3822  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3823 {
3824  using Teuchos::TimeMonitor;
3825  //Need to use a different KokkosKernelsHandle type than other versions,
3826  //since the ordinals are now GO
3827  using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3828  typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3829 
3830  const values_array Avals = A.values;
3831  const values_array Bvals = B.values;
3832  const col_inds_array Acolinds = A.graph.entries;
3833  const col_inds_array Bcolinds = B.graph.entries;
3834  auto Arowptrs = A.graph.row_map;
3835  auto Browptrs = B.graph.row_map;
3836  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3837  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3838 #ifdef HAVE_TPETRA_MMM_TIMINGS
3839  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3840 #endif
3841  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3842  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3843  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3844  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3845  KKH_GO handle;
3846  handle.create_spadd_handle(false);
3847  auto addHandle = handle.get_spadd_handle();
3848 #ifdef HAVE_TPETRA_MMM_TIMINGS
3849  MM = Teuchos::null;
3850  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3851 #endif
3852  auto nrows = Arowptrs.extent(0) - 1;
3853  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3854  KokkosSparse::Experimental::spadd_symbolic
3855  (&handle,
3856 #if KOKKOSKERNELS_VERSION >= 40299
3857  nrows, A.numCols(),
3858 #endif
3859  Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3860  Cvals = values_array("C values", addHandle->get_c_nnz());
3861  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3862 #ifdef HAVE_TPETRA_MMM_TIMINGS
3863  MM = Teuchos::null;
3864  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3865 #endif
3866  KokkosSparse::Experimental::spadd_numeric(&handle,
3867 #if KOKKOSKERNELS_VERSION >= 40299
3868  nrows, A.numCols(),
3869 #endif
3870  Arowptrs, AcolindsConverted, Avals, scalarA,
3871  Browptrs, BcolindsConverted, Bvals, scalarB,
3872  Crowptrs, Ccolinds, Cvals);
3873 }
3874 
3875 
3876 } //End namepsace MMdetails
3877 
3878 } //End namespace Tpetra
3879 
3880 /*********************************************************************************************************/
3881 //
3882 // Explicit instantiation macro
3883 //
3884 // Must be expanded from within the Tpetra namespace!
3885 //
3886 namespace Tpetra {
3887 
3888 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3889 template \
3890  void MatrixMatrix::Multiply( \
3891  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3892  bool transposeA, \
3893  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3894  bool transposeB, \
3895  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3896  bool call_FillComplete_on_result, \
3897  const std::string & label, \
3898  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3899 \
3900 template \
3901  void MatrixMatrix::Multiply( \
3902  const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3903  bool transposeA, \
3904  const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3905  bool transposeB, \
3906  Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3907  const std::string & label); \
3908 \
3909 template \
3910  void MatrixMatrix::Jacobi( \
3911  SCALAR omega, \
3912  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3913  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3914  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3915  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3916  bool call_FillComplete_on_result, \
3917  const std::string & label, \
3918  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3919 \
3920  template \
3921  void MatrixMatrix::Add( \
3922  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3923  bool transposeA, \
3924  SCALAR scalarA, \
3925  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3926  bool transposeB, \
3927  SCALAR scalarB, \
3928  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3929 \
3930  template \
3931  void MatrixMatrix::Add( \
3932  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3933  bool transposeA, \
3934  SCALAR scalarA, \
3935  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3936  bool transposeB, \
3937  SCALAR scalarB, \
3938  const Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3939 \
3940  template \
3941  void MatrixMatrix::Add( \
3942  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3943  bool transposeA, \
3944  SCALAR scalarA, \
3945  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3946  SCALAR scalarB ); \
3947 \
3948  template \
3949  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3950  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3951  (const SCALAR & alpha, \
3952  const bool transposeA, \
3953  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3954  const SCALAR & beta, \
3955  const bool transposeB, \
3956  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3957  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3958  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3959  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3960 \
3961  template \
3962  void \
3963  MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3964  (const SCALAR & alpha, \
3965  const bool transposeA, \
3966  const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3967  const SCALAR& beta, \
3968  const bool transposeB, \
3969  const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3970  CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3971  const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3972  const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3973  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3974 \
3975  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3976 \
3977  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3978  Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3979  CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3980  Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3981  bool userAssertsThereAreNoRemotes, \
3982  const std::string& label, \
3983  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3984 \
3985  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3986  Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3987  BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3988  Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3989  bool userAssertsThereAreNoRemotes);
3990 } //End namespace Tpetra
3991 
3992 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Struct that holds views of the contents of a BlockCrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void start()
Start the deep_copy counter.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
Forward declaration of some Tpetra Matrix Matrix objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.