Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_DEF_HPP
13 #include "KokkosSparse_Utils.hpp"
14 #include "Tpetra_ConfigDefs.hpp"
16 #include "Teuchos_VerboseObject.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Tpetra_Util.hpp"
19 #include "Tpetra_CrsMatrix.hpp"
20 #include "Tpetra_BlockCrsMatrix.hpp"
22 #include "Tpetra_RowMatrixTransposer.hpp"
25 #include "Tpetra_Details_makeColMap.hpp"
26 #include "Tpetra_ConfigDefs.hpp"
27 #include "Tpetra_Map.hpp"
28 #include "Tpetra_Export.hpp"
29 #include "Tpetra_Import_Util.hpp"
30 #include "Tpetra_Import_Util2.hpp"
32 
33 #include <algorithm>
34 #include <type_traits>
35 #include "Teuchos_FancyOStream.hpp"
36 
37 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
39 
40 #include "KokkosSparse_spgemm.hpp"
41 #include "KokkosSparse_spadd.hpp"
42 #include "Kokkos_Bitset.hpp"
43 
44 #include <MatrixMarket_Tpetra.hpp>
45 
53 /*********************************************************************************************************/
54 // Include the architecture-specific kernel partial specializations here
55 // NOTE: This needs to be outside all namespaces
56 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
57 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
58 #include "TpetraExt_MatrixMatrix_HIP.hpp"
59 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
60 
61 namespace Tpetra {
62 
63 namespace MatrixMatrix{
64 
65 //
66 // This method forms the matrix-matrix product C = op(A) * op(B), where
67 // op(A) == A if transposeA is false,
68 // op(A) == A^T if transposeA is true,
69 // and similarly for op(B).
70 //
71 template <class Scalar,
72  class LocalOrdinal,
73  class GlobalOrdinal,
74  class Node>
75 void Multiply(
77  bool transposeA,
79  bool transposeB,
81  bool call_FillComplete_on_result,
82  const std::string& label,
83  const Teuchos::RCP<Teuchos::ParameterList>& params)
84 {
85  using Teuchos::null;
86  using Teuchos::RCP;
87  using Teuchos::rcp;
88  typedef Scalar SC;
89  typedef LocalOrdinal LO;
90  typedef GlobalOrdinal GO;
91  typedef Node NO;
92  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
93  typedef Import<LO,GO,NO> import_type;
94  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
95  typedef Map<LO,GO,NO> map_type;
96  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
97 
98  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All Setup"));
99 
100  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
101 
102  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
103 
104  // The input matrices A and B must both be fillComplete.
105  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
106  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
107 
108  // If transposeA is true, then Aprime will be the transpose of A
109  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
110  // will just be a pointer to A.
111  RCP<const crs_matrix_type> Aprime = null;
112  // If transposeB is true, then Bprime will be the transpose of B
113  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
114  // will just be a pointer to B.
115  RCP<const crs_matrix_type> Bprime = null;
116 
117  // Is this a "clean" matrix?
118  //
119  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
120  // locally nor globally indexed, then it was empty. I don't like
121  // this, because the most straightforward implementation presumes
122  // lazy allocation of indices. However, historical precedent
123  // demands that we keep around this predicate as a way to test
124  // whether the matrix is empty.
125  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
126 
127  bool use_optimized_ATB = false;
128  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
129  use_optimized_ATB = true;
130 
131 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
132  use_optimized_ATB = false;
133 #endif
134 
135  using Teuchos::ParameterList;
136  RCP<ParameterList> transposeParams (new ParameterList);
137  transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
138 
139  if (!use_optimized_ATB && transposeA) {
140  transposer_type transposer (rcpFromRef (A));
141  Aprime = transposer.createTranspose (transposeParams);
142  }
143  else {
144  Aprime = rcpFromRef(A);
145  }
146 
147  if (transposeB) {
148  transposer_type transposer (rcpFromRef (B));
149  Bprime = transposer.createTranspose (transposeParams);
150  }
151  else {
152  Bprime = rcpFromRef(B);
153  }
154 
155  // Check size compatibility
156  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
157  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
158  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
159  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
160  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
161  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
162  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
163  prefix << "ERROR, inner dimensions of op(A) and op(B) "
164  "must match for matrix-matrix product. op(A) is "
165  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
166 
167  // The result matrix C must at least have a row-map that reflects the correct
168  // row-size. Don't check the number of columns because rectangular matrices
169  // which were constructed with only one map can still end up having the
170  // correct capacity and dimensions when filled.
171  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
172  prefix << "ERROR, dimensions of result C must "
173  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
174  << " rows, should have at least " << Aouter << std::endl);
175 
176  // It doesn't matter whether C is already Filled or not. If it is already
177  // Filled, it must have space allocated for the positions that will be
178  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
179  // we'll error out later when trying to store result values.
180 
181  // CGB: However, matrix must be in active-fill
182  if (!C.isFillActive()) C.resumeFill();
183 
184  // We're going to need to import remotely-owned sections of A and/or B if
185  // more than one processor is performing this run, depending on the scenario.
186  int numProcs = A.getComm()->getSize();
187 
188  // Declare a couple of structs that will be used to hold views of the data
189  // of A and B, to be used for fast access during the matrix-multiplication.
190  crs_matrix_struct_type Aview;
191  crs_matrix_struct_type Bview;
192 
193  RCP<const map_type> targetMap_A = Aprime->getRowMap();
194  RCP<const map_type> targetMap_B = Bprime->getRowMap();
195 
196  {
197  Tpetra::Details::ProfilingRegion r("TpetraExt: MMM: All I&X");
198 
199  // Now import any needed remote rows and populate the Aview struct
200  // NOTE: We assert that an import isn't needed --- since we do the transpose
201  // above to handle that.
202  if (!use_optimized_ATB) {
203  RCP<const import_type> dummyImporter;
204  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
205  }
206 
207  // We will also need local access to all rows of B that correspond to the
208  // column-map of op(A).
209  if (numProcs > 1)
210  targetMap_B = Aprime->getColMap();
211 
212  // Import any needed remote rows and populate the Bview struct.
213  if (!use_optimized_ATB)
214  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
215 
216  } //stop MM_importExtract here
217 
218  //stop the setup timer, and start the multiply timer
219  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All Multiply"));
220 
221  // Call the appropriate method to perform the actual multiplication.
222  if (use_optimized_ATB) {
223  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
224 
225  } else if (call_FillComplete_on_result && newFlag) {
226  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
227 
228  } else if (call_FillComplete_on_result) {
229  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
230 
231  } else {
232  // mfh 27 Sep 2016: Is this the "slow" case? This
233  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
234  // thread-parallel inserts, but that may take some effort.
235  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
236 
237  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
238  }
239 
240  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All FillComplete"));
241  if (call_FillComplete_on_result && !C.isFillComplete()) {
242  // We'll call FillComplete on the C matrix before we exit, and give it a
243  // domain-map and a range-map.
244  // The domain-map will be the domain-map of B, unless
245  // op(B)==transpose(B), in which case the range-map of B will be used.
246  // The range-map will be the range-map of A, unless op(A)==transpose(A),
247  // in which case the domain-map of A will be used.
248  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
249  }
250 }
251 
252 //
253 // This method forms the matrix-matrix product C = op(A) * op(B), where
254 // op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
255 //
256 template <class Scalar,
257  class LocalOrdinal,
258  class GlobalOrdinal,
259  class Node>
260 void Multiply(
261  const Teuchos::RCP<const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& A,
262  bool transposeA,
263  const Teuchos::RCP<const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& B,
264  bool transposeB,
266  const std::string& label)
267 {
268  using Teuchos::null;
269  using Teuchos::RCP;
270  using Teuchos::rcp;
271  typedef Scalar SC;
272  typedef LocalOrdinal LO;
273  typedef GlobalOrdinal GO;
274  typedef Node NO;
275  typedef BlockCrsMatrixStruct<SC,LO,GO,NO> blockcrs_matrix_struct_type;
276  typedef Map<LO,GO,NO> map_type;
277  typedef Import<LO,GO,NO> import_type;
278 
279  std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
280 
281  TEUCHOS_TEST_FOR_EXCEPTION(transposeA==true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
282  TEUCHOS_TEST_FOR_EXCEPTION(transposeB==true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
283 
284  // Check size compatibility
285  global_size_t numACols = A->getGlobalNumCols();
286  global_size_t numBCols = B->getGlobalNumCols();
287  global_size_t numARows = A->getGlobalNumRows();
288  global_size_t numBRows = B->getGlobalNumRows();
289 
290  global_size_t Aouter = numARows;
291  global_size_t Bouter = numBCols;
292  global_size_t Ainner = numACols;
293  global_size_t Binner = numBRows;
294  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
295  prefix << "ERROR, inner dimensions of op(A) and op(B) "
296  "must match for matrix-matrix product. op(A) is "
297  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
298 
299  // We're going to need to import remotely-owned sections of A and/or B if
300  // more than one processor is performing this run, depending on the scenario.
301  int numProcs = A->getComm()->getSize();
302 
303  const LO blocksize = A->getBlockSize();
304  TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
305  prefix << "ERROR, Blocksizes do not match. A.blocksize = " <<
306  blocksize << ", B.blocksize = " << B->getBlockSize() );
307 
308  // Declare a couple of structs that will be used to hold views of the data
309  // of A and B, to be used for fast access during the matrix-multiplication.
310  blockcrs_matrix_struct_type Aview(blocksize);
311  blockcrs_matrix_struct_type Bview(blocksize);
312 
313  RCP<const map_type> targetMap_A = A->getRowMap();
314  RCP<const map_type> targetMap_B = B->getRowMap();
315 
316  // Populate the Aview struct. No remotes are needed.
317  RCP<const import_type> dummyImporter;
318  MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
319 
320  // We will also need local access to all rows of B that correspond to the
321  // column-map of op(A).
322  if (numProcs > 1)
323  targetMap_B = A->getColMap();
324 
325  // Import any needed remote rows and populate the Bview struct.
326  MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
327  A->getGraph()->getImporter().is_null());
328 
329  // Call the appropriate method to perform the actual multiplication.
330  MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
331 }
332 
333 template <class Scalar,
334  class LocalOrdinal,
335  class GlobalOrdinal,
336  class Node>
337 void Jacobi(Scalar omega,
342  bool call_FillComplete_on_result,
343  const std::string& label,
344  const Teuchos::RCP<Teuchos::ParameterList>& params)
345 {
346  using Teuchos::RCP;
347  using Teuchos::rcp;
348  typedef Scalar SC;
349  typedef LocalOrdinal LO;
350  typedef GlobalOrdinal GO;
351  typedef Node NO;
352  typedef Import<LO,GO,NO> import_type;
353  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
354  typedef Map<LO,GO,NO> map_type;
355  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
356 
357 
358  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: All Setup"));
359 
360  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
361 
362  // A and B should already be Filled.
363  // Should we go ahead and call FillComplete() on them if necessary or error
364  // out? For now, we choose to error out.
365  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
366  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
367 
368  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
369  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
370 
371  // Now check size compatibility
372  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
373  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
374  global_size_t Aouter = A.getGlobalNumRows();
375  global_size_t Bouter = numBCols;
376  global_size_t Ainner = numACols;
377  global_size_t Binner = B.getGlobalNumRows();
378  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
379  prefix << "ERROR, inner dimensions of op(A) and op(B) "
380  "must match for matrix-matrix product. op(A) is "
381  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
382 
383  // The result matrix C must at least have a row-map that reflects the correct
384  // row-size. Don't check the number of columns because rectangular matrices
385  // which were constructed with only one map can still end up having the
386  // correct capacity and dimensions when filled.
387  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
388  prefix << "ERROR, dimensions of result C must "
389  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
390  << " rows, should have at least "<< Aouter << std::endl);
391 
392  // It doesn't matter whether C is already Filled or not. If it is already
393  // Filled, it must have space allocated for the positions that will be
394  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
395  // we'll error out later when trying to store result values.
396 
397  // CGB: However, matrix must be in active-fill
398  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
399 
400  // We're going to need to import remotely-owned sections of A and/or B if
401  // more than one processor is performing this run, depending on the scenario.
402  int numProcs = A.getComm()->getSize();
403 
404  // Declare a couple of structs that will be used to hold views of the data of
405  // A and B, to be used for fast access during the matrix-multiplication.
406  crs_matrix_struct_type Aview;
407  crs_matrix_struct_type Bview;
408 
409  RCP<const map_type> targetMap_A = Aprime->getRowMap();
410  RCP<const map_type> targetMap_B = Bprime->getRowMap();
411 
412  {
413  Tpetra::Details::ProfilingRegion r("TpetraExt: Jacobi: All I&X");
414  // Enable globalConstants by default
415  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
416  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
417  if(!params.is_null()) {
418  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
419  int mm_optimization_core_count=0;
420  auto slist = params->sublist("matrixmatrix: kernel params",false);
421  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
422  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
423  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
424  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
425  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
426  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
427  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
428  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
429  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
430  }
431 
432  //Now import any needed remote rows and populate the Aview struct.
433  RCP<const import_type> dummyImporter;
434  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
435 
436  // We will also need local access to all rows of B that correspond to the
437  // column-map of op(A).
438  if (numProcs > 1)
439  targetMap_B = Aprime->getColMap();
440 
441  // Now import any needed remote rows and populate the Bview struct.
442  // Enable globalConstants by default
443  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
444  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
445  if(!params.is_null()) {
446  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
447 
448  auto slist = params->sublist("matrixmatrix: kernel params",false);
449  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
450  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
451  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
452  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
453  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
454  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
455  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
456  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
457  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
458  }
459 
460  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
461 
462  }
463  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi All Multiply"));
464 
465  // Now call the appropriate method to perform the actual multiplication.
466  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
467 
468  // Is this a "clean" matrix
469  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
470 
471  if (call_FillComplete_on_result && newFlag) {
472  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
473 
474  } else if (call_FillComplete_on_result) {
475  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
476 
477  } else {
478  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
479  }
480 
481  if(!params.is_null()) {
482  bool removeZeroEntries = params->get("remove zeros", false);
483  if (removeZeroEntries) {
484  typedef Teuchos::ScalarTraits<Scalar> STS;
485  typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
486  removeCrsMatrixZeros(C, threshold);
487  }
488  }
489 }
490 
491 
492 template <class Scalar,
493  class LocalOrdinal,
494  class GlobalOrdinal,
495  class Node>
496 void Add(
498  bool transposeA,
499  Scalar scalarA,
501  Scalar scalarB )
502 {
503  using Teuchos::Array;
504  using Teuchos::RCP;
505  using Teuchos::null;
506  typedef Scalar SC;
507  typedef LocalOrdinal LO;
508  typedef GlobalOrdinal GO;
509  typedef Node NO;
510  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
511  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
512 
513  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
514 
515  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
516  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
517  "(Result matrix B is not required to be isFillComplete()).");
518  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
519  prefix << "ERROR, input matrix B must not be fill complete!");
520  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
521  prefix << "ERROR, input matrix B must not have static graph!");
522  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
523  prefix << "ERROR, input matrix B must not be locally indexed!");
524 
525  using Teuchos::ParameterList;
526  RCP<ParameterList> transposeParams (new ParameterList);
527  transposeParams->set ("sort", false);
528 
529  RCP<const crs_matrix_type> Aprime = null;
530  if (transposeA) {
531  transposer_type transposer (rcpFromRef (A));
532  Aprime = transposer.createTranspose (transposeParams);
533  }
534  else {
535  Aprime = rcpFromRef(A);
536  }
537 
538  size_t a_numEntries;
539  typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getLocalMaxNumRowEntries());
540  typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getLocalMaxNumRowEntries());
541  GO row;
542 
543  if (scalarB != Teuchos::ScalarTraits<SC>::one())
544  B.scale(scalarB);
545 
546  size_t numMyRows = B.getLocalNumRows();
547  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
548  for (LO i = 0; (size_t)i < numMyRows; ++i) {
549  row = B.getRowMap()->getGlobalElement(i);
550  Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
551 
552  if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
553  for (size_t j = 0; j < a_numEntries; ++j)
554  a_vals[j] *= scalarA;
555  }
556  B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
557  }
558  }
559 }
560 
561 template <class Scalar,
562  class LocalOrdinal,
563  class GlobalOrdinal,
564  class Node>
565 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
566 add (const Scalar& alpha,
567  const bool transposeA,
569  const Scalar& beta,
570  const bool transposeB,
572  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
573  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
574  const Teuchos::RCP<Teuchos::ParameterList>& params)
575 {
576  using Teuchos::RCP;
577  using Teuchos::rcpFromRef;
578  using Teuchos::rcp;
579  using Teuchos::ParameterList;
581  if(!params.is_null())
582  {
583  TEUCHOS_TEST_FOR_EXCEPTION(
584  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
585  std::invalid_argument,
586  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
587  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
588  params->set("Call fillComplete", true);
589  }
590  //If transposeB, must compute B's explicit transpose to
591  //get the correct row map for C.
592  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
593  if(transposeB)
594  {
596  Brcp = transposer.createTranspose();
597  }
598  //Check that A,B are fillComplete before getting B's column map
599  TEUCHOS_TEST_FOR_EXCEPTION
600  (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
601  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
602  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
603  //this version of add() always fill completes the result, no matter what is in params on input
604  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
605  return C;
606 }
607 
608 //This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
609 //but since the spadd() output is always packed there is no need for a separate
610 //numRowEntries here.
611 //
612 template<class LO, class GO, class LOView, class GOView, class LocalMap>
613 struct ConvertGlobalToLocalFunctor
614 {
615  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
616  : lids(lids_), gids(gids_), localColMap(localColMap_)
617  {}
618 
619  KOKKOS_FUNCTION void operator() (const GO i) const
620  {
621  lids(i) = localColMap.getLocalElement(gids(i));
622  }
623 
624  LOView lids;
625  const GOView gids;
626  const LocalMap localColMap;
627 };
628 
629 template <class Scalar,
630  class LocalOrdinal,
631  class GlobalOrdinal,
632  class Node>
633 void
634 add (const Scalar& alpha,
635  const bool transposeA,
637  const Scalar& beta,
638  const bool transposeB,
641  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
642  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
643  const Teuchos::RCP<Teuchos::ParameterList>& params)
644 {
645  using Teuchos::RCP;
646  using Teuchos::rcp;
647  using Teuchos::rcpFromRef;
648  using Teuchos::rcp_implicit_cast;
649  using Teuchos::rcp_dynamic_cast;
650  using Teuchos::TimeMonitor;
651  using SC = Scalar;
652  using LO = LocalOrdinal;
653  using GO = GlobalOrdinal;
654  using NO = Node;
655  using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
656  using crs_graph_type = CrsGraph<LO,GO,NO>;
657  using map_type = Map<LO,GO,NO>;
658  using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
659  using import_type = Import<LO,GO,NO>;
660  using export_type = Export<LO,GO,NO>;
661  using exec_space = typename crs_graph_type::execution_space;
662  using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
663  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
664  constexpr bool debug = false;
665 
666  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: Transpose"));
667 
668  if (debug) {
669  std::ostringstream os;
670  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
671  << "TpetraExt::MatrixMatrix::add" << std::endl;
672  std::cerr << os.str ();
673  }
674 
675  TEUCHOS_TEST_FOR_EXCEPTION
676  (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
677  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
678  TEUCHOS_TEST_FOR_EXCEPTION
679  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
680  prefix_mmm << "A and B must both be fill complete.");
681 #ifdef HAVE_TPETRA_DEBUG
682  // The matrices don't have domain or range Maps unless they are fill complete.
683  if (A.isFillComplete () && B.isFillComplete ()) {
684  const bool domainMapsSame =
685  (! transposeA && ! transposeB &&
686  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
687  (! transposeA && transposeB &&
688  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
689  ( transposeA && ! transposeB &&
690  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
691  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
692  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
693 
694  const bool rangeMapsSame =
695  (! transposeA && ! transposeB &&
696  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
697  (! transposeA && transposeB &&
698  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
699  ( transposeA && ! transposeB &&
700  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
701  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
702  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
703  }
704 #endif // HAVE_TPETRA_DEBUG
705 
706  using Teuchos::ParameterList;
707  // Form the explicit transpose of A if necessary.
708  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
709  if (transposeA) {
710  transposer_type transposer (Aprime);
711  Aprime = transposer.createTranspose ();
712  }
713 
714 #ifdef HAVE_TPETRA_DEBUG
715  TEUCHOS_TEST_FOR_EXCEPTION
716  (Aprime.is_null (), std::logic_error,
717  prefix_mmm << "Failed to compute Op(A). "
718  "Please report this bug to the Tpetra developers.");
719 #endif // HAVE_TPETRA_DEBUG
720 
721  // Form the explicit transpose of B if necessary.
722  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
723  if (transposeB) {
724  if (debug) {
725  std::ostringstream os;
726  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
727  << "Form explicit xpose of B" << std::endl;
728  std::cerr << os.str ();
729  }
730  transposer_type transposer (Bprime);
731  Bprime = transposer.createTranspose ();
732  }
733 #ifdef HAVE_TPETRA_DEBUG
734  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
735  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
736  TEUCHOS_TEST_FOR_EXCEPTION(
737  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
738  prefix_mmm << "Aprime and Bprime must both be fill complete. "
739  "Please report this bug to the Tpetra developers.");
740 #endif // HAVE_TPETRA_DEBUG
741  RCP<const map_type> CDomainMap = domainMap;
742  RCP<const map_type> CRangeMap = rangeMap;
743  if(CDomainMap.is_null())
744  {
745  CDomainMap = Bprime->getDomainMap();
746  }
747  if(CRangeMap.is_null())
748  {
749  CRangeMap = Bprime->getRangeMap();
750  }
751  assert(!(CDomainMap.is_null()));
752  assert(!(CRangeMap.is_null()));
753  typedef typename AddKern::values_array values_array;
754  typedef typename AddKern::row_ptrs_array row_ptrs_array;
755  typedef typename AddKern::col_inds_array col_inds_array;
756  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
757  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
758  values_array vals;
759  row_ptrs_array rowptrs;
760  col_inds_array colinds;
761 
762  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: rowmap check/import"));
763 
764  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
765  {
766  //import Aprime into Bprime's row map so the local matrices have same # of rows
767  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
768  // cbl do not set
769  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
770  // this import _may_ take the form of a transfer. In practice it would be unlikely,
771  // but the general case is not so forgiving.
772  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
773  }
774  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
775  bool sorted = AGraphSorted && BGraphSorted;
776  RCP<const import_type> Cimport = Teuchos::null;
777  RCP<export_type> Cexport = Teuchos::null;
778  bool doFillComplete = true;
779  if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
780  {
781  doFillComplete = params->get<bool>("Call fillComplete");
782  }
783  auto Alocal = Aprime->getLocalMatrixDevice();
784  auto Blocal = Bprime->getLocalMatrixDevice();
785  LO numLocalRows = Alocal.numRows();
786  if(numLocalRows == 0)
787  {
788  //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
789  //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
790  //Handle this case now
791  //(without interfering with collective operations, since it's possible for
792  //some ranks to have 0 local rows and others not).
793  rowptrs = row_ptrs_array("C rowptrs", 0);
794  }
795  auto Acolmap = Aprime->getColMap();
796  auto Bcolmap = Bprime->getColMap();
797  if(!matchingColMaps)
798  {
799  using global_col_inds_array = typename AddKern::global_col_inds_array;
800  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: mismatched col map full kernel"));
801 
802  //use kernel that converts col indices in both A and B to common domain map before adding
803  auto AlocalColmap = Acolmap->getLocalMap();
804  auto BlocalColmap = Bcolmap->getLocalMap();
805  global_col_inds_array globalColinds;
806  if (debug) {
807  std::ostringstream os;
808  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
809  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
810  std::cerr << os.str ();
811  }
812  AddKern::convertToGlobalAndAdd(
813  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
814  vals, rowptrs, globalColinds);
815  if (debug) {
816  std::ostringstream os;
817  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
818  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
819  std::cerr << os.str ();
820  }
821  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: Constructing graph"));
822 
823  RCP<const map_type> CcolMap;
824  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
825  (CcolMap, CDomainMap, globalColinds);
826  C.replaceColMap(CcolMap);
827  col_inds_array localColinds("C colinds", globalColinds.extent(0));
828  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
829  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
830  col_inds_array, global_col_inds_array,
831  typename map_type::local_map_type>
832  (localColinds, globalColinds, CcolMap->getLocalMap()));
833  KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
834  C.setAllValues(rowptrs, localColinds, vals);
835  C.fillComplete(CDomainMap, CRangeMap, params);
836  if(!doFillComplete)
837  C.resumeFill();
838  }
839  else
840  {
841  //Aprime, Bprime and C all have the same column maps
842  auto Avals = Alocal.values;
843  auto Bvals = Blocal.values;
844  auto Arowptrs = Alocal.graph.row_map;
845  auto Browptrs = Blocal.graph.row_map;
846  auto Acolinds = Alocal.graph.entries;
847  auto Bcolinds = Blocal.graph.entries;
848  if(sorted)
849  {
850  //use sorted kernel
851  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted entries full kernel"));
852  if (debug) {
853  std::ostringstream os;
854  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
855  << "Call AddKern::addSorted(...)" << std::endl;
856  std::cerr << os.str ();
857  }
858  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
859  }
860  else
861  {
862  //use unsorted kernel
863  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted entries full kernel"));
864 
865  if (debug) {
866  std::ostringstream os;
867  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
868  << "Call AddKern::addUnsorted(...)" << std::endl;
869  std::cerr << os.str ();
870  }
871  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
872  }
873  //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
874  RCP<const map_type> Ccolmap = Bcolmap;
875  C.replaceColMap(Ccolmap);
876  C.setAllValues(rowptrs, colinds, vals);
877  if(doFillComplete)
878  {
879  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: expertStaticFillComplete"));
880  if(!CDomainMap->isSameAs(*Ccolmap))
881  {
882  if (debug) {
883  std::ostringstream os;
884  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
885  << "Create Cimport" << std::endl;
886  std::cerr << os.str ();
887  }
888  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
889  }
890  if(!C.getRowMap()->isSameAs(*CRangeMap))
891  {
892  if (debug) {
893  std::ostringstream os;
894  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
895  << "Create Cexport" << std::endl;
896  std::cerr << os.str ();
897  }
898  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
899  }
900 
901  if (debug) {
902  std::ostringstream os;
903  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
904  << "Call C->expertStaticFillComplete(...)" << std::endl;
905  std::cerr << os.str ();
906  }
907  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
908  }
909  }
910 }
911 
912 // This version of Add takes C as RCP&, so C may be null on input (in this case,
913 // it is allocated and constructed in this function).
914 template <class Scalar,
915  class LocalOrdinal,
916  class GlobalOrdinal,
917  class Node>
918 void Add(
920  bool transposeA,
921  Scalar scalarA,
923  bool transposeB,
924  Scalar scalarB,
926 {
927  using Teuchos::Array;
928  using Teuchos::ArrayRCP;
929  using Teuchos::ArrayView;
930  using Teuchos::RCP;
931  using Teuchos::rcp;
932  using Teuchos::rcp_dynamic_cast;
933  using Teuchos::rcpFromRef;
934  using Teuchos::tuple;
935  using std::endl;
936  // typedef typename ArrayView<const Scalar>::size_type size_type;
937  typedef Teuchos::ScalarTraits<Scalar> STS;
939  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
940  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
941  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
944 
945  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
946 
947  TEUCHOS_TEST_FOR_EXCEPTION(
948  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
949  prefix << "A and B must both be fill complete before calling this function.");
950 
951  if(C.is_null()) {
952  TEUCHOS_TEST_FOR_EXCEPTION(!A.haveGlobalConstants(), std::logic_error,
953  prefix << "C is null (must be allocated), but A.haveGlobalConstants() is false. "
954  "Please report this bug to the Tpetra developers.");
955  TEUCHOS_TEST_FOR_EXCEPTION(!B.haveGlobalConstants(), std::logic_error,
956  prefix << "C is null (must be allocated), but B.haveGlobalConstants() is false. "
957  "Please report this bug to the Tpetra developers.");
958  }
959 
960 #ifdef HAVE_TPETRA_DEBUG
961  {
962  const bool domainMapsSame =
963  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
964  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
965  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
966  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
967  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
968 
969  const bool rangeMapsSame =
970  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
971  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
972  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
973  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
974  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
975  }
976 #endif // HAVE_TPETRA_DEBUG
977 
978  using Teuchos::ParameterList;
979  RCP<ParameterList> transposeParams (new ParameterList);
980  transposeParams->set ("sort", false);
981 
982  // Form the explicit transpose of A if necessary.
983  RCP<const crs_matrix_type> Aprime;
984  if (transposeA) {
985  transposer_type theTransposer (rcpFromRef (A));
986  Aprime = theTransposer.createTranspose (transposeParams);
987  }
988  else {
989  Aprime = rcpFromRef (A);
990  }
991 
992 #ifdef HAVE_TPETRA_DEBUG
993  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
994  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
995 #endif // HAVE_TPETRA_DEBUG
996 
997  // Form the explicit transpose of B if necessary.
998  RCP<const crs_matrix_type> Bprime;
999  if (transposeB) {
1000  transposer_type theTransposer (rcpFromRef (B));
1001  Bprime = theTransposer.createTranspose (transposeParams);
1002  }
1003  else {
1004  Bprime = rcpFromRef (B);
1005  }
1006 
1007 #ifdef HAVE_TPETRA_DEBUG
1008  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1010 #endif // HAVE_TPETRA_DEBUG
1011 
1012  bool CwasFillComplete = false;
1013 
1014  // Allocate or zero the entries of the result matrix.
1015  if (! C.is_null ()) {
1016  CwasFillComplete = C->isFillComplete();
1017  if(CwasFillComplete)
1018  C->resumeFill();
1019  C->setAllToScalar (STS::zero ());
1020  } else {
1021  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1022  // noticed that C was being given the row Map of Aprime (the
1023  // possibly transposed version of A). Is this what we want?
1024 
1025  // It is a precondition that Aprime and Bprime have the same domain and range maps.
1026  // However, they may have different row maps. In this case, it's difficult to
1027  // get a precise upper bound on the number of entries in each local row of C, so
1028  // just use the looser upper bound based on the max number of entries in any row of Aprime and Bprime.
1029  if(Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
1030  LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1031  Array<size_t> CmaxEntriesPerRow(numLocalRows);
1032  for(LocalOrdinal i = 0; i < numLocalRows; i++) {
1033  CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1034  }
1035  C = rcp (new crs_matrix_type (Aprime->getRowMap (), CmaxEntriesPerRow()));
1036  }
1037  else {
1038  // Note: above we checked that Aprime and Bprime have global constants, so it's safe to ask for max entries per row.
1039  C = rcp (new crs_matrix_type (Aprime->getRowMap (), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1040  }
1041  }
1042 
1043 #ifdef HAVE_TPETRA_DEBUG
1044  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1045  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1046  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1047  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1048  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1049  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1050 #endif // HAVE_TPETRA_DEBUG
1051 
1052  Array<RCP<const crs_matrix_type> > Mat =
1053  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1054  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1055 
1056  // do a loop over each matrix to add: A reordering might be more efficient
1057  for (int k = 0; k < 2; ++k) {
1058  typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1059  typename crs_matrix_type::nonconst_values_host_view_type Values;
1060 
1061  // Loop over each locally owned row of the current matrix (either
1062  // Aprime or Bprime), and sum its entries into the corresponding
1063  // row of C. This works regardless of whether Aprime or Bprime
1064  // has the same row Map as C, because both sumIntoGlobalValues and
1065  // insertGlobalValues allow summing resp. inserting into nonowned
1066  // rows of C.
1067 #ifdef HAVE_TPETRA_DEBUG
1068  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1069  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1070 #endif // HAVE_TPETRA_DEBUG
1071  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1072 #ifdef HAVE_TPETRA_DEBUG
1073  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1074  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1075 #endif // HAVE_TPETRA_DEBUG
1076 
1077  const size_t localNumRows = Mat[k]->getLocalNumRows ();
1078  for (size_t i = 0; i < localNumRows; ++i) {
1079  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1080  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1081  if (numEntries > 0) {
1082  if(numEntries > Indices.extent(0)) {
1083  Kokkos::resize(Indices, numEntries);
1084  Kokkos::resize(Values, numEntries);
1085  }
1086  Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1087 
1088  if (scalar[k] != STS::one ()) {
1089  for (size_t j = 0; j < numEntries; ++j) {
1090  Values[j] *= scalar[k];
1091  }
1092  }
1093 
1094  if (CwasFillComplete) {
1095  size_t result = C->sumIntoGlobalValues (globalRow, numEntries,
1096  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1097  TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1098  prefix << "sumIntoGlobalValues failed to add entries from A or B into C.");
1099  } else {
1100  C->insertGlobalValues (globalRow, numEntries,
1101  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1102  }
1103  }
1104  }
1105  }
1106  if(CwasFillComplete) {
1107  C->fillComplete(C->getDomainMap (),
1108  C->getRangeMap ());
1109  }
1110 }
1111 
1112 // This version of Add takes C as const RCP&, so C must not be null on input. Otherwise, its behavior is identical
1113 // to the above version where C is RCP&.
1114 template <class Scalar,
1115  class LocalOrdinal,
1116  class GlobalOrdinal,
1117  class Node>
1118 void Add(
1120  bool transposeA,
1121  Scalar scalarA,
1123  bool transposeB,
1124  Scalar scalarB,
1126 {
1127  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1128 
1129  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::invalid_argument,
1130  prefix << "C must not be null");
1131 
1132  Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C_ = C;
1133  Add(A, transposeA, scalarA, B, transposeB, scalarB, C_);
1134 }
1135 
1136 } //End namespace MatrixMatrix
1137 
1138 namespace MMdetails{
1139 
1140 /*********************************************************************************************************/
1142 //template <class TransferType>
1143 //void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1144 // if (Transfer.is_null())
1145 // return;
1146 //
1147 // const Distributor & Distor = Transfer->getDistributor();
1148 // Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1149 //
1150 // size_t rows_send = Transfer->getNumExportIDs();
1151 // size_t rows_recv = Transfer->getNumRemoteIDs();
1152 //
1153 // size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1154 // size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1155 // size_t num_send_neighbors = Distor.getNumSends();
1156 // size_t num_recv_neighbors = Distor.getNumReceives();
1157 // size_t round2_send, round2_recv;
1158 // Distor.getLastDoStatistics(round2_send,round2_recv);
1159 //
1160 // int myPID = Comm->getRank();
1161 // int NumProcs = Comm->getSize();
1162 //
1163 // // Processor by processor statistics
1164 // // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1165 // // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1166 //
1167 // // Global statistics
1168 // size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1169 // size_t gstats_min[8], gstats_max[8];
1170 //
1171 // double lstats_avg[8], gstats_avg[8];
1172 // for(int i=0; i<8; i++)
1173 // lstats_avg[i] = ((double)lstats[i])/NumProcs;
1174 //
1175 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1176 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1177 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1178 //
1179 // if(!myPID) {
1180 // printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1181 // (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1182 // (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1183 // printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1184 // (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1185 // (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1186 // }
1187 //}
1188 
1189 // Kernel method for computing the local portion of C = A*B
1190 template<class Scalar,
1191  class LocalOrdinal,
1192  class GlobalOrdinal,
1193  class Node>
1194 void mult_AT_B_newmatrix(
1195  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1196  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1197  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1198  const std::string & label,
1199  const Teuchos::RCP<Teuchos::ParameterList>& params)
1200 {
1201  using Teuchos::RCP;
1202  using Teuchos::rcp;
1203  typedef Scalar SC;
1204  typedef LocalOrdinal LO;
1205  typedef GlobalOrdinal GO;
1206  typedef Node NO;
1207  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1208  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1209 
1210 
1211  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: Transpose"));
1212 
1213  /*************************************************************/
1214  /* 1) Local Transpose of A */
1215  /*************************************************************/
1216  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1217 
1218  using Teuchos::ParameterList;
1219  RCP<ParameterList> transposeParams (new ParameterList);
1220  transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
1221  if(! params.is_null ()) {
1222  transposeParams->set ("compute global constants",
1223  params->get ("compute global constants: temporaries",
1224  false));
1225  }
1226  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1227  transposer.createTransposeLocal (transposeParams);
1228 
1229  /*************************************************************/
1230  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1231  /*************************************************************/
1232  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: I&X"));
1233 
1234  // Get views, asserting that no import is required to speed up computation
1235  crs_matrix_struct_type Aview;
1236  crs_matrix_struct_type Bview;
1237  RCP<const Import<LO, GO, NO> > dummyImporter;
1238 
1239  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1240  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1241  if (! params.is_null ()) {
1242  importParams1->set ("compute global constants",
1243  params->get ("compute global constants: temporaries",
1244  false));
1245  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1246  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1247  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1248  int mm_optimization_core_count =
1250  mm_optimization_core_count =
1251  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1252  int mm_optimization_core_count2 =
1253  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1254  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1255  mm_optimization_core_count = mm_optimization_core_count2;
1256  }
1257  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1258  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1259  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1260  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1261  }
1262 
1263  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1264  Aview, dummyImporter, true,
1265  label, importParams1);
1266 
1267  RCP<ParameterList> importParams2 (new ParameterList);
1268  if (! params.is_null ()) {
1269  importParams2->set ("compute global constants",
1270  params->get ("compute global constants: temporaries",
1271  false));
1272  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1273  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1274  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1275  int mm_optimization_core_count =
1277  mm_optimization_core_count =
1278  slist.get ("MM_TAFC_OptimizationCoreCount",
1279  mm_optimization_core_count);
1280  int mm_optimization_core_count2 =
1281  params->get ("MM_TAFC_OptimizationCoreCount",
1282  mm_optimization_core_count);
1283  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1284  mm_optimization_core_count = mm_optimization_core_count2;
1285  }
1286  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1287  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1288  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1289  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1290  }
1291 
1292  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1293  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1294  }
1295  else {
1296  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1297  }
1298 
1299  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: AB-core"));
1300 
1301  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1302 
1303  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1304  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1305  if (needs_final_export) {
1306  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1307  }
1308  else {
1309  Ctemp = rcp (&C, false);
1310  }
1311 
1312  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1313 
1314  /*************************************************************/
1315  /* 4) exportAndFillComplete matrix */
1316  /*************************************************************/
1317  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: exportAndFillComplete"));
1318 
1319  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1320 
1321  if (needs_final_export) {
1322  ParameterList labelList;
1323  labelList.set("Timer Label", label);
1324  if(!params.is_null()) {
1325  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1326  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1327  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1328  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1329  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1330  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1331  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1332  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1333  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1334 
1335  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1336  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1337  labelList.set("compute global constants",params->get("compute global constants",true));
1338  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1339  }
1340  Ctemp->exportAndFillComplete (Crcp,
1341  *Ctemp->getGraph ()->getExporter (),
1342  B.getDomainMap (),
1343  A.getDomainMap (),
1344  rcp (&labelList, false));
1345  }
1346 #ifdef HAVE_TPETRA_MMM_STATISTICS
1347  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1348 #endif
1349 }
1350 
1351 /*********************************************************************************************************/
1352 // Kernel method for computing the local portion of C = A*B
1353 template<class Scalar,
1354  class LocalOrdinal,
1355  class GlobalOrdinal,
1356  class Node>
1357 void mult_A_B(
1358  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1359  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1360  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1361  const std::string& /* label */,
1362  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1363 {
1364  using Teuchos::Array;
1365  using Teuchos::ArrayRCP;
1366  using Teuchos::ArrayView;
1367  using Teuchos::OrdinalTraits;
1368  using Teuchos::null;
1369 
1370  typedef Teuchos::ScalarTraits<Scalar> STS;
1371  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1372  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1373  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1374 
1375  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1376  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1377 
1378  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1379  ArrayView<const GlobalOrdinal> bcols_import = null;
1380  if (Bview.importColMap != null) {
1381  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1382  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1383 
1384  bcols_import = Bview.importColMap->getLocalElementList();
1385  }
1386 
1387  size_t C_numCols = C_lastCol - C_firstCol +
1388  OrdinalTraits<LocalOrdinal>::one();
1389  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1390  OrdinalTraits<LocalOrdinal>::one();
1391 
1392  if (C_numCols_import > C_numCols)
1393  C_numCols = C_numCols_import;
1394 
1395  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1396  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1397  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1398 
1399  Array<Scalar> C_row_i = dwork;
1400  Array<GlobalOrdinal> C_cols = iwork;
1401  Array<size_t> c_index = iwork2;
1402  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1403  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1404 
1405  size_t C_row_i_length, j, k, last_index;
1406 
1407  // Run through all the hash table lookups once and for all
1408  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1409  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1410  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1411  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1412  // Maps are the same: Use local IDs as the hash
1413  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1414  Aview.colMap->getMaxLocalIndex(); i++)
1415  Acol2Brow[i]=i;
1416  }
1417  else {
1418  // Maps are not the same: Use the map's hash
1419  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1420  Aview.colMap->getMaxLocalIndex(); i++) {
1421  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1422  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1423  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1424  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1425  }
1426  }
1427 
1428  // To form C = A*B we're going to execute this expression:
1429  //
1430  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1431  //
1432  // Our goal, of course, is to navigate the data in A and B once, without
1433  // performing searches for column-indices, etc.
1434  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1435  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1436  auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1437  auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1438  auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1439  auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1440  decltype(Browptr) Irowptr;
1441  decltype(Bcolind) Icolind;
1442  decltype(Bvals) Ivals;
1443  if(!Bview.importMatrix.is_null()) {
1444  Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1445  Icolind = Bview.importMatrix->getLocalIndicesHost();
1446  Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1447  }
1448 
1449  bool C_filled = C.isFillComplete();
1450 
1451  for (size_t i = 0; i < C_numCols; i++)
1452  c_index[i] = OrdinalTraits<size_t>::invalid();
1453 
1454  // Loop over the rows of A.
1455  size_t Arows = Aview.rowMap->getLocalNumElements();
1456  for(size_t i=0; i<Arows; ++i) {
1457 
1458  // Only navigate the local portion of Aview... which is, thankfully, all of
1459  // A since this routine doesn't do transpose modes
1460  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1461 
1462  // Loop across the i-th row of A and for each corresponding row in B, loop
1463  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1464  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1465  // calculating updates for row i of the result matrix C.
1466  C_row_i_length = OrdinalTraits<size_t>::zero();
1467 
1468  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1469  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1470  const Scalar Aval = Avals[k];
1471  if (Aval == STS::zero())
1472  continue;
1473 
1474  if (Ak == LO_INVALID)
1475  continue;
1476 
1477  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1478  LocalOrdinal col = Bcolind[j];
1479  //assert(col >= 0 && col < C_numCols);
1480 
1481  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1482  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1483  // This has to be a += so insertGlobalValue goes out
1484  C_row_i[C_row_i_length] = Aval*Bvals[j];
1485  C_cols[C_row_i_length] = col;
1486  c_index[col] = C_row_i_length;
1487  C_row_i_length++;
1488 
1489  } else {
1490  // static cast from impl_scalar_type to Scalar needed for complex
1491  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1492  }
1493  }
1494  }
1495 
1496  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1497  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1498  C_cols[ii] = bcols[C_cols[ii]];
1499  combined_index[ii] = C_cols[ii];
1500  combined_values[ii] = C_row_i[ii];
1501  }
1502  last_index = C_row_i_length;
1503 
1504  //
1505  //Now put the C_row_i values into C.
1506  //
1507  // We might have to revamp this later.
1508  C_row_i_length = OrdinalTraits<size_t>::zero();
1509 
1510  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1511  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1512  const Scalar Aval = Avals[k];
1513  if (Aval == STS::zero())
1514  continue;
1515 
1516  if (Ak!=LO_INVALID) continue;
1517 
1518  Ak = Acol2Irow[Acolind[k]];
1519  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1520  LocalOrdinal col = Icolind[j];
1521  //assert(col >= 0 && col < C_numCols);
1522 
1523  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1524  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1525  // This has to be a += so insertGlobalValue goes out
1526  C_row_i[C_row_i_length] = Aval*Ivals[j];
1527  C_cols[C_row_i_length] = col;
1528  c_index[col] = C_row_i_length;
1529  C_row_i_length++;
1530 
1531  } else {
1532  // This has to be a += so insertGlobalValue goes out
1533  // static cast from impl_scalar_type to Scalar needed for complex
1534  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1535  }
1536  }
1537  }
1538 
1539  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1540  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1541  C_cols[ii] = bcols_import[C_cols[ii]];
1542  combined_index[last_index] = C_cols[ii];
1543  combined_values[last_index] = C_row_i[ii];
1544  last_index++;
1545  }
1546 
1547  // Now put the C_row_i values into C.
1548  // We might have to revamp this later.
1549  C_filled ?
1550  C.sumIntoGlobalValues(
1551  global_row,
1552  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1553  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1554  :
1555  C.insertGlobalValues(
1556  global_row,
1557  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1558  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1559 
1560  }
1561 }
1562 
1563 /*********************************************************************************************************/
1564 template<class Scalar,
1565  class LocalOrdinal,
1566  class GlobalOrdinal,
1567  class Node>
1568 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1569  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1570  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1571 
1572  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1573  Mview.maxNumRowEntries = Mview.indices[0].size();
1574 
1575  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1576  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1577  Mview.maxNumRowEntries = Mview.indices[i].size();
1578  }
1579 }
1580 
1581 /*********************************************************************************************************/
1582 template<class CrsMatrixType>
1583 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1584  // Follows the NZ estimate in ML's ml_matmatmult.c
1585  size_t Aest = 100, Best=100;
1586  if (A.getLocalNumEntries() >= A.getLocalNumRows())
1587  Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1588  if (B.getLocalNumEntries() >= B.getLocalNumRows())
1589  Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1590 
1591  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1592  nnzperrow *= nnzperrow;
1593 
1594  return (size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1595 }
1596 
1597 
1598 /*********************************************************************************************************/
1599 // Kernel method for computing the local portion of C = A*B for CrsMatrix
1600 //
1601 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1602 // function, so this is probably the function we want to
1603 // thread-parallelize.
1604 template<class Scalar,
1605  class LocalOrdinal,
1606  class GlobalOrdinal,
1607  class Node>
1608 void mult_A_B_newmatrix(
1609  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1610  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1611  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1612  const std::string& label,
1613  const Teuchos::RCP<Teuchos::ParameterList>& params)
1614 {
1615  using Teuchos::Array;
1616  using Teuchos::ArrayRCP;
1617  using Teuchos::ArrayView;
1618  using Teuchos::RCP;
1619  using Teuchos::rcp;
1620 
1621  // Tpetra typedefs
1622  typedef LocalOrdinal LO;
1623  typedef GlobalOrdinal GO;
1624  typedef Node NO;
1625  typedef Import<LO,GO,NO> import_type;
1626  typedef Map<LO,GO,NO> map_type;
1627 
1628  // Kokkos typedefs
1629  typedef typename map_type::local_map_type local_map_type;
1631  typedef typename KCRS::StaticCrsGraphType graph_t;
1632  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1633  typedef typename NO::execution_space execution_space;
1634  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1635  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1636 
1637  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: M5 Cmap");
1638 
1639  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1640 
1641  // Build the final importer / column map, hash table lookups for C
1642  RCP<const import_type> Cimport;
1643  RCP<const map_type> Ccolmap;
1644  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1645  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1646  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1647  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1648  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1649  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1650  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1651  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1652 
1653  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1654  // indices of B, to local column indices of C. (B and C have the
1655  // same number of columns.) The kernel uses this, instead of
1656  // copying the entire input matrix B and converting its column
1657  // indices to those of C.
1658  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1659 
1660  if (Bview.importMatrix.is_null()) {
1661  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1662  Cimport = Bimport;
1663  Ccolmap = Bview.colMap;
1664  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1665  // Bcol2Ccol is trivial
1666  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1667  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1668  KOKKOS_LAMBDA(const LO i) {
1669  Bcol2Ccol(i) = i;
1670  });
1671  }
1672  else {
1673  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1674  // column Map of C, as well as C's Import object (from its domain
1675  // Map to its column Map). C's column Map is the union of the
1676  // column Maps of (the local part of) B, and the "remote" part of
1677  // B. Ditto for the Import. We have optimized this "setUnion"
1678  // operation on Import objects and Maps.
1679 
1680  // Choose the right variant of setUnion
1681  if (!Bimport.is_null() && !Iimport.is_null()) {
1682  Cimport = Bimport->setUnion(*Iimport);
1683  }
1684  else if (!Bimport.is_null() && Iimport.is_null()) {
1685  Cimport = Bimport->setUnion();
1686  }
1687  else if (Bimport.is_null() && !Iimport.is_null()) {
1688  Cimport = Iimport->setUnion();
1689  }
1690  else {
1691  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1692  }
1693  Ccolmap = Cimport->getTargetMap();
1694 
1695  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1696  // in general. We should get rid of it in order to reduce
1697  // communication costs of sparse matrix-matrix multiply.
1698  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1699  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1700 
1701  // NOTE: This is not efficient and should be folded into setUnion
1702  //
1703  // mfh 27 Sep 2016: What the above comment means, is that the
1704  // setUnion operation on Import objects could also compute these
1705  // local index - to - local index look-up tables.
1706  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1707  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1708  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1709  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1710  });
1711  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1712  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1713  });
1714 
1715  }
1716 
1717  // Replace the column map
1718  //
1719  // mfh 27 Sep 2016: We do this because C was originally created
1720  // without a column Map. Now we have its column Map.
1721  C.replaceColMap(Ccolmap);
1722 
1723  // mfh 27 Sep 2016: Construct tables that map from local column
1724  // indices of A, to local row indices of either B_local (the locally
1725  // owned part of B), or B_remote (the "imported" remote part of B).
1726  //
1727  // For column index Aik in row i of A, if the corresponding row of B
1728  // exists in the local part of B ("orig") (which I'll call B_local),
1729  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1730  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1731  //
1732  // For column index Aik in row i of A, if the corresponding row of B
1733  // exists in the remote part of B ("Import") (which I'll call
1734  // B_remote), then targetMapToImportRow[Aik] is the local index of
1735  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1736  // (a flag value).
1737 
1738  // Run through all the hash table lookups once and for all
1739  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1740  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1741 
1742  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1743  GO aidx = Acolmap_local.getGlobalElement(i);
1744  LO B_LID = Browmap_local.getLocalElement(aidx);
1745  if (B_LID != LO_INVALID) {
1746  targetMapToOrigRow(i) = B_LID;
1747  targetMapToImportRow(i) = LO_INVALID;
1748  } else {
1749  LO I_LID = Irowmap_local.getLocalElement(aidx);
1750  targetMapToOrigRow(i) = LO_INVALID;
1751  targetMapToImportRow(i) = I_LID;
1752 
1753  }
1754  });
1755 
1756  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1757  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1758  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1759 
1760 }
1761 
1762 /*********************************************************************************************************/
1763 // Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1764 template<class Scalar,
1765  class LocalOrdinal,
1766  class GlobalOrdinal,
1767  class Node>
1768 void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1769  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1770  Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1771 {
1772  using Teuchos::null;
1773  using Teuchos::Array;
1774  using Teuchos::ArrayRCP;
1775  using Teuchos::ArrayView;
1776  using Teuchos::RCP;
1777  using Teuchos::rcp;
1778 
1779  // Tpetra typedefs
1780  typedef LocalOrdinal LO;
1781  typedef GlobalOrdinal GO;
1782  typedef Node NO;
1783  typedef Import<LO,GO,NO> import_type;
1784  typedef Map<LO,GO,NO> map_type;
1785  typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1786  typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1787 
1788  // Kokkos typedefs
1789  typedef typename map_type::local_map_type local_map_type;
1790  typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1791  typedef typename KBSR::device_type device_t;
1792  typedef typename KBSR::StaticCrsGraphType static_graph_t;
1793  typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1794  typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1795  typedef typename KBSR::values_type::non_const_type scalar_view_t;
1796  typedef typename NO::execution_space execution_space;
1797  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1798  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1799 
1800  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1801 
1802  // Build the final importer / column map, hash table lookups for C
1803  RCP<const import_type> Cimport;
1804  RCP<const map_type> Ccolmap;
1805  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1806  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1807  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1808  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1809  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1810  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1811  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1812  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1813 
1814  // Bcol2Ccol is a table that maps from local column
1815  // indices of B, to local column indices of C. (B and C have the
1816  // same number of columns.) The kernel uses this, instead of
1817  // copying the entire input matrix B and converting its column
1818  // indices to those of C.
1819  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1820 
1821  if (Bview.importMatrix.is_null()) {
1822  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1823  Cimport = Bimport;
1824  Ccolmap = Bview.colMap;
1825  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1826  // Bcol2Ccol is trivial
1827  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1828  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1829  KOKKOS_LAMBDA(const LO i) {
1830  Bcol2Ccol(i) = i;
1831  });
1832  }
1833  else {
1834  // B has "remotes," so we need to build the
1835  // column Map of C, as well as C's Import object (from its domain
1836  // Map to its column Map). C's column Map is the union of the
1837  // column Maps of (the local part of) B, and the "remote" part of
1838  // B. Ditto for the Import. We have optimized this "setUnion"
1839  // operation on Import objects and Maps.
1840 
1841  // Choose the right variant of setUnion
1842  if (!Bimport.is_null() && !Iimport.is_null()) {
1843  Cimport = Bimport->setUnion(*Iimport);
1844  }
1845  else if (!Bimport.is_null() && Iimport.is_null()) {
1846  Cimport = Bimport->setUnion();
1847  }
1848  else if (Bimport.is_null() && !Iimport.is_null()) {
1849  Cimport = Iimport->setUnion();
1850  }
1851  else {
1852  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1853  }
1854  Ccolmap = Cimport->getTargetMap();
1855 
1856  // NOTE: This is not efficient and should be folded into setUnion
1857  //
1858  // What the above comment means, is that the
1859  // setUnion operation on Import objects could also compute these
1860  // local index - to - local index look-up tables.
1861  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1862  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1863  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1864  range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1865  KOKKOS_LAMBDA(const LO i) {
1866  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1867  });
1868  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1869  range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1870  KOKKOS_LAMBDA(const LO i) {
1871  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1872  });
1873  }
1874 
1875  // Construct tables that map from local column
1876  // indices of A, to local row indices of either B_local (the locally
1877  // owned part of B), or B_remote (the "imported" remote part of B).
1878  //
1879  // For column index Aik in row i of A, if the corresponding row of B
1880  // exists in the local part of B ("orig") (which I'll call B_local),
1881  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1882  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1883  //
1884  // For column index Aik in row i of A, if the corresponding row of B
1885  // exists in the remote part of B ("Import") (which I'll call
1886  // B_remote), then targetMapToImportRow[Aik] is the local index of
1887  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1888  // (a flag value).
1889 
1890  // Run through all the hash table lookups once and for all
1891  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1892  Aview.colMap->getLocalNumElements());
1893  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1894  Aview.colMap->getLocalNumElements());
1895 
1896  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",
1897  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1898  KOKKOS_LAMBDA(const LO i) {
1899  GO aidx = Acolmap_local.getGlobalElement(i);
1900  LO B_LID = Browmap_local.getLocalElement(aidx);
1901  if (B_LID != LO_INVALID) {
1902  targetMapToOrigRow(i) = B_LID;
1903  targetMapToImportRow(i) = LO_INVALID;
1904  } else {
1905  LO I_LID = Irowmap_local.getLocalElement(aidx);
1906  targetMapToOrigRow(i) = LO_INVALID;
1907  targetMapToImportRow(i) = I_LID;
1908  }
1909  });
1910 
1911  // Create the KernelHandle
1912  using KernelHandle =
1913  KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1914  typename lno_nnz_view_t::const_value_type,
1915  typename scalar_view_t::const_value_type,
1916  typename device_t::execution_space,
1917  typename device_t::memory_space,
1918  typename device_t::memory_space>;
1919  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1920  std::string myalg("SPGEMM_KK_MEMORY");
1921  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1922 
1923  KernelHandle kh;
1924  kh.create_spgemm_handle(alg_enum);
1925  kh.set_team_work_size(team_work_size);
1926 
1927  // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
1928  const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1929  const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
1930  targetMapToOrigRow,targetMapToImportRow,
1931  Bcol2Ccol,Icol2Ccol,
1932  Ccolmap.getConst()->getLocalNumElements());
1933 
1934  RCP<graph_t> graphC;
1935  typename KBSR::values_type values;
1936  {
1937  // Call KokkosSparse routines to calculate Amat*Bmerged on device.
1938  // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
1939  KBSR Cmat;
1940  KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
1941  KokkosSparse::block_spgemm_numeric (kh, Amat, false, Bmerged, false, Cmat);
1942  kh.destroy_spgemm_handle();
1943 
1944  // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
1945  graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1946  values = Cmat.values;
1947  }
1948  C = rcp (new block_crs_matrix_type (*graphC, values, Aview.blocksize));
1949 
1950 }
1951 
1952 /*********************************************************************************************************/
1953 // AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
1954 template<class Scalar,
1955  class LocalOrdinal,
1956  class GlobalOrdinal,
1957  class Node,
1958  class LocalOrdinalViewType>
1959 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1960  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1961  const LocalOrdinalViewType & targetMapToOrigRow,
1962  const LocalOrdinalViewType & targetMapToImportRow,
1963  const LocalOrdinalViewType & Bcol2Ccol,
1964  const LocalOrdinalViewType & Icol2Ccol,
1965  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1966  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1967  const std::string& label,
1968  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1969 
1970  using Teuchos::Array;
1971  using Teuchos::ArrayRCP;
1972  using Teuchos::ArrayView;
1973  using Teuchos::RCP;
1974  using Teuchos::rcp;
1975 
1976  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Newmatrix SerialCore");
1977 
1978  // Lots and lots of typedefs
1979  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
1980  typedef typename KCRS::StaticCrsGraphType graph_t;
1981  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1982  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1983  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1984  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1985 
1986  typedef Scalar SC;
1987  typedef LocalOrdinal LO;
1988  typedef GlobalOrdinal GO;
1989  typedef Node NO;
1990  typedef Map<LO,GO,NO> map_type;
1991  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1992  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1993  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1994 
1995  // Sizes
1996  RCP<const map_type> Ccolmap = C.getColMap();
1997  size_t m = Aview.origMatrix->getLocalNumRows();
1998  size_t n = Ccolmap->getLocalNumElements();
1999  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2000 
2001  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2002  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2003  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2004 
2005  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2006  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2007  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2008 
2009  c_lno_view_t Irowptr;
2010  lno_nnz_view_t Icolind;
2011  scalar_view_t Ivals;
2012  if(!Bview.importMatrix.is_null()) {
2013  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2014  Irowptr = lclB.graph.row_map;
2015  Icolind = lclB.graph.entries;
2016  Ivals = lclB.values;
2017  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2018  }
2019 
2020  // Classic csr assembly (low memory edition)
2021  //
2022  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2023  // The method loops over rows of A, and may resize after processing
2024  // each row. Chris Siefert says that this reflects experience in
2025  // ML; for the non-threaded case, ML found it faster to spend less
2026  // effort on estimation and risk an occasional reallocation.
2027  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2028  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2029  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2030  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2031 
2032  // mfh 27 Sep 2016: The c_status array is an implementation detail
2033  // of the local sparse matrix-matrix multiply routine.
2034 
2035  // The status array will contain the index into colind where this entry was last deposited.
2036  // c_status[i] < CSR_ip - not in the row yet
2037  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2038  // We start with this filled with INVALID's indicating that there are no entries yet.
2039  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2040  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2041  std::vector<size_t> c_status(n, ST_INVALID);
2042 
2043  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2044  // routine. The routine computes C := A * (B_local + B_remote).
2045  //
2046  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2047  // you whether the corresponding row of B belongs to B_local
2048  // ("orig") or B_remote ("Import").
2049 
2050  // For each row of A/C
2051  size_t CSR_ip = 0, OLD_ip = 0;
2052  for (size_t i = 0; i < m; i++) {
2053  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2054  // on the calling process.
2055  Crowptr[i] = CSR_ip;
2056 
2057  // mfh 27 Sep 2016: For each entry of A in the current row of A
2058  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2059  LO Aik = Acolind[k]; // local column index of current entry of A
2060  const SC Aval = Avals[k]; // value of current entry of A
2061  if (Aval == SC_ZERO)
2062  continue; // skip explicitly stored zero values in A
2063 
2064  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2065  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2066  // corresponding to the current entry of A is populated, then
2067  // the corresponding row of B is in B_local (i.e., it lives on
2068  // the calling process).
2069 
2070  // Local matrix
2071  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2072 
2073  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2074  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2075  LO Bkj = Bcolind[j];
2076  LO Cij = Bcol2Ccol[Bkj];
2077 
2078  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2079  // New entry
2080  c_status[Cij] = CSR_ip;
2081  Ccolind[CSR_ip] = Cij;
2082  Cvals[CSR_ip] = Aval*Bvals[j];
2083  CSR_ip++;
2084 
2085  } else {
2086  Cvals[c_status[Cij]] += Aval*Bvals[j];
2087  }
2088  }
2089 
2090  } else {
2091  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2092  // corresponding to the current entry of A NOT populated (has
2093  // a flag "invalid" value), then the corresponding row of B is
2094  // in B_local (i.e., it lives on the calling process).
2095 
2096  // Remote matrix
2097  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2098  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2099  LO Ikj = Icolind[j];
2100  LO Cij = Icol2Ccol[Ikj];
2101 
2102  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2103  // New entry
2104  c_status[Cij] = CSR_ip;
2105  Ccolind[CSR_ip] = Cij;
2106  Cvals[CSR_ip] = Aval*Ivals[j];
2107  CSR_ip++;
2108  } else {
2109  Cvals[c_status[Cij]] += Aval*Ivals[j];
2110  }
2111  }
2112  }
2113  }
2114 
2115  // Resize for next pass if needed
2116  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2117  CSR_alloc *= 2;
2118  Kokkos::resize(Ccolind,CSR_alloc);
2119  Kokkos::resize(Cvals,CSR_alloc);
2120  }
2121  OLD_ip = CSR_ip;
2122  }
2123 
2124  Crowptr[m] = CSR_ip;
2125 
2126  // Downward resize
2127  Kokkos::resize(Ccolind,CSR_ip);
2128  Kokkos::resize(Cvals,CSR_ip);
2129 
2130  {
2131  Tpetra::Details::ProfilingRegion MM3("TpetraExt: MMM: Newmatrix Final Sort");
2132 
2133  // Final sort & set of CRS arrays
2134  if (params.is_null() || params->get("sort entries",true))
2135  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2136  C.setAllValues(Crowptr,Ccolind, Cvals);
2137 
2138 
2139  }
2140 
2141  Tpetra::Details::ProfilingRegion MM4("TpetraExt: MMM: Newmatrix ESCC");
2142  {
2143 
2144  // Final FillComplete
2145  //
2146  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2147  // Import (from domain Map to column Map) construction (which costs
2148  // lots of communication) by taking the previously constructed
2149  // Import object. We should be able to do this without interfering
2150  // with the implementation of the local part of sparse matrix-matrix
2151  // multply above.
2152  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2153  labelList->set("Timer Label",label);
2154  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2155  RCP<const Export<LO,GO,NO> > dummyExport;
2156  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2157  }
2158 
2159 }
2160 /*********************************************************************************************************/
2161 // Kernel method for computing the local portion of C = A*B (reuse)
2162 template<class Scalar,
2163  class LocalOrdinal,
2164  class GlobalOrdinal,
2165  class Node>
2166 void mult_A_B_reuse(
2167  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2168  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2169  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2170  const std::string& label,
2171  const Teuchos::RCP<Teuchos::ParameterList>& params)
2172 {
2173  using Teuchos::Array;
2174  using Teuchos::ArrayRCP;
2175  using Teuchos::ArrayView;
2176  using Teuchos::RCP;
2177  using Teuchos::rcp;
2178 
2179  // Tpetra typedefs
2180  typedef LocalOrdinal LO;
2181  typedef GlobalOrdinal GO;
2182  typedef Node NO;
2183  typedef Import<LO,GO,NO> import_type;
2184  typedef Map<LO,GO,NO> map_type;
2185 
2186  // Kokkos typedefs
2187  typedef typename map_type::local_map_type local_map_type;
2189  typedef typename KCRS::StaticCrsGraphType graph_t;
2190  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2191  typedef typename NO::execution_space execution_space;
2192  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2193  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2194 
2195  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Reuse Cmap");
2196  (void) label;
2197 
2198  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2199 
2200  // Grab all the maps
2201  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2202  RCP<const map_type> Ccolmap = C.getColMap();
2203  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2204  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2205  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2206  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2207  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2208  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2209 
2210  // Build the final importer / column map, hash table lookups for C
2211  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2212  {
2213  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2214  // So, column map of C may be a strict subset of the column map of B
2215  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2216  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2217  });
2218 
2219  if (!Bview.importMatrix.is_null()) {
2220  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2221  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2222 
2223  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2224  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2225  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2226  });
2227  }
2228  }
2229 
2230  // Run through all the hash table lookups once and for all
2231  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2232  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2233  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2234  GO aidx = Acolmap_local.getGlobalElement(i);
2235  LO B_LID = Browmap_local.getLocalElement(aidx);
2236  if (B_LID != LO_INVALID) {
2237  targetMapToOrigRow(i) = B_LID;
2238  targetMapToImportRow(i) = LO_INVALID;
2239  } else {
2240  LO I_LID = Irowmap_local.getLocalElement(aidx);
2241  targetMapToOrigRow(i) = LO_INVALID;
2242  targetMapToImportRow(i) = I_LID;
2243 
2244  }
2245  });
2246 
2247  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2248  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2249  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2250 }
2251 
2252 /*********************************************************************************************************/
2253 template<class Scalar,
2254  class LocalOrdinal,
2255  class GlobalOrdinal,
2256  class Node,
2257  class LocalOrdinalViewType>
2258 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2259  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2260  const LocalOrdinalViewType & targetMapToOrigRow,
2261  const LocalOrdinalViewType & targetMapToImportRow,
2262  const LocalOrdinalViewType & Bcol2Ccol,
2263  const LocalOrdinalViewType & Icol2Ccol,
2264  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2265  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2266  const std::string& label,
2267  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2268 
2269  using Teuchos::RCP;
2270  using Teuchos::rcp;
2271 
2272  // Lots and lots of typedefs
2273  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2274  typedef typename KCRS::StaticCrsGraphType graph_t;
2275  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2276  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2277  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2278 
2279  typedef Scalar SC;
2280  typedef LocalOrdinal LO;
2281  typedef GlobalOrdinal GO;
2282  typedef Node NO;
2283  typedef Map<LO,GO,NO> map_type;
2284  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2285  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2286  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2287 
2288  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Reuse SerialCore");
2289  (void) label;
2290 
2291  // Sizes
2292  RCP<const map_type> Ccolmap = C.getColMap();
2293  size_t m = Aview.origMatrix->getLocalNumRows();
2294  size_t n = Ccolmap->getLocalNumElements();
2295 
2296  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2297  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2298  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2299  const KCRS & Cmat = C.getLocalMatrixHost();
2300 
2301  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2302  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2303  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2304  scalar_view_t Cvals = Cmat.values;
2305 
2306  c_lno_view_t Irowptr;
2307  lno_nnz_view_t Icolind;
2308  scalar_view_t Ivals;
2309  if(!Bview.importMatrix.is_null()) {
2310  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2311  Irowptr = lclB.graph.row_map;
2312  Icolind = lclB.graph.entries;
2313  Ivals = lclB.values;
2314  }
2315 
2316  // Classic csr assembly (low memory edition)
2317  // mfh 27 Sep 2016: The c_status array is an implementation detail
2318  // of the local sparse matrix-matrix multiply routine.
2319 
2320  // The status array will contain the index into colind where this entry was last deposited.
2321  // c_status[i] < CSR_ip - not in the row yet
2322  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2323  // We start with this filled with INVALID's indicating that there are no entries yet.
2324  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2325  std::vector<size_t> c_status(n, ST_INVALID);
2326 
2327  // For each row of A/C
2328  size_t CSR_ip = 0, OLD_ip = 0;
2329  for (size_t i = 0; i < m; i++) {
2330  // First fill the c_status array w/ locations where we're allowed to
2331  // generate nonzeros for this row
2332  OLD_ip = Crowptr[i];
2333  CSR_ip = Crowptr[i+1];
2334  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2335  c_status[Ccolind[k]] = k;
2336 
2337  // Reset values in the row of C
2338  Cvals[k] = SC_ZERO;
2339  }
2340 
2341  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2342  LO Aik = Acolind[k];
2343  const SC Aval = Avals[k];
2344  if (Aval == SC_ZERO)
2345  continue;
2346 
2347  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2348  // Local matrix
2349  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2350 
2351  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2352  LO Bkj = Bcolind[j];
2353  LO Cij = Bcol2Ccol[Bkj];
2354 
2355  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2356  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2357  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2358 
2359  Cvals[c_status[Cij]] += Aval * Bvals[j];
2360  }
2361 
2362  } else {
2363  // Remote matrix
2364  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2365  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2366  LO Ikj = Icolind[j];
2367  LO Cij = Icol2Ccol[Ikj];
2368 
2369  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2370  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2371  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2372 
2373  Cvals[c_status[Cij]] += Aval * Ivals[j];
2374  }
2375  }
2376  }
2377  }
2378 
2379  {
2380  Tpetra::Details::ProfilingRegion MM3("TpetraExt: MMM: Reuse ESFC");
2381  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2382  }
2383 }
2384 
2385 
2386 /*********************************************************************************************************/
2387 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2388 template<class Scalar,
2389  class LocalOrdinal,
2390  class GlobalOrdinal,
2391  class Node>
2392 void jacobi_A_B_newmatrix(
2393  Scalar omega,
2394  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2395  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2396  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2397  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2398  const std::string& label,
2399  const Teuchos::RCP<Teuchos::ParameterList>& params)
2400 {
2401  using Teuchos::Array;
2402  using Teuchos::ArrayRCP;
2403  using Teuchos::ArrayView;
2404  using Teuchos::RCP;
2405  using Teuchos::rcp;
2406  // typedef Scalar SC;
2407  typedef LocalOrdinal LO;
2408  typedef GlobalOrdinal GO;
2409  typedef Node NO;
2410 
2411  typedef Import<LO,GO,NO> import_type;
2412  typedef Map<LO,GO,NO> map_type;
2413  typedef typename map_type::local_map_type local_map_type;
2414 
2415  // All of the Kokkos typedefs
2417  typedef typename KCRS::StaticCrsGraphType graph_t;
2418  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2419  typedef typename NO::execution_space execution_space;
2420  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2421  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2422 
2423  Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: M5 Cmap");
2424 
2425  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2426 
2427  // Build the final importer / column map, hash table lookups for C
2428  RCP<const import_type> Cimport;
2429  RCP<const map_type> Ccolmap;
2430  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2431  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2432  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2433  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2434  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2435  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2436  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2437  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2438 
2439  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2440  // indices of B, to local column indices of C. (B and C have the
2441  // same number of columns.) The kernel uses this, instead of
2442  // copying the entire input matrix B and converting its column
2443  // indices to those of C.
2444  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2445 
2446  if (Bview.importMatrix.is_null()) {
2447  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2448  Cimport = Bimport;
2449  Ccolmap = Bview.colMap;
2450  // Bcol2Ccol is trivial
2451  // Bcol2Ccol is trivial
2452 
2453  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2454  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2455  Bcol2Ccol(i) = static_cast<LO> (i);
2456  });
2457  } else {
2458  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2459  // column Map of C, as well as C's Import object (from its domain
2460  // Map to its column Map). C's column Map is the union of the
2461  // column Maps of (the local part of) B, and the "remote" part of
2462  // B. Ditto for the Import. We have optimized this "setUnion"
2463  // operation on Import objects and Maps.
2464 
2465  // Choose the right variant of setUnion
2466  if (!Bimport.is_null() && !Iimport.is_null()){
2467  Cimport = Bimport->setUnion(*Iimport);
2468  Ccolmap = Cimport->getTargetMap();
2469 
2470  } else if (!Bimport.is_null() && Iimport.is_null()) {
2471  Cimport = Bimport->setUnion();
2472 
2473  } else if(Bimport.is_null() && !Iimport.is_null()) {
2474  Cimport = Iimport->setUnion();
2475 
2476  } else
2477  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2478 
2479  Ccolmap = Cimport->getTargetMap();
2480 
2481  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2482  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2483 
2484  // NOTE: This is not efficient and should be folded into setUnion
2485  //
2486  // mfh 27 Sep 2016: What the above comment means, is that the
2487  // setUnion operation on Import objects could also compute these
2488  // local index - to - local index look-up tables.
2489  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2490  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2491  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2492  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2493  });
2494  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2495  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2496  });
2497 
2498  }
2499 
2500  // Replace the column map
2501  //
2502  // mfh 27 Sep 2016: We do this because C was originally created
2503  // without a column Map. Now we have its column Map.
2504  C.replaceColMap(Ccolmap);
2505 
2506  // mfh 27 Sep 2016: Construct tables that map from local column
2507  // indices of A, to local row indices of either B_local (the locally
2508  // owned part of B), or B_remote (the "imported" remote part of B).
2509  //
2510  // For column index Aik in row i of A, if the corresponding row of B
2511  // exists in the local part of B ("orig") (which I'll call B_local),
2512  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2513  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2514  //
2515  // For column index Aik in row i of A, if the corresponding row of B
2516  // exists in the remote part of B ("Import") (which I'll call
2517  // B_remote), then targetMapToImportRow[Aik] is the local index of
2518  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2519  // (a flag value).
2520 
2521  // Run through all the hash table lookups once and for all
2522  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2523  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2524  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2525  GO aidx = Acolmap_local.getGlobalElement(i);
2526  LO B_LID = Browmap_local.getLocalElement(aidx);
2527  if (B_LID != LO_INVALID) {
2528  targetMapToOrigRow(i) = B_LID;
2529  targetMapToImportRow(i) = LO_INVALID;
2530  } else {
2531  LO I_LID = Irowmap_local.getLocalElement(aidx);
2532  targetMapToOrigRow(i) = LO_INVALID;
2533  targetMapToImportRow(i) = I_LID;
2534 
2535  }
2536  });
2537 
2538  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2539  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2540  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2541 
2542 }
2543 
2544 
2545 /*********************************************************************************************************/
2546 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2547 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2548 
2549 template<class Scalar,
2550  class LocalOrdinal,
2551  class GlobalOrdinal,
2552  class Node,
2553  class LocalOrdinalViewType>
2554 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2555  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2556  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2557  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2558  const LocalOrdinalViewType & targetMapToOrigRow,
2559  const LocalOrdinalViewType & targetMapToImportRow,
2560  const LocalOrdinalViewType & Bcol2Ccol,
2561  const LocalOrdinalViewType & Icol2Ccol,
2562  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2563  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2564  const std::string& label,
2565  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2566 
2567  Tpetra::Details::ProfilingRegion MM("TpetraExt: Jacobi: Newmatrix SerialCore");
2568 
2569  using Teuchos::Array;
2570  using Teuchos::ArrayRCP;
2571  using Teuchos::ArrayView;
2572  using Teuchos::RCP;
2573  using Teuchos::rcp;
2574 
2575  // Lots and lots of typedefs
2576  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2577  typedef typename KCRS::StaticCrsGraphType graph_t;
2578  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2579  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2580  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2581  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2582 
2583  // Jacobi-specific
2584  typedef typename scalar_view_t::memory_space scalar_memory_space;
2585 
2586  typedef Scalar SC;
2587  typedef LocalOrdinal LO;
2588  typedef GlobalOrdinal GO;
2589  typedef Node NO;
2590 
2591  typedef Map<LO,GO,NO> map_type;
2592  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2593  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2594 
2595  // Sizes
2596  RCP<const map_type> Ccolmap = C.getColMap();
2597  size_t m = Aview.origMatrix->getLocalNumRows();
2598  size_t n = Ccolmap->getLocalNumElements();
2599  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2600 
2601  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2602  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2603  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2604 
2605  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2606  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2607  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2608 
2609  c_lno_view_t Irowptr;
2610  lno_nnz_view_t Icolind;
2611  scalar_view_t Ivals;
2612  if(!Bview.importMatrix.is_null()) {
2613  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2614  Irowptr = lclB.graph.row_map;
2615  Icolind = lclB.graph.entries;
2616  Ivals = lclB.values;
2617  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2618  }
2619 
2620  // Jacobi-specific inner stuff
2621  auto Dvals =
2622  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2623 
2624  // Teuchos::ArrayView::operator[].
2625  // The status array will contain the index into colind where this entry was last deposited.
2626  // c_status[i] < CSR_ip - not in the row yet.
2627  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2628  // We start with this filled with INVALID's indicating that there are no entries yet.
2629  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2630  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2631  Array<size_t> c_status(n, ST_INVALID);
2632 
2633  // Classic csr assembly (low memory edition)
2634  //
2635  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2636  // The method loops over rows of A, and may resize after processing
2637  // each row. Chris Siefert says that this reflects experience in
2638  // ML; for the non-threaded case, ML found it faster to spend less
2639  // effort on estimation and risk an occasional reallocation.
2640  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2641  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2642  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2643  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2644  size_t CSR_ip = 0, OLD_ip = 0;
2645 
2646  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2647 
2648  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2649  // routine. The routine computes
2650  //
2651  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2652  //
2653  // This corresponds to one sweep of (weighted) Jacobi.
2654  //
2655  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2656  // you whether the corresponding row of B belongs to B_local
2657  // ("orig") or B_remote ("Import").
2658 
2659  // For each row of A/C
2660  for (size_t i = 0; i < m; i++) {
2661  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2662  // on the calling process.
2663  Crowptr[i] = CSR_ip;
2664  SC minusOmegaDval = -omega*Dvals(i,0);
2665 
2666  // Entries of B
2667  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2668  Scalar Bval = Bvals[j];
2669  if (Bval == SC_ZERO)
2670  continue;
2671  LO Bij = Bcolind[j];
2672  LO Cij = Bcol2Ccol[Bij];
2673 
2674  // Assume no repeated entries in B
2675  c_status[Cij] = CSR_ip;
2676  Ccolind[CSR_ip] = Cij;
2677  Cvals[CSR_ip] = Bvals[j];
2678  CSR_ip++;
2679  }
2680 
2681  // Entries of -omega * Dinv * A * B
2682  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2683  LO Aik = Acolind[k];
2684  const SC Aval = Avals[k];
2685  if (Aval == SC_ZERO)
2686  continue;
2687 
2688  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2689  // Local matrix
2690  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2691 
2692  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2693  LO Bkj = Bcolind[j];
2694  LO Cij = Bcol2Ccol[Bkj];
2695 
2696  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2697  // New entry
2698  c_status[Cij] = CSR_ip;
2699  Ccolind[CSR_ip] = Cij;
2700  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2701  CSR_ip++;
2702 
2703  } else {
2704  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2705  }
2706  }
2707 
2708  } else {
2709  // Remote matrix
2710  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2711  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2712  LO Ikj = Icolind[j];
2713  LO Cij = Icol2Ccol[Ikj];
2714 
2715  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2716  // New entry
2717  c_status[Cij] = CSR_ip;
2718  Ccolind[CSR_ip] = Cij;
2719  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2720  CSR_ip++;
2721  } else {
2722  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2723  }
2724  }
2725  }
2726  }
2727 
2728  // Resize for next pass if needed
2729  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2730  CSR_alloc *= 2;
2731  Kokkos::resize(Ccolind,CSR_alloc);
2732  Kokkos::resize(Cvals,CSR_alloc);
2733  }
2734  OLD_ip = CSR_ip;
2735  }
2736  Crowptr[m] = CSR_ip;
2737 
2738  // Downward resize
2739  Kokkos::resize(Ccolind,CSR_ip);
2740  Kokkos::resize(Cvals,CSR_ip);
2741 
2742  {
2743  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix Final Sort");
2744 
2745  // Replace the column map
2746  //
2747  // mfh 27 Sep 2016: We do this because C was originally created
2748  // without a column Map. Now we have its column Map.
2749  C.replaceColMap(Ccolmap);
2750 
2751  // Final sort & set of CRS arrays
2752  //
2753  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2754  // matrix-matrix multiply routine sort the entries for us?
2755  // Final sort & set of CRS arrays
2756  if (params.is_null() || params->get("sort entries",true))
2757  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2758  C.setAllValues(Crowptr,Ccolind, Cvals);
2759  }
2760  {
2761  Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: Newmatrix ESFC");
2762 
2763  // Final FillComplete
2764  //
2765  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2766  // Import (from domain Map to column Map) construction (which costs
2767  // lots of communication) by taking the previously constructed
2768  // Import object. We should be able to do this without interfering
2769  // with the implementation of the local part of sparse matrix-matrix
2770  // multply above
2771  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2772  labelList->set("Timer Label",label);
2773  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2774  RCP<const Export<LO,GO,NO> > dummyExport;
2775  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2776 
2777  }
2778 }
2779 
2780 
2781 /*********************************************************************************************************/
2782 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2783 template<class Scalar,
2784  class LocalOrdinal,
2785  class GlobalOrdinal,
2786  class Node>
2787 void jacobi_A_B_reuse(
2788  Scalar omega,
2789  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2790  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2791  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2792  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2793  const std::string& label,
2794  const Teuchos::RCP<Teuchos::ParameterList>& params)
2795 {
2796  using Teuchos::Array;
2797  using Teuchos::ArrayRCP;
2798  using Teuchos::ArrayView;
2799  using Teuchos::RCP;
2800  using Teuchos::rcp;
2801 
2802  typedef LocalOrdinal LO;
2803  typedef GlobalOrdinal GO;
2804  typedef Node NO;
2805 
2806  typedef Import<LO,GO,NO> import_type;
2807  typedef Map<LO,GO,NO> map_type;
2808 
2809  // Kokkos typedefs
2810  typedef typename map_type::local_map_type local_map_type;
2812  typedef typename KCRS::StaticCrsGraphType graph_t;
2813  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2814  typedef typename NO::execution_space execution_space;
2815  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2816  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2817 
2818  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2819  lo_view_t Bcol2Ccol, Icol2Ccol;
2820  lo_view_t targetMapToOrigRow;
2821  lo_view_t targetMapToImportRow;
2822  {
2823  Tpetra::Details::ProfilingRegion MM("TpetraExt: Jacobi: Reuse Cmap");
2824 
2825  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2826 
2827  // Grab all the maps
2828  RCP<const map_type> Ccolmap = C.getColMap();
2829  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2830  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2831  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2832  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2833  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2834  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2835 
2836  // Build the final importer / column map, hash table lookups for C
2837  Bcol2Ccol = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements());
2838  {
2839  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2840  // So, column map of C may be a strict subset of the column map of B
2841  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2842  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2843  });
2844 
2845  if (!Bview.importMatrix.is_null()) {
2846  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2847  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2848 
2849  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2850  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2851  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2852  });
2853  }
2854  }
2855 
2856  // Run through all the hash table lookups once and for all
2857  targetMapToOrigRow = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2858  targetMapToImportRow = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2859  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2860  GO aidx = Acolmap_local.getGlobalElement(i);
2861  LO B_LID = Browmap_local.getLocalElement(aidx);
2862  if (B_LID != LO_INVALID) {
2863  targetMapToOrigRow(i) = B_LID;
2864  targetMapToImportRow(i) = LO_INVALID;
2865  } else {
2866  LO I_LID = Irowmap_local.getLocalElement(aidx);
2867  targetMapToOrigRow(i) = LO_INVALID;
2868  targetMapToImportRow(i) = I_LID;
2869 
2870  }
2871  });
2872 
2873  }
2874 
2875  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2876  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2877  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2878 }
2879 
2880 
2881 
2882 /*********************************************************************************************************/
2883 template<class Scalar,
2884  class LocalOrdinal,
2885  class GlobalOrdinal,
2886  class Node,
2887  class LocalOrdinalViewType>
2888 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2889  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2890  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2891  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2892  const LocalOrdinalViewType & targetMapToOrigRow,
2893  const LocalOrdinalViewType & targetMapToImportRow,
2894  const LocalOrdinalViewType & Bcol2Ccol,
2895  const LocalOrdinalViewType & Icol2Ccol,
2896  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2897  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2898  const std::string& label,
2899  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2900 
2901 
2902  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Reuse Serial Core");
2903  (void)label;
2904 
2905  using Teuchos::RCP;
2906  using Teuchos::rcp;
2907 
2908  // Lots and lots of typedefs
2909  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2910  typedef typename KCRS::StaticCrsGraphType graph_t;
2911  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2912  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2913  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2914  typedef typename scalar_view_t::memory_space scalar_memory_space;
2915 
2916  typedef Scalar SC;
2917  typedef LocalOrdinal LO;
2918  typedef GlobalOrdinal GO;
2919  typedef Node NO;
2920  typedef Map<LO,GO,NO> map_type;
2921  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2922  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2923  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2924 
2925  // Sizes
2926  RCP<const map_type> Ccolmap = C.getColMap();
2927  size_t m = Aview.origMatrix->getLocalNumRows();
2928  size_t n = Ccolmap->getLocalNumElements();
2929 
2930  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2931  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2932  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2933  const KCRS & Cmat = C.getLocalMatrixHost();
2934 
2935  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2936  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2937  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2938  scalar_view_t Cvals = Cmat.values;
2939 
2940  c_lno_view_t Irowptr;
2941  lno_nnz_view_t Icolind;
2942  scalar_view_t Ivals;
2943  if(!Bview.importMatrix.is_null()) {
2944  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2945  Irowptr = lclB.graph.row_map;
2946  Icolind = lclB.graph.entries;
2947  Ivals = lclB.values;
2948  }
2949 
2950  // Jacobi-specific inner stuff
2951  auto Dvals =
2952  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2953 
2954 
2955  // The status array will contain the index into colind where this entry was last deposited.
2956  // c_status[i] < CSR_ip - not in the row yet
2957  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2958  // We start with this filled with INVALID's indicating that there are no entries yet.
2959  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2960  std::vector<size_t> c_status(n, ST_INVALID);
2961 
2962  // For each row of A/C
2963  size_t CSR_ip = 0, OLD_ip = 0;
2964  for (size_t i = 0; i < m; i++) {
2965 
2966  // First fill the c_status array w/ locations where we're allowed to
2967  // generate nonzeros for this row
2968  OLD_ip = Crowptr[i];
2969  CSR_ip = Crowptr[i+1];
2970  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2971  c_status[Ccolind[k]] = k;
2972 
2973  // Reset values in the row of C
2974  Cvals[k] = SC_ZERO;
2975  }
2976 
2977  SC minusOmegaDval = -omega*Dvals(i,0);
2978 
2979  // Entries of B
2980  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2981  Scalar Bval = Bvals[j];
2982  if (Bval == SC_ZERO)
2983  continue;
2984  LO Bij = Bcolind[j];
2985  LO Cij = Bcol2Ccol[Bij];
2986 
2987  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2988  std::runtime_error, "Trying to insert a new entry into a static graph");
2989 
2990  Cvals[c_status[Cij]] = Bvals[j];
2991  }
2992 
2993  // Entries of -omega * Dinv * A * B
2994  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2995  LO Aik = Acolind[k];
2996  const SC Aval = Avals[k];
2997  if (Aval == SC_ZERO)
2998  continue;
2999 
3000  if (targetMapToOrigRow[Aik] != LO_INVALID) {
3001  // Local matrix
3002  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3003 
3004  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3005  LO Bkj = Bcolind[j];
3006  LO Cij = Bcol2Ccol[Bkj];
3007 
3008  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3009  std::runtime_error, "Trying to insert a new entry into a static graph");
3010 
3011  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3012  }
3013 
3014  } else {
3015  // Remote matrix
3016  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3017  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3018  LO Ikj = Icolind[j];
3019  LO Cij = Icol2Ccol[Ikj];
3020 
3021  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3022  std::runtime_error, "Trying to insert a new entry into a static graph");
3023 
3024  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3025  }
3026  }
3027  }
3028  }
3029 
3030 
3031  {
3032  Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: Reuse ESFC");
3033  C.fillComplete(C.getDomainMap(), C.getRangeMap());
3034  }
3035 
3036 }
3037 
3038 
3039 
3040 /*********************************************************************************************************/
3041 template<class Scalar,
3042  class LocalOrdinal,
3043  class GlobalOrdinal,
3044  class Node>
3045 void import_and_extract_views(
3046  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3047  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3048  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3049  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3050  bool userAssertsThereAreNoRemotes,
3051  const std::string& label,
3052  const Teuchos::RCP<Teuchos::ParameterList>& params)
3053 {
3054  using Teuchos::Array;
3055  using Teuchos::ArrayView;
3056  using Teuchos::RCP;
3057  using Teuchos::rcp;
3058  using Teuchos::null;
3059 
3060  typedef Scalar SC;
3061  typedef LocalOrdinal LO;
3062  typedef GlobalOrdinal GO;
3063  typedef Node NO;
3064 
3065  typedef Map<LO,GO,NO> map_type;
3066  typedef Import<LO,GO,NO> import_type;
3067  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3068 
3069 
3070  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Alloc"));
3071 
3072  // The goal of this method is to populate the 'Aview' struct with views of the
3073  // rows of A, including all rows that correspond to elements in 'targetMap'.
3074  //
3075  // If targetMap includes local elements that correspond to remotely-owned rows
3076  // of A, then those remotely-owned rows will be imported into
3077  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3078  Aview.deleteContents();
3079 
3080  Aview.origMatrix = rcp(&A, false);
3081  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3082  Aview.origMatrix->getApplyHelper();
3083  Aview.origRowMap = A.getRowMap();
3084  Aview.rowMap = targetMap;
3085  Aview.colMap = A.getColMap();
3086  Aview.domainMap = A.getDomainMap();
3087  Aview.importColMap = null;
3088  RCP<const map_type> rowMap = A.getRowMap();
3089  const int numProcs = rowMap->getComm()->getSize();
3090 
3091  // Short circuit if the user swears there are no remotes (or if we're in serial)
3092  if (userAssertsThereAreNoRemotes || numProcs < 2)
3093  return;
3094 
3095  RCP<const import_type> importer;
3096  if (params != null && params->isParameter("importer")) {
3097  importer = params->get<RCP<const import_type> >("importer");
3098 
3099  } else {
3100  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X RemoteMap"));
3101 
3102  // Mark each row in targetMap as local or remote, and go ahead and get a view
3103  // for the local rows
3104  RCP<const map_type> remoteRowMap;
3105  size_t numRemote = 0;
3106  int mode = 0;
3107  if (!prototypeImporter.is_null() &&
3108  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3109  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3110  // We have a valid prototype importer --- ask it for the remotes
3111  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: I&X RemoteMap: Mode1");
3112 
3113  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3114  numRemote = prototypeImporter->getNumRemoteIDs();
3115 
3116  Array<GO> remoteRows(numRemote);
3117  for (size_t i = 0; i < numRemote; i++)
3118  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3119 
3120  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3121  rowMap->getIndexBase(), rowMap->getComm()));
3122  mode = 1;
3123 
3124  } else if (prototypeImporter.is_null()) {
3125  // No prototype importer --- count the remotes the hard way
3126  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: I&X RemoteMap: Mode2");
3127 
3128  ArrayView<const GO> rows = targetMap->getLocalElementList();
3129  size_t numRows = targetMap->getLocalNumElements();
3130 
3131  Array<GO> remoteRows(numRows);
3132  for(size_t i = 0; i < numRows; ++i) {
3133  const LO mlid = rowMap->getLocalElement(rows[i]);
3134 
3135  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3136  remoteRows[numRemote++] = rows[i];
3137  }
3138  remoteRows.resize(numRemote);
3139  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3140  rowMap->getIndexBase(), rowMap->getComm()));
3141  mode = 2;
3142 
3143  } else {
3144  // PrototypeImporter is bad. But if we're in serial that's OK.
3145  mode = 3;
3146  }
3147 
3148  if (numProcs < 2) {
3149  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3150  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3151  // If only one processor we don't need to import any remote rows, so return.
3152  return;
3153  }
3154 
3155  //
3156  // Now we will import the needed remote rows of A, if the global maximum
3157  // value of numRemote is greater than 0.
3158  //
3159  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Collective-0"));
3160 
3161  global_size_t globalMaxNumRemote = 0;
3162  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3163 
3164  if (globalMaxNumRemote > 0) {
3165  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-2"));
3166 
3167  // Create an importer with target-map remoteRowMap and source-map rowMap.
3168  if (mode == 1)
3169  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3170  else if (mode == 2)
3171  importer = rcp(new import_type(rowMap, remoteRowMap));
3172  else
3173  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3174  }
3175 
3176  if (params != null)
3177  params->set("importer", importer);
3178  }
3179 
3180  if (importer != null) {
3181  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-3"));
3182 
3183  // Now create a new matrix into which we can import the remote rows of A that we need.
3184  Teuchos::ParameterList labelList;
3185  labelList.set("Timer Label", label);
3186  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3187 
3188  bool isMM = true;
3189  bool overrideAllreduce = false;
3190  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3191  // Minor speedup tweak - avoid computing the global constants
3192  Teuchos::ParameterList params_sublist;
3193  if(!params.is_null()) {
3194  labelList.set("compute global constants", params->get("compute global constants",false));
3195  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3196  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3197  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3198  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3199  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3200  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3201  }
3202  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3203  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3204  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3205 
3206  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3207  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3208  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3209  Aview.importMatrix->getApplyHelper();
3210 
3211 #if 0
3212  // Disabled code for dumping input matrices
3213  static int count=0;
3214  char str[80];
3215  sprintf(str,"import_matrix.%d.dat",count);
3217  count++;
3218 #endif
3219 
3220 #ifdef HAVE_TPETRA_MMM_STATISTICS
3221  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3222 #endif
3223 
3224 
3225  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-4"));
3226 
3227  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3228  Aview.importColMap = Aview.importMatrix->getColMap();
3229  MM = Teuchos::null;
3230 
3231  }
3232 }
3233 
3234 /*********************************************************************************************************/
3235 template<class Scalar,
3236  class LocalOrdinal,
3237  class GlobalOrdinal,
3238  class Node>
3239 void import_and_extract_views(
3240  const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3241  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3242  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3243  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3244  bool userAssertsThereAreNoRemotes)
3245 {
3246  using Teuchos::Array;
3247  using Teuchos::ArrayView;
3248  using Teuchos::RCP;
3249  using Teuchos::rcp;
3250  using Teuchos::null;
3251 
3252  typedef Scalar SC;
3253  typedef LocalOrdinal LO;
3254  typedef GlobalOrdinal GO;
3255  typedef Node NO;
3256 
3257  typedef Map<LO,GO,NO> map_type;
3258  typedef Import<LO,GO,NO> import_type;
3259  typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3260 
3261  // The goal of this method is to populate the 'Mview' struct with views of the
3262  // rows of M, including all rows that correspond to elements in 'targetMap'.
3263  //
3264  // If targetMap includes local elements that correspond to remotely-owned rows
3265  // of M, then those remotely-owned rows will be imported into
3266  // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3267  Mview.deleteContents();
3268 
3269  Mview.origMatrix = rcp(&M, false);
3270  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3271  Mview.origMatrix->getApplyHelper();
3272  Mview.origRowMap = M.getRowMap();
3273  Mview.rowMap = targetMap;
3274  Mview.colMap = M.getColMap();
3275  Mview.importColMap = null;
3276  RCP<const map_type> rowMap = M.getRowMap();
3277  const int numProcs = rowMap->getComm()->getSize();
3278 
3279  // Short circuit if the user swears there are no remotes (or if we're in serial)
3280  if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3281 
3282  // Mark each row in targetMap as local or remote, and go ahead and get a view
3283  // for the local rows
3284  RCP<const map_type> remoteRowMap;
3285  size_t numRemote = 0;
3286  int mode = 0;
3287  if (!prototypeImporter.is_null() &&
3288  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3289  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3290 
3291  // We have a valid prototype importer --- ask it for the remotes
3292  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3293  numRemote = prototypeImporter->getNumRemoteIDs();
3294 
3295  Array<GO> remoteRows(numRemote);
3296  for (size_t i = 0; i < numRemote; i++)
3297  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3298 
3299  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3300  rowMap->getIndexBase(), rowMap->getComm()));
3301  mode = 1;
3302 
3303  } else if (prototypeImporter.is_null()) {
3304 
3305  // No prototype importer --- count the remotes the hard way
3306  ArrayView<const GO> rows = targetMap->getLocalElementList();
3307  size_t numRows = targetMap->getLocalNumElements();
3308 
3309  Array<GO> remoteRows(numRows);
3310  for(size_t i = 0; i < numRows; ++i) {
3311  const LO mlid = rowMap->getLocalElement(rows[i]);
3312 
3313  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3314  remoteRows[numRemote++] = rows[i];
3315  }
3316  remoteRows.resize(numRemote);
3317  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3318  rowMap->getIndexBase(), rowMap->getComm()));
3319  mode = 2;
3320 
3321  } else {
3322  // PrototypeImporter is bad. But if we're in serial that's OK.
3323  mode = 3;
3324  }
3325 
3326  if (numProcs < 2) {
3327  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3328  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3329  // If only one processor we don't need to import any remote rows, so return.
3330  return;
3331  }
3332 
3333  // Now we will import the needed remote rows of M, if the global maximum
3334  // value of numRemote is greater than 0.
3335  global_size_t globalMaxNumRemote = 0;
3336  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3337 
3338  RCP<const import_type> importer;
3339 
3340  if (globalMaxNumRemote > 0) {
3341  // Create an importer with target-map remoteRowMap and source-map rowMap.
3342  if (mode == 1)
3343  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3344  else if (mode == 2)
3345  importer = rcp(new import_type(rowMap, remoteRowMap));
3346  else
3347  throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3348  }
3349 
3350  if (importer != null) {
3351  // Get import matrix
3352  // TODO: create the int-typed row-pointer here
3353  Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3354  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3355  Mview.importMatrix->getApplyHelper();
3356  // Save the column map of the imported matrix, so that we can convert indices
3357  // back to global for arithmetic later
3358  Mview.importColMap = Mview.importMatrix->getColMap();
3359  }
3360 }
3361 
3362 /*********************************************************************************************************/
3363  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3364 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3366 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3367  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3368  const LocalOrdinalViewType & Acol2Brow,
3369  const LocalOrdinalViewType & Acol2Irow,
3370  const LocalOrdinalViewType & Bcol2Ccol,
3371  const LocalOrdinalViewType & Icol2Ccol,
3372  const size_t mergedNodeNumCols) {
3373 
3374  using Teuchos::RCP;
3376  typedef typename KCRS::StaticCrsGraphType graph_t;
3377  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3378  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3379  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3380  // Grab the Kokkos::SparseCrsMatrices
3381  const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3382  const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3383 
3384  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3385  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3386  // We do have a Bimport
3387  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3388  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3389  RCP<const KCRS> Ik_;
3390  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3391  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3392  KCRS Iks;
3393  if(Ik!=0) Iks = *Ik;
3394  size_t merge_numrows = Ak.numCols();
3395  // The last entry of this at least, need to be initialized
3396  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3397 
3398  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3399 
3400  // Use a Kokkos::parallel_scan to build the rowptr
3401  typedef typename Node::execution_space execution_space;
3402  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3403  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3404  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3405  if(final) Mrowptr(i) = update;
3406  // Get the row count
3407  size_t ct=0;
3408  if(Acol2Brow(i)!=LO_INVALID)
3409  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3410  else
3411  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3412  update+=ct;
3413 
3414  if(final && i+1==merge_numrows)
3415  Mrowptr(i+1)=update;
3416  });
3417 
3418  // Allocate nnz
3419  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3420  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3421  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3422 
3423  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3424  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3425  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3426  if(Acol2Brow(i)!=LO_INVALID) {
3427  size_t row = Acol2Brow(i);
3428  size_t start = Bk.graph.row_map(row);
3429  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3430  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3431  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3432  }
3433  }
3434  else {
3435  size_t row = Acol2Irow(i);
3436  size_t start = Iks.graph.row_map(row);
3437  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3438  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3439  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3440  }
3441  }
3442  });
3443 
3444  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3445  return newmat;
3446  }
3447  else {
3448  // We don't have a Bimport (the easy case)
3449  return Bk;
3450  }
3451 }//end merge_matrices
3452 
3453 /*********************************************************************************************************/
3454  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3455 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3456 const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3457 merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3458  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3459  const LocalOrdinalViewType & Acol2Brow,
3460  const LocalOrdinalViewType & Acol2Irow,
3461  const LocalOrdinalViewType & Bcol2Ccol,
3462  const LocalOrdinalViewType & Icol2Ccol,
3463  const size_t mergedNodeNumCols)
3464 {
3465  using Teuchos::RCP;
3466  typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3467  typedef typename KBCRS::StaticCrsGraphType graph_t;
3468  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3469  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3470  typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3471 
3472  // Grab the KokkosSparse::BsrMatrix
3473  const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3474  const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3475 
3476  // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3477  if(!Bview.importMatrix.is_null() ||
3478  (Bview.importMatrix.is_null() &&
3479  (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3480 
3481  // We do have a Bimport
3482  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3483  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3484  RCP<const KBCRS> Ik_;
3485  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3486  const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3487  KBCRS Iks;
3488  if(Ik!=0) Iks = *Ik;
3489  size_t merge_numrows = Ak.numCols();
3490 
3491  // The last entry of this at least, need to be initialized
3492  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3493 
3494  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3495 
3496  // Use a Kokkos::parallel_scan to build the rowptr
3497  typedef typename Node::execution_space execution_space;
3498  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3499  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3500  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3501  if(final) Mrowptr(i) = update;
3502  // Get the row count
3503  size_t ct=0;
3504  if(Acol2Brow(i)!=LO_INVALID)
3505  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3506  else
3507  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3508  update+=ct;
3509 
3510  if(final && i+1==merge_numrows)
3511  Mrowptr(i+1)=update;
3512  });
3513 
3514  // Allocate nnz
3515  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3516  const int blocksize = Ak.blockDim();
3517  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3518  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz*blocksize*blocksize);
3519 
3520  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3521  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3522  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3523  if(Acol2Brow(i)!=LO_INVALID) {
3524  size_t row = Acol2Brow(i);
3525  size_t start = Bk.graph.row_map(row);
3526  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3527  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3528 
3529  for (int b=0; b<blocksize*blocksize; ++b) {
3530  const int val_indx = j*blocksize*blocksize + b;
3531  const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3532  Mvalues(val_indx) = Bk.values(b_val_indx);
3533  }
3534  }
3535  }
3536  else {
3537  size_t row = Acol2Irow(i);
3538  size_t start = Iks.graph.row_map(row);
3539  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3540  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3541 
3542  for (int b=0; b<blocksize*blocksize; ++b) {
3543  const int val_indx = j*blocksize*blocksize + b;
3544  const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3545  Mvalues(val_indx) = Iks.values(b_val_indx);
3546  }
3547  }
3548  }
3549  });
3550 
3551  // Build and return merged KokkosSparse matrix
3552  KBCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3553  return newmat;
3554  }
3555  else {
3556  // We don't have a Bimport (the easy case)
3557  return Bk;
3558  }
3559 }//end merge_matrices
3560 
3561 /*********************************************************************************************************/
3562 template<typename SC, typename LO, typename GO, typename NO>
3563 void AddKernels<SC, LO, GO, NO>::
3564 addSorted(
3565  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3566  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3567  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3568  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3569  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3570  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3571  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3572  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3573  GO numGlobalCols,
3574  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3575  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3576  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3577 {
3578  using Teuchos::TimeMonitor;
3579  using Teuchos::rcp;
3580  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3581  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3582  auto nrows = Arowptrs.extent(0) - 1;
3583  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3584  typename AddKern::KKH handle;
3585  handle.create_spadd_handle(true);
3586  auto addHandle = handle.get_spadd_handle();
3587 
3588  auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted symbolic"));
3589 
3590  KokkosSparse::spadd_symbolic
3591  (&handle,
3592  nrows, numGlobalCols,
3593  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3594  //KokkosKernels requires values to be zeroed
3595  Cvals = values_array("C values", addHandle->get_c_nnz());
3596  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3597 
3598  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted numeric"));
3599  KokkosSparse::spadd_numeric(&handle,
3600  nrows, numGlobalCols,
3601  Arowptrs, Acolinds, Avals, scalarA,
3602  Browptrs, Bcolinds, Bvals, scalarB,
3603  Crowptrs, Ccolinds, Cvals);
3604 }
3605 
3606 template<typename SC, typename LO, typename GO, typename NO>
3607 void AddKernels<SC, LO, GO, NO>::
3608 addUnsorted(
3609  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3610  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3611  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3612  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3613  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3614  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3615  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3616  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3617  GO numGlobalCols,
3618  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3619  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3620  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3621 {
3622  using Teuchos::TimeMonitor;
3623  using Teuchos::rcp;
3624  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3625  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3626  auto nrows = Arowptrs.extent(0) - 1;
3627  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3628  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3629  typename AddKern::KKH handle;
3630  handle.create_spadd_handle(false);
3631  auto addHandle = handle.get_spadd_handle();
3632  auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted symbolic"));
3633 
3634  KokkosSparse::spadd_symbolic
3635  (&handle,
3636  nrows, numGlobalCols,
3637  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3638  //Cvals must be zeroed out
3639  Cvals = values_array("C values", addHandle->get_c_nnz());
3640  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3641  MM = Teuchos::null; MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted numeric"));
3642  KokkosSparse::spadd_numeric(&handle,
3643  nrows, numGlobalCols,
3644  Arowptrs, Acolinds, Avals, scalarA,
3645  Browptrs, Bcolinds, Bvals, scalarB,
3646  Crowptrs, Ccolinds, Cvals);
3647 }
3648 
3649 template<typename GO,
3650  typename LocalIndicesType,
3651  typename GlobalIndicesType,
3652  typename ColMapType>
3653 struct ConvertLocalToGlobalFunctor
3654 {
3655  ConvertLocalToGlobalFunctor(
3656  const LocalIndicesType& colindsOrig_,
3657  const GlobalIndicesType& colindsConverted_,
3658  const ColMapType& colmap_) :
3659  colindsOrig (colindsOrig_),
3660  colindsConverted (colindsConverted_),
3661  colmap (colmap_)
3662  {}
3663  KOKKOS_INLINE_FUNCTION void
3664  operator() (const GO i) const
3665  {
3666  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3667  }
3668  LocalIndicesType colindsOrig;
3669  GlobalIndicesType colindsConverted;
3670  ColMapType colmap;
3671 };
3672 
3673 template<typename SC, typename LO, typename GO, typename NO>
3674 void MMdetails::AddKernels<SC, LO, GO, NO>::
3675 convertToGlobalAndAdd(
3676  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3677  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3678  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3679  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3680  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3681  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3682  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3683  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3684  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3685 {
3686  using Teuchos::TimeMonitor;
3687  using Teuchos::rcp;
3688  //Need to use a different KokkosKernelsHandle type than other versions,
3689  //since the ordinals are now GO
3690  using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3691  typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3692 
3693  const values_array Avals = A.values;
3694  const values_array Bvals = B.values;
3695  const col_inds_array Acolinds = A.graph.entries;
3696  const col_inds_array Bcolinds = B.graph.entries;
3697  auto Arowptrs = A.graph.row_map;
3698  auto Browptrs = B.graph.row_map;
3699  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3700  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3701 
3702  auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: column map conversion"));
3703 
3704  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3705  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3706  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3707  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3708  KKH_GO handle;
3709  handle.create_spadd_handle(false);
3710  auto addHandle = handle.get_spadd_handle();
3711  MM = Teuchos::null;
3712  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: unsorted symbolic"));
3713  auto nrows = Arowptrs.extent(0) - 1;
3714  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3715  KokkosSparse::spadd_symbolic
3716  (&handle,
3717  nrows, A.numCols(),
3718  Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3719  Cvals = values_array("C values", addHandle->get_c_nnz());
3720  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3721 
3722  MM = Teuchos::null;
3723  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: unsorted numeric"));
3724  KokkosSparse::spadd_numeric(&handle,
3725  nrows, A.numCols(),
3726  Arowptrs, AcolindsConverted, Avals, scalarA,
3727  Browptrs, BcolindsConverted, Bvals, scalarB,
3728  Crowptrs, Ccolinds, Cvals);
3729 }
3730 
3731 
3732 } //End namepsace MMdetails
3733 
3734 } //End namespace Tpetra
3735 
3736 /*********************************************************************************************************/
3737 //
3738 // Explicit instantiation macro
3739 //
3740 // Must be expanded from within the Tpetra namespace!
3741 //
3742 namespace Tpetra {
3743 
3744 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3745 template \
3746  void MatrixMatrix::Multiply( \
3747  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3748  bool transposeA, \
3749  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3750  bool transposeB, \
3751  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3752  bool call_FillComplete_on_result, \
3753  const std::string & label, \
3754  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3755 \
3756 template \
3757  void MatrixMatrix::Multiply( \
3758  const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3759  bool transposeA, \
3760  const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3761  bool transposeB, \
3762  Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3763  const std::string & label); \
3764 \
3765 template \
3766  void MatrixMatrix::Jacobi( \
3767  SCALAR omega, \
3768  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3769  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3770  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3771  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3772  bool call_FillComplete_on_result, \
3773  const std::string & label, \
3774  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3775 \
3776  template \
3777  void MatrixMatrix::Add( \
3778  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3779  bool transposeA, \
3780  SCALAR scalarA, \
3781  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3782  bool transposeB, \
3783  SCALAR scalarB, \
3784  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3785 \
3786  template \
3787  void MatrixMatrix::Add( \
3788  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3789  bool transposeA, \
3790  SCALAR scalarA, \
3791  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3792  bool transposeB, \
3793  SCALAR scalarB, \
3794  const Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3795 \
3796  template \
3797  void MatrixMatrix::Add( \
3798  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3799  bool transposeA, \
3800  SCALAR scalarA, \
3801  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3802  SCALAR scalarB ); \
3803 \
3804  template \
3805  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3806  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3807  (const SCALAR & alpha, \
3808  const bool transposeA, \
3809  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3810  const SCALAR & beta, \
3811  const bool transposeB, \
3812  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3813  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3814  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3815  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3816 \
3817  template \
3818  void \
3819  MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3820  (const SCALAR & alpha, \
3821  const bool transposeA, \
3822  const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3823  const SCALAR& beta, \
3824  const bool transposeB, \
3825  const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3826  CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3827  const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3828  const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3829  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3830 \
3831  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3832 \
3833  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3834  Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3835  CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3836  Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3837  bool userAssertsThereAreNoRemotes, \
3838  const std::string& label, \
3839  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3840 \
3841  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3842  Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
3843  BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3844  Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
3845  bool userAssertsThereAreNoRemotes);
3846 } //End namespace Tpetra
3847 
3848 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Struct that holds views of the contents of a BlockCrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void start()
Start the deep_copy counter.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
Forward declaration of some Tpetra Matrix Matrix objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.