Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
11 #define TPETRA_MATRIXMATRIX_DEF_HPP
13 #include "KokkosSparse_Utils.hpp"
14 #include "Tpetra_ConfigDefs.hpp"
16 #include "Teuchos_VerboseObject.hpp"
17 #include "Teuchos_Array.hpp"
18 #include "Tpetra_Util.hpp"
19 #include "Tpetra_CrsMatrix.hpp"
20 #include "Tpetra_BlockCrsMatrix.hpp"
22 #include "Tpetra_RowMatrixTransposer.hpp"
25 #include "Tpetra_Details_makeColMap.hpp"
26 #include "Tpetra_ConfigDefs.hpp"
27 #include "Tpetra_Map.hpp"
28 #include "Tpetra_Export.hpp"
29 #include "Tpetra_Import_Util.hpp"
30 #include "Tpetra_Import_Util2.hpp"
32 
33 #include <algorithm>
34 #include <type_traits>
35 #include "Teuchos_FancyOStream.hpp"
36 
37 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
39 
40 #include "KokkosSparse_spgemm.hpp"
41 #include "KokkosSparse_spadd.hpp"
42 #include "Kokkos_Bitset.hpp"
43 
44 #include <MatrixMarket_Tpetra.hpp>
45 
51 /*********************************************************************************************************/
52 // Include the architecture-specific kernel partial specializations here
53 // NOTE: This needs to be outside all namespaces
54 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
55 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
56 #include "TpetraExt_MatrixMatrix_HIP.hpp"
57 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
58 
59 namespace Tpetra {
60 
61 namespace MatrixMatrix {
62 
63 //
64 // This method forms the matrix-matrix product C = op(A) * op(B), where
65 // op(A) == A if transposeA is false,
66 // op(A) == A^T if transposeA is true,
67 // and similarly for op(B).
68 //
69 template <class Scalar,
70  class LocalOrdinal,
71  class GlobalOrdinal,
72  class Node>
73 void Multiply(
75  bool transposeA,
77  bool transposeB,
79  bool call_FillComplete_on_result,
80  const std::string& label,
81  const Teuchos::RCP<Teuchos::ParameterList>& params) {
82  using Teuchos::null;
83  using Teuchos::RCP;
84  using Teuchos::rcp;
85  typedef Scalar SC;
86  typedef LocalOrdinal LO;
87  typedef GlobalOrdinal GO;
88  typedef Node NO;
89  typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
90  typedef Import<LO, GO, NO> import_type;
91  typedef CrsMatrixStruct<SC, LO, GO, NO> crs_matrix_struct_type;
92  typedef Map<LO, GO, NO> map_type;
93  typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
94 
95  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All Setup"));
96 
97  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
98 
99  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
100 
101  // The input matrices A and B must both be fillComplete.
102  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
103  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
104 
105  // If transposeA is true, then Aprime will be the transpose of A
106  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
107  // will just be a pointer to A.
108  RCP<const crs_matrix_type> Aprime = null;
109  // If transposeB is true, then Bprime will be the transpose of B
110  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
111  // will just be a pointer to B.
112  RCP<const crs_matrix_type> Bprime = null;
113 
114  // Is this a "clean" matrix?
115  //
116  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
117  // locally nor globally indexed, then it was empty. I don't like
118  // this, because the most straightforward implementation presumes
119  // lazy allocation of indices. However, historical precedent
120  // demands that we keep around this predicate as a way to test
121  // whether the matrix is empty.
122  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
123 
124  bool use_optimized_ATB = false;
125  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
126  use_optimized_ATB = true;
127 
128 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
129  use_optimized_ATB = false;
130 #endif
131 
132  using Teuchos::ParameterList;
133  RCP<ParameterList> transposeParams(new ParameterList);
134  transposeParams->set("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
135 
136  if (!use_optimized_ATB && transposeA) {
137  transposer_type transposer(rcpFromRef(A));
138  Aprime = transposer.createTranspose(transposeParams);
139  } else {
140  Aprime = rcpFromRef(A);
141  }
142 
143  if (transposeB) {
144  transposer_type transposer(rcpFromRef(B));
145  Bprime = transposer.createTranspose(transposeParams);
146  } else {
147  Bprime = rcpFromRef(B);
148  }
149 
150  // Check size compatibility
151  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
152  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
153  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
154  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
155  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
156  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
157  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
158  prefix << "ERROR, inner dimensions of op(A) and op(B) "
159  "must match for matrix-matrix product. op(A) is "
160  << Aouter << "x" << Ainner << ", op(B) is " << Binner << "x" << Bouter);
161 
162  // The result matrix C must at least have a row-map that reflects the correct
163  // row-size. Don't check the number of columns because rectangular matrices
164  // which were constructed with only one map can still end up having the
165  // correct capacity and dimensions when filled.
166  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
167  prefix << "ERROR, dimensions of result C must "
168  "match dimensions of op(A) * op(B). C has "
169  << C.getGlobalNumRows()
170  << " rows, should have at least " << Aouter << std::endl);
171 
172  // It doesn't matter whether C is already Filled or not. If it is already
173  // Filled, it must have space allocated for the positions that will be
174  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
175  // we'll error out later when trying to store result values.
176 
177  // CGB: However, matrix must be in active-fill
178  if (!C.isFillActive()) C.resumeFill();
179 
180  // We're going to need to import remotely-owned sections of A and/or B if
181  // more than one processor is performing this run, depending on the scenario.
182  int numProcs = A.getComm()->getSize();
183 
184  // Declare a couple of structs that will be used to hold views of the data
185  // of A and B, to be used for fast access during the matrix-multiplication.
186  crs_matrix_struct_type Aview;
187  crs_matrix_struct_type Bview;
188 
189  RCP<const map_type> targetMap_A = Aprime->getRowMap();
190  RCP<const map_type> targetMap_B = Bprime->getRowMap();
191 
192  {
193  Tpetra::Details::ProfilingRegion r("TpetraExt: MMM: All I&X");
194 
195  // Now import any needed remote rows and populate the Aview struct
196  // NOTE: We assert that an import isn't needed --- since we do the transpose
197  // above to handle that.
198  if (!use_optimized_ATB) {
199  RCP<const import_type> dummyImporter;
200  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
201  }
202 
203  // We will also need local access to all rows of B that correspond to the
204  // column-map of op(A).
205  if (numProcs > 1)
206  targetMap_B = Aprime->getColMap();
207 
208  // Import any needed remote rows and populate the Bview struct.
209  if (!use_optimized_ATB)
210  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
211 
212  } // stop MM_importExtract here
213 
214  // stop the setup timer, and start the multiply timer
215  MM = Teuchos::null;
216  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All Multiply"));
217 
218  // Call the appropriate method to perform the actual multiplication.
219  if (use_optimized_ATB) {
220  MMdetails::mult_AT_B_newmatrix(A, B, C, label, params);
221 
222  } else if (call_FillComplete_on_result && newFlag) {
223  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label, params);
224 
225  } else if (call_FillComplete_on_result) {
226  MMdetails::mult_A_B_reuse(Aview, Bview, C, label, params);
227 
228  } else {
229  // mfh 27 Sep 2016: Is this the "slow" case? This
230  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
231  // thread-parallel inserts, but that may take some effort.
232  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
233 
234  MMdetails::mult_A_B(Aview, Bview, crsmat, label, params);
235  }
236 
237  MM = Teuchos::null;
238  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: All FillComplete"));
239  if (call_FillComplete_on_result && !C.isFillComplete()) {
240  // We'll call FillComplete on the C matrix before we exit, and give it a
241  // domain-map and a range-map.
242  // The domain-map will be the domain-map of B, unless
243  // op(B)==transpose(B), in which case the range-map of B will be used.
244  // The range-map will be the range-map of A, unless op(A)==transpose(A),
245  // in which case the domain-map of A will be used.
246  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
247  }
248 }
249 
250 //
251 // This method forms the matrix-matrix product C = op(A) * op(B), where
252 // op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
253 //
254 template <class Scalar,
255  class LocalOrdinal,
256  class GlobalOrdinal,
257  class Node>
258 void Multiply(
260  bool transposeA,
262  bool transposeB,
264  const std::string& label) {
265  using Teuchos::null;
266  using Teuchos::RCP;
267  using Teuchos::rcp;
268  typedef Scalar SC;
269  typedef LocalOrdinal LO;
270  typedef GlobalOrdinal GO;
271  typedef Node NO;
272  typedef BlockCrsMatrixStruct<SC, LO, GO, NO> blockcrs_matrix_struct_type;
273  typedef Map<LO, GO, NO> map_type;
274  typedef Import<LO, GO, NO> import_type;
275 
276  std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
277 
278  TEUCHOS_TEST_FOR_EXCEPTION(transposeA == true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
279  TEUCHOS_TEST_FOR_EXCEPTION(transposeB == true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
280 
281  // Check size compatibility
282  global_size_t numACols = A->getGlobalNumCols();
283  global_size_t numBCols = B->getGlobalNumCols();
284  global_size_t numARows = A->getGlobalNumRows();
285  global_size_t numBRows = B->getGlobalNumRows();
286 
287  global_size_t Aouter = numARows;
288  global_size_t Bouter = numBCols;
289  global_size_t Ainner = numACols;
290  global_size_t Binner = numBRows;
291  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
292  prefix << "ERROR, inner dimensions of op(A) and op(B) "
293  "must match for matrix-matrix product. op(A) is "
294  << Aouter << "x" << Ainner << ", op(B) is " << Binner << "x" << Bouter);
295 
296  // We're going to need to import remotely-owned sections of A and/or B if
297  // more than one processor is performing this run, depending on the scenario.
298  int numProcs = A->getComm()->getSize();
299 
300  const LO blocksize = A->getBlockSize();
301  TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
302  prefix << "ERROR, Blocksizes do not match. A.blocksize = " << blocksize << ", B.blocksize = " << B->getBlockSize());
303 
304  // Declare a couple of structs that will be used to hold views of the data
305  // of A and B, to be used for fast access during the matrix-multiplication.
306  blockcrs_matrix_struct_type Aview(blocksize);
307  blockcrs_matrix_struct_type Bview(blocksize);
308 
309  RCP<const map_type> targetMap_A = A->getRowMap();
310  RCP<const map_type> targetMap_B = B->getRowMap();
311 
312  // Populate the Aview struct. No remotes are needed.
313  RCP<const import_type> dummyImporter;
314  MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
315 
316  // We will also need local access to all rows of B that correspond to the
317  // column-map of op(A).
318  if (numProcs > 1)
319  targetMap_B = A->getColMap();
320 
321  // Import any needed remote rows and populate the Bview struct.
322  MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
323  A->getGraph()->getImporter().is_null());
324 
325  // Call the appropriate method to perform the actual multiplication.
326  MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
327 }
328 
329 template <class Scalar,
330  class LocalOrdinal,
331  class GlobalOrdinal,
332  class Node>
333 void Jacobi(Scalar omega,
338  bool call_FillComplete_on_result,
339  const std::string& label,
340  const Teuchos::RCP<Teuchos::ParameterList>& params) {
341  using Teuchos::RCP;
342  using Teuchos::rcp;
343  typedef Scalar SC;
344  typedef LocalOrdinal LO;
345  typedef GlobalOrdinal GO;
346  typedef Node NO;
347  typedef Import<LO, GO, NO> import_type;
348  typedef CrsMatrixStruct<SC, LO, GO, NO> crs_matrix_struct_type;
349  typedef Map<LO, GO, NO> map_type;
350  typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
351 
352  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi: All Setup"));
353 
354  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
355 
356  // A and B should already be Filled.
357  // Should we go ahead and call FillComplete() on them if necessary or error
358  // out? For now, we choose to error out.
359  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
360  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
361 
362  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
363  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
364 
365  // Now check size compatibility
366  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
367  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
368  global_size_t Aouter = A.getGlobalNumRows();
369  global_size_t Bouter = numBCols;
370  global_size_t Ainner = numACols;
371  global_size_t Binner = B.getGlobalNumRows();
372  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
373  prefix << "ERROR, inner dimensions of op(A) and op(B) "
374  "must match for matrix-matrix product. op(A) is "
375  << Aouter << "x" << Ainner << ", op(B) is " << Binner << "x" << Bouter);
376 
377  // The result matrix C must at least have a row-map that reflects the correct
378  // row-size. Don't check the number of columns because rectangular matrices
379  // which were constructed with only one map can still end up having the
380  // correct capacity and dimensions when filled.
381  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
382  prefix << "ERROR, dimensions of result C must "
383  "match dimensions of op(A) * op(B). C has "
384  << C.getGlobalNumRows()
385  << " rows, should have at least " << Aouter << std::endl);
386 
387  // It doesn't matter whether C is already Filled or not. If it is already
388  // Filled, it must have space allocated for the positions that will be
389  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
390  // we'll error out later when trying to store result values.
391 
392  // CGB: However, matrix must be in active-fill
393  TEUCHOS_TEST_FOR_EXCEPT(C.isFillActive() == false);
394 
395  // We're going to need to import remotely-owned sections of A and/or B if
396  // more than one processor is performing this run, depending on the scenario.
397  int numProcs = A.getComm()->getSize();
398 
399  // Declare a couple of structs that will be used to hold views of the data of
400  // A and B, to be used for fast access during the matrix-multiplication.
401  crs_matrix_struct_type Aview;
402  crs_matrix_struct_type Bview;
403 
404  RCP<const map_type> targetMap_A = Aprime->getRowMap();
405  RCP<const map_type> targetMap_B = Bprime->getRowMap();
406 
407  {
408  Tpetra::Details::ProfilingRegion r("TpetraExt: Jacobi: All I&X");
409  // Enable globalConstants by default
410  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
411  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
412  if (!params.is_null()) {
413  importParams1->set("compute global constants", params->get("compute global constants: temporaries", false));
414  int mm_optimization_core_count = 0;
415  auto slist = params->sublist("matrixmatrix: kernel params", false);
416  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount", ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount());
417  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
418  if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
419  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
420  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
421  auto& ip1slist = importParams1->sublist("matrixmatrix: kernel params", false);
422  ip1slist.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
423  ip1slist.set("isMatrixMatrix_TransferAndFillComplete", isMM);
424  ip1slist.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
425  }
426 
427  // Now import any needed remote rows and populate the Aview struct.
428  RCP<const import_type> dummyImporter;
429  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, importParams1);
430 
431  // We will also need local access to all rows of B that correspond to the
432  // column-map of op(A).
433  if (numProcs > 1)
434  targetMap_B = Aprime->getColMap();
435 
436  // Now import any needed remote rows and populate the Bview struct.
437  // Enable globalConstants by default
438  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
439  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
440  if (!params.is_null()) {
441  importParams2->set("compute global constants", params->get("compute global constants: temporaries", false));
442 
443  auto slist = params->sublist("matrixmatrix: kernel params", false);
444  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount", ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount());
445  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
446  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
447  auto& ip2slist = importParams2->sublist("matrixmatrix: kernel params", false);
448  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
449  if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
450  ip2slist.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
451  ip2slist.set("isMatrixMatrix_TransferAndFillComplete", isMM);
452  ip2slist.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
453  }
454 
455  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, importParams2);
456  }
457  MM = Teuchos::null;
458  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: Jacobi All Multiply"));
459 
460  // Now call the appropriate method to perform the actual multiplication.
461  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
462 
463  // Is this a "clean" matrix
464  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
465 
466  if (call_FillComplete_on_result && newFlag) {
467  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
468 
469  } else if (call_FillComplete_on_result) {
470  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
471 
472  } else {
473  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
474  }
475 
476  if (!params.is_null()) {
477  bool removeZeroEntries = params->get("remove zeros", false);
478  if (removeZeroEntries) {
479  typedef Teuchos::ScalarTraits<Scalar> STS;
480  typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
481  removeCrsMatrixZeros(C, threshold);
482  }
483  }
484 }
485 
486 template <class Scalar,
487  class LocalOrdinal,
488  class GlobalOrdinal,
489  class Node>
490 void Add(
492  bool transposeA,
493  Scalar scalarA,
495  Scalar scalarB) {
496  using Teuchos::Array;
497  using Teuchos::null;
498  using Teuchos::RCP;
499  typedef Scalar SC;
500  typedef LocalOrdinal LO;
501  typedef GlobalOrdinal GO;
502  typedef Node NO;
503  typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
504  typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
505 
506  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
507 
508  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
509  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
510  "(Result matrix B is not required to be isFillComplete()).");
511  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete(), std::runtime_error,
512  prefix << "ERROR, input matrix B must not be fill complete!");
513  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph(), std::runtime_error,
514  prefix << "ERROR, input matrix B must not have static graph!");
515  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed(), std::runtime_error,
516  prefix << "ERROR, input matrix B must not be locally indexed!");
517 
518  using Teuchos::ParameterList;
519  RCP<ParameterList> transposeParams(new ParameterList);
520  transposeParams->set("sort", false);
521 
522  RCP<const crs_matrix_type> Aprime = null;
523  if (transposeA) {
524  transposer_type transposer(rcpFromRef(A));
525  Aprime = transposer.createTranspose(transposeParams);
526  } else {
527  Aprime = rcpFromRef(A);
528  }
529 
530  size_t a_numEntries;
531  typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds", A.getLocalMaxNumRowEntries());
532  typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals", A.getLocalMaxNumRowEntries());
533  GO row;
534 
535  if (scalarB != Teuchos::ScalarTraits<SC>::one())
536  B.scale(scalarB);
537 
538  size_t numMyRows = B.getLocalNumRows();
539  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
540  for (LO i = 0; (size_t)i < numMyRows; ++i) {
541  row = B.getRowMap()->getGlobalElement(i);
542  Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
543 
544  if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
545  for (size_t j = 0; j < a_numEntries; ++j)
546  a_vals[j] *= scalarA;
547  }
548  B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar*>(a_vals.data()), a_inds.data());
549  }
550  }
551 }
552 
553 template <class Scalar,
554  class LocalOrdinal,
555  class GlobalOrdinal,
556  class Node>
557 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>
558 add(const Scalar& alpha,
559  const bool transposeA,
561  const Scalar& beta,
562  const bool transposeB,
564  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& domainMap,
565  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& rangeMap,
566  const Teuchos::RCP<Teuchos::ParameterList>& params) {
567  using Teuchos::ParameterList;
568  using Teuchos::RCP;
569  using Teuchos::rcp;
570  using Teuchos::rcpFromRef;
572  if (!params.is_null()) {
573  TEUCHOS_TEST_FOR_EXCEPTION(
574  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
575  std::invalid_argument,
576  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
577  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
578  params->set("Call fillComplete", true);
579  }
580  // If transposeB, must compute B's explicit transpose to
581  // get the correct row map for C.
582  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
583  if (transposeB) {
584  RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer(Brcp);
585  Brcp = transposer.createTranspose();
586  }
587  // Check that A,B are fillComplete before getting B's column map
588  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete() || !Brcp->isFillComplete(), std::invalid_argument,
589  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
590  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
591  // this version of add() always fill completes the result, no matter what is in params on input
592  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
593  return C;
594 }
595 
596 // This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
597 // but since the spadd() output is always packed there is no need for a separate
598 // numRowEntries here.
599 //
600 template <class LO, class GO, class LOView, class GOView, class LocalMap>
601 struct ConvertGlobalToLocalFunctor {
602  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
603  : lids(lids_)
604  , gids(gids_)
605  , localColMap(localColMap_) {}
606 
607  KOKKOS_FUNCTION void operator()(const GO i) const {
608  lids(i) = localColMap.getLocalElement(gids(i));
609  }
610 
611  LOView lids;
612  const GOView gids;
613  const LocalMap localColMap;
614 };
615 
616 template <class Scalar,
617  class LocalOrdinal,
618  class GlobalOrdinal,
619  class Node>
620 void add(const Scalar& alpha,
621  const bool transposeA,
622  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
623  const Scalar& beta,
624  const bool transposeB,
625  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
626  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
627  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& domainMap,
628  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>>& rangeMap,
629  const Teuchos::RCP<Teuchos::ParameterList>& params) {
630  using Teuchos::RCP;
631  using Teuchos::rcp;
632  using Teuchos::rcp_dynamic_cast;
633  using Teuchos::rcp_implicit_cast;
634  using Teuchos::rcpFromRef;
635  using Teuchos::TimeMonitor;
636  using SC = Scalar;
637  using LO = LocalOrdinal;
638  using GO = GlobalOrdinal;
639  using NO = Node;
640  using crs_matrix_type = CrsMatrix<SC, LO, GO, NO>;
641  using crs_graph_type = CrsGraph<LO, GO, NO>;
642  using map_type = Map<LO, GO, NO>;
643  using transposer_type = RowMatrixTransposer<SC, LO, GO, NO>;
644  using import_type = Import<LO, GO, NO>;
645  using export_type = Export<LO, GO, NO>;
646  using exec_space = typename crs_graph_type::execution_space;
647  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
648  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
649  constexpr bool debug = false;
650 
651  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: Transpose"));
652 
653  if (debug) {
654  std::ostringstream os;
655  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
656  << "TpetraExt::MatrixMatrix::add" << std::endl;
657  std::cerr << os.str();
658  }
659 
660  TEUCHOS_TEST_FOR_EXCEPTION(C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
661  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
662  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete() || !B.isFillComplete(), std::invalid_argument,
663  prefix_mmm << "A and B must both be fill complete.");
664 #ifdef HAVE_TPETRA_DEBUG
665  // The matrices don't have domain or range Maps unless they are fill complete.
666  if (A.isFillComplete() && B.isFillComplete()) {
667  const bool domainMapsSame =
668  (!transposeA && !transposeB &&
669  !A.getDomainMap()->locallySameAs(*B.getDomainMap())) ||
670  (!transposeA && transposeB &&
671  !A.getDomainMap()->isSameAs(*B.getRangeMap())) ||
672  (transposeA && !transposeB &&
673  !A.getRangeMap()->isSameAs(*B.getDomainMap()));
674  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
675  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
676 
677  const bool rangeMapsSame =
678  (!transposeA && !transposeB &&
679  !A.getRangeMap()->isSameAs(*B.getRangeMap())) ||
680  (!transposeA && transposeB &&
681  !A.getRangeMap()->isSameAs(*B.getDomainMap())) ||
682  (transposeA && !transposeB &&
683  !A.getDomainMap()->isSameAs(*B.getRangeMap()));
684  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
685  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
686  }
687 #endif // HAVE_TPETRA_DEBUG
688 
689  using Teuchos::ParameterList;
690  // Form the explicit transpose of A if necessary.
691  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
692  if (transposeA) {
693  transposer_type transposer(Aprime);
694  Aprime = transposer.createTranspose();
695  }
696 
697 #ifdef HAVE_TPETRA_DEBUG
698  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null(), std::logic_error,
699  prefix_mmm << "Failed to compute Op(A). "
700  "Please report this bug to the Tpetra developers.");
701 #endif // HAVE_TPETRA_DEBUG
702 
703  // Form the explicit transpose of B if necessary.
704  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
705  if (transposeB) {
706  if (debug) {
707  std::ostringstream os;
708  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
709  << "Form explicit xpose of B" << std::endl;
710  std::cerr << os.str();
711  }
712  transposer_type transposer(Bprime);
713  Bprime = transposer.createTranspose();
714  }
715 #ifdef HAVE_TPETRA_DEBUG
716  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null(), std::logic_error,
717  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
718  TEUCHOS_TEST_FOR_EXCEPTION(
719  !Aprime->isFillComplete() || !Bprime->isFillComplete(), std::invalid_argument,
720  prefix_mmm << "Aprime and Bprime must both be fill complete. "
721  "Please report this bug to the Tpetra developers.");
722 #endif // HAVE_TPETRA_DEBUG
723  RCP<const map_type> CDomainMap = domainMap;
724  RCP<const map_type> CRangeMap = rangeMap;
725  if (CDomainMap.is_null()) {
726  CDomainMap = Bprime->getDomainMap();
727  }
728  if (CRangeMap.is_null()) {
729  CRangeMap = Bprime->getRangeMap();
730  }
731  assert(!(CDomainMap.is_null()));
732  assert(!(CRangeMap.is_null()));
733  typedef typename AddKern::values_array values_array;
734  typedef typename AddKern::row_ptrs_array row_ptrs_array;
735  typedef typename AddKern::col_inds_array col_inds_array;
736  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
737  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
738  values_array vals;
739  row_ptrs_array rowptrs;
740  col_inds_array colinds;
741 
742  MM = Teuchos::null;
743  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: rowmap check/import"));
744 
745  if (!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap())))) {
746  // import Aprime into Bprime's row map so the local matrices have same # of rows
747  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
748  // cbl do not set
749  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
750  // this import _may_ take the form of a transfer. In practice it would be unlikely,
751  // but the general case is not so forgiving.
752  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
753  }
754  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
755  bool sorted = AGraphSorted && BGraphSorted;
756  RCP<const import_type> Cimport = Teuchos::null;
757  RCP<export_type> Cexport = Teuchos::null;
758  bool doFillComplete = true;
759  if (Teuchos::nonnull(params) && params->isParameter("Call fillComplete")) {
760  doFillComplete = params->get<bool>("Call fillComplete");
761  }
762  auto Alocal = Aprime->getLocalMatrixDevice();
763  auto Blocal = Bprime->getLocalMatrixDevice();
764  LO numLocalRows = Alocal.numRows();
765  if (numLocalRows == 0) {
766  // KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
767  // but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
768  // Handle this case now
769  //(without interfering with collective operations, since it's possible for
770  // some ranks to have 0 local rows and others not).
771  rowptrs = row_ptrs_array("C rowptrs", 0);
772  }
773  auto Acolmap = Aprime->getColMap();
774  auto Bcolmap = Bprime->getColMap();
775  if (!matchingColMaps) {
776  using global_col_inds_array = typename AddKern::global_col_inds_array;
777  MM = Teuchos::null;
778  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: mismatched col map full kernel"));
779 
780  // use kernel that converts col indices in both A and B to common domain map before adding
781  auto AlocalColmap = Acolmap->getLocalMap();
782  auto BlocalColmap = Bcolmap->getLocalMap();
783  global_col_inds_array globalColinds;
784  if (debug) {
785  std::ostringstream os;
786  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
787  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
788  std::cerr << os.str();
789  }
790  AddKern::convertToGlobalAndAdd(
791  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
792  vals, rowptrs, globalColinds);
793  if (debug) {
794  std::ostringstream os;
795  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
796  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
797  std::cerr << os.str();
798  }
799  MM = Teuchos::null;
800  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: Constructing graph"));
801 
802  RCP<const map_type> CcolMap;
803  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>(CcolMap, CDomainMap, globalColinds);
804  C.replaceColMap(CcolMap);
805  col_inds_array localColinds("C colinds", globalColinds.extent(0));
806  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
807  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
808  col_inds_array, global_col_inds_array,
809  typename map_type::local_map_type>(localColinds, globalColinds, CcolMap->getLocalMap()));
810  KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
811  C.setAllValues(rowptrs, localColinds, vals);
812  C.fillComplete(CDomainMap, CRangeMap, params);
813  if (!doFillComplete)
814  C.resumeFill();
815  } else {
816  // Aprime, Bprime and C all have the same column maps
817  auto Avals = Alocal.values;
818  auto Bvals = Blocal.values;
819  auto Arowptrs = Alocal.graph.row_map;
820  auto Browptrs = Blocal.graph.row_map;
821  auto Acolinds = Alocal.graph.entries;
822  auto Bcolinds = Blocal.graph.entries;
823  if (sorted) {
824  // use sorted kernel
825  MM = Teuchos::null;
826  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted entries full kernel"));
827  if (debug) {
828  std::ostringstream os;
829  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
830  << "Call AddKern::addSorted(...)" << std::endl;
831  std::cerr << os.str();
832  }
833  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
834  } else {
835  // use unsorted kernel
836  MM = Teuchos::null;
837  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted entries full kernel"));
838 
839  if (debug) {
840  std::ostringstream os;
841  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
842  << "Call AddKern::addUnsorted(...)" << std::endl;
843  std::cerr << os.str();
844  }
845  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
846  }
847  // Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
848  RCP<const map_type> Ccolmap = Bcolmap;
849  C.replaceColMap(Ccolmap);
850  C.setAllValues(rowptrs, colinds, vals);
851  if (doFillComplete) {
852  MM = Teuchos::null;
853  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: expertStaticFillComplete"));
854  if (!CDomainMap->isSameAs(*Ccolmap)) {
855  if (debug) {
856  std::ostringstream os;
857  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
858  << "Create Cimport" << std::endl;
859  std::cerr << os.str();
860  }
861  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
862  }
863  if (!C.getRowMap()->isSameAs(*CRangeMap)) {
864  if (debug) {
865  std::ostringstream os;
866  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
867  << "Create Cexport" << std::endl;
868  std::cerr << os.str();
869  }
870  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
871  }
872 
873  if (debug) {
874  std::ostringstream os;
875  os << "Proc " << A.getMap()->getComm()->getRank() << ": "
876  << "Call C->expertStaticFillComplete(...)" << std::endl;
877  std::cerr << os.str();
878  }
879  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
880  }
881  }
882 }
883 
884 // This version of Add takes C as RCP&, so C may be null on input (in this case,
885 // it is allocated and constructed in this function).
886 template <class Scalar,
887  class LocalOrdinal,
888  class GlobalOrdinal,
889  class Node>
890 void Add(
891  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
892  bool transposeA,
893  Scalar scalarA,
894  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
895  bool transposeB,
896  Scalar scalarB,
897  Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& C) {
898  using std::endl;
899  using Teuchos::Array;
900  using Teuchos::ArrayRCP;
901  using Teuchos::ArrayView;
902  using Teuchos::RCP;
903  using Teuchos::rcp;
904  using Teuchos::rcp_dynamic_cast;
905  using Teuchos::rcpFromRef;
906  using Teuchos::tuple;
907  // typedef typename ArrayView<const Scalar>::size_type size_type;
908  typedef Teuchos::ScalarTraits<Scalar> STS;
909  typedef Map<LocalOrdinal, GlobalOrdinal, Node> map_type;
910  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
911  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
912  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
913  typedef CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crs_matrix_type;
914  typedef RowMatrixTransposer<Scalar, LocalOrdinal, GlobalOrdinal, Node> transposer_type;
915 
916  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
917 
918  TEUCHOS_TEST_FOR_EXCEPTION(
919  !A.isFillComplete() || !B.isFillComplete(), std::invalid_argument,
920  prefix << "A and B must both be fill complete before calling this function.");
921 
922  if (C.is_null()) {
923  TEUCHOS_TEST_FOR_EXCEPTION(!A.haveGlobalConstants(), std::logic_error,
924  prefix << "C is null (must be allocated), but A.haveGlobalConstants() is false. "
925  "Please report this bug to the Tpetra developers.");
926  TEUCHOS_TEST_FOR_EXCEPTION(!B.haveGlobalConstants(), std::logic_error,
927  prefix << "C is null (must be allocated), but B.haveGlobalConstants() is false. "
928  "Please report this bug to the Tpetra developers.");
929  }
930 
931 #ifdef HAVE_TPETRA_DEBUG
932  {
933  const bool domainMapsSame =
934  (!transposeA && !transposeB && !A.getDomainMap()->isSameAs(*(B.getDomainMap()))) ||
935  (!transposeA && transposeB && !A.getDomainMap()->isSameAs(*(B.getRangeMap()))) ||
936  (transposeA && !transposeB && !A.getRangeMap()->isSameAs(*(B.getDomainMap())));
937  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
938  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
939 
940  const bool rangeMapsSame =
941  (!transposeA && !transposeB && !A.getRangeMap()->isSameAs(*(B.getRangeMap()))) ||
942  (!transposeA && transposeB && !A.getRangeMap()->isSameAs(*(B.getDomainMap()))) ||
943  (transposeA && !transposeB && !A.getDomainMap()->isSameAs(*(B.getRangeMap())));
944  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
945  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
946  }
947 #endif // HAVE_TPETRA_DEBUG
948 
949  using Teuchos::ParameterList;
950  RCP<ParameterList> transposeParams(new ParameterList);
951  transposeParams->set("sort", false);
952 
953  // Form the explicit transpose of A if necessary.
954  RCP<const crs_matrix_type> Aprime;
955  if (transposeA) {
956  transposer_type theTransposer(rcpFromRef(A));
957  Aprime = theTransposer.createTranspose(transposeParams);
958  } else {
959  Aprime = rcpFromRef(A);
960  }
961 
962 #ifdef HAVE_TPETRA_DEBUG
963  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null(), std::logic_error,
964  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
965 #endif // HAVE_TPETRA_DEBUG
966 
967  // Form the explicit transpose of B if necessary.
968  RCP<const crs_matrix_type> Bprime;
969  if (transposeB) {
970  transposer_type theTransposer(rcpFromRef(B));
971  Bprime = theTransposer.createTranspose(transposeParams);
972  } else {
973  Bprime = rcpFromRef(B);
974  }
975 
976 #ifdef HAVE_TPETRA_DEBUG
977  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null(), std::logic_error,
978  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
979 #endif // HAVE_TPETRA_DEBUG
980 
981  bool CwasFillComplete = false;
982 
983  // Allocate or zero the entries of the result matrix.
984  if (!C.is_null()) {
985  CwasFillComplete = C->isFillComplete();
986  if (CwasFillComplete)
987  C->resumeFill();
988  C->setAllToScalar(STS::zero());
989  } else {
990  // FIXME (mfh 08 May 2013) When I first looked at this method, I
991  // noticed that C was being given the row Map of Aprime (the
992  // possibly transposed version of A). Is this what we want?
993 
994  // It is a precondition that Aprime and Bprime have the same domain and range maps.
995  // However, they may have different row maps. In this case, it's difficult to
996  // get a precise upper bound on the number of entries in each local row of C, so
997  // just use the looser upper bound based on the max number of entries in any row of Aprime and Bprime.
998  if (Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
999  LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1000  Array<size_t> CmaxEntriesPerRow(numLocalRows);
1001  for (LocalOrdinal i = 0; i < numLocalRows; i++) {
1002  CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1003  }
1004  C = rcp(new crs_matrix_type(Aprime->getRowMap(), CmaxEntriesPerRow()));
1005  } else {
1006  // Note: above we checked that Aprime and Bprime have global constants, so it's safe to ask for max entries per row.
1007  C = rcp(new crs_matrix_type(Aprime->getRowMap(), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1008  }
1009  }
1010 
1011 #ifdef HAVE_TPETRA_DEBUG
1012  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null(), std::logic_error,
1013  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1014  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null(), std::logic_error,
1015  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1016  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null(), std::logic_error,
1017  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1018 #endif // HAVE_TPETRA_DEBUG
1019 
1020  Array<RCP<const crs_matrix_type>> Mat =
1021  tuple<RCP<const crs_matrix_type>>(Aprime, Bprime);
1022  Array<Scalar> scalar = tuple<Scalar>(scalarA, scalarB);
1023 
1024  // do a loop over each matrix to add: A reordering might be more efficient
1025  for (int k = 0; k < 2; ++k) {
1026  typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1027  typename crs_matrix_type::nonconst_values_host_view_type Values;
1028 
1029  // Loop over each locally owned row of the current matrix (either
1030  // Aprime or Bprime), and sum its entries into the corresponding
1031  // row of C. This works regardless of whether Aprime or Bprime
1032  // has the same row Map as C, because both sumIntoGlobalValues and
1033  // insertGlobalValues allow summing resp. inserting into nonowned
1034  // rows of C.
1035 #ifdef HAVE_TPETRA_DEBUG
1036  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null(), std::logic_error,
1037  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1038 #endif // HAVE_TPETRA_DEBUG
1039  RCP<const map_type> curRowMap = Mat[k]->getRowMap();
1040 #ifdef HAVE_TPETRA_DEBUG
1041  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null(), std::logic_error,
1042  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1043 #endif // HAVE_TPETRA_DEBUG
1044 
1045  const size_t localNumRows = Mat[k]->getLocalNumRows();
1046  for (size_t i = 0; i < localNumRows; ++i) {
1047  const GlobalOrdinal globalRow = curRowMap->getGlobalElement(i);
1048  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow(globalRow);
1049  if (numEntries > 0) {
1050  if (numEntries > Indices.extent(0)) {
1051  Kokkos::resize(Indices, numEntries);
1052  Kokkos::resize(Values, numEntries);
1053  }
1054  Mat[k]->getGlobalRowCopy(globalRow, Indices, Values, numEntries);
1055 
1056  if (scalar[k] != STS::one()) {
1057  for (size_t j = 0; j < numEntries; ++j) {
1058  Values[j] *= scalar[k];
1059  }
1060  }
1061 
1062  if (CwasFillComplete) {
1063  size_t result = C->sumIntoGlobalValues(globalRow, numEntries,
1064  reinterpret_cast<Scalar*>(Values.data()), Indices.data());
1065  TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1066  prefix << "sumIntoGlobalValues failed to add entries from A or B into C.");
1067  } else {
1068  C->insertGlobalValues(globalRow, numEntries,
1069  reinterpret_cast<Scalar*>(Values.data()), Indices.data());
1070  }
1071  }
1072  }
1073  }
1074  if (CwasFillComplete) {
1075  C->fillComplete(C->getDomainMap(),
1076  C->getRangeMap());
1077  }
1078 }
1079 
1080 // This version of Add takes C as const RCP&, so C must not be null on input. Otherwise, its behavior is identical
1081 // to the above version where C is RCP&.
1082 template <class Scalar,
1083  class LocalOrdinal,
1084  class GlobalOrdinal,
1085  class Node>
1086 void Add(
1087  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1088  bool transposeA,
1089  Scalar scalarA,
1090  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1091  bool transposeB,
1092  Scalar scalarB,
1093  const Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& C) {
1094  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1095 
1096  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null(), std::invalid_argument,
1097  prefix << "C must not be null");
1098 
1099  Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>> C_ = C;
1100  Add(A, transposeA, scalarA, B, transposeB, scalarB, C_);
1101 }
1102 
1103 } // End namespace MatrixMatrix
1104 
1105 namespace MMdetails {
1106 
1107 /*********************************************************************************************************/
1109 // template <class TransferType>
1110 // void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1111 // if (Transfer.is_null())
1112 // return;
1113 //
1114 // const Distributor & Distor = Transfer->getDistributor();
1115 // Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1116 //
1117 // size_t rows_send = Transfer->getNumExportIDs();
1118 // size_t rows_recv = Transfer->getNumRemoteIDs();
1119 //
1120 // size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1121 // size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1122 // size_t num_send_neighbors = Distor.getNumSends();
1123 // size_t num_recv_neighbors = Distor.getNumReceives();
1124 // size_t round2_send, round2_recv;
1125 // Distor.getLastDoStatistics(round2_send,round2_recv);
1126 //
1127 // int myPID = Comm->getRank();
1128 // int NumProcs = Comm->getSize();
1129 //
1130 // // Processor by processor statistics
1131 // // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1132 // // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1133 //
1134 // // Global statistics
1135 // size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1136 // size_t gstats_min[8], gstats_max[8];
1137 //
1138 // double lstats_avg[8], gstats_avg[8];
1139 // for(int i=0; i<8; i++)
1140 // lstats_avg[i] = ((double)lstats[i])/NumProcs;
1141 //
1142 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1143 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1144 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1145 //
1146 // if(!myPID) {
1147 // printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1148 // (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1149 // (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1150 // printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1151 // (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1152 // (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1153 // }
1154 // }
1155 
1156 // Kernel method for computing the local portion of C = A*B
1157 template <class Scalar,
1158  class LocalOrdinal,
1159  class GlobalOrdinal,
1160  class Node>
1161 void mult_AT_B_newmatrix(
1162  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1163  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1164  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1165  const std::string& label,
1166  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1167  using Teuchos::RCP;
1168  using Teuchos::rcp;
1169  typedef Scalar SC;
1170  typedef LocalOrdinal LO;
1171  typedef GlobalOrdinal GO;
1172  typedef Node NO;
1173  typedef CrsMatrixStruct<SC, LO, GO, NO> crs_matrix_struct_type;
1174  typedef RowMatrixTransposer<SC, LO, GO, NO> transposer_type;
1175 
1176  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: Transpose"));
1177 
1178  /*************************************************************/
1179  /* 1) Local Transpose of A */
1180  /*************************************************************/
1181  transposer_type transposer(rcpFromRef(A), label + std::string("XP: "));
1182 
1183  using Teuchos::ParameterList;
1184  RCP<ParameterList> transposeParams(new ParameterList);
1185  transposeParams->set("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
1186  if (!params.is_null()) {
1187  transposeParams->set("compute global constants",
1188  params->get("compute global constants: temporaries",
1189  false));
1190  }
1191  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1192  transposer.createTransposeLocal(transposeParams);
1193 
1194  /*************************************************************/
1195  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1196  /*************************************************************/
1197  MM = Teuchos::null;
1198  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: I&X"));
1199 
1200  // Get views, asserting that no import is required to speed up computation
1201  crs_matrix_struct_type Aview;
1202  crs_matrix_struct_type Bview;
1203  RCP<const Import<LO, GO, NO>> dummyImporter;
1204 
1205  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1206  RCP<Teuchos::ParameterList> importParams1(new ParameterList);
1207  if (!params.is_null()) {
1208  importParams1->set("compute global constants",
1209  params->get("compute global constants: temporaries",
1210  false));
1211  auto slist = params->sublist("matrixmatrix: kernel params", false);
1212  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
1213  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1214  int mm_optimization_core_count =
1216  mm_optimization_core_count =
1217  slist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1218  int mm_optimization_core_count2 =
1219  params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1220  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1221  mm_optimization_core_count = mm_optimization_core_count2;
1222  }
1223  auto& sip1 = importParams1->sublist("matrixmatrix: kernel params", false);
1224  sip1.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1225  sip1.set("isMatrixMatrix_TransferAndFillComplete", isMM);
1226  sip1.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1227  }
1228 
1229  MMdetails::import_and_extract_views(*Atrans, Atrans->getRowMap(),
1230  Aview, dummyImporter, true,
1231  label, importParams1);
1232 
1233  RCP<ParameterList> importParams2(new ParameterList);
1234  if (!params.is_null()) {
1235  importParams2->set("compute global constants",
1236  params->get("compute global constants: temporaries",
1237  false));
1238  auto slist = params->sublist("matrixmatrix: kernel params", false);
1239  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete", false);
1240  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1241  int mm_optimization_core_count =
1243  mm_optimization_core_count =
1244  slist.get("MM_TAFC_OptimizationCoreCount",
1245  mm_optimization_core_count);
1246  int mm_optimization_core_count2 =
1247  params->get("MM_TAFC_OptimizationCoreCount",
1248  mm_optimization_core_count);
1249  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1250  mm_optimization_core_count = mm_optimization_core_count2;
1251  }
1252  auto& sip2 = importParams2->sublist("matrixmatrix: kernel params", false);
1253  sip2.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1254  sip2.set("isMatrixMatrix_TransferAndFillComplete", isMM);
1255  sip2.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1256  }
1257 
1258  if (B.getRowMap()->isSameAs(*Atrans->getColMap())) {
1259  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter, true, label, importParams2);
1260  } else {
1261  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter, false, label, importParams2);
1262  }
1263 
1264  MM = Teuchos::null;
1265  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: AB-core"));
1266 
1267  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1268 
1269  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1270  bool needs_final_export = !Atrans->getGraph()->getExporter().is_null();
1271  if (needs_final_export) {
1272  Ctemp = rcp(new Tpetra::CrsMatrix<SC, LO, GO, NO>(Atrans->getRowMap(), 0));
1273  } else {
1274  Ctemp = rcp(&C, false);
1275  }
1276 
1277  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label, params);
1278 
1279  /*************************************************************/
1280  /* 4) exportAndFillComplete matrix */
1281  /*************************************************************/
1282  MM = Teuchos::null;
1283  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM-T: exportAndFillComplete"));
1284 
1285  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp(&C, false);
1286 
1287  if (needs_final_export) {
1288  ParameterList labelList;
1289  labelList.set("Timer Label", label);
1290  if (!params.is_null()) {
1291  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params", false);
1292  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params", false);
1293  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1294  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1295  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1296  if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
1297  labelList_subList.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count, "Core Count above which the optimized neighbor discovery is used");
1298  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete", false);
1299  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck", false);
1300 
1301  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete", isMM,
1302  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1303  labelList.set("compute global constants", params->get("compute global constants", true));
1304  labelList.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1305  }
1306  Ctemp->exportAndFillComplete(Crcp,
1307  *Ctemp->getGraph()->getExporter(),
1308  B.getDomainMap(),
1309  A.getDomainMap(),
1310  rcp(&labelList, false));
1311  }
1312 #ifdef HAVE_TPETRA_MMM_STATISTICS
1313  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label + std::string(" AT_B MMM"));
1314 #endif
1315 }
1316 
1317 /*********************************************************************************************************/
1318 // Kernel method for computing the local portion of C = A*B
1319 template <class Scalar,
1320  class LocalOrdinal,
1321  class GlobalOrdinal,
1322  class Node>
1323 void mult_A_B(
1324  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1325  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1326  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1327  const std::string& /* label */,
1328  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
1329  using Teuchos::Array;
1330  using Teuchos::ArrayRCP;
1331  using Teuchos::ArrayView;
1332  using Teuchos::null;
1333  using Teuchos::OrdinalTraits;
1334 
1335  typedef Teuchos::ScalarTraits<Scalar> STS;
1336  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1337  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1338  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1339 
1340  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1341  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1342 
1343  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1344  ArrayView<const GlobalOrdinal> bcols_import = null;
1345  if (Bview.importColMap != null) {
1346  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1347  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1348 
1349  bcols_import = Bview.importColMap->getLocalElementList();
1350  }
1351 
1352  size_t C_numCols = C_lastCol - C_firstCol +
1353  OrdinalTraits<LocalOrdinal>::one();
1354  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1355  OrdinalTraits<LocalOrdinal>::one();
1356 
1357  if (C_numCols_import > C_numCols)
1358  C_numCols = C_numCols_import;
1359 
1360  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1361  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1362  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1363 
1364  Array<Scalar> C_row_i = dwork;
1365  Array<GlobalOrdinal> C_cols = iwork;
1366  Array<size_t> c_index = iwork2;
1367  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2 * C_numCols);
1368  Array<Scalar> combined_values = Array<Scalar>(2 * C_numCols);
1369 
1370  size_t C_row_i_length, j, k, last_index;
1371 
1372  // Run through all the hash table lookups once and for all
1373  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1374  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(), LO_INVALID);
1375  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(), LO_INVALID);
1376  if (Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())) {
1377  // Maps are the same: Use local IDs as the hash
1378  for (LocalOrdinal i = Aview.colMap->getMinLocalIndex(); i <=
1379  Aview.colMap->getMaxLocalIndex();
1380  i++)
1381  Acol2Brow[i] = i;
1382  } else {
1383  // Maps are not the same: Use the map's hash
1384  for (LocalOrdinal i = Aview.colMap->getMinLocalIndex(); i <=
1385  Aview.colMap->getMaxLocalIndex();
1386  i++) {
1387  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1388  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1389  if (BLID != LO_INVALID)
1390  Acol2Brow[i] = BLID;
1391  else
1392  Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1393  }
1394  }
1395 
1396  // To form C = A*B we're going to execute this expression:
1397  //
1398  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1399  //
1400  // Our goal, of course, is to navigate the data in A and B once, without
1401  // performing searches for column-indices, etc.
1402  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1403  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1404  auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1405  auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1406  auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1407  auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1408  decltype(Browptr) Irowptr;
1409  decltype(Bcolind) Icolind;
1410  decltype(Bvals) Ivals;
1411  if (!Bview.importMatrix.is_null()) {
1412  Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1413  Icolind = Bview.importMatrix->getLocalIndicesHost();
1414  Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1415  }
1416 
1417  bool C_filled = C.isFillComplete();
1418 
1419  for (size_t i = 0; i < C_numCols; i++)
1420  c_index[i] = OrdinalTraits<size_t>::invalid();
1421 
1422  // Loop over the rows of A.
1423  size_t Arows = Aview.rowMap->getLocalNumElements();
1424  for (size_t i = 0; i < Arows; ++i) {
1425  // Only navigate the local portion of Aview... which is, thankfully, all of
1426  // A since this routine doesn't do transpose modes
1427  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1428 
1429  // Loop across the i-th row of A and for each corresponding row in B, loop
1430  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1431  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1432  // calculating updates for row i of the result matrix C.
1433  C_row_i_length = OrdinalTraits<size_t>::zero();
1434 
1435  for (k = Arowptr[i]; k < Arowptr[i + 1]; ++k) {
1436  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1437  const Scalar Aval = Avals[k];
1438  if (Aval == STS::zero())
1439  continue;
1440 
1441  if (Ak == LO_INVALID)
1442  continue;
1443 
1444  for (j = Browptr[Ak]; j < Browptr[Ak + 1]; ++j) {
1445  LocalOrdinal col = Bcolind[j];
1446  // assert(col >= 0 && col < C_numCols);
1447 
1448  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1449  // assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1450  // This has to be a += so insertGlobalValue goes out
1451  C_row_i[C_row_i_length] = Aval * Bvals[j];
1452  C_cols[C_row_i_length] = col;
1453  c_index[col] = C_row_i_length;
1454  C_row_i_length++;
1455 
1456  } else {
1457  // static cast from impl_scalar_type to Scalar needed for complex
1458  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1459  }
1460  }
1461  }
1462 
1463  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1464  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1465  C_cols[ii] = bcols[C_cols[ii]];
1466  combined_index[ii] = C_cols[ii];
1467  combined_values[ii] = C_row_i[ii];
1468  }
1469  last_index = C_row_i_length;
1470 
1471  //
1472  // Now put the C_row_i values into C.
1473  //
1474  // We might have to revamp this later.
1475  C_row_i_length = OrdinalTraits<size_t>::zero();
1476 
1477  for (k = Arowptr[i]; k < Arowptr[i + 1]; ++k) {
1478  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1479  const Scalar Aval = Avals[k];
1480  if (Aval == STS::zero())
1481  continue;
1482 
1483  if (Ak != LO_INVALID) continue;
1484 
1485  Ak = Acol2Irow[Acolind[k]];
1486  for (j = Irowptr[Ak]; j < Irowptr[Ak + 1]; ++j) {
1487  LocalOrdinal col = Icolind[j];
1488  // assert(col >= 0 && col < C_numCols);
1489 
1490  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1491  // assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1492  // This has to be a += so insertGlobalValue goes out
1493  C_row_i[C_row_i_length] = Aval * Ivals[j];
1494  C_cols[C_row_i_length] = col;
1495  c_index[col] = C_row_i_length;
1496  C_row_i_length++;
1497 
1498  } else {
1499  // This has to be a += so insertGlobalValue goes out
1500  // static cast from impl_scalar_type to Scalar needed for complex
1501  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1502  }
1503  }
1504  }
1505 
1506  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1507  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1508  C_cols[ii] = bcols_import[C_cols[ii]];
1509  combined_index[last_index] = C_cols[ii];
1510  combined_values[last_index] = C_row_i[ii];
1511  last_index++;
1512  }
1513 
1514  // Now put the C_row_i values into C.
1515  // We might have to revamp this later.
1516  C_filled ? C.sumIntoGlobalValues(
1517  global_row,
1518  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1519  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1520  : C.insertGlobalValues(
1521  global_row,
1522  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1523  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1524  }
1525 }
1526 
1527 /*********************************************************************************************************/
1528 template <class Scalar,
1529  class LocalOrdinal,
1530  class GlobalOrdinal,
1531  class Node>
1532 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1533  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal>>::size_type local_length_size;
1534  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1535 
1536  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1537  Mview.maxNumRowEntries = Mview.indices[0].size();
1538 
1539  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1540  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1541  Mview.maxNumRowEntries = Mview.indices[i].size();
1542  }
1543 }
1544 
1545 /*********************************************************************************************************/
1546 template <class CrsMatrixType>
1547 size_t C_estimate_nnz(CrsMatrixType& A, CrsMatrixType& B) {
1548  // Follows the NZ estimate in ML's ml_matmatmult.c
1549  size_t Aest = 100, Best = 100;
1550  if (A.getLocalNumEntries() >= A.getLocalNumRows())
1551  Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries() / A.getLocalNumRows() : 100;
1552  if (B.getLocalNumEntries() >= B.getLocalNumRows())
1553  Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries() / B.getLocalNumRows() : 100;
1554 
1555  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1556  nnzperrow *= nnzperrow;
1557 
1558  return (size_t)(A.getLocalNumRows() * nnzperrow * 0.75 + 100);
1559 }
1560 
1561 /*********************************************************************************************************/
1562 // Kernel method for computing the local portion of C = A*B for CrsMatrix
1563 //
1564 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1565 // function, so this is probably the function we want to
1566 // thread-parallelize.
1567 template <class Scalar,
1568  class LocalOrdinal,
1569  class GlobalOrdinal,
1570  class Node>
1571 void mult_A_B_newmatrix(
1572  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1573  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1574  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1575  const std::string& label,
1576  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1577  using Teuchos::Array;
1578  using Teuchos::ArrayRCP;
1579  using Teuchos::ArrayView;
1580  using Teuchos::RCP;
1581  using Teuchos::rcp;
1582 
1583  // Tpetra typedefs
1584  typedef LocalOrdinal LO;
1585  typedef GlobalOrdinal GO;
1586  typedef Node NO;
1587  typedef Import<LO, GO, NO> import_type;
1588  typedef Map<LO, GO, NO> map_type;
1589 
1590  // Kokkos typedefs
1591  typedef typename map_type::local_map_type local_map_type;
1593  typedef typename KCRS::StaticCrsGraphType graph_t;
1594  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1595  typedef typename NO::execution_space execution_space;
1596  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1597  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1598 
1599  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: M5 Cmap");
1600 
1601  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1602 
1603  // Build the final importer / column map, hash table lookups for C
1604  RCP<const import_type> Cimport;
1605  RCP<const map_type> Ccolmap;
1606  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1607  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1608  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1609  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1610  local_map_type Irowmap_local;
1611  if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1612  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1613  local_map_type Icolmap_local;
1614  if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1615 
1616  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1617  // indices of B, to local column indices of C. (B and C have the
1618  // same number of columns.) The kernel uses this, instead of
1619  // copying the entire input matrix B and converting its column
1620  // indices to those of C.
1621  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
1622 
1623  if (Bview.importMatrix.is_null()) {
1624  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1625  Cimport = Bimport;
1626  Ccolmap = Bview.colMap;
1627  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1628  // Bcol2Ccol is trivial
1629  Kokkos::parallel_for(
1630  "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1631  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1632  KOKKOS_LAMBDA(const LO i) {
1633  Bcol2Ccol(i) = i;
1634  });
1635  } else {
1636  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1637  // column Map of C, as well as C's Import object (from its domain
1638  // Map to its column Map). C's column Map is the union of the
1639  // column Maps of (the local part of) B, and the "remote" part of
1640  // B. Ditto for the Import. We have optimized this "setUnion"
1641  // operation on Import objects and Maps.
1642 
1643  // Choose the right variant of setUnion
1644  if (!Bimport.is_null() && !Iimport.is_null()) {
1645  Cimport = Bimport->setUnion(*Iimport);
1646  } else if (!Bimport.is_null() && Iimport.is_null()) {
1647  Cimport = Bimport->setUnion();
1648  } else if (Bimport.is_null() && !Iimport.is_null()) {
1649  Cimport = Iimport->setUnion();
1650  } else {
1651  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1652  }
1653  Ccolmap = Cimport->getTargetMap();
1654 
1655  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1656  // in general. We should get rid of it in order to reduce
1657  // communication costs of sparse matrix-matrix multiply.
1658  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1659  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1660 
1661  // NOTE: This is not efficient and should be folded into setUnion
1662  //
1663  // mfh 27 Sep 2016: What the above comment means, is that the
1664  // setUnion operation on Import objects could also compute these
1665  // local index - to - local index look-up tables.
1666  Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
1667  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1668  Kokkos::parallel_for(
1669  "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement", range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
1670  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1671  });
1672  Kokkos::parallel_for(
1673  "Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement", range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
1674  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1675  });
1676  }
1677 
1678  // Replace the column map
1679  //
1680  // mfh 27 Sep 2016: We do this because C was originally created
1681  // without a column Map. Now we have its column Map.
1682  C.replaceColMap(Ccolmap);
1683 
1684  // mfh 27 Sep 2016: Construct tables that map from local column
1685  // indices of A, to local row indices of either B_local (the locally
1686  // owned part of B), or B_remote (the "imported" remote part of B).
1687  //
1688  // For column index Aik in row i of A, if the corresponding row of B
1689  // exists in the local part of B ("orig") (which I'll call B_local),
1690  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1691  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1692  //
1693  // For column index Aik in row i of A, if the corresponding row of B
1694  // exists in the remote part of B ("Import") (which I'll call
1695  // B_remote), then targetMapToImportRow[Aik] is the local index of
1696  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1697  // (a flag value).
1698 
1699  // Run through all the hash table lookups once and for all
1700  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
1701  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
1702 
1703  Kokkos::parallel_for(
1704  "Tpetra::mult_A_B_newmatrix::construct_tables", range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
1705  GO aidx = Acolmap_local.getGlobalElement(i);
1706  LO B_LID = Browmap_local.getLocalElement(aidx);
1707  if (B_LID != LO_INVALID) {
1708  targetMapToOrigRow(i) = B_LID;
1709  targetMapToImportRow(i) = LO_INVALID;
1710  } else {
1711  LO I_LID = Irowmap_local.getLocalElement(aidx);
1712  targetMapToOrigRow(i) = LO_INVALID;
1713  targetMapToImportRow(i) = I_LID;
1714  }
1715  });
1716 
1717  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1718  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1719  KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
1720 }
1721 
1722 /*********************************************************************************************************/
1723 // Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1724 template <class Scalar,
1725  class LocalOrdinal,
1726  class GlobalOrdinal,
1727  class Node>
1728 void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1729  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1730  Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>>& C) {
1731  using Teuchos::Array;
1732  using Teuchos::ArrayRCP;
1733  using Teuchos::ArrayView;
1734  using Teuchos::null;
1735  using Teuchos::RCP;
1736  using Teuchos::rcp;
1737 
1738  // Tpetra typedefs
1739  typedef LocalOrdinal LO;
1740  typedef GlobalOrdinal GO;
1741  typedef Node NO;
1742  typedef Import<LO, GO, NO> import_type;
1743  typedef Map<LO, GO, NO> map_type;
1744  typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1745  typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1746 
1747  // Kokkos typedefs
1748  typedef typename map_type::local_map_type local_map_type;
1749  typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1750  typedef typename KBSR::device_type device_t;
1751  typedef typename KBSR::StaticCrsGraphType static_graph_t;
1752  typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1753  typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1754  typedef typename KBSR::values_type::non_const_type scalar_view_t;
1755  typedef typename NO::execution_space execution_space;
1756  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1757  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1758 
1759  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1760 
1761  // Build the final importer / column map, hash table lookups for C
1762  RCP<const import_type> Cimport;
1763  RCP<const map_type> Ccolmap;
1764  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1765  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1766  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1767  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1768  local_map_type Irowmap_local;
1769  if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1770  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1771  local_map_type Icolmap_local;
1772  if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1773 
1774  // Bcol2Ccol is a table that maps from local column
1775  // indices of B, to local column indices of C. (B and C have the
1776  // same number of columns.) The kernel uses this, instead of
1777  // copying the entire input matrix B and converting its column
1778  // indices to those of C.
1779  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
1780 
1781  if (Bview.importMatrix.is_null()) {
1782  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1783  Cimport = Bimport;
1784  Ccolmap = Bview.colMap;
1785  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1786  // Bcol2Ccol is trivial
1787  Kokkos::parallel_for(
1788  "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1789  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1790  KOKKOS_LAMBDA(const LO i) {
1791  Bcol2Ccol(i) = i;
1792  });
1793  } else {
1794  // B has "remotes," so we need to build the
1795  // column Map of C, as well as C's Import object (from its domain
1796  // Map to its column Map). C's column Map is the union of the
1797  // column Maps of (the local part of) B, and the "remote" part of
1798  // B. Ditto for the Import. We have optimized this "setUnion"
1799  // operation on Import objects and Maps.
1800 
1801  // Choose the right variant of setUnion
1802  if (!Bimport.is_null() && !Iimport.is_null()) {
1803  Cimport = Bimport->setUnion(*Iimport);
1804  } else if (!Bimport.is_null() && Iimport.is_null()) {
1805  Cimport = Bimport->setUnion();
1806  } else if (Bimport.is_null() && !Iimport.is_null()) {
1807  Cimport = Iimport->setUnion();
1808  } else {
1809  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1810  }
1811  Ccolmap = Cimport->getTargetMap();
1812 
1813  // NOTE: This is not efficient and should be folded into setUnion
1814  //
1815  // What the above comment means, is that the
1816  // setUnion operation on Import objects could also compute these
1817  // local index - to - local index look-up tables.
1818  Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
1819  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1820  Kokkos::parallel_for(
1821  "Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1822  range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()),
1823  KOKKOS_LAMBDA(const LO i) {
1824  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1825  });
1826  Kokkos::parallel_for(
1827  "Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1828  range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()),
1829  KOKKOS_LAMBDA(const LO i) {
1830  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1831  });
1832  }
1833 
1834  // Construct tables that map from local column
1835  // indices of A, to local row indices of either B_local (the locally
1836  // owned part of B), or B_remote (the "imported" remote part of B).
1837  //
1838  // For column index Aik in row i of A, if the corresponding row of B
1839  // exists in the local part of B ("orig") (which I'll call B_local),
1840  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1841  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1842  //
1843  // For column index Aik in row i of A, if the corresponding row of B
1844  // exists in the remote part of B ("Import") (which I'll call
1845  // B_remote), then targetMapToImportRow[Aik] is the local index of
1846  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1847  // (a flag value).
1848 
1849  // Run through all the hash table lookups once and for all
1850  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1851  Aview.colMap->getLocalNumElements());
1852  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1853  Aview.colMap->getLocalNumElements());
1854 
1855  Kokkos::parallel_for(
1856  "Tpetra::mult_A_B_newmatrix::construct_tables",
1857  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1),
1858  KOKKOS_LAMBDA(const LO i) {
1859  GO aidx = Acolmap_local.getGlobalElement(i);
1860  LO B_LID = Browmap_local.getLocalElement(aidx);
1861  if (B_LID != LO_INVALID) {
1862  targetMapToOrigRow(i) = B_LID;
1863  targetMapToImportRow(i) = LO_INVALID;
1864  } else {
1865  LO I_LID = Irowmap_local.getLocalElement(aidx);
1866  targetMapToOrigRow(i) = LO_INVALID;
1867  targetMapToImportRow(i) = I_LID;
1868  }
1869  });
1870 
1871  // Create the KernelHandle
1872  using KernelHandle =
1873  KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1874  typename lno_nnz_view_t::const_value_type,
1875  typename scalar_view_t::const_value_type,
1876  typename device_t::execution_space,
1877  typename device_t::memory_space,
1878  typename device_t::memory_space>;
1879  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
1880  std::string myalg("SPGEMM_KK_MEMORY");
1881  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
1882 
1883  KernelHandle kh;
1884  kh.create_spgemm_handle(alg_enum);
1885  kh.set_team_work_size(team_work_size);
1886 
1887  // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
1888  const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
1889  const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview, Bview,
1890  targetMapToOrigRow, targetMapToImportRow,
1891  Bcol2Ccol, Icol2Ccol,
1892  Ccolmap.getConst()->getLocalNumElements());
1893 
1894  RCP<graph_t> graphC;
1895  typename KBSR::values_type values;
1896  {
1897  // Call KokkosSparse routines to calculate Amat*Bmerged on device.
1898  // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
1899  KBSR Cmat;
1900  KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
1901  KokkosSparse::block_spgemm_numeric(kh, Amat, false, Bmerged, false, Cmat);
1902  kh.destroy_spgemm_handle();
1903 
1904  // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
1905  graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
1906  values = Cmat.values;
1907  }
1908  C = rcp(new block_crs_matrix_type(*graphC, values, Aview.blocksize));
1909 }
1910 
1911 /*********************************************************************************************************/
1912 // AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
1913 template <class Scalar,
1914  class LocalOrdinal,
1915  class GlobalOrdinal,
1916  class Node,
1917  class LocalOrdinalViewType>
1918 void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1919  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1920  const LocalOrdinalViewType& targetMapToOrigRow,
1921  const LocalOrdinalViewType& targetMapToImportRow,
1922  const LocalOrdinalViewType& Bcol2Ccol,
1923  const LocalOrdinalViewType& Icol2Ccol,
1924  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1925  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> Cimport,
1926  const std::string& label,
1927  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1928  using Teuchos::Array;
1929  using Teuchos::ArrayRCP;
1930  using Teuchos::ArrayView;
1931  using Teuchos::RCP;
1932  using Teuchos::rcp;
1933 
1934  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Newmatrix SerialCore");
1935 
1936  // Lots and lots of typedefs
1937  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
1938  typedef typename KCRS::StaticCrsGraphType graph_t;
1939  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1940  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1941  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1942  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1943 
1944  typedef Scalar SC;
1945  typedef LocalOrdinal LO;
1946  typedef GlobalOrdinal GO;
1947  typedef Node NO;
1948  typedef Map<LO, GO, NO> map_type;
1949  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1950  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1951  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1952 
1953  // Sizes
1954  RCP<const map_type> Ccolmap = C.getColMap();
1955  size_t m = Aview.origMatrix->getLocalNumRows();
1956  size_t n = Ccolmap->getLocalNumElements();
1957  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
1958 
1959  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1960  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
1961  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
1962 
1963  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1964  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1965  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1966 
1967  c_lno_view_t Irowptr;
1968  lno_nnz_view_t Icolind;
1969  scalar_view_t Ivals;
1970  if (!Bview.importMatrix.is_null()) {
1971  auto lclB = Bview.importMatrix->getLocalMatrixHost();
1972  Irowptr = lclB.graph.row_map;
1973  Icolind = lclB.graph.entries;
1974  Ivals = lclB.values;
1975  b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
1976  }
1977 
1978  // Classic csr assembly (low memory edition)
1979  //
1980  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1981  // The method loops over rows of A, and may resize after processing
1982  // each row. Chris Siefert says that this reflects experience in
1983  // ML; for the non-threaded case, ML found it faster to spend less
1984  // effort on estimation and risk an occasional reallocation.
1985  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1986  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"), m + 1);
1987  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"), CSR_alloc);
1988  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"), CSR_alloc);
1989 
1990  // mfh 27 Sep 2016: The c_status array is an implementation detail
1991  // of the local sparse matrix-matrix multiply routine.
1992 
1993  // The status array will contain the index into colind where this entry was last deposited.
1994  // c_status[i] < CSR_ip - not in the row yet
1995  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1996  // We start with this filled with INVALID's indicating that there are no entries yet.
1997  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1998  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1999  std::vector<size_t> c_status(n, ST_INVALID);
2000 
2001  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2002  // routine. The routine computes C := A * (B_local + B_remote).
2003  //
2004  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2005  // you whether the corresponding row of B belongs to B_local
2006  // ("orig") or B_remote ("Import").
2007 
2008  // For each row of A/C
2009  size_t CSR_ip = 0, OLD_ip = 0;
2010  for (size_t i = 0; i < m; i++) {
2011  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2012  // on the calling process.
2013  Crowptr[i] = CSR_ip;
2014 
2015  // mfh 27 Sep 2016: For each entry of A in the current row of A
2016  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2017  LO Aik = Acolind[k]; // local column index of current entry of A
2018  const SC Aval = Avals[k]; // value of current entry of A
2019  if (Aval == SC_ZERO)
2020  continue; // skip explicitly stored zero values in A
2021 
2022  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2023  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2024  // corresponding to the current entry of A is populated, then
2025  // the corresponding row of B is in B_local (i.e., it lives on
2026  // the calling process).
2027 
2028  // Local matrix
2029  size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2030 
2031  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2032  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2033  LO Bkj = Bcolind[j];
2034  LO Cij = Bcol2Ccol[Bkj];
2035 
2036  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2037  // New entry
2038  c_status[Cij] = CSR_ip;
2039  Ccolind[CSR_ip] = Cij;
2040  Cvals[CSR_ip] = Aval * Bvals[j];
2041  CSR_ip++;
2042 
2043  } else {
2044  Cvals[c_status[Cij]] += Aval * Bvals[j];
2045  }
2046  }
2047 
2048  } else {
2049  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2050  // corresponding to the current entry of A NOT populated (has
2051  // a flag "invalid" value), then the corresponding row of B is
2052  // in B_local (i.e., it lives on the calling process).
2053 
2054  // Remote matrix
2055  size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2056  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2057  LO Ikj = Icolind[j];
2058  LO Cij = Icol2Ccol[Ikj];
2059 
2060  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2061  // New entry
2062  c_status[Cij] = CSR_ip;
2063  Ccolind[CSR_ip] = Cij;
2064  Cvals[CSR_ip] = Aval * Ivals[j];
2065  CSR_ip++;
2066  } else {
2067  Cvals[c_status[Cij]] += Aval * Ivals[j];
2068  }
2069  }
2070  }
2071  }
2072 
2073  // Resize for next pass if needed
2074  if (i + 1 < m && CSR_ip + std::min(n, (Arowptr[i + 2] - Arowptr[i + 1]) * b_max_nnz_per_row) > CSR_alloc) {
2075  CSR_alloc *= 2;
2076  Kokkos::resize(Ccolind, CSR_alloc);
2077  Kokkos::resize(Cvals, CSR_alloc);
2078  }
2079  OLD_ip = CSR_ip;
2080  }
2081 
2082  Crowptr[m] = CSR_ip;
2083 
2084  // Downward resize
2085  Kokkos::resize(Ccolind, CSR_ip);
2086  Kokkos::resize(Cvals, CSR_ip);
2087 
2088  {
2089  Tpetra::Details::ProfilingRegion MM3("TpetraExt: MMM: Newmatrix Final Sort");
2090 
2091  // Final sort & set of CRS arrays
2092  if (params.is_null() || params->get("sort entries", true))
2093  Import_Util::sortCrsEntries(Crowptr, Ccolind, Cvals);
2094  C.setAllValues(Crowptr, Ccolind, Cvals);
2095  }
2096 
2097  Tpetra::Details::ProfilingRegion MM4("TpetraExt: MMM: Newmatrix ESCC");
2098  {
2099  // Final FillComplete
2100  //
2101  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2102  // Import (from domain Map to column Map) construction (which costs
2103  // lots of communication) by taking the previously constructed
2104  // Import object. We should be able to do this without interfering
2105  // with the implementation of the local part of sparse matrix-matrix
2106  // multply above.
2107  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2108  labelList->set("Timer Label", label);
2109  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
2110  RCP<const Export<LO, GO, NO>> dummyExport;
2111  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
2112  }
2113 }
2114 /*********************************************************************************************************/
2115 // Kernel method for computing the local portion of C = A*B (reuse)
2116 template <class Scalar,
2117  class LocalOrdinal,
2118  class GlobalOrdinal,
2119  class Node>
2120 void mult_A_B_reuse(
2121  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2122  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2123  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2124  const std::string& label,
2125  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2126  using Teuchos::Array;
2127  using Teuchos::ArrayRCP;
2128  using Teuchos::ArrayView;
2129  using Teuchos::RCP;
2130  using Teuchos::rcp;
2131 
2132  // Tpetra typedefs
2133  typedef LocalOrdinal LO;
2134  typedef GlobalOrdinal GO;
2135  typedef Node NO;
2136  typedef Import<LO, GO, NO> import_type;
2137  typedef Map<LO, GO, NO> map_type;
2138 
2139  // Kokkos typedefs
2140  typedef typename map_type::local_map_type local_map_type;
2142  typedef typename KCRS::StaticCrsGraphType graph_t;
2143  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2144  typedef typename NO::execution_space execution_space;
2145  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2146  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2147 
2148  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Reuse Cmap");
2149  (void)label;
2150 
2151  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2152 
2153  // Grab all the maps
2154  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2155  RCP<const map_type> Ccolmap = C.getColMap();
2156  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2157  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2158  local_map_type Irowmap_local;
2159  if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2160  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2161  local_map_type Icolmap_local;
2162  if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2163  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2164 
2165  // Build the final importer / column map, hash table lookups for C
2166  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
2167  {
2168  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2169  // So, column map of C may be a strict subset of the column map of B
2170  Kokkos::parallel_for(
2171  range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2172  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2173  });
2174 
2175  if (!Bview.importMatrix.is_null()) {
2176  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2177  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2178 
2179  Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
2180  Kokkos::parallel_for(
2181  range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2182  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2183  });
2184  }
2185  }
2186 
2187  // Run through all the hash table lookups once and for all
2188  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
2189  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
2190  Kokkos::parallel_for(
2191  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
2192  GO aidx = Acolmap_local.getGlobalElement(i);
2193  LO B_LID = Browmap_local.getLocalElement(aidx);
2194  if (B_LID != LO_INVALID) {
2195  targetMapToOrigRow(i) = B_LID;
2196  targetMapToImportRow(i) = LO_INVALID;
2197  } else {
2198  LO I_LID = Irowmap_local.getLocalElement(aidx);
2199  targetMapToOrigRow(i) = LO_INVALID;
2200  targetMapToImportRow(i) = I_LID;
2201  }
2202  });
2203 
2204  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2205  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2206  KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
2207 }
2208 
2209 /*********************************************************************************************************/
2210 template <class Scalar,
2211  class LocalOrdinal,
2212  class GlobalOrdinal,
2213  class Node,
2214  class LocalOrdinalViewType>
2215 void KernelWrappers<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2216  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2217  const LocalOrdinalViewType& targetMapToOrigRow,
2218  const LocalOrdinalViewType& targetMapToImportRow,
2219  const LocalOrdinalViewType& Bcol2Ccol,
2220  const LocalOrdinalViewType& Icol2Ccol,
2221  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2222  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> /* Cimport */,
2223  const std::string& label,
2224  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2225  using Teuchos::RCP;
2226  using Teuchos::rcp;
2227 
2228  // Lots and lots of typedefs
2229  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
2230  typedef typename KCRS::StaticCrsGraphType graph_t;
2231  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2232  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2233  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2234 
2235  typedef Scalar SC;
2236  typedef LocalOrdinal LO;
2237  typedef GlobalOrdinal GO;
2238  typedef Node NO;
2239  typedef Map<LO, GO, NO> map_type;
2240  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2241  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2242  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2243 
2244  Tpetra::Details::ProfilingRegion MM("TpetraExt: MMM: Reuse SerialCore");
2245  (void)label;
2246 
2247  // Sizes
2248  RCP<const map_type> Ccolmap = C.getColMap();
2249  size_t m = Aview.origMatrix->getLocalNumRows();
2250  size_t n = Ccolmap->getLocalNumElements();
2251 
2252  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2253  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
2254  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
2255  const KCRS& Cmat = C.getLocalMatrixHost();
2256 
2257  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2258  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2259  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2260  scalar_view_t Cvals = Cmat.values;
2261 
2262  c_lno_view_t Irowptr;
2263  lno_nnz_view_t Icolind;
2264  scalar_view_t Ivals;
2265  if (!Bview.importMatrix.is_null()) {
2266  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2267  Irowptr = lclB.graph.row_map;
2268  Icolind = lclB.graph.entries;
2269  Ivals = lclB.values;
2270  }
2271 
2272  // Classic csr assembly (low memory edition)
2273  // mfh 27 Sep 2016: The c_status array is an implementation detail
2274  // of the local sparse matrix-matrix multiply routine.
2275 
2276  // The status array will contain the index into colind where this entry was last deposited.
2277  // c_status[i] < CSR_ip - not in the row yet
2278  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2279  // We start with this filled with INVALID's indicating that there are no entries yet.
2280  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2281  std::vector<size_t> c_status(n, ST_INVALID);
2282 
2283  // For each row of A/C
2284  size_t CSR_ip = 0, OLD_ip = 0;
2285  for (size_t i = 0; i < m; i++) {
2286  // First fill the c_status array w/ locations where we're allowed to
2287  // generate nonzeros for this row
2288  OLD_ip = Crowptr[i];
2289  CSR_ip = Crowptr[i + 1];
2290  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2291  c_status[Ccolind[k]] = k;
2292 
2293  // Reset values in the row of C
2294  Cvals[k] = SC_ZERO;
2295  }
2296 
2297  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2298  LO Aik = Acolind[k];
2299  const SC Aval = Avals[k];
2300  if (Aval == SC_ZERO)
2301  continue;
2302 
2303  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2304  // Local matrix
2305  size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2306 
2307  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2308  LO Bkj = Bcolind[j];
2309  LO Cij = Bcol2Ccol[Bkj];
2310 
2311  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2312  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
2313  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2314 
2315  Cvals[c_status[Cij]] += Aval * Bvals[j];
2316  }
2317 
2318  } else {
2319  // Remote matrix
2320  size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2321  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2322  LO Ikj = Icolind[j];
2323  LO Cij = Icol2Ccol[Ikj];
2324 
2325  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2326  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph "
2327  << "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2328 
2329  Cvals[c_status[Cij]] += Aval * Ivals[j];
2330  }
2331  }
2332  }
2333  }
2334 
2335  {
2336  Tpetra::Details::ProfilingRegion MM3("TpetraExt: MMM: Reuse ESFC");
2337  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2338  }
2339 }
2340 
2341 /*********************************************************************************************************/
2342 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2343 template <class Scalar,
2344  class LocalOrdinal,
2345  class GlobalOrdinal,
2346  class Node>
2347 void jacobi_A_B_newmatrix(
2348  Scalar omega,
2349  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2350  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2351  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2352  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2353  const std::string& label,
2354  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2355  using Teuchos::Array;
2356  using Teuchos::ArrayRCP;
2357  using Teuchos::ArrayView;
2358  using Teuchos::RCP;
2359  using Teuchos::rcp;
2360  // typedef Scalar SC;
2361  typedef LocalOrdinal LO;
2362  typedef GlobalOrdinal GO;
2363  typedef Node NO;
2364 
2365  typedef Import<LO, GO, NO> import_type;
2366  typedef Map<LO, GO, NO> map_type;
2367  typedef typename map_type::local_map_type local_map_type;
2368 
2369  // All of the Kokkos typedefs
2371  typedef typename KCRS::StaticCrsGraphType graph_t;
2372  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2373  typedef typename NO::execution_space execution_space;
2374  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2375  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2376 
2377  Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: M5 Cmap");
2378 
2379  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2380 
2381  // Build the final importer / column map, hash table lookups for C
2382  RCP<const import_type> Cimport;
2383  RCP<const map_type> Ccolmap;
2384  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2385  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ? Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2386  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2387  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2388  local_map_type Irowmap_local;
2389  if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2390  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2391  local_map_type Icolmap_local;
2392  if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2393 
2394  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2395  // indices of B, to local column indices of C. (B and C have the
2396  // same number of columns.) The kernel uses this, instead of
2397  // copying the entire input matrix B and converting its column
2398  // indices to those of C.
2399  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements()), Icol2Ccol;
2400 
2401  if (Bview.importMatrix.is_null()) {
2402  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2403  Cimport = Bimport;
2404  Ccolmap = Bview.colMap;
2405  // Bcol2Ccol is trivial
2406  // Bcol2Ccol is trivial
2407 
2408  Kokkos::RangePolicy<execution_space, LO> range(0, static_cast<LO>(Bview.colMap->getLocalNumElements()));
2409  Kokkos::parallel_for(
2410  range, KOKKOS_LAMBDA(const size_t i) {
2411  Bcol2Ccol(i) = static_cast<LO>(i);
2412  });
2413  } else {
2414  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2415  // column Map of C, as well as C's Import object (from its domain
2416  // Map to its column Map). C's column Map is the union of the
2417  // column Maps of (the local part of) B, and the "remote" part of
2418  // B. Ditto for the Import. We have optimized this "setUnion"
2419  // operation on Import objects and Maps.
2420 
2421  // Choose the right variant of setUnion
2422  if (!Bimport.is_null() && !Iimport.is_null()) {
2423  Cimport = Bimport->setUnion(*Iimport);
2424  Ccolmap = Cimport->getTargetMap();
2425 
2426  } else if (!Bimport.is_null() && Iimport.is_null()) {
2427  Cimport = Bimport->setUnion();
2428 
2429  } else if (Bimport.is_null() && !Iimport.is_null()) {
2430  Cimport = Iimport->setUnion();
2431 
2432  } else
2433  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2434 
2435  Ccolmap = Cimport->getTargetMap();
2436 
2437  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2438  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2439 
2440  // NOTE: This is not efficient and should be folded into setUnion
2441  //
2442  // mfh 27 Sep 2016: What the above comment means, is that the
2443  // setUnion operation on Import objects could also compute these
2444  // local index - to - local index look-up tables.
2445  Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
2446  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2447  Kokkos::parallel_for(
2448  range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2449  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2450  });
2451  Kokkos::parallel_for(
2452  range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2453  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2454  });
2455  }
2456 
2457  // Replace the column map
2458  //
2459  // mfh 27 Sep 2016: We do this because C was originally created
2460  // without a column Map. Now we have its column Map.
2461  C.replaceColMap(Ccolmap);
2462 
2463  // mfh 27 Sep 2016: Construct tables that map from local column
2464  // indices of A, to local row indices of either B_local (the locally
2465  // owned part of B), or B_remote (the "imported" remote part of B).
2466  //
2467  // For column index Aik in row i of A, if the corresponding row of B
2468  // exists in the local part of B ("orig") (which I'll call B_local),
2469  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2470  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2471  //
2472  // For column index Aik in row i of A, if the corresponding row of B
2473  // exists in the remote part of B ("Import") (which I'll call
2474  // B_remote), then targetMapToImportRow[Aik] is the local index of
2475  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2476  // (a flag value).
2477 
2478  // Run through all the hash table lookups once and for all
2479  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
2480  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
2481  Kokkos::parallel_for(
2482  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
2483  GO aidx = Acolmap_local.getGlobalElement(i);
2484  LO B_LID = Browmap_local.getLocalElement(aidx);
2485  if (B_LID != LO_INVALID) {
2486  targetMapToOrigRow(i) = B_LID;
2487  targetMapToImportRow(i) = LO_INVALID;
2488  } else {
2489  LO I_LID = Irowmap_local.getLocalElement(aidx);
2490  targetMapToOrigRow(i) = LO_INVALID;
2491  targetMapToImportRow(i) = I_LID;
2492  }
2493  });
2494 
2495  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2496  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2497  KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega, Dinv, Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
2498 }
2499 
2500 /*********************************************************************************************************/
2501 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2502 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2503 
2504 template <class Scalar,
2505  class LocalOrdinal,
2506  class GlobalOrdinal,
2507  class Node,
2508  class LocalOrdinalViewType>
2509 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2510  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2511  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2512  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2513  const LocalOrdinalViewType& targetMapToOrigRow,
2514  const LocalOrdinalViewType& targetMapToImportRow,
2515  const LocalOrdinalViewType& Bcol2Ccol,
2516  const LocalOrdinalViewType& Icol2Ccol,
2517  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2518  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> Cimport,
2519  const std::string& label,
2520  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2521  Tpetra::Details::ProfilingRegion MM("TpetraExt: Jacobi: Newmatrix SerialCore");
2522 
2523  using Teuchos::Array;
2524  using Teuchos::ArrayRCP;
2525  using Teuchos::ArrayView;
2526  using Teuchos::RCP;
2527  using Teuchos::rcp;
2528 
2529  // Lots and lots of typedefs
2530  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
2531  typedef typename KCRS::StaticCrsGraphType graph_t;
2532  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2533  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2534  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2535  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2536 
2537  // Jacobi-specific
2538  typedef typename scalar_view_t::memory_space scalar_memory_space;
2539 
2540  typedef Scalar SC;
2541  typedef LocalOrdinal LO;
2542  typedef GlobalOrdinal GO;
2543  typedef Node NO;
2544 
2545  typedef Map<LO, GO, NO> map_type;
2546  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2547  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2548 
2549  // Sizes
2550  RCP<const map_type> Ccolmap = C.getColMap();
2551  size_t m = Aview.origMatrix->getLocalNumRows();
2552  size_t n = Ccolmap->getLocalNumElements();
2553  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2554 
2555  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2556  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
2557  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
2558 
2559  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2560  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2561  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2562 
2563  c_lno_view_t Irowptr;
2564  lno_nnz_view_t Icolind;
2565  scalar_view_t Ivals;
2566  if (!Bview.importMatrix.is_null()) {
2567  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2568  Irowptr = lclB.graph.row_map;
2569  Icolind = lclB.graph.entries;
2570  Ivals = lclB.values;
2571  b_max_nnz_per_row = std::max(b_max_nnz_per_row, Bview.importMatrix->getLocalMaxNumRowEntries());
2572  }
2573 
2574  // Jacobi-specific inner stuff
2575  auto Dvals =
2576  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2577 
2578  // Teuchos::ArrayView::operator[].
2579  // The status array will contain the index into colind where this entry was last deposited.
2580  // c_status[i] < CSR_ip - not in the row yet.
2581  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2582  // We start with this filled with INVALID's indicating that there are no entries yet.
2583  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2584  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2585  Array<size_t> c_status(n, ST_INVALID);
2586 
2587  // Classic csr assembly (low memory edition)
2588  //
2589  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2590  // The method loops over rows of A, and may resize after processing
2591  // each row. Chris Siefert says that this reflects experience in
2592  // ML; for the non-threaded case, ML found it faster to spend less
2593  // effort on estimation and risk an occasional reallocation.
2594  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2595  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"), m + 1);
2596  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"), CSR_alloc);
2597  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"), CSR_alloc);
2598  size_t CSR_ip = 0, OLD_ip = 0;
2599 
2600  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2601 
2602  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2603  // routine. The routine computes
2604  //
2605  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2606  //
2607  // This corresponds to one sweep of (weighted) Jacobi.
2608  //
2609  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2610  // you whether the corresponding row of B belongs to B_local
2611  // ("orig") or B_remote ("Import").
2612 
2613  // For each row of A/C
2614  for (size_t i = 0; i < m; i++) {
2615  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2616  // on the calling process.
2617  Crowptr[i] = CSR_ip;
2618  SC minusOmegaDval = -omega * Dvals(i, 0);
2619 
2620  // Entries of B
2621  for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
2622  Scalar Bval = Bvals[j];
2623  if (Bval == SC_ZERO)
2624  continue;
2625  LO Bij = Bcolind[j];
2626  LO Cij = Bcol2Ccol[Bij];
2627 
2628  // Assume no repeated entries in B
2629  c_status[Cij] = CSR_ip;
2630  Ccolind[CSR_ip] = Cij;
2631  Cvals[CSR_ip] = Bvals[j];
2632  CSR_ip++;
2633  }
2634 
2635  // Entries of -omega * Dinv * A * B
2636  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2637  LO Aik = Acolind[k];
2638  const SC Aval = Avals[k];
2639  if (Aval == SC_ZERO)
2640  continue;
2641 
2642  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2643  // Local matrix
2644  size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2645 
2646  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2647  LO Bkj = Bcolind[j];
2648  LO Cij = Bcol2Ccol[Bkj];
2649 
2650  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2651  // New entry
2652  c_status[Cij] = CSR_ip;
2653  Ccolind[CSR_ip] = Cij;
2654  Cvals[CSR_ip] = minusOmegaDval * Aval * Bvals[j];
2655  CSR_ip++;
2656 
2657  } else {
2658  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2659  }
2660  }
2661 
2662  } else {
2663  // Remote matrix
2664  size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2665  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2666  LO Ikj = Icolind[j];
2667  LO Cij = Icol2Ccol[Ikj];
2668 
2669  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2670  // New entry
2671  c_status[Cij] = CSR_ip;
2672  Ccolind[CSR_ip] = Cij;
2673  Cvals[CSR_ip] = minusOmegaDval * Aval * Ivals[j];
2674  CSR_ip++;
2675  } else {
2676  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2677  }
2678  }
2679  }
2680  }
2681 
2682  // Resize for next pass if needed
2683  if (i + 1 < m && CSR_ip + std::min(n, (Arowptr[i + 2] - Arowptr[i + 1] + 1) * b_max_nnz_per_row) > CSR_alloc) {
2684  CSR_alloc *= 2;
2685  Kokkos::resize(Ccolind, CSR_alloc);
2686  Kokkos::resize(Cvals, CSR_alloc);
2687  }
2688  OLD_ip = CSR_ip;
2689  }
2690  Crowptr[m] = CSR_ip;
2691 
2692  // Downward resize
2693  Kokkos::resize(Ccolind, CSR_ip);
2694  Kokkos::resize(Cvals, CSR_ip);
2695 
2696  {
2697  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Newmatrix Final Sort");
2698 
2699  // Replace the column map
2700  //
2701  // mfh 27 Sep 2016: We do this because C was originally created
2702  // without a column Map. Now we have its column Map.
2703  C.replaceColMap(Ccolmap);
2704 
2705  // Final sort & set of CRS arrays
2706  //
2707  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2708  // matrix-matrix multiply routine sort the entries for us?
2709  // Final sort & set of CRS arrays
2710  if (params.is_null() || params->get("sort entries", true))
2711  Import_Util::sortCrsEntries(Crowptr, Ccolind, Cvals);
2712  C.setAllValues(Crowptr, Ccolind, Cvals);
2713  }
2714  {
2715  Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: Newmatrix ESFC");
2716 
2717  // Final FillComplete
2718  //
2719  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2720  // Import (from domain Map to column Map) construction (which costs
2721  // lots of communication) by taking the previously constructed
2722  // Import object. We should be able to do this without interfering
2723  // with the implementation of the local part of sparse matrix-matrix
2724  // multply above
2725  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2726  labelList->set("Timer Label", label);
2727  if (!params.is_null()) labelList->set("compute global constants", params->get("compute global constants", true));
2728  RCP<const Export<LO, GO, NO>> dummyExport;
2729  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport, dummyExport, labelList);
2730  }
2731 }
2732 
2733 /*********************************************************************************************************/
2734 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2735 template <class Scalar,
2736  class LocalOrdinal,
2737  class GlobalOrdinal,
2738  class Node>
2739 void jacobi_A_B_reuse(
2740  Scalar omega,
2741  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2742  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2743  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2744  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2745  const std::string& label,
2746  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2747  using Teuchos::Array;
2748  using Teuchos::ArrayRCP;
2749  using Teuchos::ArrayView;
2750  using Teuchos::RCP;
2751  using Teuchos::rcp;
2752 
2753  typedef LocalOrdinal LO;
2754  typedef GlobalOrdinal GO;
2755  typedef Node NO;
2756 
2757  typedef Import<LO, GO, NO> import_type;
2758  typedef Map<LO, GO, NO> map_type;
2759 
2760  // Kokkos typedefs
2761  typedef typename map_type::local_map_type local_map_type;
2763  typedef typename KCRS::StaticCrsGraphType graph_t;
2764  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2765  typedef typename NO::execution_space execution_space;
2766  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2767  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2768 
2769  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2770  lo_view_t Bcol2Ccol, Icol2Ccol;
2771  lo_view_t targetMapToOrigRow;
2772  lo_view_t targetMapToImportRow;
2773  {
2774  Tpetra::Details::ProfilingRegion MM("TpetraExt: Jacobi: Reuse Cmap");
2775 
2776  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2777 
2778  // Grab all the maps
2779  RCP<const map_type> Ccolmap = C.getColMap();
2780  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2781  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2782  local_map_type Irowmap_local;
2783  if (!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2784  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2785  local_map_type Icolmap_local;
2786  if (!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2787  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2788 
2789  // Build the final importer / column map, hash table lookups for C
2790  Bcol2Ccol = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"), Bview.colMap->getLocalNumElements());
2791  {
2792  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2793  // So, column map of C may be a strict subset of the column map of B
2794  Kokkos::parallel_for(
2795  range_type(0, Bview.origMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2796  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2797  });
2798 
2799  if (!Bview.importMatrix.is_null()) {
2800  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2801  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2802 
2803  Kokkos::resize(Icol2Ccol, Bview.importMatrix->getColMap()->getLocalNumElements());
2804  Kokkos::parallel_for(
2805  range_type(0, Bview.importMatrix->getColMap()->getLocalNumElements()), KOKKOS_LAMBDA(const LO i) {
2806  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2807  });
2808  }
2809  }
2810 
2811  // Run through all the hash table lookups once and for all
2812  targetMapToOrigRow = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"), Aview.colMap->getLocalNumElements());
2813  targetMapToImportRow = lo_view_t(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"), Aview.colMap->getLocalNumElements());
2814  Kokkos::parallel_for(
2815  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex() + 1), KOKKOS_LAMBDA(const LO i) {
2816  GO aidx = Acolmap_local.getGlobalElement(i);
2817  LO B_LID = Browmap_local.getLocalElement(aidx);
2818  if (B_LID != LO_INVALID) {
2819  targetMapToOrigRow(i) = B_LID;
2820  targetMapToImportRow(i) = LO_INVALID;
2821  } else {
2822  LO I_LID = Irowmap_local.getLocalElement(aidx);
2823  targetMapToOrigRow(i) = LO_INVALID;
2824  targetMapToImportRow(i) = I_LID;
2825  }
2826  });
2827  }
2828 
2829  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2830  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2831  KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega, Dinv, Aview, Bview, targetMapToOrigRow, targetMapToImportRow, Bcol2Ccol, Icol2Ccol, C, Cimport, label, params);
2832 }
2833 
2834 /*********************************************************************************************************/
2835 template <class Scalar,
2836  class LocalOrdinal,
2837  class GlobalOrdinal,
2838  class Node,
2839  class LocalOrdinalViewType>
2840 void KernelWrappers2<Scalar, LocalOrdinal, GlobalOrdinal, Node, LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2841  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Dinv,
2842  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2843  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2844  const LocalOrdinalViewType& targetMapToOrigRow,
2845  const LocalOrdinalViewType& targetMapToImportRow,
2846  const LocalOrdinalViewType& Bcol2Ccol,
2847  const LocalOrdinalViewType& Icol2Ccol,
2848  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2849  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> /* Cimport */,
2850  const std::string& label,
2851  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2852  Tpetra::Details::ProfilingRegion MM2("TpetraExt: Jacobi: Reuse Serial Core");
2853  (void)label;
2854 
2855  using Teuchos::RCP;
2856  using Teuchos::rcp;
2857 
2858  // Lots and lots of typedefs
2859  typedef typename Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type KCRS;
2860  typedef typename KCRS::StaticCrsGraphType graph_t;
2861  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2862  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2863  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2864  typedef typename scalar_view_t::memory_space scalar_memory_space;
2865 
2866  typedef Scalar SC;
2867  typedef LocalOrdinal LO;
2868  typedef GlobalOrdinal GO;
2869  typedef Node NO;
2870  typedef Map<LO, GO, NO> map_type;
2871  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2872  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2873  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2874 
2875  // Sizes
2876  RCP<const map_type> Ccolmap = C.getColMap();
2877  size_t m = Aview.origMatrix->getLocalNumRows();
2878  size_t n = Ccolmap->getLocalNumElements();
2879 
2880  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2881  const KCRS& Amat = Aview.origMatrix->getLocalMatrixHost();
2882  const KCRS& Bmat = Bview.origMatrix->getLocalMatrixHost();
2883  const KCRS& Cmat = C.getLocalMatrixHost();
2884 
2885  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2886  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2887  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2888  scalar_view_t Cvals = Cmat.values;
2889 
2890  c_lno_view_t Irowptr;
2891  lno_nnz_view_t Icolind;
2892  scalar_view_t Ivals;
2893  if (!Bview.importMatrix.is_null()) {
2894  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2895  Irowptr = lclB.graph.row_map;
2896  Icolind = lclB.graph.entries;
2897  Ivals = lclB.values;
2898  }
2899 
2900  // Jacobi-specific inner stuff
2901  auto Dvals =
2902  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2903 
2904  // The status array will contain the index into colind where this entry was last deposited.
2905  // c_status[i] < CSR_ip - not in the row yet
2906  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2907  // We start with this filled with INVALID's indicating that there are no entries yet.
2908  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2909  std::vector<size_t> c_status(n, ST_INVALID);
2910 
2911  // For each row of A/C
2912  size_t CSR_ip = 0, OLD_ip = 0;
2913  for (size_t i = 0; i < m; i++) {
2914  // First fill the c_status array w/ locations where we're allowed to
2915  // generate nonzeros for this row
2916  OLD_ip = Crowptr[i];
2917  CSR_ip = Crowptr[i + 1];
2918  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2919  c_status[Ccolind[k]] = k;
2920 
2921  // Reset values in the row of C
2922  Cvals[k] = SC_ZERO;
2923  }
2924 
2925  SC minusOmegaDval = -omega * Dvals(i, 0);
2926 
2927  // Entries of B
2928  for (size_t j = Browptr[i]; j < Browptr[i + 1]; j++) {
2929  Scalar Bval = Bvals[j];
2930  if (Bval == SC_ZERO)
2931  continue;
2932  LO Bij = Bcolind[j];
2933  LO Cij = Bcol2Ccol[Bij];
2934 
2935  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2936  std::runtime_error, "Trying to insert a new entry into a static graph");
2937 
2938  Cvals[c_status[Cij]] = Bvals[j];
2939  }
2940 
2941  // Entries of -omega * Dinv * A * B
2942  for (size_t k = Arowptr[i]; k < Arowptr[i + 1]; k++) {
2943  LO Aik = Acolind[k];
2944  const SC Aval = Avals[k];
2945  if (Aval == SC_ZERO)
2946  continue;
2947 
2948  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2949  // Local matrix
2950  size_t Bk = static_cast<size_t>(targetMapToOrigRow[Aik]);
2951 
2952  for (size_t j = Browptr[Bk]; j < Browptr[Bk + 1]; ++j) {
2953  LO Bkj = Bcolind[j];
2954  LO Cij = Bcol2Ccol[Bkj];
2955 
2956  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2957  std::runtime_error, "Trying to insert a new entry into a static graph");
2958 
2959  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2960  }
2961 
2962  } else {
2963  // Remote matrix
2964  size_t Ik = static_cast<size_t>(targetMapToImportRow[Aik]);
2965  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik + 1]; ++j) {
2966  LO Ikj = Icolind[j];
2967  LO Cij = Icol2Ccol[Ikj];
2968 
2969  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2970  std::runtime_error, "Trying to insert a new entry into a static graph");
2971 
2972  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2973  }
2974  }
2975  }
2976  }
2977 
2978  {
2979  Tpetra::Details::ProfilingRegion MM3("TpetraExt: Jacobi: Reuse ESFC");
2980  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2981  }
2982 }
2983 
2984 /*********************************************************************************************************/
2985 template <class Scalar,
2986  class LocalOrdinal,
2987  class GlobalOrdinal,
2988  class Node>
2989 void import_and_extract_views(
2990  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2991  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>> targetMap,
2992  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2993  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> prototypeImporter,
2994  bool userAssertsThereAreNoRemotes,
2995  const std::string& label,
2996  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2997  using Teuchos::Array;
2998  using Teuchos::ArrayView;
2999  using Teuchos::null;
3000  using Teuchos::RCP;
3001  using Teuchos::rcp;
3002 
3003  typedef Scalar SC;
3004  typedef LocalOrdinal LO;
3005  typedef GlobalOrdinal GO;
3006  typedef Node NO;
3007 
3008  typedef Map<LO, GO, NO> map_type;
3009  typedef Import<LO, GO, NO> import_type;
3010  typedef CrsMatrix<SC, LO, GO, NO> crs_matrix_type;
3011 
3012  RCP<Tpetra::Details::ProfilingRegion> MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Alloc"));
3013 
3014  // The goal of this method is to populate the 'Aview' struct with views of the
3015  // rows of A, including all rows that correspond to elements in 'targetMap'.
3016  //
3017  // If targetMap includes local elements that correspond to remotely-owned rows
3018  // of A, then those remotely-owned rows will be imported into
3019  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3020  Aview.deleteContents();
3021 
3022  Aview.origMatrix = rcp(&A, false);
3023  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3024  Aview.origMatrix->getApplyHelper();
3025  Aview.origRowMap = A.getRowMap();
3026  Aview.rowMap = targetMap;
3027  Aview.colMap = A.getColMap();
3028  Aview.domainMap = A.getDomainMap();
3029  Aview.importColMap = null;
3030  RCP<const map_type> rowMap = A.getRowMap();
3031  const int numProcs = rowMap->getComm()->getSize();
3032 
3033  // Short circuit if the user swears there are no remotes (or if we're in serial)
3034  if (userAssertsThereAreNoRemotes || numProcs < 2)
3035  return;
3036 
3037  RCP<const import_type> importer;
3038  if (params != null && params->isParameter("importer")) {
3039  importer = params->get<RCP<const import_type>>("importer");
3040 
3041  } else {
3042  MM = Teuchos::null;
3043  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X RemoteMap"));
3044 
3045  // Mark each row in targetMap as local or remote, and go ahead and get a view
3046  // for the local rows
3047  RCP<const map_type> remoteRowMap;
3048  size_t numRemote = 0;
3049  int mode = 0;
3050  if (!prototypeImporter.is_null() &&
3051  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3052  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3053  // We have a valid prototype importer --- ask it for the remotes
3054  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: I&X RemoteMap: Mode1");
3055 
3056  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3057  numRemote = prototypeImporter->getNumRemoteIDs();
3058 
3059  Array<GO> remoteRows(numRemote);
3060  for (size_t i = 0; i < numRemote; i++)
3061  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3062 
3063  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3064  rowMap->getIndexBase(), rowMap->getComm()));
3065  mode = 1;
3066 
3067  } else if (prototypeImporter.is_null()) {
3068  // No prototype importer --- count the remotes the hard way
3069  Tpetra::Details::ProfilingRegion MM2("TpetraExt: MMM: I&X RemoteMap: Mode2");
3070 
3071  ArrayView<const GO> rows = targetMap->getLocalElementList();
3072  size_t numRows = targetMap->getLocalNumElements();
3073 
3074  Array<GO> remoteRows(numRows);
3075  for (size_t i = 0; i < numRows; ++i) {
3076  const LO mlid = rowMap->getLocalElement(rows[i]);
3077 
3078  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3079  remoteRows[numRemote++] = rows[i];
3080  }
3081  remoteRows.resize(numRemote);
3082  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3083  rowMap->getIndexBase(), rowMap->getComm()));
3084  mode = 2;
3085 
3086  } else {
3087  // PrototypeImporter is bad. But if we're in serial that's OK.
3088  mode = 3;
3089  }
3090 
3091  if (numProcs < 2) {
3092  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3093  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3094  // If only one processor we don't need to import any remote rows, so return.
3095  return;
3096  }
3097 
3098  //
3099  // Now we will import the needed remote rows of A, if the global maximum
3100  // value of numRemote is greater than 0.
3101  //
3102  MM = Teuchos::null;
3103  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Collective-0"));
3104 
3105  global_size_t globalMaxNumRemote = 0;
3106  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote));
3107 
3108  if (globalMaxNumRemote > 0) {
3109  MM = Teuchos::null;
3110  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-2"));
3111 
3112  // Create an importer with target-map remoteRowMap and source-map rowMap.
3113  if (mode == 1)
3114  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3115  else if (mode == 2)
3116  importer = rcp(new import_type(rowMap, remoteRowMap));
3117  else
3118  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3119  }
3120 
3121  if (params != null)
3122  params->set("importer", importer);
3123  }
3124 
3125  if (importer != null) {
3126  MM = Teuchos::null;
3127  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-3"));
3128 
3129  // Now create a new matrix into which we can import the remote rows of A that we need.
3130  Teuchos::ParameterList labelList;
3131  labelList.set("Timer Label", label);
3132  auto& labelList_subList = labelList.sublist("matrixmatrix: kernel params", false);
3133 
3134  bool isMM = true;
3135  bool overrideAllreduce = false;
3136  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3137  // Minor speedup tweak - avoid computing the global constants
3138  Teuchos::ParameterList params_sublist;
3139  if (!params.is_null()) {
3140  labelList.set("compute global constants", params->get("compute global constants", false));
3141  params_sublist = params->sublist("matrixmatrix: kernel params", false);
3142  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
3143  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
3144  if (mm_optimization_core_count2 < mm_optimization_core_count) mm_optimization_core_count = mm_optimization_core_count2;
3145  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete", false);
3146  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck", false);
3147  }
3148  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete", isMM);
3149  labelList_subList.set("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
3150  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
3151 
3152  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3153  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3154  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3155  Aview.importMatrix->getApplyHelper();
3156 
3157 #if 0
3158  // Disabled code for dumping input matrices
3159  static int count=0;
3160  char str[80];
3161  sprintf(str,"import_matrix.%d.dat",count);
3163  count++;
3164 #endif
3165 
3166 #ifdef HAVE_TPETRA_MMM_STATISTICS
3167  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3168 #endif
3169 
3170  MM = Teuchos::null;
3171  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: MMM: I&X Import-4"));
3172 
3173  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3174  Aview.importColMap = Aview.importMatrix->getColMap();
3175  MM = Teuchos::null;
3176  }
3177 }
3178 
3179 /*********************************************************************************************************/
3180 template <class Scalar,
3181  class LocalOrdinal,
3182  class GlobalOrdinal,
3183  class Node>
3184 void import_and_extract_views(
3185  const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3186  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node>> targetMap,
3187  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3188  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node>> prototypeImporter,
3189  bool userAssertsThereAreNoRemotes) {
3190  using Teuchos::Array;
3191  using Teuchos::ArrayView;
3192  using Teuchos::null;
3193  using Teuchos::RCP;
3194  using Teuchos::rcp;
3195 
3196  typedef Scalar SC;
3197  typedef LocalOrdinal LO;
3198  typedef GlobalOrdinal GO;
3199  typedef Node NO;
3200 
3201  typedef Map<LO, GO, NO> map_type;
3202  typedef Import<LO, GO, NO> import_type;
3203  typedef BlockCrsMatrix<SC, LO, GO, NO> blockcrs_matrix_type;
3204 
3205  // The goal of this method is to populate the 'Mview' struct with views of the
3206  // rows of M, including all rows that correspond to elements in 'targetMap'.
3207  //
3208  // If targetMap includes local elements that correspond to remotely-owned rows
3209  // of M, then those remotely-owned rows will be imported into
3210  // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3211  Mview.deleteContents();
3212 
3213  Mview.origMatrix = rcp(&M, false);
3214  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3215  Mview.origMatrix->getApplyHelper();
3216  Mview.origRowMap = M.getRowMap();
3217  Mview.rowMap = targetMap;
3218  Mview.colMap = M.getColMap();
3219  Mview.importColMap = null;
3220  RCP<const map_type> rowMap = M.getRowMap();
3221  const int numProcs = rowMap->getComm()->getSize();
3222 
3223  // Short circuit if the user swears there are no remotes (or if we're in serial)
3224  if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3225 
3226  // Mark each row in targetMap as local or remote, and go ahead and get a view
3227  // for the local rows
3228  RCP<const map_type> remoteRowMap;
3229  size_t numRemote = 0;
3230  int mode = 0;
3231  if (!prototypeImporter.is_null() &&
3232  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3233  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3234  // We have a valid prototype importer --- ask it for the remotes
3235  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3236  numRemote = prototypeImporter->getNumRemoteIDs();
3237 
3238  Array<GO> remoteRows(numRemote);
3239  for (size_t i = 0; i < numRemote; i++)
3240  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3241 
3242  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3243  rowMap->getIndexBase(), rowMap->getComm()));
3244  mode = 1;
3245 
3246  } else if (prototypeImporter.is_null()) {
3247  // No prototype importer --- count the remotes the hard way
3248  ArrayView<const GO> rows = targetMap->getLocalElementList();
3249  size_t numRows = targetMap->getLocalNumElements();
3250 
3251  Array<GO> remoteRows(numRows);
3252  for (size_t i = 0; i < numRows; ++i) {
3253  const LO mlid = rowMap->getLocalElement(rows[i]);
3254 
3255  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3256  remoteRows[numRemote++] = rows[i];
3257  }
3258  remoteRows.resize(numRemote);
3259  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3260  rowMap->getIndexBase(), rowMap->getComm()));
3261  mode = 2;
3262 
3263  } else {
3264  // PrototypeImporter is bad. But if we're in serial that's OK.
3265  mode = 3;
3266  }
3267 
3268  if (numProcs < 2) {
3269  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3270  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3271  // If only one processor we don't need to import any remote rows, so return.
3272  return;
3273  }
3274 
3275  // Now we will import the needed remote rows of M, if the global maximum
3276  // value of numRemote is greater than 0.
3277  global_size_t globalMaxNumRemote = 0;
3278  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote));
3279 
3280  RCP<const import_type> importer;
3281 
3282  if (globalMaxNumRemote > 0) {
3283  // Create an importer with target-map remoteRowMap and source-map rowMap.
3284  if (mode == 1)
3285  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3286  else if (mode == 2)
3287  importer = rcp(new import_type(rowMap, remoteRowMap));
3288  else
3289  throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3290  }
3291 
3292  if (importer != null) {
3293  // Get import matrix
3294  // TODO: create the int-typed row-pointer here
3295  Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3296  // trigger creation of int-typed row pointer array for use in TPLs, but don't actually need it here
3297  Mview.importMatrix->getApplyHelper();
3298  // Save the column map of the imported matrix, so that we can convert indices
3299  // back to global for arithmetic later
3300  Mview.importColMap = Mview.importMatrix->getColMap();
3301  }
3302 }
3303 
3304 /*********************************************************************************************************/
3305 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3306 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
3308 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3309  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3310  const LocalOrdinalViewType& Acol2Brow,
3311  const LocalOrdinalViewType& Acol2Irow,
3312  const LocalOrdinalViewType& Bcol2Ccol,
3313  const LocalOrdinalViewType& Icol2Ccol,
3314  const size_t mergedNodeNumCols) {
3315  using Teuchos::RCP;
3317  typedef typename KCRS::StaticCrsGraphType graph_t;
3318  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3319  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3320  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3321  // Grab the Kokkos::SparseCrsMatrices
3322  const KCRS& Ak = Aview.origMatrix->getLocalMatrixDevice();
3323  const KCRS& Bk = Bview.origMatrix->getLocalMatrixDevice();
3324 
3325  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3326  if (!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3327  // We do have a Bimport
3328  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3329  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3330  RCP<const KCRS> Ik_;
3331  if (!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3332  const KCRS* Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3333  KCRS Iks;
3334  if (Ik != 0) Iks = *Ik;
3335  size_t merge_numrows = Ak.numCols();
3336  // The last entry of this at least, need to be initialized
3337  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3338 
3339  const LocalOrdinal LO_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3340 
3341  // Use a Kokkos::parallel_scan to build the rowptr
3342  typedef typename Node::execution_space execution_space;
3343  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3344  Kokkos::parallel_scan(
3345  "Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type(0, merge_numrows),
3346  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3347  if (final) Mrowptr(i) = update;
3348  // Get the row count
3349  size_t ct = 0;
3350  if (Acol2Brow(i) != LO_INVALID)
3351  ct = Bk.graph.row_map(Acol2Brow(i) + 1) - Bk.graph.row_map(Acol2Brow(i));
3352  else
3353  ct = Iks.graph.row_map(Acol2Irow(i) + 1) - Iks.graph.row_map(Acol2Irow(i));
3354  update += ct;
3355 
3356  if (final && i + 1 == merge_numrows)
3357  Mrowptr(i + 1) = update;
3358  });
3359 
3360  // Allocate nnz
3361  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr, merge_numrows);
3362  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"), merge_nnz);
3363  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"), merge_nnz);
3364 
3365  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3366  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3367  Kokkos::parallel_for(
3368  "Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type(0, merge_numrows), KOKKOS_LAMBDA(const size_t i) {
3369  if (Acol2Brow(i) != LO_INVALID) {
3370  size_t row = Acol2Brow(i);
3371  size_t start = Bk.graph.row_map(row);
3372  for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3373  Mvalues(j) = Bk.values(j - Mrowptr(i) + start);
3374  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j - Mrowptr(i) + start));
3375  }
3376  } else {
3377  size_t row = Acol2Irow(i);
3378  size_t start = Iks.graph.row_map(row);
3379  for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3380  Mvalues(j) = Iks.values(j - Mrowptr(i) + start);
3381  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j - Mrowptr(i) + start));
3382  }
3383  }
3384  });
3385 
3386  KCRS newmat("CrsMatrix", merge_numrows, mergedNodeNumCols, merge_nnz, Mvalues, Mrowptr, Mcolind);
3387  return newmat;
3388  } else {
3389  // We don't have a Bimport (the easy case)
3390  return Bk;
3391  }
3392 } // end merge_matrices
3393 
3394 /*********************************************************************************************************/
3395 // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3396 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node, class LocalOrdinalViewType>
3397 const typename Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
3398 merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3399  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3400  const LocalOrdinalViewType& Acol2Brow,
3401  const LocalOrdinalViewType& Acol2Irow,
3402  const LocalOrdinalViewType& Bcol2Ccol,
3403  const LocalOrdinalViewType& Icol2Ccol,
3404  const size_t mergedNodeNumCols) {
3405  using Teuchos::RCP;
3406  typedef typename Tpetra::BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type KBCRS;
3407  typedef typename KBCRS::StaticCrsGraphType graph_t;
3408  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3409  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3410  typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3411 
3412  // Grab the KokkosSparse::BsrMatrix
3413  const KBCRS& Ak = Aview.origMatrix->getLocalMatrixDevice();
3414  const KBCRS& Bk = Bview.origMatrix->getLocalMatrixDevice();
3415 
3416  // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3417  if (!Bview.importMatrix.is_null() ||
3418  (Bview.importMatrix.is_null() &&
3419  (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3420  // We do have a Bimport
3421  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3422  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3423  RCP<const KBCRS> Ik_;
3424  if (!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3425  const KBCRS* Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3426  KBCRS Iks;
3427  if (Ik != 0) Iks = *Ik;
3428  size_t merge_numrows = Ak.numCols();
3429 
3430  // The last entry of this at least, need to be initialized
3431  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3432 
3433  const LocalOrdinal LO_INVALID = Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3434 
3435  // Use a Kokkos::parallel_scan to build the rowptr
3436  typedef typename Node::execution_space execution_space;
3437  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3438  Kokkos::parallel_scan(
3439  "Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type(0, merge_numrows),
3440  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3441  if (final) Mrowptr(i) = update;
3442  // Get the row count
3443  size_t ct = 0;
3444  if (Acol2Brow(i) != LO_INVALID)
3445  ct = Bk.graph.row_map(Acol2Brow(i) + 1) - Bk.graph.row_map(Acol2Brow(i));
3446  else
3447  ct = Iks.graph.row_map(Acol2Irow(i) + 1) - Iks.graph.row_map(Acol2Irow(i));
3448  update += ct;
3449 
3450  if (final && i + 1 == merge_numrows)
3451  Mrowptr(i + 1) = update;
3452  });
3453 
3454  // Allocate nnz
3455  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr, merge_numrows);
3456  const int blocksize = Ak.blockDim();
3457  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"), merge_nnz);
3458  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"), merge_nnz * blocksize * blocksize);
3459 
3460  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3461  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3462  Kokkos::parallel_for(
3463  "Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type(0, merge_numrows), KOKKOS_LAMBDA(const size_t i) {
3464  if (Acol2Brow(i) != LO_INVALID) {
3465  size_t row = Acol2Brow(i);
3466  size_t start = Bk.graph.row_map(row);
3467  for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3468  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j - Mrowptr(i) + start));
3469 
3470  for (int b = 0; b < blocksize * blocksize; ++b) {
3471  const int val_indx = j * blocksize * blocksize + b;
3472  const int b_val_indx = (j - Mrowptr(i) + start) * blocksize * blocksize + b;
3473  Mvalues(val_indx) = Bk.values(b_val_indx);
3474  }
3475  }
3476  } else {
3477  size_t row = Acol2Irow(i);
3478  size_t start = Iks.graph.row_map(row);
3479  for (size_t j = Mrowptr(i); j < Mrowptr(i + 1); j++) {
3480  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j - Mrowptr(i) + start));
3481 
3482  for (int b = 0; b < blocksize * blocksize; ++b) {
3483  const int val_indx = j * blocksize * blocksize + b;
3484  const int b_val_indx = (j - Mrowptr(i) + start) * blocksize * blocksize + b;
3485  Mvalues(val_indx) = Iks.values(b_val_indx);
3486  }
3487  }
3488  }
3489  });
3490 
3491  // Build and return merged KokkosSparse matrix
3492  KBCRS newmat("CrsMatrix", merge_numrows, mergedNodeNumCols, merge_nnz, Mvalues, Mrowptr, Mcolind, blocksize);
3493  return newmat;
3494  } else {
3495  // We don't have a Bimport (the easy case)
3496  return Bk;
3497  }
3498 } // end merge_matrices
3499 
3500 /*********************************************************************************************************/
3501 template <typename SC, typename LO, typename GO, typename NO>
3502 void AddKernels<SC, LO, GO, NO>::
3503  addSorted(
3504  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3505  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3506  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3507  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3508  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3509  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3510  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3511  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3512  GO numGlobalCols,
3513  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3514  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3515  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds) {
3516  using Teuchos::rcp;
3517  using Teuchos::TimeMonitor;
3518  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3519  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3520  auto nrows = Arowptrs.extent(0) - 1;
3521  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3522  typename AddKern::KKH handle;
3523  handle.create_spadd_handle(true);
3524  auto addHandle = handle.get_spadd_handle();
3525 
3526  auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted symbolic"));
3527 
3528  KokkosSparse::spadd_symbolic(&handle,
3529  nrows, numGlobalCols,
3530  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3531  // KokkosKernels requires values to be zeroed
3532  Cvals = values_array("C values", addHandle->get_c_nnz());
3533  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3534 
3535  MM = Teuchos::null;
3536  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: sorted numeric"));
3537  KokkosSparse::spadd_numeric(&handle,
3538  nrows, numGlobalCols,
3539  Arowptrs, Acolinds, Avals, scalarA,
3540  Browptrs, Bcolinds, Bvals, scalarB,
3541  Crowptrs, Ccolinds, Cvals);
3542 }
3543 
3544 template <typename SC, typename LO, typename GO, typename NO>
3545 void AddKernels<SC, LO, GO, NO>::
3546  addUnsorted(
3547  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3548  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3549  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3550  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3551  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3552  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3553  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3554  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3555  GO numGlobalCols,
3556  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3557  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3558  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds) {
3559  using Teuchos::rcp;
3560  using Teuchos::TimeMonitor;
3561  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3562  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3563  auto nrows = Arowptrs.extent(0) - 1;
3564  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3565  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3566  typename AddKern::KKH handle;
3567  handle.create_spadd_handle(false);
3568  auto addHandle = handle.get_spadd_handle();
3569  auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted symbolic"));
3570 
3571  KokkosSparse::spadd_symbolic(&handle,
3572  nrows, numGlobalCols,
3573  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3574  // Cvals must be zeroed out
3575  Cvals = values_array("C values", addHandle->get_c_nnz());
3576  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3577  MM = Teuchos::null;
3578  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: unsorted numeric"));
3579  KokkosSparse::spadd_numeric(&handle,
3580  nrows, numGlobalCols,
3581  Arowptrs, Acolinds, Avals, scalarA,
3582  Browptrs, Bcolinds, Bvals, scalarB,
3583  Crowptrs, Ccolinds, Cvals);
3584 }
3585 
3586 template <typename GO,
3587  typename LocalIndicesType,
3588  typename GlobalIndicesType,
3589  typename ColMapType>
3590 struct ConvertLocalToGlobalFunctor {
3591  ConvertLocalToGlobalFunctor(
3592  const LocalIndicesType& colindsOrig_,
3593  const GlobalIndicesType& colindsConverted_,
3594  const ColMapType& colmap_)
3595  : colindsOrig(colindsOrig_)
3596  , colindsConverted(colindsConverted_)
3597  , colmap(colmap_) {}
3598  KOKKOS_INLINE_FUNCTION void
3599  operator()(const GO i) const {
3600  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3601  }
3602  LocalIndicesType colindsOrig;
3603  GlobalIndicesType colindsConverted;
3604  ColMapType colmap;
3605 };
3606 
3607 template <typename SC, typename LO, typename GO, typename NO>
3608 void MMdetails::AddKernels<SC, LO, GO, NO>::
3609  convertToGlobalAndAdd(
3610  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3611  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3612  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3613  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3614  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3615  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3616  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3617  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3618  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds) {
3619  using Teuchos::rcp;
3620  using Teuchos::TimeMonitor;
3621  // Need to use a different KokkosKernelsHandle type than other versions,
3622  // since the ordinals are now GO
3623  using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3624  typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3625 
3626  const values_array Avals = A.values;
3627  const values_array Bvals = B.values;
3628  const col_inds_array Acolinds = A.graph.entries;
3629  const col_inds_array Bcolinds = B.graph.entries;
3630  auto Arowptrs = A.graph.row_map;
3631  auto Browptrs = B.graph.row_map;
3632  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3633  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3634 
3635  auto MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: column map conversion"));
3636 
3637  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3638  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3639  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3640  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3641  KKH_GO handle;
3642  handle.create_spadd_handle(false);
3643  auto addHandle = handle.get_spadd_handle();
3644  MM = Teuchos::null;
3645  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: unsorted symbolic"));
3646  auto nrows = Arowptrs.extent(0) - 1;
3647  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3648  KokkosSparse::spadd_symbolic(&handle,
3649  nrows, A.numCols(),
3650  Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3651  Cvals = values_array("C values", addHandle->get_c_nnz());
3652  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3653 
3654  MM = Teuchos::null;
3655  MM = rcp(new Tpetra::Details::ProfilingRegion("TpetraExt: add: diff col map kernel: unsorted numeric"));
3656  KokkosSparse::spadd_numeric(&handle,
3657  nrows, A.numCols(),
3658  Arowptrs, AcolindsConverted, Avals, scalarA,
3659  Browptrs, BcolindsConverted, Bvals, scalarB,
3660  Crowptrs, Ccolinds, Cvals);
3661 }
3662 
3663 } // namespace MMdetails
3664 
3665 } // End namespace Tpetra
3666 
3667 /*********************************************************************************************************/
3668 //
3669 // Explicit instantiation macro
3670 //
3671 // Must be expanded from within the Tpetra namespace!
3672 //
3673 namespace Tpetra {
3674 
3675 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR, LO, GO, NODE) \
3676  template void MatrixMatrix::Multiply( \
3677  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3678  bool transposeA, \
3679  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3680  bool transposeB, \
3681  CrsMatrix<SCALAR, LO, GO, NODE>& C, \
3682  bool call_FillComplete_on_result, \
3683  const std::string& label, \
3684  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3685  \
3686  template void MatrixMatrix::Multiply( \
3687  const Teuchos::RCP<const BlockCrsMatrix<SCALAR, LO, GO, NODE>>& A, \
3688  bool transposeA, \
3689  const Teuchos::RCP<const BlockCrsMatrix<SCALAR, LO, GO, NODE>>& B, \
3690  bool transposeB, \
3691  Teuchos::RCP<BlockCrsMatrix<SCALAR, LO, GO, NODE>>& C, \
3692  const std::string& label); \
3693  \
3694  template void MatrixMatrix::Jacobi( \
3695  SCALAR omega, \
3696  const Vector<SCALAR, LO, GO, NODE>& Dinv, \
3697  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3698  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3699  CrsMatrix<SCALAR, LO, GO, NODE>& C, \
3700  bool call_FillComplete_on_result, \
3701  const std::string& label, \
3702  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3703  \
3704  template void MatrixMatrix::Add( \
3705  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3706  bool transposeA, \
3707  SCALAR scalarA, \
3708  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3709  bool transposeB, \
3710  SCALAR scalarB, \
3711  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>>& C); \
3712  \
3713  template void MatrixMatrix::Add( \
3714  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3715  bool transposeA, \
3716  SCALAR scalarA, \
3717  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3718  bool transposeB, \
3719  SCALAR scalarB, \
3720  const Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>>& C); \
3721  \
3722  template void MatrixMatrix::Add( \
3723  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3724  bool transposeA, \
3725  SCALAR scalarA, \
3726  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3727  SCALAR scalarB); \
3728  \
3729  template Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE>> \
3730  MatrixMatrix::add<SCALAR, LO, GO, NODE>(const SCALAR& alpha, \
3731  const bool transposeA, \
3732  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3733  const SCALAR& beta, \
3734  const bool transposeB, \
3735  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3736  const Teuchos::RCP<const Map<LO, GO, NODE>>& domainMap, \
3737  const Teuchos::RCP<const Map<LO, GO, NODE>>& rangeMap, \
3738  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3739  \
3740  template void \
3741  MatrixMatrix::add<SCALAR, LO, GO, NODE>(const SCALAR& alpha, \
3742  const bool transposeA, \
3743  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3744  const SCALAR& beta, \
3745  const bool transposeB, \
3746  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3747  CrsMatrix<SCALAR, LO, GO, NODE>& C, \
3748  const Teuchos::RCP<const Map<LO, GO, NODE>>& domainMap, \
3749  const Teuchos::RCP<const Map<LO, GO, NODE>>& rangeMap, \
3750  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3751  \
3752  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3753  \
3754  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
3755  Teuchos::RCP<const Map<LO, GO, NODE>> targetMap, \
3756  CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3757  Teuchos::RCP<const Import<LO, GO, NODE>> prototypeImporter, \
3758  bool userAssertsThereAreNoRemotes, \
3759  const std::string& label, \
3760  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3761  \
3762  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
3763  Teuchos::RCP<const Map<LO, GO, NODE>> targetMap, \
3764  BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
3765  Teuchos::RCP<const Import<LO, GO, NODE>> prototypeImporter, \
3766  bool userAssertsThereAreNoRemotes);
3767 } // End namespace Tpetra
3768 
3769 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Struct that holds views of the contents of a BlockCrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void start()
Start the deep_copy counter.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
Matrix Market file readers and writers for Tpetra objects.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
Forward declaration of some Tpetra Matrix Matrix objects.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.