Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
52 #include "Tpetra_Details_radixSort.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
57 #include "Tpetra_Import_Util.hpp"
58 #include "Tpetra_Import_Util2.hpp"
59 #include <algorithm>
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
62 
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65 
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 
69 #include <MatrixMarket_Tpetra.hpp>
70 
78 /*********************************************************************************************************/
79 // Include the architecture-specific kernel partial specializations here
80 // NOTE: This needs to be outside all namespaces
81 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
82 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
83 
84 
85 namespace Tpetra {
86 
87 namespace MatrixMatrix{
88 
89 //
90 // This method forms the matrix-matrix product C = op(A) * op(B), where
91 // op(A) == A if transposeA is false,
92 // op(A) == A^T if transposeA is true,
93 // and similarly for op(B).
94 //
95 template <class Scalar,
96  class LocalOrdinal,
97  class GlobalOrdinal,
98  class Node>
99 void Multiply(
101  bool transposeA,
103  bool transposeB,
105  bool call_FillComplete_on_result,
106  const std::string& label,
107  const Teuchos::RCP<Teuchos::ParameterList>& params)
108 {
109  using Teuchos::null;
110  using Teuchos::RCP;
111  typedef Scalar SC;
112  typedef LocalOrdinal LO;
113  typedef GlobalOrdinal GO;
114  typedef Node NO;
115  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
116  typedef Import<LO,GO,NO> import_type;
117  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
118  typedef Map<LO,GO,NO> map_type;
119  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
120 
121 #ifdef HAVE_TPETRA_MMM_TIMINGS
122  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
123  using Teuchos::TimeMonitor;
124  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup")));
125 #endif
126 
127  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
128 
129  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
130 
131  // The input matrices A and B must both be fillComplete.
132  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
133  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
134 
135  // If transposeA is true, then Aprime will be the transpose of A
136  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
137  // will just be a pointer to A.
138  RCP<const crs_matrix_type> Aprime = null;
139  // If transposeB is true, then Bprime will be the transpose of B
140  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
141  // will just be a pointer to B.
142  RCP<const crs_matrix_type> Bprime = null;
143 
144  // Is this a "clean" matrix?
145  //
146  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
147  // locally nor globally indexed, then it was empty. I don't like
148  // this, because the most straightforward implementation presumes
149  // lazy allocation of indices. However, historical precedent
150  // demands that we keep around this predicate as a way to test
151  // whether the matrix is empty.
152  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
153 
154  bool use_optimized_ATB = false;
155  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
156  use_optimized_ATB = true;
157 
158 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
159  use_optimized_ATB = false;
160 #endif
161 
162  if (!use_optimized_ATB && transposeA) {
163  transposer_type transposer(rcpFromRef (A));
164  Aprime = transposer.createTranspose();
165 
166  } else {
167  Aprime = rcpFromRef(A);
168  }
169 
170  if (transposeB) {
171  transposer_type transposer(rcpFromRef (B));
172  Bprime = transposer.createTranspose();
173 
174  } else {
175  Bprime = rcpFromRef(B);
176  }
177 
178  // Check size compatibility
179  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
180  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
181  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
182  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
183  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
184  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
185  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
186  prefix << "ERROR, inner dimensions of op(A) and op(B) "
187  "must match for matrix-matrix product. op(A) is "
188  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
189 
190  // The result matrix C must at least have a row-map that reflects the correct
191  // row-size. Don't check the number of columns because rectangular matrices
192  // which were constructed with only one map can still end up having the
193  // correct capacity and dimensions when filled.
194  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
195  prefix << "ERROR, dimensions of result C must "
196  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
197  << " rows, should have at least " << Aouter << std::endl);
198 
199  // It doesn't matter whether C is already Filled or not. If it is already
200  // Filled, it must have space allocated for the positions that will be
201  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
202  // we'll error out later when trying to store result values.
203 
204  // CGB: However, matrix must be in active-fill
205  if (!C.isFillActive()) C.resumeFill();
206 
207  // We're going to need to import remotely-owned sections of A and/or B if
208  // more than one processor is performing this run, depending on the scenario.
209  int numProcs = A.getComm()->getSize();
210 
211  // Declare a couple of structs that will be used to hold views of the data
212  // of A and B, to be used for fast access during the matrix-multiplication.
213  crs_matrix_struct_type Aview;
214  crs_matrix_struct_type Bview;
215 
216  RCP<const map_type> targetMap_A = Aprime->getRowMap();
217  RCP<const map_type> targetMap_B = Bprime->getRowMap();
218 
219 #ifdef HAVE_TPETRA_MMM_TIMINGS
220  {
221  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
222 #endif
223 
224  // Now import any needed remote rows and populate the Aview struct
225  // NOTE: We assert that an import isn't needed --- since we do the transpose
226  // above to handle that.
227  if (!use_optimized_ATB) {
228  RCP<const import_type> dummyImporter;
229  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
230  }
231 
232  // We will also need local access to all rows of B that correspond to the
233  // column-map of op(A).
234  if (numProcs > 1)
235  targetMap_B = Aprime->getColMap();
236 
237  // Import any needed remote rows and populate the Bview struct.
238  if (!use_optimized_ATB)
239  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label, params);
240 
241 #ifdef HAVE_TPETRA_MMM_TIMINGS
242  }
243  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply")));
244  {
245 #endif
246 
247  // Call the appropriate method to perform the actual multiplication.
248  if (use_optimized_ATB) {
249  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
250 
251  } else if (call_FillComplete_on_result && newFlag) {
252  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
253 
254  } else if (call_FillComplete_on_result) {
255  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
256 
257  } else {
258  // mfh 27 Sep 2016: Is this the "slow" case? This
259  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
260  // thread-parallel inserts, but that may take some effort.
261  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
262 
263  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
264 
265 #ifdef HAVE_TPETRA_MMM_TIMINGS
266  }
267  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
268 #endif
269  if (call_FillComplete_on_result) {
270  // We'll call FillComplete on the C matrix before we exit, and give it a
271  // domain-map and a range-map.
272  // The domain-map will be the domain-map of B, unless
273  // op(B)==transpose(B), in which case the range-map of B will be used.
274  // The range-map will be the range-map of A, unless op(A)==transpose(A),
275  // in which case the domain-map of A will be used.
276  if (!C.isFillComplete())
277  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
278  }
279  }
280 }
281 
282 
283 template <class Scalar,
284  class LocalOrdinal,
285  class GlobalOrdinal,
286  class Node>
287 void Jacobi(Scalar omega,
292  bool call_FillComplete_on_result,
293  const std::string& label,
294  const Teuchos::RCP<Teuchos::ParameterList>& params)
295 {
296  using Teuchos::RCP;
297  typedef Scalar SC;
298  typedef LocalOrdinal LO;
299  typedef GlobalOrdinal GO;
300  typedef Node NO;
301  typedef Import<LO,GO,NO> import_type;
302  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
303  typedef Map<LO,GO,NO> map_type;
304  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
305 
306 #ifdef HAVE_TPETRA_MMM_TIMINGS
307  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
308  using Teuchos::TimeMonitor;
309  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
310 #endif
311 
312  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
313 
314  // A and B should already be Filled.
315  // Should we go ahead and call FillComplete() on them if necessary or error
316  // out? For now, we choose to error out.
317  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
318  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
319 
320  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
321  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
322 
323  // Now check size compatibility
324  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
325  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
326  global_size_t Aouter = A.getGlobalNumRows();
327  global_size_t Bouter = numBCols;
328  global_size_t Ainner = numACols;
329  global_size_t Binner = B.getGlobalNumRows();
330  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
331  prefix << "ERROR, inner dimensions of op(A) and op(B) "
332  "must match for matrix-matrix product. op(A) is "
333  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
334 
335  // The result matrix C must at least have a row-map that reflects the correct
336  // row-size. Don't check the number of columns because rectangular matrices
337  // which were constructed with only one map can still end up having the
338  // correct capacity and dimensions when filled.
339  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
340  prefix << "ERROR, dimensions of result C must "
341  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
342  << " rows, should have at least "<< Aouter << std::endl);
343 
344  // It doesn't matter whether C is already Filled or not. If it is already
345  // Filled, it must have space allocated for the positions that will be
346  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
347  // we'll error out later when trying to store result values.
348 
349  // CGB: However, matrix must be in active-fill
350  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
351 
352  // We're going to need to import remotely-owned sections of A and/or B if
353  // more than one processor is performing this run, depending on the scenario.
354  int numProcs = A.getComm()->getSize();
355 
356  // Declare a couple of structs that will be used to hold views of the data of
357  // A and B, to be used for fast access during the matrix-multiplication.
358  crs_matrix_struct_type Aview;
359  crs_matrix_struct_type Bview;
360 
361  RCP<const map_type> targetMap_A = Aprime->getRowMap();
362  RCP<const map_type> targetMap_B = Bprime->getRowMap();
363 
364 #ifdef HAVE_TPETRA_MMM_TIMINGS
365  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
366  {
367 #endif
368 
369  // Enable globalConstants by default
370  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
371  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
372  if(!params.is_null()) {
373  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
374  int mm_optimization_core_count=0;
375  auto slist = params->sublist("matrixmatrix: kernel params",false);
376  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
377  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
378  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
379  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
380  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
381  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
382  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
383  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
384  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
385  }
386 
387  //Now import any needed remote rows and populate the Aview struct.
388  RCP<const import_type> dummyImporter;
389  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, false, label,importParams1);
390 
391  // We will also need local access to all rows of B that correspond to the
392  // column-map of op(A).
393  if (numProcs > 1)
394  targetMap_B = Aprime->getColMap();
395 
396  // Now import any needed remote rows and populate the Bview struct.
397  // Enable globalConstants by default
398  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
399  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
400  if(!params.is_null()) {
401  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
402 
403  auto slist = params->sublist("matrixmatrix: kernel params",false);
404  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
405  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
406  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
407  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
408  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
409  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
410  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
411  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
412  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
413  }
414 
415  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label,importParams2);
416 
417 #ifdef HAVE_TPETRA_MMM_TIMINGS
418  }
419  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
420 #endif
421 
422  // Now call the appropriate method to perform the actual multiplication.
423  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
424 
425  // Is this a "clean" matrix
426  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
427 
428  if (call_FillComplete_on_result && newFlag) {
429  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
430 
431  } else if (call_FillComplete_on_result) {
432  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
433 
434  } else {
435  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
436  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
437  // commenting it out.
438 // #ifdef HAVE_TPETRA_MMM_TIMINGS
439 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
440 // #endif
441  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
442  // commenting it out.
443  // if (call_FillComplete_on_result) {
444  // //We'll call FillComplete on the C matrix before we exit, and give
445  // //it a domain-map and a range-map.
446  // //The domain-map will be the domain-map of B, unless
447  // //op(B)==transpose(B), in which case the range-map of B will be used.
448  // //The range-map will be the range-map of A, unless
449  // //op(A)==transpose(A), in which case the domain-map of A will be used.
450  // if (!C.isFillComplete()) {
451  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
452  // }
453  // }
454  }
455 }
456 
457 
458 template <class Scalar,
459  class LocalOrdinal,
460  class GlobalOrdinal,
461  class Node>
462 void Add(
464  bool transposeA,
465  Scalar scalarA,
467  Scalar scalarB )
468 {
469  using Teuchos::Array;
470  using Teuchos::RCP;
471  using Teuchos::null;
472  typedef Scalar SC;
473  typedef LocalOrdinal LO;
474  typedef GlobalOrdinal GO;
475  typedef Node NO;
476  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
477  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
478 
479  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
480 
481  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
482  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
483  "(Result matrix B is not required to be isFillComplete()).");
484  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
485  prefix << "ERROR, input matrix B must not be fill complete!");
486  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
487  prefix << "ERROR, input matrix B must not have static graph!");
488  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
489  prefix << "ERROR, input matrix B must not be locally indexed!");
490 
491  RCP<const crs_matrix_type> Aprime = null;
492  if (transposeA) {
493  transposer_type transposer(rcpFromRef (A));
494  Aprime = transposer.createTranspose();
495  } else {
496  Aprime = rcpFromRef(A);
497  }
498 
499  size_t a_numEntries;
500  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
501  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
502  GO row;
503 
504  if (scalarB != Teuchos::ScalarTraits<SC>::one())
505  B.scale(scalarB);
506 
507  bool bFilled = B.isFillComplete();
508  size_t numMyRows = B.getNodeNumRows();
509  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
510  for (LO i = 0; (size_t)i < numMyRows; ++i) {
511  row = B.getRowMap()->getGlobalElement(i);
512  Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
513 
514  if (scalarA != Teuchos::ScalarTraits<SC>::one())
515  for (size_t j = 0; j < a_numEntries; ++j)
516  a_vals[j] *= scalarA;
517 
518  if (bFilled)
519  B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
520  else
521  B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
522  }
523  }
524 }
525 
526 namespace ColMapFunctors
527 {
528  template<typename ByteView, typename GView>
529  struct UnionEntries
530  {
531  UnionEntries(ByteView entryUnion_, const GView gids_) : entryUnion(entryUnion_), gids(gids_) {}
532  KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const
533  {
534  entryUnion(gids(i)) = 1;
535  }
536  ByteView entryUnion;
537  const GView gids;
538  };
539 
540  template<typename LView, typename GView>
541  struct ConvertGlobalToLocal
542  {
543  ConvertGlobalToLocal(const LView gtol_, const GView gids_, LView lids_) : gtol(gtol_), gids(gids_), lids(lids_) {}
544  KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const
545  {
546  lids(i) = gtol(gids(i));
547  }
548  const LView gtol;
549  const GView gids;
550  LView lids;
551  };
552 }//end ColMapFunctors
553 
554 //Build the minimal (sorted) column map for the given set of global columns
555 //Then convert gids and store them in lids (gids is not modified)
556 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
557 Teuchos::RCP<Map<LocalOrdinal, GlobalOrdinal, Node> > AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
558 makeColMapAndConvertGids(GlobalOrdinal ncols,
559  const typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::global_col_inds_array& gids,
560  typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::col_inds_array& lids,
561  const Teuchos::RCP<const Teuchos::Comm<int>>& comm)
562 {
563  using namespace ColMapFunctors;
564  using Teuchos::RCP;
565  using Teuchos::rcp;
566  typedef Kokkos::View<char*, device_type> ByteView;
567  typedef global_col_inds_array GView;
568  typedef col_inds_array LView;
569  //Functors (explained in the procedural code below)
570  auto nentries = gids.extent(0);
571  //each entry of entryUnion is 0 unless there is a local entry in that column (then it is 1)
572  ByteView entryUnion("entry union", ncols);
573  UnionEntries<ByteView, GView> ue(entryUnion, gids);
574  Kokkos::parallel_for("Tpetra_MatrixMatrix_unionEntries", range_type(0, nentries), ue);
575  //turn entryUnion into prefix sum gtol (where gtol(i) gives the new local col for global col i)
576  LView gtol("global col -> local col", ncols + 1);
577  ::Tpetra::Details::computeOffsetsFromCounts<decltype(gtol), decltype(entryUnion)>(gtol, entryUnion);
578  //convert gids to local ids and put them in lids (implicitly sorted as long as input gids is sorted per row)
579  ConvertGlobalToLocal<LView, GView> cgtl(gtol, gids, lids);
580  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertGlobalToLocal", range_type(0, gids.extent(0)), cgtl);
581  //build local set of GIDs for constructing column map - the last entry in gtol is the total number of local cols
582  execution_space::fence();
583  GView colmap("column map", gtol(ncols));
584  size_t localIter = 0;
585  execution_space::fence();
586  for(size_t i = 0; i < entryUnion.extent(0); i++)
587  {
588  if(entryUnion(i) != 0)
589  {
590  colmap(localIter++) = i;
591  }
592  }
593  execution_space::fence();
594  //finally, construct Tpetra map
595  return rcp(new map_type(Teuchos::OrdinalTraits<GlobalOrdinal>::invalid(), colmap, 0, comm));
596 }
597 
598 template <class Scalar,
599  class LocalOrdinal,
600  class GlobalOrdinal,
601  class Node>
602 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
603 add (const Scalar& alpha,
604  const bool transposeA,
606  const Scalar& beta,
607  const bool transposeB,
609  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
610  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
611  const Teuchos::RCP<Teuchos::ParameterList>& params)
612 {
613  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
615  Teuchos::RCP<const map_type> CrowMap = transposeB ? B.getDomainMap() : B.getRowMap();
616 
617  Teuchos::RCP<crs_matrix_type> C = rcp(new crs_matrix_type(CrowMap, 0));
618 
619  add(alpha,transposeA,A,beta,transposeB,B,*C,domainMap,rangeMap,params);
620  return C;
621 }
622 
623 template <class Scalar,
624  class LocalOrdinal,
625  class GlobalOrdinal,
626  class Node>
627 void
628 add (const Scalar& alpha,
629  const bool transposeA,
631  const Scalar& beta,
632  const bool transposeB,
635  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
636  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
637  const Teuchos::RCP<Teuchos::ParameterList>& params)
638 {
639  using Teuchos::RCP;
640  using Teuchos::rcp;
641  using Teuchos::rcpFromRef;
642  using Teuchos::rcp_implicit_cast;
643  using Teuchos::rcp_dynamic_cast;
644  using Teuchos::TimeMonitor;
645  typedef Scalar SC;
646  typedef LocalOrdinal LO;
647  typedef GlobalOrdinal GO;
648  typedef Node NO;
649  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
650  typedef Map<LO,GO,NO> map_type;
651  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
652  typedef Import<LO,GO,NO> import_type;
653  typedef Export<LO,GO,NO> export_type;
654  typedef AddDetails::AddKernels<SC,LO,GO,NO> AddKern;
655  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
656  constexpr bool debug = false;
657 
658 #ifdef HAVE_TPETRA_MMM_TIMINGS
659  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
660 #endif
661 
662  if (debug) {
663  std::ostringstream os;
664  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
665  << "TpetraExt::MatrixMatrix::add" << std::endl;
666  std::cerr << os.str ();
667  }
668 
669  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete () || !B.isFillComplete (), std::invalid_argument,
670  prefix_mmm << "A and B must both be fill complete.");
671 #ifdef HAVE_TPETRA_DEBUG
672  // The matrices don't have domain or range Maps unless they are fill complete.
673  if (A.isFillComplete () && B.isFillComplete ()) {
674  const bool domainMapsSame =
675  (!transposeA && !transposeB && !A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
676  (!transposeA && transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
677  ( transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
678  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
679  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
680 
681  const bool rangeMapsSame =
682  (!transposeA && !transposeB && !A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
683  (!transposeA && transposeB && !A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
684  ( transposeA && !transposeB && !A.getDomainMap()->isSameAs (*B.getRangeMap ()));
685  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
686  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
687  }
688 #endif // HAVE_TPETRA_DEBUG
689  auto comm = A.getComm();
690  // Form the explicit transpose of A if necessary.
691  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
692  if (transposeA) {
693  transposer_type transposer(Aprime);
694  Aprime = transposer.createTranspose ();
695  }
696 #ifdef HAVE_TPETRA_DEBUG
697  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
698  prefix_mmm << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
699 #endif // HAVE_TPETRA_DEBUG
700  // Form the explicit transpose of B if necessary.
701  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
702  if (transposeB) {
703  if (debug) {
704  std::ostringstream os;
705  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
706  << "Form explicit xpose of B" << std::endl;
707  std::cerr << os.str ();
708  }
709  transposer_type transposer(Bprime);
710  Bprime = transposer.createTranspose ();
711  }
712 #ifdef HAVE_TPETRA_DEBUG
713  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
714  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
715  TEUCHOS_TEST_FOR_EXCEPTION(
716  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
717  prefix_mmm << "Aprime and Bprime must both be fill complete. "
718  "Please report this bug to the Tpetra developers.");
719 #endif // HAVE_TPETRA_DEBUG
720  RCP<const map_type> CDomainMap = domainMap;
721  RCP<const map_type> CRangeMap = rangeMap;
722  if(CDomainMap.is_null())
723  {
724  CDomainMap = Bprime->getDomainMap();
725  }
726  if(CRangeMap.is_null())
727  {
728  CRangeMap = Bprime->getRangeMap();
729  }
730  assert(!(CDomainMap.is_null()));
731  assert(!(CRangeMap.is_null()));
732  typedef typename AddKern::values_array values_array;
733  typedef typename AddKern::row_ptrs_array row_ptrs_array;
734  typedef typename AddKern::col_inds_array col_inds_array;
735  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
736  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
737  values_array vals;
738  row_ptrs_array rowptrs;
739  col_inds_array colinds;
740  auto acolmap = Aprime->getColMap()->getMyGlobalIndices();
741  auto bcolmap = Bprime->getColMap()->getMyGlobalIndices();
742 #ifdef HAVE_TPETRA_MMM_TIMINGS
743  MM = Teuchos::null;
744  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
745 #endif
746  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
747  {
748  //import Aprime into Bprime's row map so the local matrices have same # of rows
749  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
750  // cbl do not set
751  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
752  // this import _may_ take the form of a transfer. In practice it would be unlikely,
753  // but the general case is not so forgiving.
754 
755  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
756  }
757  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
758  bool sorted = AGraphSorted && BGraphSorted;
759  RCP<const map_type> CrowMap;
760  RCP<const map_type> CcolMap;
761  RCP<const import_type> Cimport = Teuchos::null;
762  RCP<export_type> Cexport = Teuchos::null;
763  //The unsorted KCRS addition kernel uses std::sort(), which can't run on CUDA
764  if(!matchingColMaps && !(CDomainMap->isContiguous()))
765  {
766  //can't do with current set of kernels, so fall back to original (slow) version
767 #ifdef HAVE_TPETRA_MMM_TIMINGS
768  MM = Teuchos::null;
769  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("fallback to CrsMatrix::add"))));
770 #endif
771  if (debug) {
772  std::ostringstream os;
773  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
774  << "Call Bprime->add(...)" << std::endl;
775  std::cerr << os.str ();
776  }
777  Teuchos::RCP<crs_matrix_type> C_ = Teuchos::rcp_static_cast<crs_matrix_type>(Bprime->add(alpha, *Aprime, beta, CDomainMap, CRangeMap, params));
778  C.replaceColMap(C_->getColMap());
779  C.setAllValues(C_->getLocalMatrix().graph.row_map,C_->getLocalMatrix().graph.entries,C_->getLocalMatrix().values);
780  C.expertStaticFillComplete(CDomainMap, CRangeMap, C_->getGraph()->getImporter(), C_->getGraph()->getExporter(), params);
781  return;
782  }
783  else if(!matchingColMaps)
784  {
785 #ifdef HAVE_TPETRA_MMM_TIMINGS
786  MM = Teuchos::null;
787  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
788 #endif
789  //use kernel that converts col indices in both A and B to common domain map before adding
790  auto Acolmap = Aprime->getColMap()->getMyGlobalIndices();
791  auto Bcolmap = Bprime->getColMap()->getMyGlobalIndices();
792  typename AddKern::global_col_inds_array globalColinds("", 0);
793  if (debug) {
794  std::ostringstream os;
795  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
796  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
797  std::cerr << os.str ();
798  }
799  AddKern::convertToGlobalAndAdd(
800  Aprime->getLocalMatrix(), alpha, Bprime->getLocalMatrix(), beta, Acolmap, Bcolmap,
801  CRangeMap->getMinGlobalIndex(), Aprime->getGlobalNumCols(), vals, rowptrs, globalColinds);
802  colinds = col_inds_array("C colinds", globalColinds.extent(0));
803  if (debug) {
804  std::ostringstream os;
805  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
806  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
807  std::cerr << os.str ();
808  }
809 #ifdef HAVE_TPETRA_MMM_TIMINGS
810  MM = Teuchos::null;
811  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("building optimized column map"))));
812 #endif
813  CrowMap = Bprime->getRowMap();
814  //OLD: CcolMap = AddKern::makeColMapAndConvertGids(Aprime->getGlobalNumCols(), globalColinds, colinds, comm);
815  //Get C's column map as the union of Aprime and Bprime col maps
816  CcolMap = AddKern::makeColMapAndConvertGids(Aprime->getGlobalNumCols(), globalColinds, colinds, comm);
817  }
818  else
819  {
820  //Aprime, Bprime and C all have the same column maps
821  auto localA = Aprime->getLocalMatrix();
822  auto localB = Bprime->getLocalMatrix();
823  auto Avals = localA.values;
824  auto Bvals = localB.values;
825  auto Arowptrs = localA.graph.row_map;
826  auto Browptrs = localB.graph.row_map;
827  auto Acolinds = localA.graph.entries;
828  auto Bcolinds = localB.graph.entries;
829  if(sorted)
830  {
831  //use sorted kernel
832 #ifdef HAVE_TPETRA_MMM_TIMINGS
833  MM = Teuchos::null;
834  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
835 #endif
836  if (debug) {
837  std::ostringstream os;
838  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
839  << "Call AddKern::addSorted(...)" << std::endl;
840  std::cerr << os.str ();
841  }
842  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
843  }
844  else
845  {
846  //use unsorted kernel
847 #ifdef HAVE_TPETRA_MMM_TIMINGS
848  MM = Teuchos::null;
849  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
850 #endif
851  if (debug) {
852  std::ostringstream os;
853  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
854  << "Call AddKern::addUnsorted(...)" << std::endl;
855  std::cerr << os.str ();
856  }
857  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
858  }
859  CrowMap = Bprime->getRowMap();
860  CcolMap = Bprime->getColMap();
861  //note: Cexport created below (if it's needed)
862  }
863 #ifdef HAVE_TPETRA_MMM_TIMINGS
864  MM = Teuchos::null;
865  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs constructor"))));
866 #endif
867  // C = rcp(new crs_matrix_type(CrowMap, CcolMap, rowptrs, colinds, vals, params));
868  C.replaceColMap(CcolMap);
869  C.setAllValues(rowptrs,colinds,vals);
870 #ifdef HAVE_TPETRA_MMM_TIMINGS
871  MM = Teuchos::null;
872  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
873 #endif
874  if(!CDomainMap->isSameAs(*CcolMap))
875  {
876  if (debug) {
877  std::ostringstream os;
878  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
879  << "Create Cimport" << std::endl;
880  std::cerr << os.str ();
881  }
882  Cimport = rcp(new import_type(CDomainMap, CcolMap));
883  }
884  if(!CrowMap->isSameAs(*CRangeMap))
885  {
886  if (debug) {
887  std::ostringstream os;
888  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
889  << "Create Cexport" << std::endl;
890  std::cerr << os.str ();
891  }
892  Cexport = rcp(new export_type(CrowMap, CRangeMap));
893  }
894 
895  if (debug) {
896  std::ostringstream os;
897  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
898  << "Call C->expertStaticFillComplete(...)" << std::endl;
899  std::cerr << os.str ();
900  }
901  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
902 
903 }
904 
905 template <class Scalar,
906  class LocalOrdinal,
907  class GlobalOrdinal,
908  class Node>
909 void Add(
911  bool transposeA,
912  Scalar scalarA,
914  bool transposeB,
915  Scalar scalarB,
917 {
918  using Teuchos::Array;
919  using Teuchos::ArrayRCP;
920  using Teuchos::ArrayView;
921  using Teuchos::RCP;
922  using Teuchos::rcp;
923  using Teuchos::rcp_dynamic_cast;
924  using Teuchos::rcpFromRef;
925  using Teuchos::tuple;
926  using std::endl;
927  // typedef typename ArrayView<const Scalar>::size_type size_type;
928  typedef Teuchos::ScalarTraits<Scalar> STS;
930  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
931  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
932  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
935 
936  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
937 
938  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
939  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
940 
941  TEUCHOS_TEST_FOR_EXCEPTION(
942  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
943  prefix << "Both input matrices must be fill complete before calling this function.");
944 
945 #ifdef HAVE_TPETRA_DEBUG
946  {
947  const bool domainMapsSame =
948  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
949  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
950  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
951  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
952  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
953 
954  const bool rangeMapsSame =
955  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
956  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
957  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
958  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
959  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
960  }
961 #endif // HAVE_TPETRA_DEBUG
962 
963  // Form the explicit transpose of A if necessary.
964  RCP<const crs_matrix_type> Aprime;
965  if (transposeA) {
966  transposer_type theTransposer (rcpFromRef (A));
967  Aprime = theTransposer.createTranspose ();
968  } else {
969  Aprime = rcpFromRef (A);
970  }
971 
972 #ifdef HAVE_TPETRA_DEBUG
973  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
974  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
975 #endif // HAVE_TPETRA_DEBUG
976 
977  // Form the explicit transpose of B if necessary.
978  RCP<const crs_matrix_type> Bprime;
979  if (transposeB) {
980  transposer_type theTransposer (rcpFromRef (B));
981  Bprime = theTransposer.createTranspose ();
982  } else {
983  Bprime = rcpFromRef (B);
984  }
985 
986 #ifdef HAVE_TPETRA_DEBUG
987  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
988  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
989 #endif // HAVE_TPETRA_DEBUG
990 
991  // Allocate or zero the entries of the result matrix.
992  if (! C.is_null ()) {
993  C->setAllToScalar (STS::zero ());
994  } else {
995 #if 0
996  // If Aprime and Bprime have the same row Map, and if C is null,
997  // we can optimize construction and fillComplete of C. For now,
998  // we just check pointer equality, to avoid the all-reduce in
999  // isSameAs. It may be worth that all-reduce to check, however.
1000  //if (Aprime->getRowMap ().getRawPtr () == Bprime->getRowMap ().getRawPtr ())
1001  if (Aprime->getRowMap ()->isSameAs (* (Bprime->getRowMap ())) {
1002  RCP<const map_type> rowMap = Aprime->getRowMap ();
1003 
1004  RCP<const crs_graph_type> A_graph =
1005  rcp_dynamic_cast<const crs_graph_type> (Aprime->getGraph (), true);
1006 #ifdef HAVE_TPETRA_DEBUG
1007  TEUCHOS_TEST_FOR_EXCEPTION(A_graph.is_null (), std::logic_error,
1008  "Tpetra::MatrixMatrix::Add: Graph of Op(A) is null. "
1009  "Please report this bug to the Tpetra developers.");
1010 #endif // HAVE_TPETRA_DEBUG
1011  RCP<const crs_graph_type> B_graph =
1012  rcp_dynamic_cast<const crs_graph_type> (Bprime->getGraph (), true);
1013 #ifdef HAVE_TPETRA_DEBUG
1014  TEUCHOS_TEST_FOR_EXCEPTION(B_graph.is_null (), std::logic_error,
1015  "Tpetra::MatrixMatrix::Add: Graph of Op(B) is null. "
1016  "Please report this bug to the Tpetra developers.");
1017 #endif // HAVE_TPETRA_DEBUG
1018  RCP<const map_type> A_colMap = A_graph->getColMap ();
1019 #ifdef HAVE_TPETRA_DEBUG
1020  TEUCHOS_TEST_FOR_EXCEPTION(A_colMap.is_null (), std::logic_error,
1021  "Tpetra::MatrixMatrix::Add: Column Map of Op(A) is null. "
1022  "Please report this bug to the Tpetra developers.");
1023 #endif // HAVE_TPETRA_DEBUG
1024  RCP<const map_type> B_colMap = B_graph->getColMap ();
1025 #ifdef HAVE_TPETRA_DEBUG
1026  TEUCHOS_TEST_FOR_EXCEPTION(B_colMap.is_null (), std::logic_error,
1027  "Tpetra::MatrixMatrix::Add: Column Map of Op(B) is null. "
1028  "Please report this bug to the Tpetra developers.");
1029  TEUCHOS_TEST_FOR_EXCEPTION(A_graph->getImporter ().is_null (),
1030  std::logic_error,
1031  "Tpetra::MatrixMatrix::Add: Op(A)'s Import is null. "
1032  "Please report this bug to the Tpetra developers.");
1033  TEUCHOS_TEST_FOR_EXCEPTION(B_graph->getImporter ().is_null (),
1034  std::logic_error,
1035  "Tpetra::MatrixMatrix::Add: Op(B)'s Import is null. "
1036  "Please report this bug to the Tpetra developers.");
1037 #endif // HAVE_TPETRA_DEBUG
1038 
1039  // Compute the (column Map and) Import of the matrix sum.
1040  RCP<const import_type> sumImport =
1041  A_graph->getImporter ()->setUnion (* (B_graph->getImporter ()));
1042  RCP<const map_type> C_colMap = sumImport->getTargetMap ();
1043 
1044  // First, count the number of entries in each row. Then, go
1045  // back over the rows again, and compute the actual sum.
1046  // Remember that C may have a different column Map than Aprime
1047  // or Bprime, so its local indices may be different. That's why
1048  // we have to convert from local to global indices.
1049 
1050  ArrayView<const LocalOrdinal> A_local_ind;
1051  Array<GlobalOrdinal> A_global_ind;
1052  ArrayView<const LocalOrdinal> B_local_ind;
1053  Array<GlobalOrdinal> B_global_ind;
1054 
1055  const size_t localNumRows = rowMap->getNodeNumElements ();
1056  ArrayRCP<size_t> numEntriesPerRow (localNumRows);
1057  // Compute the max number of entries in any row of A+B on this
1058  // process, so that we won't have to resize temporary arrays.
1059  size_t maxNumEntriesPerRow = 0;
1060  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
1061  // Get view of current row of A_graph, in its local indices.
1062  A_graph->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind);
1063  const size_type A_numEnt = A_local_ind.size ();
1064  if (A_numEnt > A_global_ind.size ()) {
1065  A_global_ind.resize (A_numEnt);
1066  }
1067  // Convert A's local indices to global indices.
1068  for (size_type k = 0; k < A_numEnt; ++k) {
1069  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1070  }
1071 
1072  // Get view of current row of B_graph, in its local indices.
1073  B_graph->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind);
1074  const size_type B_numEnt = B_local_ind.size ();
1075  if (B_numEnt > B_global_ind.size ()) {
1076  B_global_ind.resize (B_numEnt);
1077  }
1078  // Convert B's local indices to global indices.
1079  for (size_type k = 0; k < B_numEnt; ++k) {
1080  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1081  }
1082 
1083  // Count the number of entries in the merged row of A + B.
1084  const size_t curNumEntriesPerRow =
1085  keyMergeCount (A_global_ind.begin (), A_global_ind.end (),
1086  B_global_ind.begin (), B_global_ind.end ());
1087  numEntriesPerRow[localRow] = curNumEntriesPerRow;
1088  maxNumEntriesPerRow = std::max (maxNumEntriesPerRow, curNumEntriesPerRow);
1089  }
1090 
1091  // Create C, using the sum column Map and number of entries per
1092  // row that we computed above. Having the exact number of
1093  // entries per row lets us use static profile, making it valid
1094  // to call expertStaticFillComplete.
1095  C = rcp (new crs_matrix_type (rowMap, C_colMap, numEntriesPerRow, StaticProfile));
1096 
1097  // Go back through the rows and actually compute the sum. We
1098  // don't ever have to resize A_global_ind or B_global_ind below,
1099  // since we've already done it above.
1100  ArrayView<const Scalar> A_val;
1101  ArrayView<const Scalar> B_val;
1102 
1103  Array<LocalOrdinal> AplusB_local_ind (maxNumEntriesPerRow);
1104  Array<GlobalOrdinal> AplusB_global_ind (maxNumEntriesPerRow);
1105  Array<Scalar> AplusB_val (maxNumEntriesPerRow);
1106 
1107  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
1108  // Get view of current row of A, in A's local indices.
1109  Aprime->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind, A_val);
1110  // Convert A's local indices to global indices.
1111  for (size_type k = 0; k < A_local_ind.size (); ++k) {
1112  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1113  }
1114 
1115  // Get view of current row of B, in B's local indices.
1116  Bprime->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind, B_val);
1117  // Convert B's local indices to global indices.
1118  for (size_type k = 0; k < B_local_ind.size (); ++k) {
1119  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1120  }
1121 
1122  const size_t curNumEntries = numEntriesPerRow[localRow];
1123  ArrayView<LocalOrdinal> C_local_ind = AplusB_local_ind (0, curNumEntries);
1124  ArrayView<GlobalOrdinal> C_global_ind = AplusB_global_ind (0, curNumEntries);
1125  ArrayView<Scalar> C_val = AplusB_val (0, curNumEntries);
1126 
1127  // Sum the entries in the current row of A plus B.
1128  keyValueMerge (A_global_ind.begin (), A_global_ind.end (),
1129  A_val.begin (), A_val.end (),
1130  B_global_ind.begin (), B_global_ind.end (),
1131  B_val.begin (), B_val.end (),
1132  C_global_ind.begin (), C_val.begin (),
1133  std::plus<Scalar> ());
1134  // Convert the sum's global indices into C's local indices.
1135  for (size_type k = 0; k < as<size_type> (numEntriesPerRow[localRow]); ++k) {
1136  C_local_ind[k] = C_colMap->getLocalElement (C_global_ind[k]);
1137  }
1138  // Give the current row sum to C.
1139  C->replaceLocalValues (localRow, C_local_ind, C_val);
1140  }
1141 
1142  // Use "expert static fill complete" to bypass construction of
1143  // the Import and Export (if applicable) object(s).
1144  RCP<const map_type> domainMap = A_graph->getDomainMap ();
1145  RCP<const map_type> rangeMap = A_graph->getRangeMap ();
1146  C->expertStaticFillComplete (domainMap, rangeMap, sumImport, A_graph->getExporter ());
1147 
1148  return; // Now we're done!
1149  }
1150  else {
1151  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1152  // noticed that C was being given the row Map of Aprime (the
1153  // possibly transposed version of A). Is this what we want?
1154  C = rcp (new crs_matrix_type (Aprime->getRowMap (), null));
1155  }
1156 
1157 #else
1158  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1159  // noticed that C was being given the row Map of Aprime (the
1160  // possibly transposed version of A). Is this what we want?
1161  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1162 #endif // 0
1163  }
1164 
1165 #ifdef HAVE_TPETRA_DEBUG
1166  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1167  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1168  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1169  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1170  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1171  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1172 #endif // HAVE_TPETRA_DEBUG
1173 
1174  Array<RCP<const crs_matrix_type> > Mat =
1175  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1176  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1177 
1178  // do a loop over each matrix to add: A reordering might be more efficient
1179  for (int k = 0; k < 2; ++k) {
1180  Array<GlobalOrdinal> Indices;
1181  Array<Scalar> Values;
1182 
1183  // Loop over each locally owned row of the current matrix (either
1184  // Aprime or Bprime), and sum its entries into the corresponding
1185  // row of C. This works regardless of whether Aprime or Bprime
1186  // has the same row Map as C, because both sumIntoGlobalValues and
1187  // insertGlobalValues allow summing resp. inserting into nonowned
1188  // rows of C.
1189 #ifdef HAVE_TPETRA_DEBUG
1190  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1191  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1192 #endif // HAVE_TPETRA_DEBUG
1193  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1194 #ifdef HAVE_TPETRA_DEBUG
1195  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1196  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1197 #endif // HAVE_TPETRA_DEBUG
1198 
1199  const size_t localNumRows = Mat[k]->getNodeNumRows ();
1200  for (size_t i = 0; i < localNumRows; ++i) {
1201  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1202  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1203  if (numEntries > 0) {
1204  Indices.resize (numEntries);
1205  Values.resize (numEntries);
1206  Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1207 
1208  if (scalar[k] != STS::one ()) {
1209  for (size_t j = 0; j < numEntries; ++j) {
1210  Values[j] *= scalar[k];
1211  }
1212  }
1213 
1214  if (C->isFillComplete ()) {
1215  C->sumIntoGlobalValues (globalRow, Indices, Values);
1216  } else {
1217  C->insertGlobalValues (globalRow, Indices, Values);
1218  }
1219  }
1220  }
1221  }
1222 }
1223 
1224 template<typename SC, typename LO, typename GO, typename NO>
1225 void AddDetails::AddKernels<SC, LO, GO, NO>::
1226 addSorted(
1227  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1228  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1229  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1230  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1231  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1232  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1233  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1234  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1235  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1236  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1237  typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1238 {
1239  using Teuchos::TimeMonitor;
1240  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
1241  auto nrows = Arowptrs.extent(0) - 1;
1242  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1243  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, impl_scalar_type,
1244  execution_space, memory_space, memory_space> KKH;
1245  KKH handle;
1246  handle.create_spadd_handle(true);
1247  auto addHandle = handle.get_spadd_handle();
1248 #ifdef HAVE_TPETRA_MMM_TIMINGS
1249  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
1250 #endif
1251  KokkosSparse::Experimental::spadd_symbolic
1252  <KKH,
1253  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1254  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1255  row_ptrs_array, col_inds_array>
1256  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1257  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1258  Ccolinds = col_inds_array("C colinds", addHandle->get_max_result_nnz());
1259 #ifdef HAVE_TPETRA_MMM_TIMINGS
1260  MM = Teuchos::null;
1261  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
1262 #endif
1263  KokkosSparse::Experimental::spadd_numeric(&handle,
1264  Arowptrs, Acolinds, Avals, scalarA,
1265  Browptrs, Bcolinds, Bvals, scalarB,
1266  Crowptrs, Ccolinds, Cvals);
1267 }
1268 
1269 template<typename SC, typename LO, typename GO, typename NO>
1270 void AddDetails::AddKernels<SC, LO, GO, NO>::
1271 addUnsorted(
1272  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1273  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1274  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1275  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1276  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1277  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1278  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1279  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1280  GO /* numGlobalCols */,
1281  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1282  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1283  typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1284 {
1285  using Teuchos::TimeMonitor;
1286  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
1287  auto nrows = Arowptrs.extent(0) - 1;
1288  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1289  typedef AddDetails::AddKernels<SC, LO, GO, NO> AddKern;
1290  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
1291  AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
1292  KKH handle;
1293  handle.create_spadd_handle(false);
1294  auto addHandle = handle.get_spadd_handle();
1295 #ifdef HAVE_TPETRA_MMM_TIMINGS
1296  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
1297 #endif
1298  KokkosSparse::Experimental::spadd_symbolic
1299  <KKH,
1300  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1301  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1302  row_ptrs_array, col_inds_array>
1303  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1304  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1305  Ccolinds = col_inds_array("C colinds", addHandle->get_max_result_nnz());
1306 #ifdef HAVE_TPETRA_MMM_TIMINGS
1307  MM = Teuchos::null;
1308  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
1309 #endif
1310  KokkosSparse::Experimental::spadd_numeric(&handle,
1311  Arowptrs, Acolinds, Avals, scalarA,
1312  Browptrs, Bcolinds, Bvals, scalarB,
1313  Crowptrs, Ccolinds, Cvals);
1314 }
1315 
1316 template<typename GO,
1317  typename LocalIndicesType,
1318  typename GlobalIndicesType,
1319  typename ColMapType>
1320 struct ConvertColIndsFunctor
1321 {
1322  ConvertColIndsFunctor (const GO minGlobal_,
1323  const LocalIndicesType& colindsOrig_,
1324  const GlobalIndicesType& colindsConverted_,
1325  const ColMapType& colmap_) :
1326  minGlobal (minGlobal_),
1327  colindsOrig (colindsOrig_),
1328  colindsConverted (colindsConverted_),
1329  colmap (colmap_)
1330  {}
1331  KOKKOS_INLINE_FUNCTION void
1332  operator() (const size_t& i) const
1333  {
1334  colindsConverted[i] = colmap[colindsOrig[i]];
1335  }
1336  GO minGlobal;
1337  LocalIndicesType colindsOrig;
1338  GlobalIndicesType colindsConverted;
1339  ColMapType colmap;
1340 };
1341 
1342 template<typename SC, typename LO, typename GO, typename NO>
1343 void AddDetails::AddKernels<SC, LO, GO, NO>::
1344 convertToGlobalAndAdd(
1345  const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
1346  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1347  const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
1348  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1349  const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
1350  const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
1351  GO minGlobalCol,
1352  GO /* numGlobalCols */,
1353  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1354  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1355  typename AddDetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
1356 {
1357  using Teuchos::TimeMonitor;
1358 
1359  const values_array& Avals = A.values;
1360  const values_array& Bvals = B.values;
1361  const col_inds_array& Acolinds = A.graph.entries;
1362  const col_inds_array& Bcolinds = B.graph.entries;
1363  auto Arowptrs = A.graph.row_map;
1364  auto Browptrs = B.graph.row_map;
1365  global_col_inds_array AcolindsConverted("A colinds (converted)", Acolinds.extent(0));
1366  global_col_inds_array BcolindsConverted("B colinds (converted)", Bcolinds.extent(0));
1367 #ifdef HAVE_TPETRA_MMM_TIMINGS
1368  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
1369 #endif
1370  ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(minGlobalCol, Acolinds, AcolindsConverted, AcolMap);
1371  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
1372  ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(minGlobalCol, Bcolinds, BcolindsConverted, BcolMap);
1373  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
1374  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, GO, impl_scalar_type,
1375  execution_space, memory_space, memory_space> KKH;
1376  KKH handle;
1377  handle.create_spadd_handle(false);
1378  auto addHandle = handle.get_spadd_handle();
1379 #ifdef HAVE_TPETRA_MMM_TIMINGS
1380  MM = Teuchos::null;
1381  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
1382 #endif
1383  auto nrows = Arowptrs.extent(0) - 1;
1384  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1385  KokkosSparse::Experimental::spadd_symbolic
1386  <KKH, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
1387  (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
1388  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1389  Ccolinds = global_col_inds_array("C colinds", addHandle->get_max_result_nnz());
1390 #ifdef HAVE_TPETRA_MMM_TIMINGS
1391  MM = Teuchos::null;
1392  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
1393 #endif
1394  KokkosSparse::Experimental::spadd_numeric(&handle,
1395  Arowptrs, AcolindsConverted, Avals, scalarA,
1396  Browptrs, BcolindsConverted, Bvals, scalarB,
1397  Crowptrs, Ccolinds, Cvals);
1398 }
1399 
1400 } //End namespace MatrixMatrix
1401 
1402 namespace MMdetails{
1403 
1404 /*********************************************************************************************************/
1405 // Prints MMM-style statistics on communication done with an Import or Export object
1406 template <class TransferType>
1407 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1408  if (Transfer.is_null())
1409  return;
1410 
1411  const Distributor & Distor = Transfer->getDistributor();
1412  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1413 
1414  size_t rows_send = Transfer->getNumExportIDs();
1415  size_t rows_recv = Transfer->getNumRemoteIDs();
1416 
1417  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1418  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1419  size_t num_send_neighbors = Distor.getNumSends();
1420  size_t num_recv_neighbors = Distor.getNumReceives();
1421  size_t round2_send, round2_recv;
1422  Distor.getLastDoStatistics(round2_send,round2_recv);
1423 
1424  int myPID = Comm->getRank();
1425  int NumProcs = Comm->getSize();
1426 
1427  // Processor by processor statistics
1428  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1429  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1430 
1431  // Global statistics
1432  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1433  size_t gstats_min[8], gstats_max[8];
1434 
1435  double lstats_avg[8], gstats_avg[8];
1436  for(int i=0; i<8; i++)
1437  lstats_avg[i] = ((double)lstats[i])/NumProcs;
1438 
1439  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1440  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1441  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1442 
1443  if(!myPID) {
1444  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1445  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1446  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1447  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1448  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1449  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1450  }
1451 }
1452 
1453 
1454 /*********************************************************************************************************/
1455 // Kernel method for computing the local portion of C = A*B
1456 template<class Scalar,
1457  class LocalOrdinal,
1458  class GlobalOrdinal,
1459  class Node>
1460 void mult_AT_B_newmatrix(
1461  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1462  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1463  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1464  const std::string & label,
1465  const Teuchos::RCP<Teuchos::ParameterList>& params)
1466 {
1467  // Using & Typedefs
1468  using Teuchos::RCP;
1469  using Teuchos::rcp;
1470  typedef Scalar SC;
1471  typedef LocalOrdinal LO;
1472  typedef GlobalOrdinal GO;
1473  typedef Node NO;
1474  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1475  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1476 
1477 #ifdef HAVE_TPETRA_MMM_TIMINGS
1478  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1479  using Teuchos::TimeMonitor;
1480  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T Transpose"))));
1481 #endif
1482 
1483  /*************************************************************/
1484  /* 1) Local Transpose of A */
1485  /*************************************************************/
1486  transposer_type transposer (rcpFromRef (A),label+std::string("XP: "));
1487  RCP<Teuchos::ParameterList> transposeParams = Teuchos::rcp(new Teuchos::ParameterList);
1488  if(!params.is_null()) transposeParams->set("compute global constants",params->get("compute global constants: temporaries",false));
1489  RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Atrans = transposer.createTransposeLocal(transposeParams);
1490 
1491  /*************************************************************/
1492  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1493  /*************************************************************/
1494 #ifdef HAVE_TPETRA_MMM_TIMINGS
1495  MM = Teuchos::null;
1496  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T I&X"))));
1497 #endif
1498 
1499  // Get views, asserting that no import is required to speed up computation
1500  crs_matrix_struct_type Aview;
1501  crs_matrix_struct_type Bview;
1502  RCP<const Import<LocalOrdinal,GlobalOrdinal, Node> > dummyImporter;
1503 
1504  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1505  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
1506  if(!params.is_null()) {
1507  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
1508  auto slist = params->sublist("matrixmatrix: kernel params",false);
1509  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
1510  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
1511  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1512  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1513  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1514  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1515  auto & sip1 = importParams1->sublist("matrixmatrix: kernel params",false);
1516  sip1.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1517  sip1.set("isMatrixMatrix_TransferAndFillComplete",isMM);
1518  sip1.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1519  }
1520 
1521  MMdetails::import_and_extract_views(*Atrans, Atrans->getRowMap(), Aview, dummyImporter,true, label,importParams1);
1522 
1523  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
1524  if(!params.is_null()){
1525  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
1526  auto slist = params->sublist("matrixmatrix: kernel params",false);
1527  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
1528  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
1529  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1530  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1531  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1532  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1533  auto & sip2 = importParams2->sublist("matrixmatrix: kernel params",false);
1534  sip2.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1535  sip2.set("isMatrixMatrix_TransferAndFillComplete",isMM);
1536  sip2.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1537  }
1538 
1539  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1540  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1541  }
1542  else {
1543  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1544  }
1545 
1546 #ifdef HAVE_TPETRA_MMM_TIMINGS
1547  MM = Teuchos::null;
1548  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1549 #endif
1550 
1551  RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >Ctemp;
1552 
1553  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1554  bool needs_final_export = !Atrans->getGraph()->getExporter().is_null();
1555  if (needs_final_export)
1556  Ctemp = rcp(new Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>(Atrans->getRowMap(),0));
1557  else
1558  Ctemp = rcp(&C,false);// don't allow deallocation
1559 
1560  // Multiply
1561  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1562 
1563  /*************************************************************/
1564  /* 4) exportAndFillComplete matrix */
1565  /*************************************************************/
1566 #ifdef HAVE_TPETRA_MMM_TIMINGS
1567  MM = Teuchos::null;
1568  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1569 #endif
1570 
1571  Teuchos::RCP<Tpetra::CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > Crcp(&C,false);
1572 
1573  if (needs_final_export) {
1574  Teuchos::ParameterList labelList;
1575  labelList.set("Timer Label", label);
1576  if(!params.is_null()) {
1577  Teuchos::ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1578  Teuchos::ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1579  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1580  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1581  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1582  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1583  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1584  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1585  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1586 
1587  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM,
1588  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1589  labelList.set("compute global constants",params->get("compute global constants",true));
1590  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1591  }
1592  Ctemp->exportAndFillComplete(Crcp,*Ctemp->getGraph()->getExporter(),
1593  B.getDomainMap(),A.getDomainMap(),rcp(&labelList,false));
1594  }
1595 #ifdef HAVE_TPETRA_MMM_STATISTICS
1596  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1597 #endif
1598 }
1599 
1600 /*********************************************************************************************************/
1601 // Kernel method for computing the local portion of C = A*B
1602 template<class Scalar,
1603  class LocalOrdinal,
1604  class GlobalOrdinal,
1605  class Node>
1606 void mult_A_B(
1607  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1608  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1609  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1610  const std::string& /* label */,
1611  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1612 {
1613  using Teuchos::Array;
1614  using Teuchos::ArrayRCP;
1615  using Teuchos::ArrayView;
1616  using Teuchos::OrdinalTraits;
1617  using Teuchos::null;
1618 
1619  typedef Teuchos::ScalarTraits<Scalar> STS;
1620  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1621  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1622  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1623 
1624  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1625  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1626 
1627  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1628  ArrayView<const GlobalOrdinal> bcols_import = null;
1629  if (Bview.importColMap != null) {
1630  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1631  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1632 
1633  bcols_import = Bview.importColMap->getNodeElementList();
1634  }
1635 
1636  size_t C_numCols = C_lastCol - C_firstCol +
1637  OrdinalTraits<LocalOrdinal>::one();
1638  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1639  OrdinalTraits<LocalOrdinal>::one();
1640 
1641  if (C_numCols_import > C_numCols)
1642  C_numCols = C_numCols_import;
1643 
1644  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1645  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1646  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1647 
1648  Array<Scalar> C_row_i = dwork;
1649  Array<GlobalOrdinal> C_cols = iwork;
1650  Array<size_t> c_index = iwork2;
1651  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1652  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1653 
1654  size_t C_row_i_length, j, k, last_index;
1655 
1656  // Run through all the hash table lookups once and for all
1657  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1658  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1659  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1660  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1661  // Maps are the same: Use local IDs as the hash
1662  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1663  Aview.colMap->getMaxLocalIndex(); i++)
1664  Acol2Brow[i]=i;
1665  }
1666  else {
1667  // Maps are not the same: Use the map's hash
1668  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1669  Aview.colMap->getMaxLocalIndex(); i++) {
1670  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1671  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1672  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1673  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1674  }
1675  }
1676 
1677  // To form C = A*B we're going to execute this expression:
1678  //
1679  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1680  //
1681  // Our goal, of course, is to navigate the data in A and B once, without
1682  // performing searches for column-indices, etc.
1683  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1684  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1685  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1686  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1687  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1688  ArrayView<const Scalar> Avals, Bvals, Ivals;
1689  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1690  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1691  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1692  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1693  if(!Bview.importMatrix.is_null()) {
1694  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1695  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1696  }
1697 
1698  bool C_filled = C.isFillComplete();
1699 
1700  for (size_t i = 0; i < C_numCols; i++)
1701  c_index[i] = OrdinalTraits<size_t>::invalid();
1702 
1703  // Loop over the rows of A.
1704  size_t Arows = Aview.rowMap->getNodeNumElements();
1705  for(size_t i=0; i<Arows; ++i) {
1706 
1707  // Only navigate the local portion of Aview... which is, thankfully, all of
1708  // A since this routine doesn't do transpose modes
1709  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1710 
1711  // Loop across the i-th row of A and for each corresponding row in B, loop
1712  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1713  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1714  // calculating updates for row i of the result matrix C.
1715  C_row_i_length = OrdinalTraits<size_t>::zero();
1716 
1717  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1718  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1719  const Scalar Aval = Avals[k];
1720  if (Aval == STS::zero())
1721  continue;
1722 
1723  if (Ak == LO_INVALID)
1724  continue;
1725 
1726  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1727  LocalOrdinal col = Bcolind[j];
1728  //assert(col >= 0 && col < C_numCols);
1729 
1730  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1731  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1732  // This has to be a += so insertGlobalValue goes out
1733  C_row_i[C_row_i_length] = Aval*Bvals[j];
1734  C_cols[C_row_i_length] = col;
1735  c_index[col] = C_row_i_length;
1736  C_row_i_length++;
1737 
1738  } else {
1739  C_row_i[c_index[col]] += Aval*Bvals[j];
1740  }
1741  }
1742  }
1743 
1744  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1745  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1746  C_cols[ii] = bcols[C_cols[ii]];
1747  combined_index[ii] = C_cols[ii];
1748  combined_values[ii] = C_row_i[ii];
1749  }
1750  last_index = C_row_i_length;
1751 
1752  //
1753  //Now put the C_row_i values into C.
1754  //
1755  // We might have to revamp this later.
1756  C_row_i_length = OrdinalTraits<size_t>::zero();
1757 
1758  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1759  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1760  const Scalar Aval = Avals[k];
1761  if (Aval == STS::zero())
1762  continue;
1763 
1764  if (Ak!=LO_INVALID) continue;
1765 
1766  Ak = Acol2Irow[Acolind[k]];
1767  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1768  LocalOrdinal col = Icolind[j];
1769  //assert(col >= 0 && col < C_numCols);
1770 
1771  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1772  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1773  // This has to be a += so insertGlobalValue goes out
1774  C_row_i[C_row_i_length] = Aval*Ivals[j];
1775  C_cols[C_row_i_length] = col;
1776  c_index[col] = C_row_i_length;
1777  C_row_i_length++;
1778 
1779  } else {
1780  // This has to be a += so insertGlobalValue goes out
1781  C_row_i[c_index[col]] += Aval*Ivals[j];
1782  }
1783  }
1784  }
1785 
1786  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1787  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1788  C_cols[ii] = bcols_import[C_cols[ii]];
1789  combined_index[last_index] = C_cols[ii];
1790  combined_values[last_index] = C_row_i[ii];
1791  last_index++;
1792  }
1793 
1794  // Now put the C_row_i values into C.
1795  // We might have to revamp this later.
1796  C_filled ?
1797  C.sumIntoGlobalValues(
1798  global_row,
1799  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1800  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1801  :
1802  C.insertGlobalValues(
1803  global_row,
1804  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1805  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1806 
1807  }
1808 }
1809 
1810 /*********************************************************************************************************/
1811 template<class Scalar,
1812  class LocalOrdinal,
1813  class GlobalOrdinal,
1814  class Node>
1815 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1816  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1817  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1818 
1819  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1820  Mview.maxNumRowEntries = Mview.indices[0].size();
1821 
1822  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1823  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1824  Mview.maxNumRowEntries = Mview.indices[i].size();
1825  }
1826 }
1827 
1828 /*********************************************************************************************************/
1829 template<class CrsMatrixType>
1830 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1831  // Follows the NZ estimate in ML's ml_matmatmult.c
1832  size_t Aest = 100, Best=100;
1833  if (A.getNodeNumEntries() > 0)
1834  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1835  if (B.getNodeNumEntries() > 0)
1836  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1837 
1838  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1839  nnzperrow *= nnzperrow;
1840 
1841  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1842 }
1843 
1844 
1845 /*********************************************************************************************************/
1846 // Kernel method for computing the local portion of C = A*B
1847 //
1848 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1849 // function, so this is probably the function we want to
1850 // thread-parallelize.
1851 template<class Scalar,
1852  class LocalOrdinal,
1853  class GlobalOrdinal,
1854  class Node>
1855 void mult_A_B_newmatrix(
1856  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1857  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1858  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1859  const std::string& label,
1860  const Teuchos::RCP<Teuchos::ParameterList>& params)
1861 {
1862  using Teuchos::Array;
1863  using Teuchos::ArrayRCP;
1864  using Teuchos::ArrayView;
1865  using Teuchos::RCP;
1866  using Teuchos::rcp;
1867 
1868  // Tpetra typedefs
1869  typedef LocalOrdinal LO;
1870  typedef GlobalOrdinal GO;
1871  typedef Node NO;
1872  typedef Import<LO,GO,NO> import_type;
1873  typedef Map<LO,GO,NO> map_type;
1874 
1875  // Kokkos typedefs
1876  typedef typename map_type::local_map_type local_map_type;
1878  typedef typename KCRS::StaticCrsGraphType graph_t;
1879  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1880  typedef typename NO::execution_space execution_space;
1881  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1882  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1883 
1884 #ifdef HAVE_TPETRA_MMM_TIMINGS
1885  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1886  using Teuchos::TimeMonitor;
1887  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1888 #endif
1889  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1890 
1891  // Build the final importer / column map, hash table lookups for C
1892  RCP<const import_type> Cimport;
1893  RCP<const map_type> Ccolmap;
1894  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1895  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1896  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1897  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1898  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1899  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1900  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1901  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1902 
1903  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1904  // indices of B, to local column indices of C. (B and C have the
1905  // same number of columns.) The kernel uses this, instead of
1906  // copying the entire input matrix B and converting its column
1907  // indices to those of C.
1908  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1909 
1910  if (Bview.importMatrix.is_null()) {
1911  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1912  Cimport = Bimport;
1913  Ccolmap = Bview.colMap;
1914  const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1915  // Bcol2Ccol is trivial
1916  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1917  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1918  KOKKOS_LAMBDA(const LO i) {
1919  Bcol2Ccol(i) = i;
1920  });
1921  }
1922  else {
1923  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1924  // column Map of C, as well as C's Import object (from its domain
1925  // Map to its column Map). C's column Map is the union of the
1926  // column Maps of (the local part of) B, and the "remote" part of
1927  // B. Ditto for the Import. We have optimized this "setUnion"
1928  // operation on Import objects and Maps.
1929 
1930  // Choose the right variant of setUnion
1931  if (!Bimport.is_null() && !Iimport.is_null()) {
1932  Cimport = Bimport->setUnion(*Iimport);
1933  }
1934  else if (!Bimport.is_null() && Iimport.is_null()) {
1935  Cimport = Bimport->setUnion();
1936  }
1937  else if (Bimport.is_null() && !Iimport.is_null()) {
1938  Cimport = Iimport->setUnion();
1939  }
1940  else {
1941  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1942  }
1943  Ccolmap = Cimport->getTargetMap();
1944 
1945  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1946  // in general. We should get rid of it in order to reduce
1947  // communication costs of sparse matrix-matrix multiply.
1948  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1949  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1950 
1951  // NOTE: This is not efficient and should be folded into setUnion
1952  //
1953  // mfh 27 Sep 2016: What the above comment means, is that the
1954  // setUnion operation on Import objects could also compute these
1955  // local index - to - local index look-up tables.
1956  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1957  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1958  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1959  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1960  });
1961  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1962  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1963  });
1964 
1965  }
1966 
1967  // Replace the column map
1968  //
1969  // mfh 27 Sep 2016: We do this because C was originally created
1970  // without a column Map. Now we have its column Map.
1971  C.replaceColMap(Ccolmap);
1972 
1973  // mfh 27 Sep 2016: Construct tables that map from local column
1974  // indices of A, to local row indices of either B_local (the locally
1975  // owned part of B), or B_remote (the "imported" remote part of B).
1976  //
1977  // For column index Aik in row i of A, if the corresponding row of B
1978  // exists in the local part of B ("orig") (which I'll call B_local),
1979  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1980  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1981  //
1982  // For column index Aik in row i of A, if the corresponding row of B
1983  // exists in the remote part of B ("Import") (which I'll call
1984  // B_remote), then targetMapToImportRow[Aik] is the local index of
1985  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1986  // (a flag value).
1987 
1988  // Run through all the hash table lookups once and for all
1989  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1990  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1991  Kokkos::fence();
1992  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1993  GO aidx = Acolmap_local.getGlobalElement(i);
1994  LO B_LID = Browmap_local.getLocalElement(aidx);
1995  if (B_LID != LO_INVALID) {
1996  targetMapToOrigRow(i) = B_LID;
1997  targetMapToImportRow(i) = LO_INVALID;
1998  } else {
1999  LO I_LID = Irowmap_local.getLocalElement(aidx);
2000  targetMapToOrigRow(i) = LO_INVALID;
2001  targetMapToImportRow(i) = I_LID;
2002 
2003  }
2004  });
2005 
2006  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2007  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2008  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2009 
2010 }
2011 
2012 
2013 /*********************************************************************************************************/
2014 // AB NewMatrix Kernel wrappers (Default non-threaded version)
2015 template<class Scalar,
2016  class LocalOrdinal,
2017  class GlobalOrdinal,
2018  class Node,
2019  class LocalOrdinalViewType>
2020 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2021  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2022  const LocalOrdinalViewType & targetMapToOrigRow,
2023  const LocalOrdinalViewType & targetMapToImportRow,
2024  const LocalOrdinalViewType & Bcol2Ccol,
2025  const LocalOrdinalViewType & Icol2Ccol,
2026  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2027  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2028  const std::string& label,
2029  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2030 #ifdef HAVE_TPETRA_MMM_TIMINGS
2031  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2032  using Teuchos::TimeMonitor;
2033  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
2034 #endif
2035 
2036  using Teuchos::Array;
2037  using Teuchos::ArrayRCP;
2038  using Teuchos::ArrayView;
2039  using Teuchos::RCP;
2040  using Teuchos::rcp;
2041 
2042  // Lots and lots of typedefs
2044  typedef typename KCRS::StaticCrsGraphType graph_t;
2045  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2046  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2047  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2048  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2049 
2050  typedef Scalar SC;
2051  typedef LocalOrdinal LO;
2052  typedef GlobalOrdinal GO;
2053  typedef Node NO;
2054  typedef Map<LO,GO,NO> map_type;
2055  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2056  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2057  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2058 
2059  // Sizes
2060  RCP<const map_type> Ccolmap = C.getColMap();
2061  size_t m = Aview.origMatrix->getNodeNumRows();
2062  size_t n = Ccolmap->getNodeNumElements();
2063  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2064 
2065  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2066  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2067  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2068 
2069  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2070  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2071  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2072 
2073  c_lno_view_t Irowptr;
2074  lno_nnz_view_t Icolind;
2075  scalar_view_t Ivals;
2076  if(!Bview.importMatrix.is_null()) {
2077  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2078  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2079  Ivals = Bview.importMatrix->getLocalMatrix().values;
2080  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2081  }
2082 
2083 #ifdef HAVE_TPETRA_MMM_TIMINGS
2084  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2085 #endif
2086 
2087  // Classic csr assembly (low memory edition)
2088  //
2089  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2090  // The method loops over rows of A, and may resize after processing
2091  // each row. Chris Siefert says that this reflects experience in
2092  // ML; for the non-threaded case, ML found it faster to spend less
2093  // effort on estimation and risk an occasional reallocation.
2094  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2095  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2096  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2097  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2098 
2099  // mfh 27 Sep 2016: The c_status array is an implementation detail
2100  // of the local sparse matrix-matrix multiply routine.
2101 
2102  // The status array will contain the index into colind where this entry was last deposited.
2103  // c_status[i] < CSR_ip - not in the row yet
2104  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2105  // We start with this filled with INVALID's indicating that there are no entries yet.
2106  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2107  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2108  std::vector<size_t> c_status(n, ST_INVALID);
2109 
2110  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2111  // routine. The routine computes C := A * (B_local + B_remote).
2112  //
2113  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2114  // you whether the corresponding row of B belongs to B_local
2115  // ("orig") or B_remote ("Import").
2116 
2117  // For each row of A/C
2118  size_t CSR_ip = 0, OLD_ip = 0;
2119  for (size_t i = 0; i < m; i++) {
2120  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2121  // on the calling process.
2122  Crowptr[i] = CSR_ip;
2123 
2124  // mfh 27 Sep 2016: For each entry of A in the current row of A
2125  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2126  LO Aik = Acolind[k]; // local column index of current entry of A
2127  const SC Aval = Avals[k]; // value of current entry of A
2128  if (Aval == SC_ZERO)
2129  continue; // skip explicitly stored zero values in A
2130 
2131  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2132  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2133  // corresponding to the current entry of A is populated, then
2134  // the corresponding row of B is in B_local (i.e., it lives on
2135  // the calling process).
2136 
2137  // Local matrix
2138  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2139 
2140  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2141  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2142  LO Bkj = Bcolind[j];
2143  LO Cij = Bcol2Ccol[Bkj];
2144 
2145  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2146  // New entry
2147  c_status[Cij] = CSR_ip;
2148  Ccolind[CSR_ip] = Cij;
2149  Cvals[CSR_ip] = Aval*Bvals[j];
2150  CSR_ip++;
2151 
2152  } else {
2153  Cvals[c_status[Cij]] += Aval*Bvals[j];
2154  }
2155  }
2156 
2157  } else {
2158  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2159  // corresponding to the current entry of A NOT populated (has
2160  // a flag "invalid" value), then the corresponding row of B is
2161  // in B_local (i.e., it lives on the calling process).
2162 
2163  // Remote matrix
2164  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2165  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2166  LO Ikj = Icolind[j];
2167  LO Cij = Icol2Ccol[Ikj];
2168 
2169  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2170  // New entry
2171  c_status[Cij] = CSR_ip;
2172  Ccolind[CSR_ip] = Cij;
2173  Cvals[CSR_ip] = Aval*Ivals[j];
2174  CSR_ip++;
2175  } else {
2176  Cvals[c_status[Cij]] += Aval*Ivals[j];
2177  }
2178  }
2179  }
2180  }
2181 
2182  // Resize for next pass if needed
2183  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2184  CSR_alloc *= 2;
2185  Kokkos::resize(Ccolind,CSR_alloc);
2186  Kokkos::resize(Cvals,CSR_alloc);
2187  }
2188  OLD_ip = CSR_ip;
2189  }
2190 
2191  Crowptr[m] = CSR_ip;
2192 
2193  // Downward resize
2194  Kokkos::resize(Ccolind,CSR_ip);
2195  Kokkos::resize(Cvals,CSR_ip);
2196 
2197 #ifdef HAVE_TPETRA_MMM_TIMINGS
2198  {
2199  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
2200 #endif
2201 
2202  // Final sort & set of CRS arrays
2203  if (params.is_null() || params->get("sort entries",true))
2204  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2205  C.setAllValues(Crowptr,Ccolind, Cvals);
2206 
2207 
2208 #ifdef HAVE_TPETRA_MMM_TIMINGS
2209  }
2210  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
2211  {
2212 #endif
2213 
2214  // Final FillComplete
2215  //
2216  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2217  // Import (from domain Map to column Map) construction (which costs
2218  // lots of communication) by taking the previously constructed
2219  // Import object. We should be able to do this without interfering
2220  // with the implementation of the local part of sparse matrix-matrix
2221  // multply above.
2222  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2223  labelList->set("Timer Label",label);
2224  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2225  RCP<const Export<LO,GO,NO> > dummyExport;
2226  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2227 #ifdef HAVE_TPETRA_MMM_TIMINGS
2228  }
2229  MM2 = Teuchos::null;
2230 #endif
2231 
2232 }
2233 
2234 
2235 
2236 
2237 /*********************************************************************************************************/
2238 // Kernel method for computing the local portion of C = A*B (reuse)
2239 template<class Scalar,
2240  class LocalOrdinal,
2241  class GlobalOrdinal,
2242  class Node>
2243 void mult_A_B_reuse(
2244  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2245  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2246  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2247  const std::string& label,
2248  const Teuchos::RCP<Teuchos::ParameterList>& params)
2249 {
2250  using Teuchos::Array;
2251  using Teuchos::ArrayRCP;
2252  using Teuchos::ArrayView;
2253  using Teuchos::RCP;
2254  using Teuchos::rcp;
2255 
2256  // Tpetra typedefs
2257  typedef LocalOrdinal LO;
2258  typedef GlobalOrdinal GO;
2259  typedef Node NO;
2260  typedef Import<LO,GO,NO> import_type;
2261  typedef Map<LO,GO,NO> map_type;
2262 
2263  // Kokkos typedefs
2264  typedef typename map_type::local_map_type local_map_type;
2266  typedef typename KCRS::StaticCrsGraphType graph_t;
2267  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2268  typedef typename NO::execution_space execution_space;
2269  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2270  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2271 
2272 #ifdef HAVE_TPETRA_MMM_TIMINGS
2273  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2274  using Teuchos::TimeMonitor;
2275  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2276 #endif
2277  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2278 
2279  // Grab all the maps
2280  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2281  RCP<const map_type> Ccolmap = C.getColMap();
2282  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2283  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2284  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2285  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2286  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2287  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2288 
2289  // Build the final importer / column map, hash table lookups for C
2290  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2291  {
2292  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2293  // So, column map of C may be a strict subset of the column map of B
2294  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2295  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2296  });
2297 
2298  if (!Bview.importMatrix.is_null()) {
2299  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2300  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2301 
2302  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2303  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2304  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2305  });
2306  }
2307  }
2308 
2309  // Run through all the hash table lookups once and for all
2310  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2311  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2312  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2313  GO aidx = Acolmap_local.getGlobalElement(i);
2314  LO B_LID = Browmap_local.getLocalElement(aidx);
2315  if (B_LID != LO_INVALID) {
2316  targetMapToOrigRow(i) = B_LID;
2317  targetMapToImportRow(i) = LO_INVALID;
2318  } else {
2319  LO I_LID = Irowmap_local.getLocalElement(aidx);
2320  targetMapToOrigRow(i) = LO_INVALID;
2321  targetMapToImportRow(i) = I_LID;
2322 
2323  }
2324  });
2325 
2326  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2327  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2328  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2329 }
2330 
2331 /*********************************************************************************************************/
2332 template<class Scalar,
2333  class LocalOrdinal,
2334  class GlobalOrdinal,
2335  class Node,
2336  class LocalOrdinalViewType>
2337 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2338  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2339  const LocalOrdinalViewType & targetMapToOrigRow,
2340  const LocalOrdinalViewType & targetMapToImportRow,
2341  const LocalOrdinalViewType & Bcol2Ccol,
2342  const LocalOrdinalViewType & Icol2Ccol,
2343  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2344  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2345  const std::string& label,
2346  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2347 #ifdef HAVE_TPETRA_MMM_TIMINGS
2348  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2349  using Teuchos::TimeMonitor;
2350  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2351  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2352 #else
2353  (void)label;
2354 #endif
2355  using Teuchos::RCP;
2356  using Teuchos::rcp;
2357 
2358 
2359  // Lots and lots of typedefs
2361  typedef typename KCRS::StaticCrsGraphType graph_t;
2362  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2363  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2364  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2365 
2366  typedef Scalar SC;
2367  typedef LocalOrdinal LO;
2368  typedef GlobalOrdinal GO;
2369  typedef Node NO;
2370  typedef Map<LO,GO,NO> map_type;
2371  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2372  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2373  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2374 
2375  // Sizes
2376  RCP<const map_type> Ccolmap = C.getColMap();
2377  size_t m = Aview.origMatrix->getNodeNumRows();
2378  size_t n = Ccolmap->getNodeNumElements();
2379 
2380  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2381  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2382  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2383  const KCRS & Cmat = C.getLocalMatrix();
2384 
2385  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2386  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2387  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2388  scalar_view_t Cvals = Cmat.values;
2389 
2390  c_lno_view_t Irowptr;
2391  lno_nnz_view_t Icolind;
2392  scalar_view_t Ivals;
2393  if(!Bview.importMatrix.is_null()) {
2394  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2395  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2396  Ivals = Bview.importMatrix->getLocalMatrix().values;
2397  }
2398 
2399 #ifdef HAVE_TPETRA_MMM_TIMINGS
2400  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2401 #endif
2402 
2403  // Classic csr assembly (low memory edition)
2404  // mfh 27 Sep 2016: The c_status array is an implementation detail
2405  // of the local sparse matrix-matrix multiply routine.
2406 
2407  // The status array will contain the index into colind where this entry was last deposited.
2408  // c_status[i] < CSR_ip - not in the row yet
2409  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2410  // We start with this filled with INVALID's indicating that there are no entries yet.
2411  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2412  std::vector<size_t> c_status(n, ST_INVALID);
2413 
2414  // For each row of A/C
2415  size_t CSR_ip = 0, OLD_ip = 0;
2416  for (size_t i = 0; i < m; i++) {
2417  // First fill the c_status array w/ locations where we're allowed to
2418  // generate nonzeros for this row
2419  OLD_ip = Crowptr[i];
2420  CSR_ip = Crowptr[i+1];
2421  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2422  c_status[Ccolind[k]] = k;
2423 
2424  // Reset values in the row of C
2425  Cvals[k] = SC_ZERO;
2426  }
2427 
2428  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2429  LO Aik = Acolind[k];
2430  const SC Aval = Avals[k];
2431  if (Aval == SC_ZERO)
2432  continue;
2433 
2434  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2435  // Local matrix
2436  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2437 
2438  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2439  LO Bkj = Bcolind[j];
2440  LO Cij = Bcol2Ccol[Bkj];
2441 
2442  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2443  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2444  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2445 
2446  Cvals[c_status[Cij]] += Aval * Bvals[j];
2447  }
2448 
2449  } else {
2450  // Remote matrix
2451  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2452  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2453  LO Ikj = Icolind[j];
2454  LO Cij = Icol2Ccol[Ikj];
2455 
2456  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2457  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2458  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2459 
2460  Cvals[c_status[Cij]] += Aval * Ivals[j];
2461  }
2462  }
2463  }
2464  }
2465 
2466 #ifdef HAVE_TPETRA_MMM_TIMINGS
2467  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2468 #endif
2469 
2470  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2471 }
2472 
2473 
2474 /*********************************************************************************************************/
2475 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2476 template<class Scalar,
2477  class LocalOrdinal,
2478  class GlobalOrdinal,
2479  class Node>
2480 void jacobi_A_B_newmatrix(
2481  Scalar omega,
2482  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2483  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2484  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2485  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2486  const std::string& label,
2487  const Teuchos::RCP<Teuchos::ParameterList>& params)
2488 {
2489  using Teuchos::Array;
2490  using Teuchos::ArrayRCP;
2491  using Teuchos::ArrayView;
2492  using Teuchos::RCP;
2493  using Teuchos::rcp;
2494  // typedef Scalar SC;
2495  typedef LocalOrdinal LO;
2496  typedef GlobalOrdinal GO;
2497  typedef Node NO;
2498 
2499  typedef Import<LO,GO,NO> import_type;
2500  typedef Map<LO,GO,NO> map_type;
2501  typedef typename map_type::local_map_type local_map_type;
2502 
2503  // All of the Kokkos typedefs
2505  typedef typename KCRS::StaticCrsGraphType graph_t;
2506  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2507  typedef typename NO::execution_space execution_space;
2508  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2509  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2510 
2511 
2512 #ifdef HAVE_TPETRA_MMM_TIMINGS
2513  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2514  using Teuchos::TimeMonitor;
2515  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2516 #endif
2517  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2518 
2519  // Build the final importer / column map, hash table lookups for C
2520  RCP<const import_type> Cimport;
2521  RCP<const map_type> Ccolmap;
2522  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2523  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2524  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2525  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2526  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2527  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2528  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2529  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2530 
2531  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2532  // indices of B, to local column indices of C. (B and C have the
2533  // same number of columns.) The kernel uses this, instead of
2534  // copying the entire input matrix B and converting its column
2535  // indices to those of C.
2536  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2537 
2538  if (Bview.importMatrix.is_null()) {
2539  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2540  Cimport = Bimport;
2541  Ccolmap = Bview.colMap;
2542  // Bcol2Ccol is trivial
2543  // Bcol2Ccol is trivial
2544 
2545  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2546  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2547  Bcol2Ccol(i) = static_cast<LO> (i);
2548  });
2549  } else {
2550  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2551  // column Map of C, as well as C's Import object (from its domain
2552  // Map to its column Map). C's column Map is the union of the
2553  // column Maps of (the local part of) B, and the "remote" part of
2554  // B. Ditto for the Import. We have optimized this "setUnion"
2555  // operation on Import objects and Maps.
2556 
2557  // Choose the right variant of setUnion
2558  if (!Bimport.is_null() && !Iimport.is_null()){
2559  Cimport = Bimport->setUnion(*Iimport);
2560  Ccolmap = Cimport->getTargetMap();
2561 
2562  } else if (!Bimport.is_null() && Iimport.is_null()) {
2563  Cimport = Bimport->setUnion();
2564 
2565  } else if(Bimport.is_null() && !Iimport.is_null()) {
2566  Cimport = Iimport->setUnion();
2567 
2568  } else
2569  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2570 
2571  Ccolmap = Cimport->getTargetMap();
2572 
2573  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2574  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2575 
2576  // NOTE: This is not efficient and should be folded into setUnion
2577  //
2578  // mfh 27 Sep 2016: What the above comment means, is that the
2579  // setUnion operation on Import objects could also compute these
2580  // local index - to - local index look-up tables.
2581  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2582  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2583  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2584  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2585  });
2586  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2587  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2588  });
2589 
2590  }
2591 
2592  // Replace the column map
2593  //
2594  // mfh 27 Sep 2016: We do this because C was originally created
2595  // without a column Map. Now we have its column Map.
2596  C.replaceColMap(Ccolmap);
2597 
2598  // mfh 27 Sep 2016: Construct tables that map from local column
2599  // indices of A, to local row indices of either B_local (the locally
2600  // owned part of B), or B_remote (the "imported" remote part of B).
2601  //
2602  // For column index Aik in row i of A, if the corresponding row of B
2603  // exists in the local part of B ("orig") (which I'll call B_local),
2604  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2605  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2606  //
2607  // For column index Aik in row i of A, if the corresponding row of B
2608  // exists in the remote part of B ("Import") (which I'll call
2609  // B_remote), then targetMapToImportRow[Aik] is the local index of
2610  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2611  // (a flag value).
2612 
2613  // Run through all the hash table lookups once and for all
2614  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2615  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2616  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2617  GO aidx = Acolmap_local.getGlobalElement(i);
2618  LO B_LID = Browmap_local.getLocalElement(aidx);
2619  if (B_LID != LO_INVALID) {
2620  targetMapToOrigRow(i) = B_LID;
2621  targetMapToImportRow(i) = LO_INVALID;
2622  } else {
2623  LO I_LID = Irowmap_local.getLocalElement(aidx);
2624  targetMapToOrigRow(i) = LO_INVALID;
2625  targetMapToImportRow(i) = I_LID;
2626 
2627  }
2628  });
2629 
2630  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2631  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2632  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2633 
2634 }
2635 
2636 
2637 /*********************************************************************************************************/
2638 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2639 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2640 
2641 template<class Scalar,
2642  class LocalOrdinal,
2643  class GlobalOrdinal,
2644  class Node,
2645  class LocalOrdinalViewType>
2646 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2647  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2648  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2649  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2650  const LocalOrdinalViewType & targetMapToOrigRow,
2651  const LocalOrdinalViewType & targetMapToImportRow,
2652  const LocalOrdinalViewType & Bcol2Ccol,
2653  const LocalOrdinalViewType & Icol2Ccol,
2654  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2655  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2656  const std::string& label,
2657  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2658 
2659 #ifdef HAVE_TPETRA_MMM_TIMINGS
2660  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2661  using Teuchos::TimeMonitor;
2662  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2663 #endif
2664 
2665  using Teuchos::Array;
2666  using Teuchos::ArrayRCP;
2667  using Teuchos::ArrayView;
2668  using Teuchos::RCP;
2669  using Teuchos::rcp;
2670 
2671  // Lots and lots of typedefs
2673  typedef typename KCRS::StaticCrsGraphType graph_t;
2674  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2675  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2676  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2677  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2678 
2679  // Jacobi-specific
2680  typedef typename scalar_view_t::memory_space scalar_memory_space;
2681 
2682  typedef Scalar SC;
2683  typedef LocalOrdinal LO;
2684  typedef GlobalOrdinal GO;
2685  typedef Node NO;
2686 
2687  typedef Map<LO,GO,NO> map_type;
2688  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2689  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2690 
2691  // Sizes
2692  RCP<const map_type> Ccolmap = C.getColMap();
2693  size_t m = Aview.origMatrix->getNodeNumRows();
2694  size_t n = Ccolmap->getNodeNumElements();
2695  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2696 
2697  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2698  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2699  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2700 
2701  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2702  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2703  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2704 
2705  c_lno_view_t Irowptr;
2706  lno_nnz_view_t Icolind;
2707  scalar_view_t Ivals;
2708  if(!Bview.importMatrix.is_null()) {
2709  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2710  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2711  Ivals = Bview.importMatrix->getLocalMatrix().values;
2712  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2713  }
2714 
2715  // Jacobi-specific inner stuff
2716  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2717 
2718  // Teuchos::ArrayView::operator[].
2719  // The status array will contain the index into colind where this entry was last deposited.
2720  // c_status[i] < CSR_ip - not in the row yet.
2721  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2722  // We start with this filled with INVALID's indicating that there are no entries yet.
2723  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2724  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2725  Array<size_t> c_status(n, ST_INVALID);
2726 
2727  // Classic csr assembly (low memory edition)
2728  //
2729  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2730  // The method loops over rows of A, and may resize after processing
2731  // each row. Chris Siefert says that this reflects experience in
2732  // ML; for the non-threaded case, ML found it faster to spend less
2733  // effort on estimation and risk an occasional reallocation.
2734  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2735  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2736  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2737  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2738  size_t CSR_ip = 0, OLD_ip = 0;
2739 
2740  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2741 
2742  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2743  // routine. The routine computes
2744  //
2745  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2746  //
2747  // This corresponds to one sweep of (weighted) Jacobi.
2748  //
2749  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2750  // you whether the corresponding row of B belongs to B_local
2751  // ("orig") or B_remote ("Import").
2752 
2753  // For each row of A/C
2754  for (size_t i = 0; i < m; i++) {
2755  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2756  // on the calling process.
2757  Crowptr[i] = CSR_ip;
2758  SC minusOmegaDval = -omega*Dvals(i,0);
2759 
2760  // Entries of B
2761  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2762  Scalar Bval = Bvals[j];
2763  if (Bval == SC_ZERO)
2764  continue;
2765  LO Bij = Bcolind[j];
2766  LO Cij = Bcol2Ccol[Bij];
2767 
2768  // Assume no repeated entries in B
2769  c_status[Cij] = CSR_ip;
2770  Ccolind[CSR_ip] = Cij;
2771  Cvals[CSR_ip] = Bvals[j];
2772  CSR_ip++;
2773  }
2774 
2775  // Entries of -omega * Dinv * A * B
2776  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2777  LO Aik = Acolind[k];
2778  const SC Aval = Avals[k];
2779  if (Aval == SC_ZERO)
2780  continue;
2781 
2782  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2783  // Local matrix
2784  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2785 
2786  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2787  LO Bkj = Bcolind[j];
2788  LO Cij = Bcol2Ccol[Bkj];
2789 
2790  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2791  // New entry
2792  c_status[Cij] = CSR_ip;
2793  Ccolind[CSR_ip] = Cij;
2794  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2795  CSR_ip++;
2796 
2797  } else {
2798  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2799  }
2800  }
2801 
2802  } else {
2803  // Remote matrix
2804  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2805  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2806  LO Ikj = Icolind[j];
2807  LO Cij = Icol2Ccol[Ikj];
2808 
2809  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2810  // New entry
2811  c_status[Cij] = CSR_ip;
2812  Ccolind[CSR_ip] = Cij;
2813  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2814  CSR_ip++;
2815  } else {
2816  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2817  }
2818  }
2819  }
2820  }
2821 
2822  // Resize for next pass if needed
2823  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2824  CSR_alloc *= 2;
2825  Kokkos::resize(Ccolind,CSR_alloc);
2826  Kokkos::resize(Cvals,CSR_alloc);
2827  }
2828  OLD_ip = CSR_ip;
2829  }
2830  Crowptr[m] = CSR_ip;
2831 
2832  // Downward resize
2833  Kokkos::resize(Ccolind,CSR_ip);
2834  Kokkos::resize(Cvals,CSR_ip);
2835 
2836  {
2837 #ifdef HAVE_TPETRA_MMM_TIMINGS
2838  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2839 #endif
2840 
2841  // Replace the column map
2842  //
2843  // mfh 27 Sep 2016: We do this because C was originally created
2844  // without a column Map. Now we have its column Map.
2845  C.replaceColMap(Ccolmap);
2846 
2847  // Final sort & set of CRS arrays
2848  //
2849  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2850  // matrix-matrix multiply routine sort the entries for us?
2851  // Final sort & set of CRS arrays
2852  if (params.is_null() || params->get("sort entries",true))
2853  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2854  C.setAllValues(Crowptr,Ccolind, Cvals);
2855  }
2856  {
2857 #ifdef HAVE_TPETRA_MMM_TIMINGS
2858  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2859 #endif
2860 
2861  // Final FillComplete
2862  //
2863  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2864  // Import (from domain Map to column Map) construction (which costs
2865  // lots of communication) by taking the previously constructed
2866  // Import object. We should be able to do this without interfering
2867  // with the implementation of the local part of sparse matrix-matrix
2868  // multply above
2869  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2870  labelList->set("Timer Label",label);
2871  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2872  RCP<const Export<LO,GO,NO> > dummyExport;
2873  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2874 
2875  }
2876 }
2877 
2878 
2879 /*********************************************************************************************************/
2880 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2881 template<class Scalar,
2882  class LocalOrdinal,
2883  class GlobalOrdinal,
2884  class Node>
2885 void jacobi_A_B_reuse(
2886  Scalar omega,
2887  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2888  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2889  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2890  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2891  const std::string& label,
2892  const Teuchos::RCP<Teuchos::ParameterList>& params)
2893 {
2894  using Teuchos::Array;
2895  using Teuchos::ArrayRCP;
2896  using Teuchos::ArrayView;
2897  using Teuchos::RCP;
2898  using Teuchos::rcp;
2899 
2900  typedef LocalOrdinal LO;
2901  typedef GlobalOrdinal GO;
2902  typedef Node NO;
2903 
2904  typedef Import<LO,GO,NO> import_type;
2905  typedef Map<LO,GO,NO> map_type;
2906 
2907  // Kokkos typedefs
2908  typedef typename map_type::local_map_type local_map_type;
2910  typedef typename KCRS::StaticCrsGraphType graph_t;
2911  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2912  typedef typename NO::execution_space execution_space;
2913  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2914  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2915 
2916 #ifdef HAVE_TPETRA_MMM_TIMINGS
2917  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2918  using Teuchos::TimeMonitor;
2919  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2920 #endif
2921  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2922 
2923  // Grab all the maps
2924  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2925  RCP<const map_type> Ccolmap = C.getColMap();
2926  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2927  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2928  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2929  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2930  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2931  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2932 
2933  // Build the final importer / column map, hash table lookups for C
2934  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2935  {
2936  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2937  // So, column map of C may be a strict subset of the column map of B
2938  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2939  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2940  });
2941 
2942  if (!Bview.importMatrix.is_null()) {
2943  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2944  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2945 
2946  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2947  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2948  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2949  });
2950  }
2951  }
2952 
2953  // Run through all the hash table lookups once and for all
2954  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2955  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2956  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2957  GO aidx = Acolmap_local.getGlobalElement(i);
2958  LO B_LID = Browmap_local.getLocalElement(aidx);
2959  if (B_LID != LO_INVALID) {
2960  targetMapToOrigRow(i) = B_LID;
2961  targetMapToImportRow(i) = LO_INVALID;
2962  } else {
2963  LO I_LID = Irowmap_local.getLocalElement(aidx);
2964  targetMapToOrigRow(i) = LO_INVALID;
2965  targetMapToImportRow(i) = I_LID;
2966 
2967  }
2968  });
2969 
2970 #ifdef HAVE_TPETRA_MMM_TIMINGS
2971  MM = Teuchos::null;
2972 #endif
2973 
2974  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2975  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2976  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2977 }
2978 
2979 
2980 
2981 /*********************************************************************************************************/
2982 template<class Scalar,
2983  class LocalOrdinal,
2984  class GlobalOrdinal,
2985  class Node,
2986  class LocalOrdinalViewType>
2987 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2988  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2989  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2990  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2991  const LocalOrdinalViewType & targetMapToOrigRow,
2992  const LocalOrdinalViewType & targetMapToImportRow,
2993  const LocalOrdinalViewType & Bcol2Ccol,
2994  const LocalOrdinalViewType & Icol2Ccol,
2995  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2996  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2997  const std::string& label,
2998  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2999 #ifdef HAVE_TPETRA_MMM_TIMINGS
3000  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3001  using Teuchos::TimeMonitor;
3002  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
3003  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
3004 #else
3005  (void)label;
3006 #endif
3007  using Teuchos::RCP;
3008  using Teuchos::rcp;
3009 
3010  // Lots and lots of typedefs
3012  typedef typename KCRS::StaticCrsGraphType graph_t;
3013  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3014  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3015  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3016  typedef typename scalar_view_t::memory_space scalar_memory_space;
3017 
3018  typedef Scalar SC;
3019  typedef LocalOrdinal LO;
3020  typedef GlobalOrdinal GO;
3021  typedef Node NO;
3022  typedef Map<LO,GO,NO> map_type;
3023  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3024  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3025  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3026 
3027  // Sizes
3028  RCP<const map_type> Ccolmap = C.getColMap();
3029  size_t m = Aview.origMatrix->getNodeNumRows();
3030  size_t n = Ccolmap->getNodeNumElements();
3031 
3032  // Grab the Kokkos::SparseCrsMatrices & inner stuff
3033  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
3034  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
3035  const KCRS & Cmat = C.getLocalMatrix();
3036 
3037  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3038  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3039  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3040  scalar_view_t Cvals = Cmat.values;
3041 
3042  c_lno_view_t Irowptr;
3043  lno_nnz_view_t Icolind;
3044  scalar_view_t Ivals;
3045  if(!Bview.importMatrix.is_null()) {
3046  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
3047  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
3048  Ivals = Bview.importMatrix->getLocalMatrix().values;
3049  }
3050 
3051  // Jacobi-specific inner stuff
3052  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
3053 
3054 #ifdef HAVE_TPETRA_MMM_TIMINGS
3055  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
3056 #endif
3057 
3058  // The status array will contain the index into colind where this entry was last deposited.
3059  // c_status[i] < CSR_ip - not in the row yet
3060  // c_status[i] >= CSR_ip - this is the entry where you can find the data
3061  // We start with this filled with INVALID's indicating that there are no entries yet.
3062  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
3063  std::vector<size_t> c_status(n, ST_INVALID);
3064 
3065  // For each row of A/C
3066  size_t CSR_ip = 0, OLD_ip = 0;
3067  for (size_t i = 0; i < m; i++) {
3068 
3069  // First fill the c_status array w/ locations where we're allowed to
3070  // generate nonzeros for this row
3071  OLD_ip = Crowptr[i];
3072  CSR_ip = Crowptr[i+1];
3073  for (size_t k = OLD_ip; k < CSR_ip; k++) {
3074  c_status[Ccolind[k]] = k;
3075 
3076  // Reset values in the row of C
3077  Cvals[k] = SC_ZERO;
3078  }
3079 
3080  SC minusOmegaDval = -omega*Dvals(i,0);
3081 
3082  // Entries of B
3083  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3084  Scalar Bval = Bvals[j];
3085  if (Bval == SC_ZERO)
3086  continue;
3087  LO Bij = Bcolind[j];
3088  LO Cij = Bcol2Ccol[Bij];
3089 
3090  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3091  std::runtime_error, "Trying to insert a new entry into a static graph");
3092 
3093  Cvals[c_status[Cij]] = Bvals[j];
3094  }
3095 
3096  // Entries of -omega * Dinv * A * B
3097  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3098  LO Aik = Acolind[k];
3099  const SC Aval = Avals[k];
3100  if (Aval == SC_ZERO)
3101  continue;
3102 
3103  if (targetMapToOrigRow[Aik] != LO_INVALID) {
3104  // Local matrix
3105  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3106 
3107  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3108  LO Bkj = Bcolind[j];
3109  LO Cij = Bcol2Ccol[Bkj];
3110 
3111  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3112  std::runtime_error, "Trying to insert a new entry into a static graph");
3113 
3114  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3115  }
3116 
3117  } else {
3118  // Remote matrix
3119  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3120  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3121  LO Ikj = Icolind[j];
3122  LO Cij = Icol2Ccol[Ikj];
3123 
3124  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3125  std::runtime_error, "Trying to insert a new entry into a static graph");
3126 
3127  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3128  }
3129  }
3130  }
3131  }
3132 
3133 #ifdef HAVE_TPETRA_MMM_TIMINGS
3134  MM2 = Teuchos::null;
3135  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3136 #endif
3137 
3138  C.fillComplete(C.getDomainMap(), C.getRangeMap());
3139 #ifdef HAVE_TPETRA_MMM_TIMINGS
3140  MM2 = Teuchos::null;
3141  MM = Teuchos::null;
3142 #endif
3143 
3144 }
3145 
3146 
3147 
3148 /*********************************************************************************************************/
3149 template<class Scalar,
3150  class LocalOrdinal,
3151  class GlobalOrdinal,
3152  class Node>
3153 void import_and_extract_views(
3154  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3155  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3156  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3157  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3158  bool userAssertsThereAreNoRemotes,
3159  const std::string& label,
3160  const Teuchos::RCP<Teuchos::ParameterList>& params)
3161 {
3162  using Teuchos::Array;
3163  using Teuchos::ArrayView;
3164  using Teuchos::RCP;
3165  using Teuchos::rcp;
3166  using Teuchos::null;
3167 
3168  typedef Scalar SC;
3169  typedef LocalOrdinal LO;
3170  typedef GlobalOrdinal GO;
3171  typedef Node NO;
3172 
3173  typedef Map<LO,GO,NO> map_type;
3174  typedef Import<LO,GO,NO> import_type;
3175  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3176 
3177 #ifdef HAVE_TPETRA_MMM_TIMINGS
3178  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3179  using Teuchos::TimeMonitor;
3180  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3181 #endif
3182  // The goal of this method is to populate the 'Aview' struct with views of the
3183  // rows of A, including all rows that correspond to elements in 'targetMap'.
3184  //
3185  // If targetMap includes local elements that correspond to remotely-owned rows
3186  // of A, then those remotely-owned rows will be imported into
3187  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3188  Aview.deleteContents();
3189 
3190  Aview.origMatrix = rcp(&A, false);
3191  Aview.origRowMap = A.getRowMap();
3192  Aview.rowMap = targetMap;
3193  Aview.colMap = A.getColMap();
3194  Aview.domainMap = A.getDomainMap();
3195  Aview.importColMap = null;
3196 
3197  // Short circuit if the user swears there are no remotes
3198  if (userAssertsThereAreNoRemotes)
3199  return;
3200 
3201  RCP<const import_type> importer;
3202  if (params != null && params->isParameter("importer")) {
3203  importer = params->get<RCP<const import_type> >("importer");
3204 
3205  } else {
3206 #ifdef HAVE_TPETRA_MMM_TIMINGS
3207  MM = Teuchos::null;
3208  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3209 #endif
3210 
3211  // Mark each row in targetMap as local or remote, and go ahead and get a view
3212  // for the local rows
3213  RCP<const map_type> rowMap = A.getRowMap(), remoteRowMap;
3214  size_t numRemote = 0;
3215  int mode = 0;
3216  if (!prototypeImporter.is_null() &&
3217  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3218  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3219  // We have a valid prototype importer --- ask it for the remotes
3220  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3221  numRemote = prototypeImporter->getNumRemoteIDs();
3222 
3223  Array<GO> remoteRows(numRemote);
3224  for (size_t i = 0; i < numRemote; i++)
3225  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3226 
3227  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3228  rowMap->getIndexBase(), rowMap->getComm()));
3229  mode = 1;
3230 
3231  } else if (prototypeImporter.is_null()) {
3232  // No prototype importer --- count the remotes the hard way
3233  ArrayView<const GO> rows = targetMap->getNodeElementList();
3234  size_t numRows = targetMap->getNodeNumElements();
3235 
3236  Array<GO> remoteRows(numRows);
3237  for(size_t i = 0; i < numRows; ++i) {
3238  const LO mlid = rowMap->getLocalElement(rows[i]);
3239 
3240  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3241  remoteRows[numRemote++] = rows[i];
3242  }
3243  remoteRows.resize(numRemote);
3244  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3245  rowMap->getIndexBase(), rowMap->getComm()));
3246  mode = 2;
3247 
3248  } else {
3249  // PrototypeImporter is bad. But if we're in serial that's OK.
3250  mode = 3;
3251  }
3252 
3253  const int numProcs = rowMap->getComm()->getSize();
3254 
3255  if (numProcs < 2) {
3256  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3257  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3258  // If only one processor we don't need to import any remote rows, so return.
3259  return;
3260  }
3261 
3262  //
3263  // Now we will import the needed remote rows of A, if the global maximum
3264  // value of numRemote is greater than 0.
3265  //
3266 #ifdef HAVE_TPETRA_MMM_TIMINGS
3267  MM = Teuchos::null;
3268  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3269 #endif
3270 
3271  global_size_t globalMaxNumRemote = 0;
3272  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3273 
3274  if (globalMaxNumRemote > 0) {
3275 #ifdef HAVE_TPETRA_MMM_TIMINGS
3276  MM = Teuchos::null;
3277  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3278 #endif
3279  // Create an importer with target-map remoteRowMap and source-map rowMap.
3280  if (mode == 1)
3281  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3282  else if (mode == 2)
3283  importer = rcp(new import_type(rowMap, remoteRowMap));
3284  else
3285  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3286  }
3287 
3288  if (params != null)
3289  params->set("importer", importer);
3290  }
3291 
3292  if (importer != null) {
3293 #ifdef HAVE_TPETRA_MMM_TIMINGS
3294  MM = Teuchos::null;
3295  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3296 #endif
3297 
3298  // Now create a new matrix into which we can import the remote rows of A that we need.
3299  Teuchos::ParameterList labelList;
3300  labelList.set("Timer Label", label);
3301  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3302 
3303  bool isMM = true;
3304  bool overrideAllreduce = false;
3305  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3306  // Minor speedup tweak - avoid computing the global constants
3307  Teuchos::ParameterList params_sublist;
3308  if(!params.is_null()) {
3309  labelList.set("compute global constants", params->get("compute global constants",false));
3310  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3311  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3312  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3313  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3314  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3315  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3316  }
3317  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3318  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3319  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3320 
3321  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3322  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3323 
3324 #if 0
3325  // Disabled code for dumping input matrices
3326  static int count=0;
3327  char str[80];
3328  sprintf(str,"import_matrix.%d.dat",count);
3330  count++;
3331 #endif
3332 
3333 #ifdef HAVE_TPETRA_MMM_STATISTICS
3334  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3335 #endif
3336 
3337 
3338 #ifdef HAVE_TPETRA_MMM_TIMINGS
3339  MM = Teuchos::null;
3340  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3341 #endif
3342 
3343  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3344  Aview.importColMap = Aview.importMatrix->getColMap();
3345 #ifdef HAVE_TPETRA_MMM_TIMINGS
3346  MM = Teuchos::null;
3347 #endif
3348  }
3349 }
3350 
3351 
3352 
3353 
3354 
3355 /*********************************************************************************************************/
3356  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3357 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3359 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3360  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3361  const LocalOrdinalViewType & Acol2Brow,
3362  const LocalOrdinalViewType & Acol2Irow,
3363  const LocalOrdinalViewType & Bcol2Ccol,
3364  const LocalOrdinalViewType & Icol2Ccol,
3365  const size_t mergedNodeNumCols) {
3366 
3367  using Teuchos::RCP;
3369  typedef typename KCRS::StaticCrsGraphType graph_t;
3370  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3371  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3372  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3373  // Grab the Kokkos::SparseCrsMatrices
3374  const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3375  const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3376 
3377  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3378  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3379  // We do have a Bimport
3380  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3381  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3382  RCP<const KCRS> Ik_;
3383  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3384  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3385  KCRS Iks;
3386  if(Ik!=0) Iks = *Ik;
3387  size_t merge_numrows = Ak.numCols();
3388  // The last entry of this at least, need to be initialized
3389  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3390 
3391  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3392 
3393  // Use a Kokkos::parallel_scan to build the rowptr
3394  typedef typename Node::execution_space execution_space;
3395  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3396  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3397  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3398  if(final) Mrowptr(i) = update;
3399  // Get the row count
3400  size_t ct=0;
3401  if(Acol2Brow(i)!=LO_INVALID)
3402  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3403  else
3404  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3405  update+=ct;
3406 
3407  if(final && i+1==merge_numrows)
3408  Mrowptr(i+1)=update;
3409  });
3410 
3411  // Allocate nnz
3412  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3413  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3414  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3415 
3416  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3417  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3418  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3419  if(Acol2Brow(i)!=LO_INVALID) {
3420  size_t row = Acol2Brow(i);
3421  size_t start = Bk.graph.row_map(row);
3422  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3423  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3424  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3425  }
3426  }
3427  else {
3428  size_t row = Acol2Irow(i);
3429  size_t start = Iks.graph.row_map(row);
3430  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3431  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3432  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3433  }
3434  }
3435  });
3436 
3437  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3438  return newmat;
3439  }
3440  else {
3441  // We don't have a Bimport (the easy case)
3442  return Bk;
3443  }
3444 }//end merge_matrices
3445 
3446 
3447 
3448 } //End namepsace MMdetails
3449 
3450 } //End namespace Tpetra
3451 
3452 /*********************************************************************************************************/
3453 //
3454 // Explicit instantiation macro
3455 //
3456 // Must be expanded from within the Tpetra namespace!
3457 //
3458 namespace Tpetra {
3459 
3460 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3461  template \
3462  void MatrixMatrix::Multiply( \
3463  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3464  bool transposeA, \
3465  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3466  bool transposeB, \
3467  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3468  bool call_FillComplete_on_result, \
3469  const std::string & label, \
3470  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3471 \
3472 template \
3473  void MatrixMatrix::Jacobi( \
3474  SCALAR omega, \
3475  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3476  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3477  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3478  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3479  bool call_FillComplete_on_result, \
3480  const std::string & label, \
3481  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3482 \
3483  template \
3484  void MatrixMatrix::Add( \
3485  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3486  bool transposeA, \
3487  SCALAR scalarA, \
3488  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3489  bool transposeB, \
3490  SCALAR scalarB, \
3491  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3492 \
3493  template \
3494  void MatrixMatrix::Add( \
3495  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3496  bool transposeA, \
3497  SCALAR scalarA, \
3498  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3499  SCALAR scalarB ); \
3500 \
3501  template \
3502  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3503  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3504  (const SCALAR & alpha, \
3505  const bool transposeA, \
3506  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3507  const SCALAR & beta, \
3508  const bool transposeB, \
3509  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3510  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3511  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3512  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3513 \
3514  template struct MatrixMatrix::AddDetails::AddKernels<SCALAR, LO, GO, NODE>;
3515 
3516 } //End namespace Tpetra
3517 
3518 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
The core count above which Tpetra::CrsMatrix::transferAndFillComplere will attempt to do advanced nei...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
static void writeSparseFile(const std::string &filename, const Teuchos::RCP< const sparse_matrix_type > &pMatrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
void keyValueMerge(KeyInputIterType keyBeg1, KeyInputIterType keyEnd1, ValueInputIterType valBeg1, ValueInputIterType valEnd1, KeyInputIterType keyBeg2, KeyInputIterType keyEnd2, ValueInputIterType valBeg2, ValueInputIterType valEnd2, KeyOutputIterType keyOut, ValueOutputIterType valOut, BinaryFunction f)
Merge two sorted (by keys) sequences of unique (key,value) pairs by combining pairs with equal keys...
Declare and define the function Tpetra::Details::computeOffsetsFromCounts, an implementation detail o...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.