Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
53 #include "Tpetra_Details_makeColMap.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
57 #include "Tpetra_Import_Util.hpp"
58 #include "Tpetra_Import_Util2.hpp"
59 #include <algorithm>
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
62 
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65 
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 #include "Kokkos_Bitset.hpp"
69 
70 #include <MatrixMarket_Tpetra.hpp>
71 
79 /*********************************************************************************************************/
80 // Include the architecture-specific kernel partial specializations here
81 // NOTE: This needs to be outside all namespaces
82 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
83 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
84 
85 
86 namespace Tpetra {
87 
88 namespace MatrixMatrix{
89 
90 //
91 // This method forms the matrix-matrix product C = op(A) * op(B), where
92 // op(A) == A if transposeA is false,
93 // op(A) == A^T if transposeA is true,
94 // and similarly for op(B).
95 //
96 template <class Scalar,
97  class LocalOrdinal,
98  class GlobalOrdinal,
99  class Node>
100 void Multiply(
102  bool transposeA,
104  bool transposeB,
106  bool call_FillComplete_on_result,
107  const std::string& label,
108  const Teuchos::RCP<Teuchos::ParameterList>& params)
109 {
110  using Teuchos::null;
111  using Teuchos::RCP;
112  using Teuchos::rcp;
113  typedef Scalar SC;
114  typedef LocalOrdinal LO;
115  typedef GlobalOrdinal GO;
116  typedef Node NO;
117  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
118  typedef Import<LO,GO,NO> import_type;
119  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
120  typedef Map<LO,GO,NO> map_type;
121  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
122 
123 #ifdef HAVE_TPETRA_MMM_TIMINGS
124  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
125  using Teuchos::TimeMonitor;
126  //MM is used to time setup, and then multiply.
127 
128  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
129 #endif
130 
131  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
132 
133  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
134 
135  // The input matrices A and B must both be fillComplete.
136  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
137  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
138 
139  // If transposeA is true, then Aprime will be the transpose of A
140  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
141  // will just be a pointer to A.
142  RCP<const crs_matrix_type> Aprime = null;
143  // If transposeB is true, then Bprime will be the transpose of B
144  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
145  // will just be a pointer to B.
146  RCP<const crs_matrix_type> Bprime = null;
147 
148  // Is this a "clean" matrix?
149  //
150  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
151  // locally nor globally indexed, then it was empty. I don't like
152  // this, because the most straightforward implementation presumes
153  // lazy allocation of indices. However, historical precedent
154  // demands that we keep around this predicate as a way to test
155  // whether the matrix is empty.
156  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
157 
158  bool use_optimized_ATB = false;
159  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
160  use_optimized_ATB = true;
161 
162 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
163  use_optimized_ATB = false;
164 #endif
165 
166  using Teuchos::ParameterList;
167  RCP<ParameterList> transposeParams (new ParameterList);
168  transposeParams->set ("sort", false);
169 
170  if (!use_optimized_ATB && transposeA) {
171  transposer_type transposer (rcpFromRef (A));
172  Aprime = transposer.createTranspose (transposeParams);
173  }
174  else {
175  Aprime = rcpFromRef(A);
176  }
177 
178  if (transposeB) {
179  transposer_type transposer (rcpFromRef (B));
180  Bprime = transposer.createTranspose (transposeParams);
181  }
182  else {
183  Bprime = rcpFromRef(B);
184  }
185 
186  // Check size compatibility
187  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
188  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
189  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
190  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
191  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
192  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
193  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
194  prefix << "ERROR, inner dimensions of op(A) and op(B) "
195  "must match for matrix-matrix product. op(A) is "
196  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
197 
198  // The result matrix C must at least have a row-map that reflects the correct
199  // row-size. Don't check the number of columns because rectangular matrices
200  // which were constructed with only one map can still end up having the
201  // correct capacity and dimensions when filled.
202  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
203  prefix << "ERROR, dimensions of result C must "
204  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
205  << " rows, should have at least " << Aouter << std::endl);
206 
207  // It doesn't matter whether C is already Filled or not. If it is already
208  // Filled, it must have space allocated for the positions that will be
209  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
210  // we'll error out later when trying to store result values.
211 
212  // CGB: However, matrix must be in active-fill
213  if (!C.isFillActive()) C.resumeFill();
214 
215  // We're going to need to import remotely-owned sections of A and/or B if
216  // more than one processor is performing this run, depending on the scenario.
217  int numProcs = A.getComm()->getSize();
218 
219  // Declare a couple of structs that will be used to hold views of the data
220  // of A and B, to be used for fast access during the matrix-multiplication.
221  crs_matrix_struct_type Aview;
222  crs_matrix_struct_type Bview;
223 
224  RCP<const map_type> targetMap_A = Aprime->getRowMap();
225  RCP<const map_type> targetMap_B = Bprime->getRowMap();
226 
227 #ifdef HAVE_TPETRA_MMM_TIMINGS
228  {
229  TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
230 #endif
231 
232  // Now import any needed remote rows and populate the Aview struct
233  // NOTE: We assert that an import isn't needed --- since we do the transpose
234  // above to handle that.
235  if (!use_optimized_ATB) {
236  RCP<const import_type> dummyImporter;
237  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
238  }
239 
240  // We will also need local access to all rows of B that correspond to the
241  // column-map of op(A).
242  if (numProcs > 1)
243  targetMap_B = Aprime->getColMap();
244 
245  // Import any needed remote rows and populate the Bview struct.
246  if (!use_optimized_ATB)
247  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label, params);
248 
249 #ifdef HAVE_TPETRA_MMM_TIMINGS
250  } //stop MM_importExtract here
251  //stop the setup timer, and start the multiply timer
252  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
253 #endif
254 
255  // Call the appropriate method to perform the actual multiplication.
256  if (use_optimized_ATB) {
257  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
258 
259  } else if (call_FillComplete_on_result && newFlag) {
260  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
261 
262  } else if (call_FillComplete_on_result) {
263  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
264 
265  } else {
266  // mfh 27 Sep 2016: Is this the "slow" case? This
267  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
268  // thread-parallel inserts, but that may take some effort.
269  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
270 
271  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
272  }
273 
274 #ifdef HAVE_TPETRA_MMM_TIMINGS
275  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
276 #endif
277  if (call_FillComplete_on_result && !C.isFillComplete()) {
278  // We'll call FillComplete on the C matrix before we exit, and give it a
279  // domain-map and a range-map.
280  // The domain-map will be the domain-map of B, unless
281  // op(B)==transpose(B), in which case the range-map of B will be used.
282  // The range-map will be the range-map of A, unless op(A)==transpose(A),
283  // in which case the domain-map of A will be used.
284  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
285  }
286 }
287 
288 
289 template <class Scalar,
290  class LocalOrdinal,
291  class GlobalOrdinal,
292  class Node>
293 void Jacobi(Scalar omega,
298  bool call_FillComplete_on_result,
299  const std::string& label,
300  const Teuchos::RCP<Teuchos::ParameterList>& params)
301 {
302  using Teuchos::RCP;
303  typedef Scalar SC;
304  typedef LocalOrdinal LO;
305  typedef GlobalOrdinal GO;
306  typedef Node NO;
307  typedef Import<LO,GO,NO> import_type;
308  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
309  typedef Map<LO,GO,NO> map_type;
310  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
311 
312 #ifdef HAVE_TPETRA_MMM_TIMINGS
313  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
314  using Teuchos::TimeMonitor;
315  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
316 #endif
317 
318  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
319 
320  // A and B should already be Filled.
321  // Should we go ahead and call FillComplete() on them if necessary or error
322  // out? For now, we choose to error out.
323  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
324  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
325 
326  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
327  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
328 
329  // Now check size compatibility
330  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
331  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
332  global_size_t Aouter = A.getGlobalNumRows();
333  global_size_t Bouter = numBCols;
334  global_size_t Ainner = numACols;
335  global_size_t Binner = B.getGlobalNumRows();
336  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
337  prefix << "ERROR, inner dimensions of op(A) and op(B) "
338  "must match for matrix-matrix product. op(A) is "
339  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
340 
341  // The result matrix C must at least have a row-map that reflects the correct
342  // row-size. Don't check the number of columns because rectangular matrices
343  // which were constructed with only one map can still end up having the
344  // correct capacity and dimensions when filled.
345  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
346  prefix << "ERROR, dimensions of result C must "
347  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
348  << " rows, should have at least "<< Aouter << std::endl);
349 
350  // It doesn't matter whether C is already Filled or not. If it is already
351  // Filled, it must have space allocated for the positions that will be
352  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
353  // we'll error out later when trying to store result values.
354 
355  // CGB: However, matrix must be in active-fill
356  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
357 
358  // We're going to need to import remotely-owned sections of A and/or B if
359  // more than one processor is performing this run, depending on the scenario.
360  int numProcs = A.getComm()->getSize();
361 
362  // Declare a couple of structs that will be used to hold views of the data of
363  // A and B, to be used for fast access during the matrix-multiplication.
364  crs_matrix_struct_type Aview;
365  crs_matrix_struct_type Bview;
366 
367  RCP<const map_type> targetMap_A = Aprime->getRowMap();
368  RCP<const map_type> targetMap_B = Bprime->getRowMap();
369 
370 #ifdef HAVE_TPETRA_MMM_TIMINGS
371  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
372  {
373 #endif
374 
375  // Enable globalConstants by default
376  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
377  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
378  if(!params.is_null()) {
379  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
380  int mm_optimization_core_count=0;
381  auto slist = params->sublist("matrixmatrix: kernel params",false);
382  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
383  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
384  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
385  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
386  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
387  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
388  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
389  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
390  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
391  }
392 
393  //Now import any needed remote rows and populate the Aview struct.
394  RCP<const import_type> dummyImporter;
395  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
396 
397  // We will also need local access to all rows of B that correspond to the
398  // column-map of op(A).
399  if (numProcs > 1)
400  targetMap_B = Aprime->getColMap();
401 
402  // Now import any needed remote rows and populate the Bview struct.
403  // Enable globalConstants by default
404  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
405  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
406  if(!params.is_null()) {
407  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
408 
409  auto slist = params->sublist("matrixmatrix: kernel params",false);
410  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
411  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
412  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
413  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
414  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
415  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
416  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
417  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
418  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
419  }
420 
421  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label,importParams2);
422 
423 #ifdef HAVE_TPETRA_MMM_TIMINGS
424  }
425  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
426 #endif
427 
428  // Now call the appropriate method to perform the actual multiplication.
429  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
430 
431  // Is this a "clean" matrix
432  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
433 
434  if (call_FillComplete_on_result && newFlag) {
435  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
436 
437  } else if (call_FillComplete_on_result) {
438  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
439 
440  } else {
441  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
442  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
443  // commenting it out.
444 // #ifdef HAVE_TPETRA_MMM_TIMINGS
445 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
446 // #endif
447  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
448  // commenting it out.
449  // if (call_FillComplete_on_result) {
450  // //We'll call FillComplete on the C matrix before we exit, and give
451  // //it a domain-map and a range-map.
452  // //The domain-map will be the domain-map of B, unless
453  // //op(B)==transpose(B), in which case the range-map of B will be used.
454  // //The range-map will be the range-map of A, unless
455  // //op(A)==transpose(A), in which case the domain-map of A will be used.
456  // if (!C.isFillComplete()) {
457  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
458  // }
459  // }
460  }
461 }
462 
463 
464 template <class Scalar,
465  class LocalOrdinal,
466  class GlobalOrdinal,
467  class Node>
468 void Add(
470  bool transposeA,
471  Scalar scalarA,
473  Scalar scalarB )
474 {
475  using Teuchos::Array;
476  using Teuchos::RCP;
477  using Teuchos::null;
478  typedef Scalar SC;
479  typedef LocalOrdinal LO;
480  typedef GlobalOrdinal GO;
481  typedef Node NO;
482  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
483  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
484 
485  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
486 
487  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
488  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
489  "(Result matrix B is not required to be isFillComplete()).");
490  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
491  prefix << "ERROR, input matrix B must not be fill complete!");
492  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
493  prefix << "ERROR, input matrix B must not have static graph!");
494  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
495  prefix << "ERROR, input matrix B must not be locally indexed!");
496 
497  using Teuchos::ParameterList;
498  RCP<ParameterList> transposeParams (new ParameterList);
499  transposeParams->set ("sort", false);
500 
501  RCP<const crs_matrix_type> Aprime = null;
502  if (transposeA) {
503  transposer_type transposer (rcpFromRef (A));
504  Aprime = transposer.createTranspose (transposeParams);
505  }
506  else {
507  Aprime = rcpFromRef(A);
508  }
509 
510  size_t a_numEntries;
511  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
512  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
513  GO row;
514 
515  if (scalarB != Teuchos::ScalarTraits<SC>::one())
516  B.scale(scalarB);
517 
518  bool bFilled = B.isFillComplete();
519  size_t numMyRows = B.getNodeNumRows();
520  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
521  for (LO i = 0; (size_t)i < numMyRows; ++i) {
522  row = B.getRowMap()->getGlobalElement(i);
523  Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
524 
525  if (scalarA != Teuchos::ScalarTraits<SC>::one())
526  for (size_t j = 0; j < a_numEntries; ++j)
527  a_vals[j] *= scalarA;
528 
529  if (bFilled)
530  B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
531  else
532  B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
533  }
534  }
535 }
536 
537 template <class Scalar,
538  class LocalOrdinal,
539  class GlobalOrdinal,
540  class Node>
541 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
542 add (const Scalar& alpha,
543  const bool transposeA,
545  const Scalar& beta,
546  const bool transposeB,
548  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
549  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
550  const Teuchos::RCP<Teuchos::ParameterList>& params)
551 {
552  using Teuchos::RCP;
553  using Teuchos::rcpFromRef;
554  using Teuchos::rcp;
555  using Teuchos::ParameterList;
557  if(!params.is_null())
558  {
559  TEUCHOS_TEST_FOR_EXCEPTION(
560  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
561  std::invalid_argument,
562  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
563  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
564  params->set("Call fillComplete", true);
565  }
566  //If transposeB, must compute B's explicit transpose to
567  //get the correct row map for C.
568  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
569  if(transposeB)
570  {
572  Brcp = transposer.createTranspose();
573  }
574  //Check that A,B are fillComplete before getting B's column map
575  TEUCHOS_TEST_FOR_EXCEPTION
576  (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
577  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
578  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
579  //this version of add() always fill completes the result, no matter what is in params on input
580  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
581  return C;
582 }
583 
584 //This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
585 //but since the spadd() output is always packed there is no need for a separate
586 //numRowEntries here.
587 //
588 template<class LO, class GO, class LOView, class GOView, class LocalMap>
589 struct ConvertGlobalToLocalFunctor
590 {
591  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
592  : lids(lids_), gids(gids_), localColMap(localColMap_)
593  {}
594 
595  KOKKOS_FUNCTION void operator() (const GO i) const
596  {
597  lids(i) = localColMap.getLocalElement(gids(i));
598  }
599 
600  LOView lids;
601  const GOView gids;
602  const LocalMap localColMap;
603 };
604 
605 template <class Scalar,
606  class LocalOrdinal,
607  class GlobalOrdinal,
608  class Node>
609 void
610 add (const Scalar& alpha,
611  const bool transposeA,
613  const Scalar& beta,
614  const bool transposeB,
617  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
618  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
619  const Teuchos::RCP<Teuchos::ParameterList>& params)
620 {
621  using Teuchos::RCP;
622  using Teuchos::rcp;
623  using Teuchos::rcpFromRef;
624  using Teuchos::rcp_implicit_cast;
625  using Teuchos::rcp_dynamic_cast;
626  using Teuchos::TimeMonitor;
627  using SC = Scalar;
628  using LO = LocalOrdinal;
629  using GO = GlobalOrdinal;
630  using NO = Node;
631  using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
632  using crs_graph_type = CrsGraph<LO,GO,NO>;
633  using map_type = Map<LO,GO,NO>;
634  using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
635  using import_type = Import<LO,GO,NO>;
636  using export_type = Export<LO,GO,NO>;
637  using exec_space = typename crs_graph_type::execution_space;
638  using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
639  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
640  constexpr bool debug = false;
641 
642 #ifdef HAVE_TPETRA_MMM_TIMINGS
643  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
644 #endif
645 
646  if (debug) {
647  std::ostringstream os;
648  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
649  << "TpetraExt::MatrixMatrix::add" << std::endl;
650  std::cerr << os.str ();
651  }
652 
653  TEUCHOS_TEST_FOR_EXCEPTION
654  (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
655  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
656  TEUCHOS_TEST_FOR_EXCEPTION
657  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
658  prefix_mmm << "A and B must both be fill complete.");
659 #ifdef HAVE_TPETRA_DEBUG
660  // The matrices don't have domain or range Maps unless they are fill complete.
661  if (A.isFillComplete () && B.isFillComplete ()) {
662  const bool domainMapsSame =
663  (! transposeA && ! transposeB &&
664  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
665  (! transposeA && transposeB &&
666  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
667  ( transposeA && ! transposeB &&
668  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
669  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
670  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
671 
672  const bool rangeMapsSame =
673  (! transposeA && ! transposeB &&
674  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
675  (! transposeA && transposeB &&
676  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
677  ( transposeA && ! transposeB &&
678  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
679  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
680  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
681  }
682 #endif // HAVE_TPETRA_DEBUG
683 
684  using Teuchos::ParameterList;
685  // Form the explicit transpose of A if necessary.
686  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
687  if (transposeA) {
688  transposer_type transposer (Aprime);
689  Aprime = transposer.createTranspose ();
690  }
691 
692 #ifdef HAVE_TPETRA_DEBUG
693  TEUCHOS_TEST_FOR_EXCEPTION
694  (Aprime.is_null (), std::logic_error,
695  prefix_mmm << "Failed to compute Op(A). "
696  "Please report this bug to the Tpetra developers.");
697 #endif // HAVE_TPETRA_DEBUG
698 
699  // Form the explicit transpose of B if necessary.
700  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
701  if (transposeB) {
702  if (debug) {
703  std::ostringstream os;
704  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
705  << "Form explicit xpose of B" << std::endl;
706  std::cerr << os.str ();
707  }
708  transposer_type transposer (Bprime);
709  Bprime = transposer.createTranspose ();
710  }
711 #ifdef HAVE_TPETRA_DEBUG
712  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
713  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
714  TEUCHOS_TEST_FOR_EXCEPTION(
715  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
716  prefix_mmm << "Aprime and Bprime must both be fill complete. "
717  "Please report this bug to the Tpetra developers.");
718 #endif // HAVE_TPETRA_DEBUG
719  RCP<const map_type> CDomainMap = domainMap;
720  RCP<const map_type> CRangeMap = rangeMap;
721  if(CDomainMap.is_null())
722  {
723  CDomainMap = Bprime->getDomainMap();
724  }
725  if(CRangeMap.is_null())
726  {
727  CRangeMap = Bprime->getRangeMap();
728  }
729  assert(!(CDomainMap.is_null()));
730  assert(!(CRangeMap.is_null()));
731  typedef typename AddKern::values_array values_array;
732  typedef typename AddKern::row_ptrs_array row_ptrs_array;
733  typedef typename AddKern::col_inds_array col_inds_array;
734  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
735  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
736  values_array vals;
737  row_ptrs_array rowptrs;
738  col_inds_array colinds;
739 #ifdef HAVE_TPETRA_MMM_TIMINGS
740  MM = Teuchos::null;
741  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
742 #endif
743  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
744  {
745  //import Aprime into Bprime's row map so the local matrices have same # of rows
746  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
747  // cbl do not set
748  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
749  // this import _may_ take the form of a transfer. In practice it would be unlikely,
750  // but the general case is not so forgiving.
751  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
752  }
753  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
754  bool sorted = AGraphSorted && BGraphSorted;
755  RCP<const import_type> Cimport = Teuchos::null;
756  RCP<export_type> Cexport = Teuchos::null;
757  bool doFillComplete = true;
758  if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
759  {
760  doFillComplete = params->get<bool>("Call fillComplete");
761  }
762  auto Alocal = Aprime->getLocalMatrix();
763  auto Blocal = Bprime->getLocalMatrix();
764  LO numLocalRows = Alocal.numRows();
765  if(numLocalRows == 0)
766  {
767  //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
768  //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
769  //Handle this case now
770  //(without interfering with collective operations, since it's possible for
771  //some ranks to have 0 local rows and others not).
772  rowptrs = row_ptrs_array("C rowptrs", 0);
773  }
774  auto Acolmap = Aprime->getColMap();
775  auto Bcolmap = Bprime->getColMap();
776  if(!matchingColMaps)
777  {
778  using global_col_inds_array = typename AddKern::global_col_inds_array;
779 #ifdef HAVE_TPETRA_MMM_TIMINGS
780  MM = Teuchos::null;
781  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
782 #endif
783  //use kernel that converts col indices in both A and B to common domain map before adding
784  auto AlocalColmap = Acolmap->getLocalMap();
785  auto BlocalColmap = Bcolmap->getLocalMap();
786  global_col_inds_array globalColinds;
787  if (debug) {
788  std::ostringstream os;
789  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
790  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
791  std::cerr << os.str ();
792  }
793  AddKern::convertToGlobalAndAdd(
794  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
795  vals, rowptrs, globalColinds);
796  if (debug) {
797  std::ostringstream os;
798  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
799  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
800  std::cerr << os.str ();
801  }
802 #ifdef HAVE_TPETRA_MMM_TIMINGS
803  MM = Teuchos::null;
804  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
805 #endif
806  RCP<const map_type> CcolMap;
807  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
808  (CcolMap, CDomainMap, globalColinds);
809  C.replaceColMap(CcolMap);
810  col_inds_array localColinds("C colinds", globalColinds.extent(0));
811  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
812  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
813  col_inds_array, global_col_inds_array,
814  typename map_type::local_map_type>
815  (localColinds, globalColinds, CcolMap->getLocalMap()));
816  KokkosKernels::Impl::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
817  C.setAllValues(rowptrs, localColinds, vals);
818  C.fillComplete(CDomainMap, CRangeMap, params);
819  if(!doFillComplete)
820  C.resumeFill();
821  }
822  else
823  {
824  //Aprime, Bprime and C all have the same column maps
825  auto Avals = Alocal.values;
826  auto Bvals = Blocal.values;
827  auto Arowptrs = Alocal.graph.row_map;
828  auto Browptrs = Blocal.graph.row_map;
829  auto Acolinds = Alocal.graph.entries;
830  auto Bcolinds = Blocal.graph.entries;
831  if(sorted)
832  {
833  //use sorted kernel
834 #ifdef HAVE_TPETRA_MMM_TIMINGS
835  MM = Teuchos::null;
836  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
837 #endif
838  if (debug) {
839  std::ostringstream os;
840  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
841  << "Call AddKern::addSorted(...)" << std::endl;
842  std::cerr << os.str ();
843  }
844  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
845  }
846  else
847  {
848  //use unsorted kernel
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
850  MM = Teuchos::null;
851  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
852 #endif
853  if (debug) {
854  std::ostringstream os;
855  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
856  << "Call AddKern::addUnsorted(...)" << std::endl;
857  std::cerr << os.str ();
858  }
859  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
860  }
861  //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
862  RCP<const map_type> Ccolmap = Bcolmap;
863  C.replaceColMap(Ccolmap);
864  C.setAllValues(rowptrs, colinds, vals);
865  if(doFillComplete)
866  {
867  #ifdef HAVE_TPETRA_MMM_TIMINGS
868  MM = Teuchos::null;
869  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
870  #endif
871  if(!CDomainMap->isSameAs(*Ccolmap))
872  {
873  if (debug) {
874  std::ostringstream os;
875  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
876  << "Create Cimport" << std::endl;
877  std::cerr << os.str ();
878  }
879  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
880  }
881  if(!C.getRowMap()->isSameAs(*CRangeMap))
882  {
883  if (debug) {
884  std::ostringstream os;
885  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
886  << "Create Cexport" << std::endl;
887  std::cerr << os.str ();
888  }
889  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
890  }
891 
892  if (debug) {
893  std::ostringstream os;
894  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
895  << "Call C->expertStaticFillComplete(...)" << std::endl;
896  std::cerr << os.str ();
897  }
898  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
899  }
900  }
901 }
902 
903 template <class Scalar,
904  class LocalOrdinal,
905  class GlobalOrdinal,
906  class Node>
907 void Add(
909  bool transposeA,
910  Scalar scalarA,
912  bool transposeB,
913  Scalar scalarB,
915 {
916  using Teuchos::Array;
917  using Teuchos::ArrayRCP;
918  using Teuchos::ArrayView;
919  using Teuchos::RCP;
920  using Teuchos::rcp;
921  using Teuchos::rcp_dynamic_cast;
922  using Teuchos::rcpFromRef;
923  using Teuchos::tuple;
924  using std::endl;
925  // typedef typename ArrayView<const Scalar>::size_type size_type;
926  typedef Teuchos::ScalarTraits<Scalar> STS;
928  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
929  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
930  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
933 
934  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
935 
936  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
937  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
938 
939  TEUCHOS_TEST_FOR_EXCEPTION(
940  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
941  prefix << "Both input matrices must be fill complete before calling this function.");
942 
943 #ifdef HAVE_TPETRA_DEBUG
944  {
945  const bool domainMapsSame =
946  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
947  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
948  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
949  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
950  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
951 
952  const bool rangeMapsSame =
953  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
954  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
955  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
956  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
957  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
958  }
959 #endif // HAVE_TPETRA_DEBUG
960 
961  using Teuchos::ParameterList;
962  RCP<ParameterList> transposeParams (new ParameterList);
963  transposeParams->set ("sort", false);
964 
965  // Form the explicit transpose of A if necessary.
966  RCP<const crs_matrix_type> Aprime;
967  if (transposeA) {
968  transposer_type theTransposer (rcpFromRef (A));
969  Aprime = theTransposer.createTranspose (transposeParams);
970  }
971  else {
972  Aprime = rcpFromRef (A);
973  }
974 
975 #ifdef HAVE_TPETRA_DEBUG
976  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
977  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
978 #endif // HAVE_TPETRA_DEBUG
979 
980  // Form the explicit transpose of B if necessary.
981  RCP<const crs_matrix_type> Bprime;
982  if (transposeB) {
983  transposer_type theTransposer (rcpFromRef (B));
984  Bprime = theTransposer.createTranspose (transposeParams);
985  }
986  else {
987  Bprime = rcpFromRef (B);
988  }
989 
990 #ifdef HAVE_TPETRA_DEBUG
991  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
992  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
993 #endif // HAVE_TPETRA_DEBUG
994 
995  // Allocate or zero the entries of the result matrix.
996  if (! C.is_null ()) {
997  C->setAllToScalar (STS::zero ());
998  } else {
999  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1000  // noticed that C was being given the row Map of Aprime (the
1001  // possibly transposed version of A). Is this what we want?
1002  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1003  }
1004 
1005 #ifdef HAVE_TPETRA_DEBUG
1006  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1007  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1008  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1009  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1010  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1011  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1012 #endif // HAVE_TPETRA_DEBUG
1013 
1014  Array<RCP<const crs_matrix_type> > Mat =
1015  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1016  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1017 
1018  // do a loop over each matrix to add: A reordering might be more efficient
1019  for (int k = 0; k < 2; ++k) {
1020  Array<GlobalOrdinal> Indices;
1021  Array<Scalar> Values;
1022 
1023  // Loop over each locally owned row of the current matrix (either
1024  // Aprime or Bprime), and sum its entries into the corresponding
1025  // row of C. This works regardless of whether Aprime or Bprime
1026  // has the same row Map as C, because both sumIntoGlobalValues and
1027  // insertGlobalValues allow summing resp. inserting into nonowned
1028  // rows of C.
1029 #ifdef HAVE_TPETRA_DEBUG
1030  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1031  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1032 #endif // HAVE_TPETRA_DEBUG
1033  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1034 #ifdef HAVE_TPETRA_DEBUG
1035  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1036  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1037 #endif // HAVE_TPETRA_DEBUG
1038 
1039  const size_t localNumRows = Mat[k]->getNodeNumRows ();
1040  for (size_t i = 0; i < localNumRows; ++i) {
1041  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1042  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1043  if (numEntries > 0) {
1044  Indices.resize (numEntries);
1045  Values.resize (numEntries);
1046  Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1047 
1048  if (scalar[k] != STS::one ()) {
1049  for (size_t j = 0; j < numEntries; ++j) {
1050  Values[j] *= scalar[k];
1051  }
1052  }
1053 
1054  if (C->isFillComplete ()) {
1055  C->sumIntoGlobalValues (globalRow, Indices, Values);
1056  } else {
1057  C->insertGlobalValues (globalRow, Indices, Values);
1058  }
1059  }
1060  }
1061  }
1062 }
1063 
1064 } //End namespace MatrixMatrix
1065 
1066 namespace MMdetails{
1067 
1068 /*********************************************************************************************************/
1069 // Prints MMM-style statistics on communication done with an Import or Export object
1070 template <class TransferType>
1071 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1072  if (Transfer.is_null())
1073  return;
1074 
1075  const Distributor & Distor = Transfer->getDistributor();
1076  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1077 
1078  size_t rows_send = Transfer->getNumExportIDs();
1079  size_t rows_recv = Transfer->getNumRemoteIDs();
1080 
1081  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1082  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1083  size_t num_send_neighbors = Distor.getNumSends();
1084  size_t num_recv_neighbors = Distor.getNumReceives();
1085  size_t round2_send, round2_recv;
1086  Distor.getLastDoStatistics(round2_send,round2_recv);
1087 
1088  int myPID = Comm->getRank();
1089  int NumProcs = Comm->getSize();
1090 
1091  // Processor by processor statistics
1092  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1093  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1094 
1095  // Global statistics
1096  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1097  size_t gstats_min[8], gstats_max[8];
1098 
1099  double lstats_avg[8], gstats_avg[8];
1100  for(int i=0; i<8; i++)
1101  lstats_avg[i] = ((double)lstats[i])/NumProcs;
1102 
1103  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1104  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1105  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1106 
1107  if(!myPID) {
1108  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1109  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1110  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1111  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1112  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1113  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1114  }
1115 }
1116 
1117 // Kernel method for computing the local portion of C = A*B
1118 template<class Scalar,
1119  class LocalOrdinal,
1120  class GlobalOrdinal,
1121  class Node>
1122 void mult_AT_B_newmatrix(
1123  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1124  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1125  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1126  const std::string & label,
1127  const Teuchos::RCP<Teuchos::ParameterList>& params)
1128 {
1129  using Teuchos::RCP;
1130  using Teuchos::rcp;
1131  typedef Scalar SC;
1132  typedef LocalOrdinal LO;
1133  typedef GlobalOrdinal GO;
1134  typedef Node NO;
1135  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1136  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1137 
1138 #ifdef HAVE_TPETRA_MMM_TIMINGS
1139  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1140  using Teuchos::TimeMonitor;
1141  RCP<TimeMonitor> MM = rcp (new TimeMonitor
1142  (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1143 #endif
1144 
1145  /*************************************************************/
1146  /* 1) Local Transpose of A */
1147  /*************************************************************/
1148  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1149 
1150  using Teuchos::ParameterList;
1151  RCP<ParameterList> transposeParams (new ParameterList);
1152  transposeParams->set ("sort", false);
1153  if(! params.is_null ()) {
1154  transposeParams->set ("compute global constants",
1155  params->get ("compute global constants: temporaries",
1156  false));
1157  }
1158  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1159  transposer.createTransposeLocal (transposeParams);
1160 
1161  /*************************************************************/
1162  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1163  /*************************************************************/
1164 #ifdef HAVE_TPETRA_MMM_TIMINGS
1165  MM = Teuchos::null;
1166  MM = rcp (new TimeMonitor
1167  (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1168 #endif
1169 
1170  // Get views, asserting that no import is required to speed up computation
1171  crs_matrix_struct_type Aview;
1172  crs_matrix_struct_type Bview;
1173  RCP<const Import<LO, GO, NO> > dummyImporter;
1174 
1175  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1176  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1177  if (! params.is_null ()) {
1178  importParams1->set ("compute global constants",
1179  params->get ("compute global constants: temporaries",
1180  false));
1181  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1182  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1183  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1184  int mm_optimization_core_count =
1186  mm_optimization_core_count =
1187  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1188  int mm_optimization_core_count2 =
1189  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1190  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1191  mm_optimization_core_count = mm_optimization_core_count2;
1192  }
1193  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1194  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1195  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1196  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1197  }
1198 
1199  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1200  Aview, dummyImporter, true,
1201  label, importParams1);
1202 
1203  RCP<ParameterList> importParams2 (new ParameterList);
1204  if (! params.is_null ()) {
1205  importParams2->set ("compute global constants",
1206  params->get ("compute global constants: temporaries",
1207  false));
1208  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1209  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1210  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1211  int mm_optimization_core_count =
1213  mm_optimization_core_count =
1214  slist.get ("MM_TAFC_OptimizationCoreCount",
1215  mm_optimization_core_count);
1216  int mm_optimization_core_count2 =
1217  params->get ("MM_TAFC_OptimizationCoreCount",
1218  mm_optimization_core_count);
1219  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1220  mm_optimization_core_count = mm_optimization_core_count2;
1221  }
1222  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1223  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1224  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1225  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1226  }
1227 
1228  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1229  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1230  }
1231  else {
1232  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1233  }
1234 
1235 #ifdef HAVE_TPETRA_MMM_TIMINGS
1236  MM = Teuchos::null;
1237  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1238 #endif
1239 
1240  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1241 
1242  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1243  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1244  if (needs_final_export) {
1245  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1246  }
1247  else {
1248  Ctemp = rcp (&C, false);
1249  }
1250 
1251  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1252 
1253  /*************************************************************/
1254  /* 4) exportAndFillComplete matrix */
1255  /*************************************************************/
1256 #ifdef HAVE_TPETRA_MMM_TIMINGS
1257  MM = Teuchos::null;
1258  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1259 #endif
1260 
1261  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1262 
1263  if (needs_final_export) {
1264  ParameterList labelList;
1265  labelList.set("Timer Label", label);
1266  if(!params.is_null()) {
1267  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1268  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1269  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1270  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1271  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1272  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1273  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1274  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1275  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1276 
1277  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1278  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1279  labelList.set("compute global constants",params->get("compute global constants",true));
1280  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1281  }
1282  Ctemp->exportAndFillComplete (Crcp,
1283  *Ctemp->getGraph ()->getExporter (),
1284  B.getDomainMap (),
1285  A.getDomainMap (),
1286  rcp (&labelList, false));
1287  }
1288 #ifdef HAVE_TPETRA_MMM_STATISTICS
1289  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1290 #endif
1291 }
1292 
1293 /*********************************************************************************************************/
1294 // Kernel method for computing the local portion of C = A*B
1295 template<class Scalar,
1296  class LocalOrdinal,
1297  class GlobalOrdinal,
1298  class Node>
1299 void mult_A_B(
1300  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1301  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1302  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1303  const std::string& /* label */,
1304  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1305 {
1306  using Teuchos::Array;
1307  using Teuchos::ArrayRCP;
1308  using Teuchos::ArrayView;
1309  using Teuchos::OrdinalTraits;
1310  using Teuchos::null;
1311 
1312  typedef Teuchos::ScalarTraits<Scalar> STS;
1313  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1314  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1315  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1316 
1317  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1318  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1319 
1320  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1321  ArrayView<const GlobalOrdinal> bcols_import = null;
1322  if (Bview.importColMap != null) {
1323  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1324  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1325 
1326  bcols_import = Bview.importColMap->getNodeElementList();
1327  }
1328 
1329  size_t C_numCols = C_lastCol - C_firstCol +
1330  OrdinalTraits<LocalOrdinal>::one();
1331  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1332  OrdinalTraits<LocalOrdinal>::one();
1333 
1334  if (C_numCols_import > C_numCols)
1335  C_numCols = C_numCols_import;
1336 
1337  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1338  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1339  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1340 
1341  Array<Scalar> C_row_i = dwork;
1342  Array<GlobalOrdinal> C_cols = iwork;
1343  Array<size_t> c_index = iwork2;
1344  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1345  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1346 
1347  size_t C_row_i_length, j, k, last_index;
1348 
1349  // Run through all the hash table lookups once and for all
1350  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1351  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1352  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1353  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1354  // Maps are the same: Use local IDs as the hash
1355  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1356  Aview.colMap->getMaxLocalIndex(); i++)
1357  Acol2Brow[i]=i;
1358  }
1359  else {
1360  // Maps are not the same: Use the map's hash
1361  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1362  Aview.colMap->getMaxLocalIndex(); i++) {
1363  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1364  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1365  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1366  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1367  }
1368  }
1369 
1370  // To form C = A*B we're going to execute this expression:
1371  //
1372  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1373  //
1374  // Our goal, of course, is to navigate the data in A and B once, without
1375  // performing searches for column-indices, etc.
1376  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1377  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1378  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1379  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1380  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1381  ArrayView<const Scalar> Avals, Bvals, Ivals;
1382  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1383  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1384  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1385  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1386  if(!Bview.importMatrix.is_null()) {
1387  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1388  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1389  }
1390 
1391  bool C_filled = C.isFillComplete();
1392 
1393  for (size_t i = 0; i < C_numCols; i++)
1394  c_index[i] = OrdinalTraits<size_t>::invalid();
1395 
1396  // Loop over the rows of A.
1397  size_t Arows = Aview.rowMap->getNodeNumElements();
1398  for(size_t i=0; i<Arows; ++i) {
1399 
1400  // Only navigate the local portion of Aview... which is, thankfully, all of
1401  // A since this routine doesn't do transpose modes
1402  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1403 
1404  // Loop across the i-th row of A and for each corresponding row in B, loop
1405  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1406  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1407  // calculating updates for row i of the result matrix C.
1408  C_row_i_length = OrdinalTraits<size_t>::zero();
1409 
1410  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1411  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1412  const Scalar Aval = Avals[k];
1413  if (Aval == STS::zero())
1414  continue;
1415 
1416  if (Ak == LO_INVALID)
1417  continue;
1418 
1419  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1420  LocalOrdinal col = Bcolind[j];
1421  //assert(col >= 0 && col < C_numCols);
1422 
1423  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1424  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1425  // This has to be a += so insertGlobalValue goes out
1426  C_row_i[C_row_i_length] = Aval*Bvals[j];
1427  C_cols[C_row_i_length] = col;
1428  c_index[col] = C_row_i_length;
1429  C_row_i_length++;
1430 
1431  } else {
1432  C_row_i[c_index[col]] += Aval*Bvals[j];
1433  }
1434  }
1435  }
1436 
1437  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1438  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1439  C_cols[ii] = bcols[C_cols[ii]];
1440  combined_index[ii] = C_cols[ii];
1441  combined_values[ii] = C_row_i[ii];
1442  }
1443  last_index = C_row_i_length;
1444 
1445  //
1446  //Now put the C_row_i values into C.
1447  //
1448  // We might have to revamp this later.
1449  C_row_i_length = OrdinalTraits<size_t>::zero();
1450 
1451  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1452  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1453  const Scalar Aval = Avals[k];
1454  if (Aval == STS::zero())
1455  continue;
1456 
1457  if (Ak!=LO_INVALID) continue;
1458 
1459  Ak = Acol2Irow[Acolind[k]];
1460  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1461  LocalOrdinal col = Icolind[j];
1462  //assert(col >= 0 && col < C_numCols);
1463 
1464  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1465  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1466  // This has to be a += so insertGlobalValue goes out
1467  C_row_i[C_row_i_length] = Aval*Ivals[j];
1468  C_cols[C_row_i_length] = col;
1469  c_index[col] = C_row_i_length;
1470  C_row_i_length++;
1471 
1472  } else {
1473  // This has to be a += so insertGlobalValue goes out
1474  C_row_i[c_index[col]] += Aval*Ivals[j];
1475  }
1476  }
1477  }
1478 
1479  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1480  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1481  C_cols[ii] = bcols_import[C_cols[ii]];
1482  combined_index[last_index] = C_cols[ii];
1483  combined_values[last_index] = C_row_i[ii];
1484  last_index++;
1485  }
1486 
1487  // Now put the C_row_i values into C.
1488  // We might have to revamp this later.
1489  C_filled ?
1490  C.sumIntoGlobalValues(
1491  global_row,
1492  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1493  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1494  :
1495  C.insertGlobalValues(
1496  global_row,
1497  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1498  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1499 
1500  }
1501 }
1502 
1503 /*********************************************************************************************************/
1504 template<class Scalar,
1505  class LocalOrdinal,
1506  class GlobalOrdinal,
1507  class Node>
1508 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1509  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1510  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1511 
1512  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1513  Mview.maxNumRowEntries = Mview.indices[0].size();
1514 
1515  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1516  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1517  Mview.maxNumRowEntries = Mview.indices[i].size();
1518  }
1519 }
1520 
1521 /*********************************************************************************************************/
1522 template<class CrsMatrixType>
1523 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1524  // Follows the NZ estimate in ML's ml_matmatmult.c
1525  size_t Aest = 100, Best=100;
1526  if (A.getNodeNumEntries() >= A.getNodeNumRows())
1527  Aest = (A.getNodeNumRows() > 0) ? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1528  if (B.getNodeNumEntries() >= B.getNodeNumRows())
1529  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1530 
1531  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1532  nnzperrow *= nnzperrow;
1533 
1534  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1535 }
1536 
1537 
1538 /*********************************************************************************************************/
1539 // Kernel method for computing the local portion of C = A*B
1540 //
1541 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1542 // function, so this is probably the function we want to
1543 // thread-parallelize.
1544 template<class Scalar,
1545  class LocalOrdinal,
1546  class GlobalOrdinal,
1547  class Node>
1548 void mult_A_B_newmatrix(
1549  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1550  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1551  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1552  const std::string& label,
1553  const Teuchos::RCP<Teuchos::ParameterList>& params)
1554 {
1555  using Teuchos::Array;
1556  using Teuchos::ArrayRCP;
1557  using Teuchos::ArrayView;
1558  using Teuchos::RCP;
1559  using Teuchos::rcp;
1560 
1561  // Tpetra typedefs
1562  typedef LocalOrdinal LO;
1563  typedef GlobalOrdinal GO;
1564  typedef Node NO;
1565  typedef Import<LO,GO,NO> import_type;
1566  typedef Map<LO,GO,NO> map_type;
1567 
1568  // Kokkos typedefs
1569  typedef typename map_type::local_map_type local_map_type;
1571  typedef typename KCRS::StaticCrsGraphType graph_t;
1572  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1573  typedef typename NO::execution_space execution_space;
1574  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1575  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1576 
1577 #ifdef HAVE_TPETRA_MMM_TIMINGS
1578  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1579  using Teuchos::TimeMonitor;
1580  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1581 #endif
1582  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1583 
1584  // Build the final importer / column map, hash table lookups for C
1585  RCP<const import_type> Cimport;
1586  RCP<const map_type> Ccolmap;
1587  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1588  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1589  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1590  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1591  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1592  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1593  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1594  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1595 
1596  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1597  // indices of B, to local column indices of C. (B and C have the
1598  // same number of columns.) The kernel uses this, instead of
1599  // copying the entire input matrix B and converting its column
1600  // indices to those of C.
1601  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1602 
1603  if (Bview.importMatrix.is_null()) {
1604  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1605  Cimport = Bimport;
1606  Ccolmap = Bview.colMap;
1607  const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1608  // Bcol2Ccol is trivial
1609  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1610  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1611  KOKKOS_LAMBDA(const LO i) {
1612  Bcol2Ccol(i) = i;
1613  });
1614  }
1615  else {
1616  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1617  // column Map of C, as well as C's Import object (from its domain
1618  // Map to its column Map). C's column Map is the union of the
1619  // column Maps of (the local part of) B, and the "remote" part of
1620  // B. Ditto for the Import. We have optimized this "setUnion"
1621  // operation on Import objects and Maps.
1622 
1623  // Choose the right variant of setUnion
1624  if (!Bimport.is_null() && !Iimport.is_null()) {
1625  Cimport = Bimport->setUnion(*Iimport);
1626  }
1627  else if (!Bimport.is_null() && Iimport.is_null()) {
1628  Cimport = Bimport->setUnion();
1629  }
1630  else if (Bimport.is_null() && !Iimport.is_null()) {
1631  Cimport = Iimport->setUnion();
1632  }
1633  else {
1634  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1635  }
1636  Ccolmap = Cimport->getTargetMap();
1637 
1638  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1639  // in general. We should get rid of it in order to reduce
1640  // communication costs of sparse matrix-matrix multiply.
1641  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1642  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1643 
1644  // NOTE: This is not efficient and should be folded into setUnion
1645  //
1646  // mfh 27 Sep 2016: What the above comment means, is that the
1647  // setUnion operation on Import objects could also compute these
1648  // local index - to - local index look-up tables.
1649  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1650  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1651  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1652  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1653  });
1654  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1655  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1656  });
1657 
1658  }
1659 
1660  // Replace the column map
1661  //
1662  // mfh 27 Sep 2016: We do this because C was originally created
1663  // without a column Map. Now we have its column Map.
1664  C.replaceColMap(Ccolmap);
1665 
1666  // mfh 27 Sep 2016: Construct tables that map from local column
1667  // indices of A, to local row indices of either B_local (the locally
1668  // owned part of B), or B_remote (the "imported" remote part of B).
1669  //
1670  // For column index Aik in row i of A, if the corresponding row of B
1671  // exists in the local part of B ("orig") (which I'll call B_local),
1672  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1673  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1674  //
1675  // For column index Aik in row i of A, if the corresponding row of B
1676  // exists in the remote part of B ("Import") (which I'll call
1677  // B_remote), then targetMapToImportRow[Aik] is the local index of
1678  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1679  // (a flag value).
1680 
1681  // Run through all the hash table lookups once and for all
1682  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
1683  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
1684 
1685  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1686  GO aidx = Acolmap_local.getGlobalElement(i);
1687  LO B_LID = Browmap_local.getLocalElement(aidx);
1688  if (B_LID != LO_INVALID) {
1689  targetMapToOrigRow(i) = B_LID;
1690  targetMapToImportRow(i) = LO_INVALID;
1691  } else {
1692  LO I_LID = Irowmap_local.getLocalElement(aidx);
1693  targetMapToOrigRow(i) = LO_INVALID;
1694  targetMapToImportRow(i) = I_LID;
1695 
1696  }
1697  });
1698 
1699  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1700  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1701  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1702 
1703 }
1704 
1705 
1706 /*********************************************************************************************************/
1707 // AB NewMatrix Kernel wrappers (Default non-threaded version)
1708 template<class Scalar,
1709  class LocalOrdinal,
1710  class GlobalOrdinal,
1711  class Node,
1712  class LocalOrdinalViewType>
1713 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1714  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1715  const LocalOrdinalViewType & targetMapToOrigRow,
1716  const LocalOrdinalViewType & targetMapToImportRow,
1717  const LocalOrdinalViewType & Bcol2Ccol,
1718  const LocalOrdinalViewType & Icol2Ccol,
1719  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1720  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
1721  const std::string& label,
1722  const Teuchos::RCP<Teuchos::ParameterList>& params) {
1723 #ifdef HAVE_TPETRA_MMM_TIMINGS
1724  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1725  using Teuchos::TimeMonitor;
1726  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
1727 #endif
1728 
1729  using Teuchos::Array;
1730  using Teuchos::ArrayRCP;
1731  using Teuchos::ArrayView;
1732  using Teuchos::RCP;
1733  using Teuchos::rcp;
1734 
1735  // Lots and lots of typedefs
1737  typedef typename KCRS::StaticCrsGraphType graph_t;
1738  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
1739  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1740  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
1741  typedef typename KCRS::values_type::non_const_type scalar_view_t;
1742 
1743  typedef Scalar SC;
1744  typedef LocalOrdinal LO;
1745  typedef GlobalOrdinal GO;
1746  typedef Node NO;
1747  typedef Map<LO,GO,NO> map_type;
1748  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1749  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1750  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
1751 
1752  // Sizes
1753  RCP<const map_type> Ccolmap = C.getColMap();
1754  size_t m = Aview.origMatrix->getNodeNumRows();
1755  size_t n = Ccolmap->getNodeNumElements();
1756  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
1757 
1758  // Grab the Kokkos::SparseCrsMatrices & inner stuff
1759  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
1760  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
1761 
1762  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
1763  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
1764  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
1765 
1766  c_lno_view_t Irowptr;
1767  lno_nnz_view_t Icolind;
1768  scalar_view_t Ivals;
1769  if(!Bview.importMatrix.is_null()) {
1770  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
1771  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
1772  Ivals = Bview.importMatrix->getLocalMatrix().values;
1773  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
1774  }
1775 
1776 #ifdef HAVE_TPETRA_MMM_TIMINGS
1777  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
1778 #endif
1779 
1780  // Classic csr assembly (low memory edition)
1781  //
1782  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
1783  // The method loops over rows of A, and may resize after processing
1784  // each row. Chris Siefert says that this reflects experience in
1785  // ML; for the non-threaded case, ML found it faster to spend less
1786  // effort on estimation and risk an occasional reallocation.
1787  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
1788  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
1789  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
1790  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
1791 
1792  // mfh 27 Sep 2016: The c_status array is an implementation detail
1793  // of the local sparse matrix-matrix multiply routine.
1794 
1795  // The status array will contain the index into colind where this entry was last deposited.
1796  // c_status[i] < CSR_ip - not in the row yet
1797  // c_status[i] >= CSR_ip - this is the entry where you can find the data
1798  // We start with this filled with INVALID's indicating that there are no entries yet.
1799  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
1800  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
1801  std::vector<size_t> c_status(n, ST_INVALID);
1802 
1803  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
1804  // routine. The routine computes C := A * (B_local + B_remote).
1805  //
1806  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
1807  // you whether the corresponding row of B belongs to B_local
1808  // ("orig") or B_remote ("Import").
1809 
1810  // For each row of A/C
1811  size_t CSR_ip = 0, OLD_ip = 0;
1812  for (size_t i = 0; i < m; i++) {
1813  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
1814  // on the calling process.
1815  Crowptr[i] = CSR_ip;
1816 
1817  // mfh 27 Sep 2016: For each entry of A in the current row of A
1818  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
1819  LO Aik = Acolind[k]; // local column index of current entry of A
1820  const SC Aval = Avals[k]; // value of current entry of A
1821  if (Aval == SC_ZERO)
1822  continue; // skip explicitly stored zero values in A
1823 
1824  if (targetMapToOrigRow[Aik] != LO_INVALID) {
1825  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1826  // corresponding to the current entry of A is populated, then
1827  // the corresponding row of B is in B_local (i.e., it lives on
1828  // the calling process).
1829 
1830  // Local matrix
1831  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
1832 
1833  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
1834  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
1835  LO Bkj = Bcolind[j];
1836  LO Cij = Bcol2Ccol[Bkj];
1837 
1838  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
1839  // New entry
1840  c_status[Cij] = CSR_ip;
1841  Ccolind[CSR_ip] = Cij;
1842  Cvals[CSR_ip] = Aval*Bvals[j];
1843  CSR_ip++;
1844 
1845  } else {
1846  Cvals[c_status[Cij]] += Aval*Bvals[j];
1847  }
1848  }
1849 
1850  } else {
1851  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
1852  // corresponding to the current entry of A NOT populated (has
1853  // a flag "invalid" value), then the corresponding row of B is
1854  // in B_local (i.e., it lives on the calling process).
1855 
1856  // Remote matrix
1857  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
1858  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
1859  LO Ikj = Icolind[j];
1860  LO Cij = Icol2Ccol[Ikj];
1861 
1862  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
1863  // New entry
1864  c_status[Cij] = CSR_ip;
1865  Ccolind[CSR_ip] = Cij;
1866  Cvals[CSR_ip] = Aval*Ivals[j];
1867  CSR_ip++;
1868  } else {
1869  Cvals[c_status[Cij]] += Aval*Ivals[j];
1870  }
1871  }
1872  }
1873  }
1874 
1875  // Resize for next pass if needed
1876  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
1877  CSR_alloc *= 2;
1878  Kokkos::resize(Ccolind,CSR_alloc);
1879  Kokkos::resize(Cvals,CSR_alloc);
1880  }
1881  OLD_ip = CSR_ip;
1882  }
1883 
1884  Crowptr[m] = CSR_ip;
1885 
1886  // Downward resize
1887  Kokkos::resize(Ccolind,CSR_ip);
1888  Kokkos::resize(Cvals,CSR_ip);
1889 
1890 #ifdef HAVE_TPETRA_MMM_TIMINGS
1891  {
1892  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
1893 #endif
1894 
1895  // Final sort & set of CRS arrays
1896  if (params.is_null() || params->get("sort entries",true))
1897  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
1898  C.setAllValues(Crowptr,Ccolind, Cvals);
1899 
1900 
1901 #ifdef HAVE_TPETRA_MMM_TIMINGS
1902  }
1903  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
1904  {
1905 #endif
1906 
1907  // Final FillComplete
1908  //
1909  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
1910  // Import (from domain Map to column Map) construction (which costs
1911  // lots of communication) by taking the previously constructed
1912  // Import object. We should be able to do this without interfering
1913  // with the implementation of the local part of sparse matrix-matrix
1914  // multply above.
1915  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
1916  labelList->set("Timer Label",label);
1917  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
1918  RCP<const Export<LO,GO,NO> > dummyExport;
1919  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
1920 #ifdef HAVE_TPETRA_MMM_TIMINGS
1921  }
1922  MM2 = Teuchos::null;
1923 #endif
1924 
1925 }
1926 
1927 
1928 
1929 
1930 /*********************************************************************************************************/
1931 // Kernel method for computing the local portion of C = A*B (reuse)
1932 template<class Scalar,
1933  class LocalOrdinal,
1934  class GlobalOrdinal,
1935  class Node>
1936 void mult_A_B_reuse(
1937  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1938  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1939  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1940  const std::string& label,
1941  const Teuchos::RCP<Teuchos::ParameterList>& params)
1942 {
1943  using Teuchos::Array;
1944  using Teuchos::ArrayRCP;
1945  using Teuchos::ArrayView;
1946  using Teuchos::RCP;
1947  using Teuchos::rcp;
1948 
1949  // Tpetra typedefs
1950  typedef LocalOrdinal LO;
1951  typedef GlobalOrdinal GO;
1952  typedef Node NO;
1953  typedef Import<LO,GO,NO> import_type;
1954  typedef Map<LO,GO,NO> map_type;
1955 
1956  // Kokkos typedefs
1957  typedef typename map_type::local_map_type local_map_type;
1959  typedef typename KCRS::StaticCrsGraphType graph_t;
1960  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1961  typedef typename NO::execution_space execution_space;
1962  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1963  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1964 
1965 #ifdef HAVE_TPETRA_MMM_TIMINGS
1966  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1967  using Teuchos::TimeMonitor;
1968  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
1969 #endif
1970  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1971 
1972  // Grab all the maps
1973  RCP<const import_type> Cimport = C.getGraph()->getImporter();
1974  RCP<const map_type> Ccolmap = C.getColMap();
1975  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1976  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1977  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1978  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1979  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1980  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1981 
1982  // Build the final importer / column map, hash table lookups for C
1983  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1984  {
1985  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
1986  // So, column map of C may be a strict subset of the column map of B
1987  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1988  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1989  });
1990 
1991  if (!Bview.importMatrix.is_null()) {
1992  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1993  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1994 
1995  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
1996  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
1997  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1998  });
1999  }
2000  }
2001 
2002  // Run through all the hash table lookups once and for all
2003  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2004  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2005  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2006  GO aidx = Acolmap_local.getGlobalElement(i);
2007  LO B_LID = Browmap_local.getLocalElement(aidx);
2008  if (B_LID != LO_INVALID) {
2009  targetMapToOrigRow(i) = B_LID;
2010  targetMapToImportRow(i) = LO_INVALID;
2011  } else {
2012  LO I_LID = Irowmap_local.getLocalElement(aidx);
2013  targetMapToOrigRow(i) = LO_INVALID;
2014  targetMapToImportRow(i) = I_LID;
2015 
2016  }
2017  });
2018 
2019  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2020  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2021  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2022 }
2023 
2024 /*********************************************************************************************************/
2025 template<class Scalar,
2026  class LocalOrdinal,
2027  class GlobalOrdinal,
2028  class Node,
2029  class LocalOrdinalViewType>
2030 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2031  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2032  const LocalOrdinalViewType & targetMapToOrigRow,
2033  const LocalOrdinalViewType & targetMapToImportRow,
2034  const LocalOrdinalViewType & Bcol2Ccol,
2035  const LocalOrdinalViewType & Icol2Ccol,
2036  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2037  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2038  const std::string& label,
2039  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2040 #ifdef HAVE_TPETRA_MMM_TIMINGS
2041  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2042  using Teuchos::TimeMonitor;
2043  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2044  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2045 #else
2046  (void)label;
2047 #endif
2048  using Teuchos::RCP;
2049  using Teuchos::rcp;
2050 
2051 
2052  // Lots and lots of typedefs
2054  typedef typename KCRS::StaticCrsGraphType graph_t;
2055  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2056  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2057  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2058 
2059  typedef Scalar SC;
2060  typedef LocalOrdinal LO;
2061  typedef GlobalOrdinal GO;
2062  typedef Node NO;
2063  typedef Map<LO,GO,NO> map_type;
2064  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2065  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2066  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2067 
2068  // Sizes
2069  RCP<const map_type> Ccolmap = C.getColMap();
2070  size_t m = Aview.origMatrix->getNodeNumRows();
2071  size_t n = Ccolmap->getNodeNumElements();
2072 
2073  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2074  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2075  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2076  const KCRS & Cmat = C.getLocalMatrix();
2077 
2078  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2079  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2080  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2081  scalar_view_t Cvals = Cmat.values;
2082 
2083  c_lno_view_t Irowptr;
2084  lno_nnz_view_t Icolind;
2085  scalar_view_t Ivals;
2086  if(!Bview.importMatrix.is_null()) {
2087  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2088  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2089  Ivals = Bview.importMatrix->getLocalMatrix().values;
2090  }
2091 
2092 #ifdef HAVE_TPETRA_MMM_TIMINGS
2093  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2094 #endif
2095 
2096  // Classic csr assembly (low memory edition)
2097  // mfh 27 Sep 2016: The c_status array is an implementation detail
2098  // of the local sparse matrix-matrix multiply routine.
2099 
2100  // The status array will contain the index into colind where this entry was last deposited.
2101  // c_status[i] < CSR_ip - not in the row yet
2102  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2103  // We start with this filled with INVALID's indicating that there are no entries yet.
2104  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2105  std::vector<size_t> c_status(n, ST_INVALID);
2106 
2107  // For each row of A/C
2108  size_t CSR_ip = 0, OLD_ip = 0;
2109  for (size_t i = 0; i < m; i++) {
2110  // First fill the c_status array w/ locations where we're allowed to
2111  // generate nonzeros for this row
2112  OLD_ip = Crowptr[i];
2113  CSR_ip = Crowptr[i+1];
2114  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2115  c_status[Ccolind[k]] = k;
2116 
2117  // Reset values in the row of C
2118  Cvals[k] = SC_ZERO;
2119  }
2120 
2121  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2122  LO Aik = Acolind[k];
2123  const SC Aval = Avals[k];
2124  if (Aval == SC_ZERO)
2125  continue;
2126 
2127  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2128  // Local matrix
2129  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2130 
2131  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2132  LO Bkj = Bcolind[j];
2133  LO Cij = Bcol2Ccol[Bkj];
2134 
2135  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2136  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2137  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2138 
2139  Cvals[c_status[Cij]] += Aval * Bvals[j];
2140  }
2141 
2142  } else {
2143  // Remote matrix
2144  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2145  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2146  LO Ikj = Icolind[j];
2147  LO Cij = Icol2Ccol[Ikj];
2148 
2149  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2150  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2151  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2152 
2153  Cvals[c_status[Cij]] += Aval * Ivals[j];
2154  }
2155  }
2156  }
2157  }
2158 
2159 #ifdef HAVE_TPETRA_MMM_TIMINGS
2160  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2161 #endif
2162 
2163  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2164 }
2165 
2166 
2167 /*********************************************************************************************************/
2168 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2169 template<class Scalar,
2170  class LocalOrdinal,
2171  class GlobalOrdinal,
2172  class Node>
2173 void jacobi_A_B_newmatrix(
2174  Scalar omega,
2175  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2176  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2177  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2178  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2179  const std::string& label,
2180  const Teuchos::RCP<Teuchos::ParameterList>& params)
2181 {
2182  using Teuchos::Array;
2183  using Teuchos::ArrayRCP;
2184  using Teuchos::ArrayView;
2185  using Teuchos::RCP;
2186  using Teuchos::rcp;
2187  // typedef Scalar SC;
2188  typedef LocalOrdinal LO;
2189  typedef GlobalOrdinal GO;
2190  typedef Node NO;
2191 
2192  typedef Import<LO,GO,NO> import_type;
2193  typedef Map<LO,GO,NO> map_type;
2194  typedef typename map_type::local_map_type local_map_type;
2195 
2196  // All of the Kokkos typedefs
2198  typedef typename KCRS::StaticCrsGraphType graph_t;
2199  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2200  typedef typename NO::execution_space execution_space;
2201  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2202  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2203 
2204 
2205 #ifdef HAVE_TPETRA_MMM_TIMINGS
2206  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2207  using Teuchos::TimeMonitor;
2208  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2209 #endif
2210  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2211 
2212  // Build the final importer / column map, hash table lookups for C
2213  RCP<const import_type> Cimport;
2214  RCP<const map_type> Ccolmap;
2215  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2216  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2217  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2218  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2219  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2220  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2221  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2222  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2223 
2224  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2225  // indices of B, to local column indices of C. (B and C have the
2226  // same number of columns.) The kernel uses this, instead of
2227  // copying the entire input matrix B and converting its column
2228  // indices to those of C.
2229  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2230 
2231  if (Bview.importMatrix.is_null()) {
2232  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2233  Cimport = Bimport;
2234  Ccolmap = Bview.colMap;
2235  // Bcol2Ccol is trivial
2236  // Bcol2Ccol is trivial
2237 
2238  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2239  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2240  Bcol2Ccol(i) = static_cast<LO> (i);
2241  });
2242  } else {
2243  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2244  // column Map of C, as well as C's Import object (from its domain
2245  // Map to its column Map). C's column Map is the union of the
2246  // column Maps of (the local part of) B, and the "remote" part of
2247  // B. Ditto for the Import. We have optimized this "setUnion"
2248  // operation on Import objects and Maps.
2249 
2250  // Choose the right variant of setUnion
2251  if (!Bimport.is_null() && !Iimport.is_null()){
2252  Cimport = Bimport->setUnion(*Iimport);
2253  Ccolmap = Cimport->getTargetMap();
2254 
2255  } else if (!Bimport.is_null() && Iimport.is_null()) {
2256  Cimport = Bimport->setUnion();
2257 
2258  } else if(Bimport.is_null() && !Iimport.is_null()) {
2259  Cimport = Iimport->setUnion();
2260 
2261  } else
2262  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2263 
2264  Ccolmap = Cimport->getTargetMap();
2265 
2266  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2267  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2268 
2269  // NOTE: This is not efficient and should be folded into setUnion
2270  //
2271  // mfh 27 Sep 2016: What the above comment means, is that the
2272  // setUnion operation on Import objects could also compute these
2273  // local index - to - local index look-up tables.
2274  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2275  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2276  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2277  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2278  });
2279  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2280  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2281  });
2282 
2283  }
2284 
2285  // Replace the column map
2286  //
2287  // mfh 27 Sep 2016: We do this because C was originally created
2288  // without a column Map. Now we have its column Map.
2289  C.replaceColMap(Ccolmap);
2290 
2291  // mfh 27 Sep 2016: Construct tables that map from local column
2292  // indices of A, to local row indices of either B_local (the locally
2293  // owned part of B), or B_remote (the "imported" remote part of B).
2294  //
2295  // For column index Aik in row i of A, if the corresponding row of B
2296  // exists in the local part of B ("orig") (which I'll call B_local),
2297  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2298  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2299  //
2300  // For column index Aik in row i of A, if the corresponding row of B
2301  // exists in the remote part of B ("Import") (which I'll call
2302  // B_remote), then targetMapToImportRow[Aik] is the local index of
2303  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2304  // (a flag value).
2305 
2306  // Run through all the hash table lookups once and for all
2307  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2308  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2309  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2310  GO aidx = Acolmap_local.getGlobalElement(i);
2311  LO B_LID = Browmap_local.getLocalElement(aidx);
2312  if (B_LID != LO_INVALID) {
2313  targetMapToOrigRow(i) = B_LID;
2314  targetMapToImportRow(i) = LO_INVALID;
2315  } else {
2316  LO I_LID = Irowmap_local.getLocalElement(aidx);
2317  targetMapToOrigRow(i) = LO_INVALID;
2318  targetMapToImportRow(i) = I_LID;
2319 
2320  }
2321  });
2322 
2323  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2324  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2325  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2326 
2327 }
2328 
2329 
2330 /*********************************************************************************************************/
2331 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2332 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2333 
2334 template<class Scalar,
2335  class LocalOrdinal,
2336  class GlobalOrdinal,
2337  class Node,
2338  class LocalOrdinalViewType>
2339 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2340  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2341  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2342  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2343  const LocalOrdinalViewType & targetMapToOrigRow,
2344  const LocalOrdinalViewType & targetMapToImportRow,
2345  const LocalOrdinalViewType & Bcol2Ccol,
2346  const LocalOrdinalViewType & Icol2Ccol,
2347  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2348  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2349  const std::string& label,
2350  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2351 
2352 #ifdef HAVE_TPETRA_MMM_TIMINGS
2353  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2354  using Teuchos::TimeMonitor;
2355  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2356 #endif
2357 
2358  using Teuchos::Array;
2359  using Teuchos::ArrayRCP;
2360  using Teuchos::ArrayView;
2361  using Teuchos::RCP;
2362  using Teuchos::rcp;
2363 
2364  // Lots and lots of typedefs
2366  typedef typename KCRS::StaticCrsGraphType graph_t;
2367  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2368  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2369  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2370  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2371 
2372  // Jacobi-specific
2373  typedef typename scalar_view_t::memory_space scalar_memory_space;
2374 
2375  typedef Scalar SC;
2376  typedef LocalOrdinal LO;
2377  typedef GlobalOrdinal GO;
2378  typedef Node NO;
2379 
2380  typedef Map<LO,GO,NO> map_type;
2381  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2382  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2383 
2384  // Sizes
2385  RCP<const map_type> Ccolmap = C.getColMap();
2386  size_t m = Aview.origMatrix->getNodeNumRows();
2387  size_t n = Ccolmap->getNodeNumElements();
2388  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2389 
2390  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2391  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2392  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2393 
2394  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2395  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2396  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2397 
2398  c_lno_view_t Irowptr;
2399  lno_nnz_view_t Icolind;
2400  scalar_view_t Ivals;
2401  if(!Bview.importMatrix.is_null()) {
2402  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2403  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2404  Ivals = Bview.importMatrix->getLocalMatrix().values;
2405  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2406  }
2407 
2408  // Jacobi-specific inner stuff
2409  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2410 
2411  // Teuchos::ArrayView::operator[].
2412  // The status array will contain the index into colind where this entry was last deposited.
2413  // c_status[i] < CSR_ip - not in the row yet.
2414  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2415  // We start with this filled with INVALID's indicating that there are no entries yet.
2416  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2417  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2418  Array<size_t> c_status(n, ST_INVALID);
2419 
2420  // Classic csr assembly (low memory edition)
2421  //
2422  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2423  // The method loops over rows of A, and may resize after processing
2424  // each row. Chris Siefert says that this reflects experience in
2425  // ML; for the non-threaded case, ML found it faster to spend less
2426  // effort on estimation and risk an occasional reallocation.
2427  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2428  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2429  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2430  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2431  size_t CSR_ip = 0, OLD_ip = 0;
2432 
2433  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2434 
2435  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2436  // routine. The routine computes
2437  //
2438  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2439  //
2440  // This corresponds to one sweep of (weighted) Jacobi.
2441  //
2442  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2443  // you whether the corresponding row of B belongs to B_local
2444  // ("orig") or B_remote ("Import").
2445 
2446  // For each row of A/C
2447  for (size_t i = 0; i < m; i++) {
2448  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2449  // on the calling process.
2450  Crowptr[i] = CSR_ip;
2451  SC minusOmegaDval = -omega*Dvals(i,0);
2452 
2453  // Entries of B
2454  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2455  Scalar Bval = Bvals[j];
2456  if (Bval == SC_ZERO)
2457  continue;
2458  LO Bij = Bcolind[j];
2459  LO Cij = Bcol2Ccol[Bij];
2460 
2461  // Assume no repeated entries in B
2462  c_status[Cij] = CSR_ip;
2463  Ccolind[CSR_ip] = Cij;
2464  Cvals[CSR_ip] = Bvals[j];
2465  CSR_ip++;
2466  }
2467 
2468  // Entries of -omega * Dinv * A * B
2469  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2470  LO Aik = Acolind[k];
2471  const SC Aval = Avals[k];
2472  if (Aval == SC_ZERO)
2473  continue;
2474 
2475  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2476  // Local matrix
2477  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2478 
2479  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2480  LO Bkj = Bcolind[j];
2481  LO Cij = Bcol2Ccol[Bkj];
2482 
2483  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2484  // New entry
2485  c_status[Cij] = CSR_ip;
2486  Ccolind[CSR_ip] = Cij;
2487  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2488  CSR_ip++;
2489 
2490  } else {
2491  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2492  }
2493  }
2494 
2495  } else {
2496  // Remote matrix
2497  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2498  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2499  LO Ikj = Icolind[j];
2500  LO Cij = Icol2Ccol[Ikj];
2501 
2502  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2503  // New entry
2504  c_status[Cij] = CSR_ip;
2505  Ccolind[CSR_ip] = Cij;
2506  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2507  CSR_ip++;
2508  } else {
2509  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2510  }
2511  }
2512  }
2513  }
2514 
2515  // Resize for next pass if needed
2516  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2517  CSR_alloc *= 2;
2518  Kokkos::resize(Ccolind,CSR_alloc);
2519  Kokkos::resize(Cvals,CSR_alloc);
2520  }
2521  OLD_ip = CSR_ip;
2522  }
2523  Crowptr[m] = CSR_ip;
2524 
2525  // Downward resize
2526  Kokkos::resize(Ccolind,CSR_ip);
2527  Kokkos::resize(Cvals,CSR_ip);
2528 
2529  {
2530 #ifdef HAVE_TPETRA_MMM_TIMINGS
2531  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2532 #endif
2533 
2534  // Replace the column map
2535  //
2536  // mfh 27 Sep 2016: We do this because C was originally created
2537  // without a column Map. Now we have its column Map.
2538  C.replaceColMap(Ccolmap);
2539 
2540  // Final sort & set of CRS arrays
2541  //
2542  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2543  // matrix-matrix multiply routine sort the entries for us?
2544  // Final sort & set of CRS arrays
2545  if (params.is_null() || params->get("sort entries",true))
2546  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2547  C.setAllValues(Crowptr,Ccolind, Cvals);
2548  }
2549  {
2550 #ifdef HAVE_TPETRA_MMM_TIMINGS
2551  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2552 #endif
2553 
2554  // Final FillComplete
2555  //
2556  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2557  // Import (from domain Map to column Map) construction (which costs
2558  // lots of communication) by taking the previously constructed
2559  // Import object. We should be able to do this without interfering
2560  // with the implementation of the local part of sparse matrix-matrix
2561  // multply above
2562  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2563  labelList->set("Timer Label",label);
2564  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2565  RCP<const Export<LO,GO,NO> > dummyExport;
2566  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2567 
2568  }
2569 }
2570 
2571 
2572 /*********************************************************************************************************/
2573 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2574 template<class Scalar,
2575  class LocalOrdinal,
2576  class GlobalOrdinal,
2577  class Node>
2578 void jacobi_A_B_reuse(
2579  Scalar omega,
2580  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2581  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2582  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2583  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2584  const std::string& label,
2585  const Teuchos::RCP<Teuchos::ParameterList>& params)
2586 {
2587  using Teuchos::Array;
2588  using Teuchos::ArrayRCP;
2589  using Teuchos::ArrayView;
2590  using Teuchos::RCP;
2591  using Teuchos::rcp;
2592 
2593  typedef LocalOrdinal LO;
2594  typedef GlobalOrdinal GO;
2595  typedef Node NO;
2596 
2597  typedef Import<LO,GO,NO> import_type;
2598  typedef Map<LO,GO,NO> map_type;
2599 
2600  // Kokkos typedefs
2601  typedef typename map_type::local_map_type local_map_type;
2603  typedef typename KCRS::StaticCrsGraphType graph_t;
2604  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2605  typedef typename NO::execution_space execution_space;
2606  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2607  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2608 
2609 #ifdef HAVE_TPETRA_MMM_TIMINGS
2610  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2611  using Teuchos::TimeMonitor;
2612  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2613 #endif
2614  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2615 
2616  // Grab all the maps
2617  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2618  RCP<const map_type> Ccolmap = C.getColMap();
2619  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2620  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2621  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2622  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2623  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2624  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2625 
2626  // Build the final importer / column map, hash table lookups for C
2627  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2628  {
2629  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2630  // So, column map of C may be a strict subset of the column map of B
2631  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2632  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2633  });
2634 
2635  if (!Bview.importMatrix.is_null()) {
2636  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2637  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2638 
2639  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2640  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2641  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2642  });
2643  }
2644  }
2645 
2646  // Run through all the hash table lookups once and for all
2647  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2648  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2649  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2650  GO aidx = Acolmap_local.getGlobalElement(i);
2651  LO B_LID = Browmap_local.getLocalElement(aidx);
2652  if (B_LID != LO_INVALID) {
2653  targetMapToOrigRow(i) = B_LID;
2654  targetMapToImportRow(i) = LO_INVALID;
2655  } else {
2656  LO I_LID = Irowmap_local.getLocalElement(aidx);
2657  targetMapToOrigRow(i) = LO_INVALID;
2658  targetMapToImportRow(i) = I_LID;
2659 
2660  }
2661  });
2662 
2663 #ifdef HAVE_TPETRA_MMM_TIMINGS
2664  MM = Teuchos::null;
2665 #endif
2666 
2667  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2668  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2669  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2670 }
2671 
2672 
2673 
2674 /*********************************************************************************************************/
2675 template<class Scalar,
2676  class LocalOrdinal,
2677  class GlobalOrdinal,
2678  class Node,
2679  class LocalOrdinalViewType>
2680 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
2681  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2682  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2683  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2684  const LocalOrdinalViewType & targetMapToOrigRow,
2685  const LocalOrdinalViewType & targetMapToImportRow,
2686  const LocalOrdinalViewType & Bcol2Ccol,
2687  const LocalOrdinalViewType & Icol2Ccol,
2688  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2689  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2690  const std::string& label,
2691  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2692 #ifdef HAVE_TPETRA_MMM_TIMINGS
2693  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2694  using Teuchos::TimeMonitor;
2695  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
2696  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2697 #else
2698  (void)label;
2699 #endif
2700  using Teuchos::RCP;
2701  using Teuchos::rcp;
2702 
2703  // Lots and lots of typedefs
2705  typedef typename KCRS::StaticCrsGraphType graph_t;
2706  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2707  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2708  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2709  typedef typename scalar_view_t::memory_space scalar_memory_space;
2710 
2711  typedef Scalar SC;
2712  typedef LocalOrdinal LO;
2713  typedef GlobalOrdinal GO;
2714  typedef Node NO;
2715  typedef Map<LO,GO,NO> map_type;
2716  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2717  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2718  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2719 
2720  // Sizes
2721  RCP<const map_type> Ccolmap = C.getColMap();
2722  size_t m = Aview.origMatrix->getNodeNumRows();
2723  size_t n = Ccolmap->getNodeNumElements();
2724 
2725  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2726  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2727  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2728  const KCRS & Cmat = C.getLocalMatrix();
2729 
2730  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2731  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2732  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2733  scalar_view_t Cvals = Cmat.values;
2734 
2735  c_lno_view_t Irowptr;
2736  lno_nnz_view_t Icolind;
2737  scalar_view_t Ivals;
2738  if(!Bview.importMatrix.is_null()) {
2739  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2740  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2741  Ivals = Bview.importMatrix->getLocalMatrix().values;
2742  }
2743 
2744  // Jacobi-specific inner stuff
2745  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2746 
2747 #ifdef HAVE_TPETRA_MMM_TIMINGS
2748  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
2749 #endif
2750 
2751  // The status array will contain the index into colind where this entry was last deposited.
2752  // c_status[i] < CSR_ip - not in the row yet
2753  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2754  // We start with this filled with INVALID's indicating that there are no entries yet.
2755  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2756  std::vector<size_t> c_status(n, ST_INVALID);
2757 
2758  // For each row of A/C
2759  size_t CSR_ip = 0, OLD_ip = 0;
2760  for (size_t i = 0; i < m; i++) {
2761 
2762  // First fill the c_status array w/ locations where we're allowed to
2763  // generate nonzeros for this row
2764  OLD_ip = Crowptr[i];
2765  CSR_ip = Crowptr[i+1];
2766  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2767  c_status[Ccolind[k]] = k;
2768 
2769  // Reset values in the row of C
2770  Cvals[k] = SC_ZERO;
2771  }
2772 
2773  SC minusOmegaDval = -omega*Dvals(i,0);
2774 
2775  // Entries of B
2776  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2777  Scalar Bval = Bvals[j];
2778  if (Bval == SC_ZERO)
2779  continue;
2780  LO Bij = Bcolind[j];
2781  LO Cij = Bcol2Ccol[Bij];
2782 
2783  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2784  std::runtime_error, "Trying to insert a new entry into a static graph");
2785 
2786  Cvals[c_status[Cij]] = Bvals[j];
2787  }
2788 
2789  // Entries of -omega * Dinv * A * B
2790  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2791  LO Aik = Acolind[k];
2792  const SC Aval = Avals[k];
2793  if (Aval == SC_ZERO)
2794  continue;
2795 
2796  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2797  // Local matrix
2798  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2799 
2800  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2801  LO Bkj = Bcolind[j];
2802  LO Cij = Bcol2Ccol[Bkj];
2803 
2804  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2805  std::runtime_error, "Trying to insert a new entry into a static graph");
2806 
2807  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
2808  }
2809 
2810  } else {
2811  // Remote matrix
2812  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2813  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2814  LO Ikj = Icolind[j];
2815  LO Cij = Icol2Ccol[Ikj];
2816 
2817  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2818  std::runtime_error, "Trying to insert a new entry into a static graph");
2819 
2820  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
2821  }
2822  }
2823  }
2824  }
2825 
2826 #ifdef HAVE_TPETRA_MMM_TIMINGS
2827  MM2 = Teuchos::null;
2828  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
2829 #endif
2830 
2831  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2832 #ifdef HAVE_TPETRA_MMM_TIMINGS
2833  MM2 = Teuchos::null;
2834  MM = Teuchos::null;
2835 #endif
2836 
2837 }
2838 
2839 
2840 
2841 /*********************************************************************************************************/
2842 template<class Scalar,
2843  class LocalOrdinal,
2844  class GlobalOrdinal,
2845  class Node>
2846 void import_and_extract_views(
2847  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
2848  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
2849  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2850  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
2851  bool userAssertsThereAreNoRemotes,
2852  const std::string& label,
2853  const Teuchos::RCP<Teuchos::ParameterList>& params)
2854 {
2855  using Teuchos::Array;
2856  using Teuchos::ArrayView;
2857  using Teuchos::RCP;
2858  using Teuchos::rcp;
2859  using Teuchos::null;
2860 
2861  typedef Scalar SC;
2862  typedef LocalOrdinal LO;
2863  typedef GlobalOrdinal GO;
2864  typedef Node NO;
2865 
2866  typedef Map<LO,GO,NO> map_type;
2867  typedef Import<LO,GO,NO> import_type;
2868  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
2869 
2870 #ifdef HAVE_TPETRA_MMM_TIMINGS
2871  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2872  using Teuchos::TimeMonitor;
2873  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
2874 #endif
2875  // The goal of this method is to populate the 'Aview' struct with views of the
2876  // rows of A, including all rows that correspond to elements in 'targetMap'.
2877  //
2878  // If targetMap includes local elements that correspond to remotely-owned rows
2879  // of A, then those remotely-owned rows will be imported into
2880  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
2881  Aview.deleteContents();
2882 
2883  Aview.origMatrix = rcp(&A, false);
2884  Aview.origRowMap = A.getRowMap();
2885  Aview.rowMap = targetMap;
2886  Aview.colMap = A.getColMap();
2887  Aview.domainMap = A.getDomainMap();
2888  Aview.importColMap = null;
2889  RCP<const map_type> rowMap = A.getRowMap();
2890  const int numProcs = rowMap->getComm()->getSize();
2891 
2892  // Short circuit if the user swears there are no remotes (or if we're in serial)
2893  if (userAssertsThereAreNoRemotes || numProcs < 2)
2894  return;
2895 
2896  RCP<const import_type> importer;
2897  if (params != null && params->isParameter("importer")) {
2898  importer = params->get<RCP<const import_type> >("importer");
2899 
2900  } else {
2901 #ifdef HAVE_TPETRA_MMM_TIMINGS
2902  MM = Teuchos::null;
2903  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
2904 #endif
2905 
2906  // Mark each row in targetMap as local or remote, and go ahead and get a view
2907  // for the local rows
2908  RCP<const map_type> remoteRowMap;
2909  size_t numRemote = 0;
2910  int mode = 0;
2911  if (!prototypeImporter.is_null() &&
2912  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
2913  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
2914  // We have a valid prototype importer --- ask it for the remotes
2915 #ifdef HAVE_TPETRA_MMM_TIMINGS
2916  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
2917 #endif
2918  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
2919  numRemote = prototypeImporter->getNumRemoteIDs();
2920 
2921  Array<GO> remoteRows(numRemote);
2922  for (size_t i = 0; i < numRemote; i++)
2923  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
2924 
2925  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2926  rowMap->getIndexBase(), rowMap->getComm()));
2927  mode = 1;
2928 
2929  } else if (prototypeImporter.is_null()) {
2930  // No prototype importer --- count the remotes the hard way
2931 #ifdef HAVE_TPETRA_MMM_TIMINGS
2932  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
2933 #endif
2934  ArrayView<const GO> rows = targetMap->getNodeElementList();
2935  size_t numRows = targetMap->getNodeNumElements();
2936 
2937  Array<GO> remoteRows(numRows);
2938  for(size_t i = 0; i < numRows; ++i) {
2939  const LO mlid = rowMap->getLocalElement(rows[i]);
2940 
2941  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
2942  remoteRows[numRemote++] = rows[i];
2943  }
2944  remoteRows.resize(numRemote);
2945  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
2946  rowMap->getIndexBase(), rowMap->getComm()));
2947  mode = 2;
2948 
2949  } else {
2950  // PrototypeImporter is bad. But if we're in serial that's OK.
2951  mode = 3;
2952  }
2953 
2954  if (numProcs < 2) {
2955  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
2956  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
2957  // If only one processor we don't need to import any remote rows, so return.
2958  return;
2959  }
2960 
2961  //
2962  // Now we will import the needed remote rows of A, if the global maximum
2963  // value of numRemote is greater than 0.
2964  //
2965 #ifdef HAVE_TPETRA_MMM_TIMINGS
2966  MM = Teuchos::null;
2967  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
2968 #endif
2969 
2970  global_size_t globalMaxNumRemote = 0;
2971  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
2972 
2973  if (globalMaxNumRemote > 0) {
2974 #ifdef HAVE_TPETRA_MMM_TIMINGS
2975  MM = Teuchos::null;
2976  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
2977 #endif
2978  // Create an importer with target-map remoteRowMap and source-map rowMap.
2979  if (mode == 1)
2980  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
2981  else if (mode == 2)
2982  importer = rcp(new import_type(rowMap, remoteRowMap));
2983  else
2984  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
2985  }
2986 
2987  if (params != null)
2988  params->set("importer", importer);
2989  }
2990 
2991  if (importer != null) {
2992 #ifdef HAVE_TPETRA_MMM_TIMINGS
2993  MM = Teuchos::null;
2994  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
2995 #endif
2996 
2997  // Now create a new matrix into which we can import the remote rows of A that we need.
2998  Teuchos::ParameterList labelList;
2999  labelList.set("Timer Label", label);
3000  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3001 
3002  bool isMM = true;
3003  bool overrideAllreduce = false;
3004  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3005  // Minor speedup tweak - avoid computing the global constants
3006  Teuchos::ParameterList params_sublist;
3007  if(!params.is_null()) {
3008  labelList.set("compute global constants", params->get("compute global constants",false));
3009  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3010  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3011  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3012  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3013  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3014  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3015  }
3016  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3017  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3018  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3019 
3020  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3021  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3022 
3023 #if 0
3024  // Disabled code for dumping input matrices
3025  static int count=0;
3026  char str[80];
3027  sprintf(str,"import_matrix.%d.dat",count);
3029  count++;
3030 #endif
3031 
3032 #ifdef HAVE_TPETRA_MMM_STATISTICS
3033  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3034 #endif
3035 
3036 
3037 #ifdef HAVE_TPETRA_MMM_TIMINGS
3038  MM = Teuchos::null;
3039  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3040 #endif
3041 
3042  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3043  Aview.importColMap = Aview.importMatrix->getColMap();
3044 #ifdef HAVE_TPETRA_MMM_TIMINGS
3045  MM = Teuchos::null;
3046 #endif
3047  }
3048 }
3049 
3050 
3051 
3052 
3053 
3054 /*********************************************************************************************************/
3055  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3056 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3058 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3059  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3060  const LocalOrdinalViewType & Acol2Brow,
3061  const LocalOrdinalViewType & Acol2Irow,
3062  const LocalOrdinalViewType & Bcol2Ccol,
3063  const LocalOrdinalViewType & Icol2Ccol,
3064  const size_t mergedNodeNumCols) {
3065 
3066  using Teuchos::RCP;
3068  typedef typename KCRS::StaticCrsGraphType graph_t;
3069  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3070  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3071  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3072  // Grab the Kokkos::SparseCrsMatrices
3073  const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3074  const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3075 
3076  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3077  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3078  // We do have a Bimport
3079  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3080  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3081  RCP<const KCRS> Ik_;
3082  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3083  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3084  KCRS Iks;
3085  if(Ik!=0) Iks = *Ik;
3086  size_t merge_numrows = Ak.numCols();
3087  // The last entry of this at least, need to be initialized
3088  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3089 
3090  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3091 
3092  // Use a Kokkos::parallel_scan to build the rowptr
3093  typedef typename Node::execution_space execution_space;
3094  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3095  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3096  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3097  if(final) Mrowptr(i) = update;
3098  // Get the row count
3099  size_t ct=0;
3100  if(Acol2Brow(i)!=LO_INVALID)
3101  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3102  else
3103  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3104  update+=ct;
3105 
3106  if(final && i+1==merge_numrows)
3107  Mrowptr(i+1)=update;
3108  });
3109 
3110  // Allocate nnz
3111  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3112  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3113  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3114 
3115  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3116  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3117  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3118  if(Acol2Brow(i)!=LO_INVALID) {
3119  size_t row = Acol2Brow(i);
3120  size_t start = Bk.graph.row_map(row);
3121  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3122  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3123  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3124  }
3125  }
3126  else {
3127  size_t row = Acol2Irow(i);
3128  size_t start = Iks.graph.row_map(row);
3129  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3130  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3131  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3132  }
3133  }
3134  });
3135 
3136  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3137  return newmat;
3138  }
3139  else {
3140  // We don't have a Bimport (the easy case)
3141  return Bk;
3142  }
3143 }//end merge_matrices
3144 
3145 template<typename SC, typename LO, typename GO, typename NO>
3146 void AddKernels<SC, LO, GO, NO>::
3147 addSorted(
3148  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3149  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3150  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3151  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3152  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3153  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3154  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3155  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3156  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3157  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3158  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3159 {
3160  using Teuchos::TimeMonitor;
3161  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3162  auto nrows = Arowptrs.extent(0) - 1;
3163  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3164  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, impl_scalar_type,
3165  execution_space, memory_space, memory_space> KKH;
3166  KKH handle;
3167  handle.create_spadd_handle(true);
3168  auto addHandle = handle.get_spadd_handle();
3169 #ifdef HAVE_TPETRA_MMM_TIMINGS
3170  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3171 #endif
3172  KokkosSparse::Experimental::spadd_symbolic
3173  <KKH,
3174  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3175  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3176  row_ptrs_array, col_inds_array>
3177  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3178  //KokkosKernels requires values to be zeroed
3179  Cvals = values_array("C values", addHandle->get_max_result_nnz());
3180  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_max_result_nnz());
3181 #ifdef HAVE_TPETRA_MMM_TIMINGS
3182  MM = Teuchos::null;
3183  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3184 #endif
3185  KokkosSparse::Experimental::spadd_numeric(&handle,
3186  Arowptrs, Acolinds, Avals, scalarA,
3187  Browptrs, Bcolinds, Bvals, scalarB,
3188  Crowptrs, Ccolinds, Cvals);
3189 }
3190 
3191 template<typename SC, typename LO, typename GO, typename NO>
3192 void AddKernels<SC, LO, GO, NO>::
3193 addUnsorted(
3194  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3195  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3196  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3197  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3198  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3199  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3200  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3201  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3202  GO /* numGlobalCols */,
3203  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3204  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3205  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3206 {
3207  using Teuchos::TimeMonitor;
3208  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3209  auto nrows = Arowptrs.extent(0) - 1;
3210  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3211  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3212  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
3213  AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
3214  KKH handle;
3215  handle.create_spadd_handle(false);
3216  auto addHandle = handle.get_spadd_handle();
3217 #ifdef HAVE_TPETRA_MMM_TIMINGS
3218  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3219 #endif
3220  KokkosSparse::Experimental::spadd_symbolic
3221  <KKH,
3222  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3223  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
3224  row_ptrs_array, col_inds_array>
3225  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3226  //Cvals must be zeroed out
3227  Cvals = values_array("C values", addHandle->get_max_result_nnz());
3228  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_max_result_nnz());
3229 #ifdef HAVE_TPETRA_MMM_TIMINGS
3230  MM = Teuchos::null;
3231  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
3232 #endif
3233  KokkosSparse::Experimental::spadd_numeric(&handle,
3234  Arowptrs, Acolinds, Avals, scalarA,
3235  Browptrs, Bcolinds, Bvals, scalarB,
3236  Crowptrs, Ccolinds, Cvals);
3237 }
3238 
3239 template<typename GO,
3240  typename LocalIndicesType,
3241  typename GlobalIndicesType,
3242  typename ColMapType>
3243 struct ConvertLocalToGlobalFunctor
3244 {
3245  ConvertLocalToGlobalFunctor(
3246  const LocalIndicesType& colindsOrig_,
3247  const GlobalIndicesType& colindsConverted_,
3248  const ColMapType& colmap_) :
3249  colindsOrig (colindsOrig_),
3250  colindsConverted (colindsConverted_),
3251  colmap (colmap_)
3252  {}
3253  KOKKOS_INLINE_FUNCTION void
3254  operator() (const GO i) const
3255  {
3256  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3257  }
3258  LocalIndicesType colindsOrig;
3259  GlobalIndicesType colindsConverted;
3260  ColMapType colmap;
3261 };
3262 
3263 template<typename SC, typename LO, typename GO, typename NO>
3264 void MMdetails::AddKernels<SC, LO, GO, NO>::
3265 convertToGlobalAndAdd(
3266  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3267  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3268  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3269  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3270  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3271  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3272  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3273  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3274  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3275 {
3276  using Teuchos::TimeMonitor;
3277 
3278  const values_array Avals = A.values;
3279  const values_array Bvals = B.values;
3280  const col_inds_array Acolinds = A.graph.entries;
3281  const col_inds_array Bcolinds = B.graph.entries;
3282  auto Arowptrs = A.graph.row_map;
3283  auto Browptrs = B.graph.row_map;
3284  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3285  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3286 #ifdef HAVE_TPETRA_MMM_TIMINGS
3287  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3288 #endif
3289  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3290  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3291  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3292  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3293  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, GO, impl_scalar_type,
3294  execution_space, memory_space, memory_space> KKH;
3295  KKH handle;
3296  handle.create_spadd_handle(false);
3297  auto addHandle = handle.get_spadd_handle();
3298 #ifdef HAVE_TPETRA_MMM_TIMINGS
3299  MM = Teuchos::null;
3300  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3301 #endif
3302  auto nrows = Arowptrs.extent(0) - 1;
3303  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3304  KokkosSparse::Experimental::spadd_symbolic
3305  <KKH, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
3306  (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3307  Cvals = values_array("C values", addHandle->get_max_result_nnz());
3308  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_max_result_nnz());
3309 #ifdef HAVE_TPETRA_MMM_TIMINGS
3310  MM = Teuchos::null;
3311  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3312 #endif
3313  KokkosSparse::Experimental::spadd_numeric(&handle,
3314  Arowptrs, AcolindsConverted, Avals, scalarA,
3315  Browptrs, BcolindsConverted, Bvals, scalarB,
3316  Crowptrs, Ccolinds, Cvals);
3317 }
3318 
3319 
3320 } //End namepsace MMdetails
3321 
3322 } //End namespace Tpetra
3323 
3324 /*********************************************************************************************************/
3325 //
3326 // Explicit instantiation macro
3327 //
3328 // Must be expanded from within the Tpetra namespace!
3329 //
3330 namespace Tpetra {
3331 
3332 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3333  template \
3334  void MatrixMatrix::Multiply( \
3335  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3336  bool transposeA, \
3337  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3338  bool transposeB, \
3339  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3340  bool call_FillComplete_on_result, \
3341  const std::string & label, \
3342  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3343 \
3344 template \
3345  void MatrixMatrix::Jacobi( \
3346  SCALAR omega, \
3347  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3348  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3349  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3350  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3351  bool call_FillComplete_on_result, \
3352  const std::string & label, \
3353  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3354 \
3355  template \
3356  void MatrixMatrix::Add( \
3357  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3358  bool transposeA, \
3359  SCALAR scalarA, \
3360  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3361  bool transposeB, \
3362  SCALAR scalarB, \
3363  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3364 \
3365  template \
3366  void MatrixMatrix::Add( \
3367  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3368  bool transposeA, \
3369  SCALAR scalarA, \
3370  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3371  SCALAR scalarB ); \
3372 \
3373  template \
3374  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3375  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3376  (const SCALAR & alpha, \
3377  const bool transposeA, \
3378  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3379  const SCALAR & beta, \
3380  const bool transposeB, \
3381  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3382  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3383  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3384  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3385 \
3386  template \
3387  void \
3388  MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3389  (const SCALAR & alpha, \
3390  const bool transposeA, \
3391  const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3392  const SCALAR& beta, \
3393  const bool transposeB, \
3394  const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3395  CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3396  const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3397  const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3398  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3399 \
3400  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>;
3401 
3402 } //End namespace Tpetra
3403 
3404 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.