Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Macros Pages
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "Tpetra_ConfigDefs.hpp"
45 #include "Teuchos_VerboseObject.hpp"
46 #include "Teuchos_Array.hpp"
47 #include "Tpetra_Util.hpp"
48 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_RowMatrixTransposer.hpp"
52 #include "Tpetra_Details_radixSort.hpp"
54 #include "Tpetra_ConfigDefs.hpp"
55 #include "Tpetra_Map.hpp"
56 #include "Tpetra_Export.hpp"
57 #include "Tpetra_Import_Util.hpp"
58 #include "Tpetra_Import_Util2.hpp"
59 #include <algorithm>
60 #include <type_traits>
61 #include "Teuchos_FancyOStream.hpp"
62 
63 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
65 
66 #include "KokkosSparse_spgemm.hpp"
67 #include "KokkosSparse_spadd.hpp"
68 
69 #include <MatrixMarket_Tpetra.hpp>
70 
78 /*********************************************************************************************************/
79 // Include the architecture-specific kernel partial specializations here
80 // NOTE: This needs to be outside all namespaces
81 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
82 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
83 
84 
85 namespace Tpetra {
86 
87 namespace MatrixMatrix{
88 
89 //
90 // This method forms the matrix-matrix product C = op(A) * op(B), where
91 // op(A) == A if transposeA is false,
92 // op(A) == A^T if transposeA is true,
93 // and similarly for op(B).
94 //
95 template <class Scalar,
96  class LocalOrdinal,
97  class GlobalOrdinal,
98  class Node>
99 void Multiply(
101  bool transposeA,
103  bool transposeB,
105  bool call_FillComplete_on_result,
106  const std::string& label,
107  const Teuchos::RCP<Teuchos::ParameterList>& params)
108 {
109  using Teuchos::null;
110  using Teuchos::RCP;
111  using Teuchos::rcp;
112  typedef Scalar SC;
113  typedef LocalOrdinal LO;
114  typedef GlobalOrdinal GO;
115  typedef Node NO;
116  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
117  typedef Import<LO,GO,NO> import_type;
118  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
119  typedef Map<LO,GO,NO> map_type;
120  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
121 
122 #ifdef HAVE_TPETRA_MMM_TIMINGS
123  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
124  using Teuchos::TimeMonitor;
125  //MM is used to time setup, and then multiply.
126 
127  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
128 #endif
129 
130  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
131 
132  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
133 
134  // The input matrices A and B must both be fillComplete.
135  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
136  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
137 
138  // If transposeA is true, then Aprime will be the transpose of A
139  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
140  // will just be a pointer to A.
141  RCP<const crs_matrix_type> Aprime = null;
142  // If transposeB is true, then Bprime will be the transpose of B
143  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
144  // will just be a pointer to B.
145  RCP<const crs_matrix_type> Bprime = null;
146 
147  // Is this a "clean" matrix?
148  //
149  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
150  // locally nor globally indexed, then it was empty. I don't like
151  // this, because the most straightforward implementation presumes
152  // lazy allocation of indices. However, historical precedent
153  // demands that we keep around this predicate as a way to test
154  // whether the matrix is empty.
155  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
156 
157  bool use_optimized_ATB = false;
158  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
159  use_optimized_ATB = true;
160 
161 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
162  use_optimized_ATB = false;
163 #endif
164 
165  using Teuchos::ParameterList;
166  RCP<ParameterList> transposeParams (new ParameterList);
167  transposeParams->set ("sort", false);
168 
169  if (!use_optimized_ATB && transposeA) {
170  transposer_type transposer (rcpFromRef (A));
171  Aprime = transposer.createTranspose (transposeParams);
172  }
173  else {
174  Aprime = rcpFromRef(A);
175  }
176 
177  if (transposeB) {
178  transposer_type transposer (rcpFromRef (B));
179  Bprime = transposer.createTranspose (transposeParams);
180  }
181  else {
182  Bprime = rcpFromRef(B);
183  }
184 
185  // Check size compatibility
186  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
187  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
188  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
189  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
190  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
191  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
192  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
193  prefix << "ERROR, inner dimensions of op(A) and op(B) "
194  "must match for matrix-matrix product. op(A) is "
195  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
196 
197  // The result matrix C must at least have a row-map that reflects the correct
198  // row-size. Don't check the number of columns because rectangular matrices
199  // which were constructed with only one map can still end up having the
200  // correct capacity and dimensions when filled.
201  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
202  prefix << "ERROR, dimensions of result C must "
203  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
204  << " rows, should have at least " << Aouter << std::endl);
205 
206  // It doesn't matter whether C is already Filled or not. If it is already
207  // Filled, it must have space allocated for the positions that will be
208  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
209  // we'll error out later when trying to store result values.
210 
211  // CGB: However, matrix must be in active-fill
212  if (!C.isFillActive()) C.resumeFill();
213 
214  // We're going to need to import remotely-owned sections of A and/or B if
215  // more than one processor is performing this run, depending on the scenario.
216  int numProcs = A.getComm()->getSize();
217 
218  // Declare a couple of structs that will be used to hold views of the data
219  // of A and B, to be used for fast access during the matrix-multiplication.
220  crs_matrix_struct_type Aview;
221  crs_matrix_struct_type Bview;
222 
223  RCP<const map_type> targetMap_A = Aprime->getRowMap();
224  RCP<const map_type> targetMap_B = Bprime->getRowMap();
225 
226 #ifdef HAVE_TPETRA_MMM_TIMINGS
227  {
228  TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
229 #endif
230 
231  // Now import any needed remote rows and populate the Aview struct
232  // NOTE: We assert that an import isn't needed --- since we do the transpose
233  // above to handle that.
234  if (!use_optimized_ATB) {
235  RCP<const import_type> dummyImporter;
236  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
237  }
238 
239  // We will also need local access to all rows of B that correspond to the
240  // column-map of op(A).
241  if (numProcs > 1)
242  targetMap_B = Aprime->getColMap();
243 
244  // Import any needed remote rows and populate the Bview struct.
245  if (!use_optimized_ATB)
246  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label, params);
247 
248 #ifdef HAVE_TPETRA_MMM_TIMINGS
249  } //stop MM_importExtract here
250  //stop the setup timer, and start the multiply timer
251  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
252 #endif
253 
254  // Call the appropriate method to perform the actual multiplication.
255  if (use_optimized_ATB) {
256  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
257 
258  } else if (call_FillComplete_on_result && newFlag) {
259  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
260 
261  } else if (call_FillComplete_on_result) {
262  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
263 
264  } else {
265  // mfh 27 Sep 2016: Is this the "slow" case? This
266  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
267  // thread-parallel inserts, but that may take some effort.
268  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
269 
270  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
271  }
272 
273 #ifdef HAVE_TPETRA_MMM_TIMINGS
274  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
275 #endif
276  if (call_FillComplete_on_result && !C.isFillComplete()) {
277  // We'll call FillComplete on the C matrix before we exit, and give it a
278  // domain-map and a range-map.
279  // The domain-map will be the domain-map of B, unless
280  // op(B)==transpose(B), in which case the range-map of B will be used.
281  // The range-map will be the range-map of A, unless op(A)==transpose(A),
282  // in which case the domain-map of A will be used.
283  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
284  }
285 }
286 
287 
288 template <class Scalar,
289  class LocalOrdinal,
290  class GlobalOrdinal,
291  class Node>
292 void Jacobi(Scalar omega,
297  bool call_FillComplete_on_result,
298  const std::string& label,
299  const Teuchos::RCP<Teuchos::ParameterList>& params)
300 {
301  using Teuchos::RCP;
302  typedef Scalar SC;
303  typedef LocalOrdinal LO;
304  typedef GlobalOrdinal GO;
305  typedef Node NO;
306  typedef Import<LO,GO,NO> import_type;
307  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
308  typedef Map<LO,GO,NO> map_type;
309  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
310 
311 #ifdef HAVE_TPETRA_MMM_TIMINGS
312  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
313  using Teuchos::TimeMonitor;
314  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
315 #endif
316 
317  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
318 
319  // A and B should already be Filled.
320  // Should we go ahead and call FillComplete() on them if necessary or error
321  // out? For now, we choose to error out.
322  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
323  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
324 
325  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
326  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
327 
328  // Now check size compatibility
329  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
330  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
331  global_size_t Aouter = A.getGlobalNumRows();
332  global_size_t Bouter = numBCols;
333  global_size_t Ainner = numACols;
334  global_size_t Binner = B.getGlobalNumRows();
335  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
336  prefix << "ERROR, inner dimensions of op(A) and op(B) "
337  "must match for matrix-matrix product. op(A) is "
338  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
339 
340  // The result matrix C must at least have a row-map that reflects the correct
341  // row-size. Don't check the number of columns because rectangular matrices
342  // which were constructed with only one map can still end up having the
343  // correct capacity and dimensions when filled.
344  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
345  prefix << "ERROR, dimensions of result C must "
346  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
347  << " rows, should have at least "<< Aouter << std::endl);
348 
349  // It doesn't matter whether C is already Filled or not. If it is already
350  // Filled, it must have space allocated for the positions that will be
351  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
352  // we'll error out later when trying to store result values.
353 
354  // CGB: However, matrix must be in active-fill
355  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
356 
357  // We're going to need to import remotely-owned sections of A and/or B if
358  // more than one processor is performing this run, depending on the scenario.
359  int numProcs = A.getComm()->getSize();
360 
361  // Declare a couple of structs that will be used to hold views of the data of
362  // A and B, to be used for fast access during the matrix-multiplication.
363  crs_matrix_struct_type Aview;
364  crs_matrix_struct_type Bview;
365 
366  RCP<const map_type> targetMap_A = Aprime->getRowMap();
367  RCP<const map_type> targetMap_B = Bprime->getRowMap();
368 
369 #ifdef HAVE_TPETRA_MMM_TIMINGS
370  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
371  {
372 #endif
373 
374  // Enable globalConstants by default
375  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
376  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
377  if(!params.is_null()) {
378  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
379  int mm_optimization_core_count=0;
380  auto slist = params->sublist("matrixmatrix: kernel params",false);
381  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
382  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
383  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
384  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
385  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
386  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
387  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
388  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
389  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
390  }
391 
392  //Now import any needed remote rows and populate the Aview struct.
393  RCP<const import_type> dummyImporter;
394  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, false, label,importParams1);
395 
396  // We will also need local access to all rows of B that correspond to the
397  // column-map of op(A).
398  if (numProcs > 1)
399  targetMap_B = Aprime->getColMap();
400 
401  // Now import any needed remote rows and populate the Bview struct.
402  // Enable globalConstants by default
403  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
404  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
405  if(!params.is_null()) {
406  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
407 
408  auto slist = params->sublist("matrixmatrix: kernel params",false);
409  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
410  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
411  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
412  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
413  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
414  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
415  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
416  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
417  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
418  }
419 
420  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), false, label,importParams2);
421 
422 #ifdef HAVE_TPETRA_MMM_TIMINGS
423  }
424  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
425 #endif
426 
427  // Now call the appropriate method to perform the actual multiplication.
428  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
429 
430  // Is this a "clean" matrix
431  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
432 
433  if (call_FillComplete_on_result && newFlag) {
434  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
435 
436  } else if (call_FillComplete_on_result) {
437  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
438 
439  } else {
440  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
441  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
442  // commenting it out.
443 // #ifdef HAVE_TPETRA_MMM_TIMINGS
444 // MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt: Jacobi FillComplete")));
445 // #endif
446  // FIXME (mfh 03 Apr 2014) This statement is unreachable, so I'm
447  // commenting it out.
448  // if (call_FillComplete_on_result) {
449  // //We'll call FillComplete on the C matrix before we exit, and give
450  // //it a domain-map and a range-map.
451  // //The domain-map will be the domain-map of B, unless
452  // //op(B)==transpose(B), in which case the range-map of B will be used.
453  // //The range-map will be the range-map of A, unless
454  // //op(A)==transpose(A), in which case the domain-map of A will be used.
455  // if (!C.isFillComplete()) {
456  // C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
457  // }
458  // }
459  }
460 }
461 
462 
463 template <class Scalar,
464  class LocalOrdinal,
465  class GlobalOrdinal,
466  class Node>
467 void Add(
469  bool transposeA,
470  Scalar scalarA,
472  Scalar scalarB )
473 {
474  using Teuchos::Array;
475  using Teuchos::RCP;
476  using Teuchos::null;
477  typedef Scalar SC;
478  typedef LocalOrdinal LO;
479  typedef GlobalOrdinal GO;
480  typedef Node NO;
481  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
482  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
483 
484  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
485 
486  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
487  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
488  "(Result matrix B is not required to be isFillComplete()).");
489  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
490  prefix << "ERROR, input matrix B must not be fill complete!");
491  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
492  prefix << "ERROR, input matrix B must not have static graph!");
493  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
494  prefix << "ERROR, input matrix B must not be locally indexed!");
495 
496  using Teuchos::ParameterList;
497  RCP<ParameterList> transposeParams (new ParameterList);
498  transposeParams->set ("sort", false);
499 
500  RCP<const crs_matrix_type> Aprime = null;
501  if (transposeA) {
502  transposer_type transposer (rcpFromRef (A));
503  Aprime = transposer.createTranspose (transposeParams);
504  }
505  else {
506  Aprime = rcpFromRef(A);
507  }
508 
509  size_t a_numEntries;
510  Array<GO> a_inds(A.getNodeMaxNumRowEntries());
511  Array<SC> a_vals(A.getNodeMaxNumRowEntries());
512  GO row;
513 
514  if (scalarB != Teuchos::ScalarTraits<SC>::one())
515  B.scale(scalarB);
516 
517  bool bFilled = B.isFillComplete();
518  size_t numMyRows = B.getNodeNumRows();
519  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
520  for (LO i = 0; (size_t)i < numMyRows; ++i) {
521  row = B.getRowMap()->getGlobalElement(i);
522  Aprime->getGlobalRowCopy(row, a_inds(), a_vals(), a_numEntries);
523 
524  if (scalarA != Teuchos::ScalarTraits<SC>::one())
525  for (size_t j = 0; j < a_numEntries; ++j)
526  a_vals[j] *= scalarA;
527 
528  if (bFilled)
529  B.sumIntoGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
530  else
531  B.insertGlobalValues(row, a_inds(0,a_numEntries), a_vals(0,a_numEntries));
532  }
533  }
534 }
535 
536 namespace ColMapFunctors
537 {
538  template<typename ByteView, typename GView>
539  struct UnionEntries
540  {
541  UnionEntries(ByteView entryUnion_, const GView gids_) : entryUnion(entryUnion_), gids(gids_) {}
542  KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const
543  {
544  entryUnion(gids(i)) = 1;
545  }
546  ByteView entryUnion;
547  const GView gids;
548  };
549 
550  template<typename LView, typename GView>
551  struct ConvertGlobalToLocal
552  {
553  ConvertGlobalToLocal(const LView gtol_, const GView gids_, LView lids_) : gtol(gtol_), gids(gids_), lids(lids_) {}
554  KOKKOS_INLINE_FUNCTION void operator()(const size_t i) const
555  {
556  lids(i) = gtol(gids(i));
557  }
558  const LView gtol;
559  const GView gids;
560  LView lids;
561  };
562 }//end ColMapFunctors
563 
564 //Build the minimal (sorted) column map for the given set of global columns
565 //Then convert gids and store them in lids (gids is not modified)
566 template<typename Scalar, typename LocalOrdinal, typename GlobalOrdinal, typename Node>
567 Teuchos::RCP<Map<LocalOrdinal, GlobalOrdinal, Node> > AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
568 makeColMapAndConvertGids(GlobalOrdinal ncols,
569  const typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::global_col_inds_array& gids,
570  typename AddDetails::AddKernels<Scalar, LocalOrdinal, GlobalOrdinal, Node>::col_inds_array& lids,
571  const Teuchos::RCP<const Teuchos::Comm<int>>& comm)
572 {
573  using namespace ColMapFunctors;
574  using Teuchos::RCP;
575  using Teuchos::rcp;
576  typedef Kokkos::View<char*, device_type> ByteView;
577  typedef global_col_inds_array GView;
578  typedef col_inds_array LView;
579  //Functors (explained in the procedural code below)
580  auto nentries = gids.extent(0);
581  //each entry of entryUnion is 0 unless there is a local entry in that column (then it is 1)
582  ByteView entryUnion("entry union", ncols);
583  UnionEntries<ByteView, GView> ue(entryUnion, gids);
584  Kokkos::parallel_for("Tpetra_MatrixMatrix_unionEntries", range_type(0, nentries), ue);
585  //turn entryUnion into prefix sum gtol (where gtol(i) gives the new local col for global col i)
586  LView gtol("global col -> local col", ncols + 1);
587  ::Tpetra::Details::computeOffsetsFromCounts<decltype(gtol), decltype(entryUnion)>(gtol, entryUnion);
588  //convert gids to local ids and put them in lids (implicitly sorted as long as input gids is sorted per row)
589  ConvertGlobalToLocal<LView, GView> cgtl(gtol, gids, lids);
590  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertGlobalToLocal", range_type(0, gids.extent(0)), cgtl);
591  //build local set of GIDs for constructing column map - the last entry in gtol is the total number of local cols
592  execution_space().fence();
593  GView colmap("column map", gtol(ncols));
594  size_t localIter = 0;
595  execution_space().fence();
596  for(size_t i = 0; i < entryUnion.extent(0); i++)
597  {
598  if(entryUnion(i) != 0)
599  {
600  colmap(localIter++) = i;
601  }
602  }
603  execution_space().fence();
604  //finally, construct Tpetra map
605  return rcp(new map_type(Teuchos::OrdinalTraits<GlobalOrdinal>::invalid(), colmap, 0, comm));
606 }
607 
608 template <class Scalar,
609  class LocalOrdinal,
610  class GlobalOrdinal,
611  class Node>
612 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
613 add (const Scalar& alpha,
614  const bool transposeA,
616  const Scalar& beta,
617  const bool transposeB,
619  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
620  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
621  const Teuchos::RCP<Teuchos::ParameterList>& params)
622 {
623  typedef Map<LocalOrdinal,GlobalOrdinal,Node> map_type;
625  Teuchos::RCP<const map_type> CrowMap = transposeB ? B.getDomainMap() : B.getRowMap();
626 
627  Teuchos::RCP<crs_matrix_type> C = rcp(new crs_matrix_type(CrowMap, 0));
628 
629  add(alpha,transposeA,A,beta,transposeB,B,*C,domainMap,rangeMap,params);
630  return C;
631 }
632 
633 template <class Scalar,
634  class LocalOrdinal,
635  class GlobalOrdinal,
636  class Node>
637 void
638 add (const Scalar& alpha,
639  const bool transposeA,
641  const Scalar& beta,
642  const bool transposeB,
645  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
646  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
647  const Teuchos::RCP<Teuchos::ParameterList>& params)
648 {
649  using Teuchos::RCP;
650  using Teuchos::rcp;
651  using Teuchos::rcpFromRef;
652  using Teuchos::rcp_implicit_cast;
653  using Teuchos::rcp_dynamic_cast;
654  using Teuchos::TimeMonitor;
655  typedef Scalar SC;
656  typedef LocalOrdinal LO;
657  typedef GlobalOrdinal GO;
658  typedef Node NO;
659  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
660  typedef Map<LO,GO,NO> map_type;
661  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
662  typedef Import<LO,GO,NO> import_type;
663  typedef Export<LO,GO,NO> export_type;
664  typedef AddDetails::AddKernels<SC,LO,GO,NO> AddKern;
665  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
666  constexpr bool debug = false;
667 
668 #ifdef HAVE_TPETRA_MMM_TIMINGS
669  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
670 #endif
671 
672  if (debug) {
673  std::ostringstream os;
674  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
675  << "TpetraExt::MatrixMatrix::add" << std::endl;
676  std::cerr << os.str ();
677  }
678 
679  TEUCHOS_TEST_FOR_EXCEPTION
680  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
681  prefix_mmm << "A and B must both be fill complete.");
682 #ifdef HAVE_TPETRA_DEBUG
683  // The matrices don't have domain or range Maps unless they are fill complete.
684  if (A.isFillComplete () && B.isFillComplete ()) {
685  const bool domainMapsSame =
686  (! transposeA && ! transposeB &&
687  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
688  (! transposeA && transposeB &&
689  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
690  ( transposeA && ! transposeB &&
691  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
692  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
693  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
694 
695  const bool rangeMapsSame =
696  (! transposeA && ! transposeB &&
697  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
698  (! transposeA && transposeB &&
699  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
700  ( transposeA && ! transposeB &&
701  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
702  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
703  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
704  }
705 #endif // HAVE_TPETRA_DEBUG
706 
707  using Teuchos::ParameterList;
708  RCP<ParameterList> transposeParams (new ParameterList);
709  transposeParams->set ("sort", false);
710 
711  auto comm = A.getComm();
712  // Form the explicit transpose of A if necessary.
713  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
714  if (transposeA) {
715  transposer_type transposer (Aprime);
716  Aprime = transposer.createTranspose (transposeParams);
717  }
718 
719 #ifdef HAVE_TPETRA_DEBUG
720  TEUCHOS_TEST_FOR_EXCEPTION
721  (Aprime.is_null (), std::logic_error,
722  prefix_mmm << "Failed to compute Op(A). "
723  "Please report this bug to the Tpetra developers.");
724 #endif // HAVE_TPETRA_DEBUG
725 
726  // Form the explicit transpose of B if necessary.
727  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
728  if (transposeB) {
729  if (debug) {
730  std::ostringstream os;
731  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
732  << "Form explicit xpose of B" << std::endl;
733  std::cerr << os.str ();
734  }
735  transposer_type transposer (Bprime);
736  Bprime = transposer.createTranspose (transposeParams);
737  }
738 #ifdef HAVE_TPETRA_DEBUG
739  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
740  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
741  TEUCHOS_TEST_FOR_EXCEPTION(
742  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
743  prefix_mmm << "Aprime and Bprime must both be fill complete. "
744  "Please report this bug to the Tpetra developers.");
745 #endif // HAVE_TPETRA_DEBUG
746  RCP<const map_type> CDomainMap = domainMap;
747  RCP<const map_type> CRangeMap = rangeMap;
748  if(CDomainMap.is_null())
749  {
750  CDomainMap = Bprime->getDomainMap();
751  }
752  if(CRangeMap.is_null())
753  {
754  CRangeMap = Bprime->getRangeMap();
755  }
756  assert(!(CDomainMap.is_null()));
757  assert(!(CRangeMap.is_null()));
758  typedef typename AddKern::values_array values_array;
759  typedef typename AddKern::row_ptrs_array row_ptrs_array;
760  typedef typename AddKern::col_inds_array col_inds_array;
761  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
762  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
763  values_array vals;
764  row_ptrs_array rowptrs;
765  col_inds_array colinds;
766  auto acolmap = Aprime->getColMap()->getMyGlobalIndices();
767  auto bcolmap = Bprime->getColMap()->getMyGlobalIndices();
768 #ifdef HAVE_TPETRA_MMM_TIMINGS
769  MM = Teuchos::null;
770  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
771 #endif
772  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
773  {
774  //import Aprime into Bprime's row map so the local matrices have same # of rows
775  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
776  // cbl do not set
777  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
778  // this import _may_ take the form of a transfer. In practice it would be unlikely,
779  // but the general case is not so forgiving.
780 
781  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
782  }
783  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
784  bool sorted = AGraphSorted && BGraphSorted;
785  RCP<const map_type> CrowMap;
786  RCP<const map_type> CcolMap;
787  RCP<const import_type> Cimport = Teuchos::null;
788  RCP<export_type> Cexport = Teuchos::null;
789  //The unsorted KCRS addition kernel uses std::sort(), which can't run on CUDA
790  if(!matchingColMaps && !(CDomainMap->isContiguous()))
791  {
792  //can't do with current set of kernels, so fall back to original (slow) version
793 #ifdef HAVE_TPETRA_MMM_TIMINGS
794  MM = Teuchos::null;
795  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("fallback to CrsMatrix::add"))));
796 #endif
797  if (debug) {
798  std::ostringstream os;
799  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
800  << "Call Bprime->add(...)" << std::endl;
801  std::cerr << os.str ();
802  }
803  Teuchos::RCP<crs_matrix_type> C_ = Teuchos::rcp_static_cast<crs_matrix_type>(Bprime->add(alpha, *Aprime, beta, CDomainMap, CRangeMap, params));
804  C.replaceColMap(C_->getColMap());
805  C.setAllValues(C_->getLocalMatrix().graph.row_map,C_->getLocalMatrix().graph.entries,C_->getLocalMatrix().values);
806  C.expertStaticFillComplete(CDomainMap, CRangeMap, C_->getGraph()->getImporter(), C_->getGraph()->getExporter(), params);
807  return;
808  }
809  else if(!matchingColMaps)
810  {
811 #ifdef HAVE_TPETRA_MMM_TIMINGS
812  MM = Teuchos::null;
813  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
814 #endif
815  //use kernel that converts col indices in both A and B to common domain map before adding
816  auto Acolmap = Aprime->getColMap()->getMyGlobalIndices();
817  auto Bcolmap = Bprime->getColMap()->getMyGlobalIndices();
818  typename AddKern::global_col_inds_array globalColinds("", 0);
819  if (debug) {
820  std::ostringstream os;
821  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
822  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
823  std::cerr << os.str ();
824  }
825  AddKern::convertToGlobalAndAdd(
826  Aprime->getLocalMatrix(), alpha, Bprime->getLocalMatrix(), beta, Acolmap, Bcolmap,
827  CRangeMap->getMinGlobalIndex(), Aprime->getGlobalNumCols(), vals, rowptrs, globalColinds);
828  colinds = col_inds_array("C colinds", globalColinds.extent(0));
829  if (debug) {
830  std::ostringstream os;
831  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
832  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
833  std::cerr << os.str ();
834  }
835 #ifdef HAVE_TPETRA_MMM_TIMINGS
836  MM = Teuchos::null;
837  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("building optimized column map"))));
838 #endif
839  CrowMap = Bprime->getRowMap();
840  //OLD: CcolMap = AddKern::makeColMapAndConvertGids(Aprime->getGlobalNumCols(), globalColinds, colinds, comm);
841  //Get C's column map as the union of Aprime and Bprime col maps
842  CcolMap = AddKern::makeColMapAndConvertGids(Aprime->getGlobalNumCols(), globalColinds, colinds, comm);
843  }
844  else
845  {
846  //Aprime, Bprime and C all have the same column maps
847  auto localA = Aprime->getLocalMatrix();
848  auto localB = Bprime->getLocalMatrix();
849  auto Avals = localA.values;
850  auto Bvals = localB.values;
851  auto Arowptrs = localA.graph.row_map;
852  auto Browptrs = localB.graph.row_map;
853  auto Acolinds = localA.graph.entries;
854  auto Bcolinds = localB.graph.entries;
855  if(sorted)
856  {
857  //use sorted kernel
858 #ifdef HAVE_TPETRA_MMM_TIMINGS
859  MM = Teuchos::null;
860  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
861 #endif
862  if (debug) {
863  std::ostringstream os;
864  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
865  << "Call AddKern::addSorted(...)" << std::endl;
866  std::cerr << os.str ();
867  }
868  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
869  }
870  else
871  {
872  //use unsorted kernel
873 #ifdef HAVE_TPETRA_MMM_TIMINGS
874  MM = Teuchos::null;
875  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
876 #endif
877  if (debug) {
878  std::ostringstream os;
879  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
880  << "Call AddKern::addUnsorted(...)" << std::endl;
881  std::cerr << os.str ();
882  }
883  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
884  }
885  CrowMap = Bprime->getRowMap();
886  CcolMap = Bprime->getColMap();
887  //note: Cexport created below (if it's needed)
888  }
889 #ifdef HAVE_TPETRA_MMM_TIMINGS
890  MM = Teuchos::null;
891  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs constructor"))));
892 #endif
893  // C = rcp(new crs_matrix_type(CrowMap, CcolMap, rowptrs, colinds, vals, params));
894  C.replaceColMap(CcolMap);
895  C.setAllValues(rowptrs,colinds,vals);
896 #ifdef HAVE_TPETRA_MMM_TIMINGS
897  MM = Teuchos::null;
898  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
899 #endif
900  if(!CDomainMap->isSameAs(*CcolMap))
901  {
902  if (debug) {
903  std::ostringstream os;
904  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
905  << "Create Cimport" << std::endl;
906  std::cerr << os.str ();
907  }
908  Cimport = rcp(new import_type(CDomainMap, CcolMap));
909  }
910  if(!CrowMap->isSameAs(*CRangeMap))
911  {
912  if (debug) {
913  std::ostringstream os;
914  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
915  << "Create Cexport" << std::endl;
916  std::cerr << os.str ();
917  }
918  Cexport = rcp(new export_type(CrowMap, CRangeMap));
919  }
920 
921  if (debug) {
922  std::ostringstream os;
923  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
924  << "Call C->expertStaticFillComplete(...)" << std::endl;
925  std::cerr << os.str ();
926  }
927  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
928 
929 }
930 
931 template <class Scalar,
932  class LocalOrdinal,
933  class GlobalOrdinal,
934  class Node>
935 void Add(
937  bool transposeA,
938  Scalar scalarA,
940  bool transposeB,
941  Scalar scalarB,
943 {
944  using Teuchos::Array;
945  using Teuchos::ArrayRCP;
946  using Teuchos::ArrayView;
947  using Teuchos::RCP;
948  using Teuchos::rcp;
949  using Teuchos::rcp_dynamic_cast;
950  using Teuchos::rcpFromRef;
951  using Teuchos::tuple;
952  using std::endl;
953  // typedef typename ArrayView<const Scalar>::size_type size_type;
954  typedef Teuchos::ScalarTraits<Scalar> STS;
956  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
957  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
958  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
961 
962  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
963 
964  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
965  prefix << "The case C == null does not actually work. Fixing this will require an interface change.");
966 
967  TEUCHOS_TEST_FOR_EXCEPTION(
968  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
969  prefix << "Both input matrices must be fill complete before calling this function.");
970 
971 #ifdef HAVE_TPETRA_DEBUG
972  {
973  const bool domainMapsSame =
974  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
975  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
976  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
977  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
978  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
979 
980  const bool rangeMapsSame =
981  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
982  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
983  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
984  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
985  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
986  }
987 #endif // HAVE_TPETRA_DEBUG
988 
989  using Teuchos::ParameterList;
990  RCP<ParameterList> transposeParams (new ParameterList);
991  transposeParams->set ("sort", false);
992 
993  // Form the explicit transpose of A if necessary.
994  RCP<const crs_matrix_type> Aprime;
995  if (transposeA) {
996  transposer_type theTransposer (rcpFromRef (A));
997  Aprime = theTransposer.createTranspose (transposeParams);
998  }
999  else {
1000  Aprime = rcpFromRef (A);
1001  }
1002 
1003 #ifdef HAVE_TPETRA_DEBUG
1004  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1005  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1006 #endif // HAVE_TPETRA_DEBUG
1007 
1008  // Form the explicit transpose of B if necessary.
1009  RCP<const crs_matrix_type> Bprime;
1010  if (transposeB) {
1011  transposer_type theTransposer (rcpFromRef (B));
1012  Bprime = theTransposer.createTranspose (transposeParams);
1013  }
1014  else {
1015  Bprime = rcpFromRef (B);
1016  }
1017 
1018 #ifdef HAVE_TPETRA_DEBUG
1019  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1020  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1021 #endif // HAVE_TPETRA_DEBUG
1022 
1023  // Allocate or zero the entries of the result matrix.
1024  if (! C.is_null ()) {
1025  C->setAllToScalar (STS::zero ());
1026  } else {
1027 #if 0
1028  // If Aprime and Bprime have the same row Map, and if C is null,
1029  // we can optimize construction and fillComplete of C. For now,
1030  // we just check pointer equality, to avoid the all-reduce in
1031  // isSameAs. It may be worth that all-reduce to check, however.
1032  //if (Aprime->getRowMap ().getRawPtr () == Bprime->getRowMap ().getRawPtr ())
1033  if (Aprime->getRowMap ()->isSameAs (* (Bprime->getRowMap ())) {
1034  RCP<const map_type> rowMap = Aprime->getRowMap ();
1035 
1036  RCP<const crs_graph_type> A_graph =
1037  rcp_dynamic_cast<const crs_graph_type> (Aprime->getGraph (), true);
1038 #ifdef HAVE_TPETRA_DEBUG
1039  TEUCHOS_TEST_FOR_EXCEPTION(A_graph.is_null (), std::logic_error,
1040  "Tpetra::MatrixMatrix::Add: Graph of Op(A) is null. "
1041  "Please report this bug to the Tpetra developers.");
1042 #endif // HAVE_TPETRA_DEBUG
1043  RCP<const crs_graph_type> B_graph =
1044  rcp_dynamic_cast<const crs_graph_type> (Bprime->getGraph (), true);
1045 #ifdef HAVE_TPETRA_DEBUG
1046  TEUCHOS_TEST_FOR_EXCEPTION(B_graph.is_null (), std::logic_error,
1047  "Tpetra::MatrixMatrix::Add: Graph of Op(B) is null. "
1048  "Please report this bug to the Tpetra developers.");
1049 #endif // HAVE_TPETRA_DEBUG
1050  RCP<const map_type> A_colMap = A_graph->getColMap ();
1051 #ifdef HAVE_TPETRA_DEBUG
1052  TEUCHOS_TEST_FOR_EXCEPTION(A_colMap.is_null (), std::logic_error,
1053  "Tpetra::MatrixMatrix::Add: Column Map of Op(A) is null. "
1054  "Please report this bug to the Tpetra developers.");
1055 #endif // HAVE_TPETRA_DEBUG
1056  RCP<const map_type> B_colMap = B_graph->getColMap ();
1057 #ifdef HAVE_TPETRA_DEBUG
1058  TEUCHOS_TEST_FOR_EXCEPTION(B_colMap.is_null (), std::logic_error,
1059  "Tpetra::MatrixMatrix::Add: Column Map of Op(B) is null. "
1060  "Please report this bug to the Tpetra developers.");
1061  TEUCHOS_TEST_FOR_EXCEPTION(A_graph->getImporter ().is_null (),
1062  std::logic_error,
1063  "Tpetra::MatrixMatrix::Add: Op(A)'s Import is null. "
1064  "Please report this bug to the Tpetra developers.");
1065  TEUCHOS_TEST_FOR_EXCEPTION(B_graph->getImporter ().is_null (),
1066  std::logic_error,
1067  "Tpetra::MatrixMatrix::Add: Op(B)'s Import is null. "
1068  "Please report this bug to the Tpetra developers.");
1069 #endif // HAVE_TPETRA_DEBUG
1070 
1071  // Compute the (column Map and) Import of the matrix sum.
1072  RCP<const import_type> sumImport =
1073  A_graph->getImporter ()->setUnion (* (B_graph->getImporter ()));
1074  RCP<const map_type> C_colMap = sumImport->getTargetMap ();
1075 
1076  // First, count the number of entries in each row. Then, go
1077  // back over the rows again, and compute the actual sum.
1078  // Remember that C may have a different column Map than Aprime
1079  // or Bprime, so its local indices may be different. That's why
1080  // we have to convert from local to global indices.
1081 
1082  ArrayView<const LocalOrdinal> A_local_ind;
1083  Array<GlobalOrdinal> A_global_ind;
1084  ArrayView<const LocalOrdinal> B_local_ind;
1085  Array<GlobalOrdinal> B_global_ind;
1086 
1087  const size_t localNumRows = rowMap->getNodeNumElements ();
1088  ArrayRCP<size_t> numEntriesPerRow (localNumRows);
1089  // Compute the max number of entries in any row of A+B on this
1090  // process, so that we won't have to resize temporary arrays.
1091  size_t maxNumEntriesPerRow = 0;
1092  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
1093  // Get view of current row of A_graph, in its local indices.
1094  A_graph->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind);
1095  const size_type A_numEnt = A_local_ind.size ();
1096  if (A_numEnt > A_global_ind.size ()) {
1097  A_global_ind.resize (A_numEnt);
1098  }
1099  // Convert A's local indices to global indices.
1100  for (size_type k = 0; k < A_numEnt; ++k) {
1101  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1102  }
1103 
1104  // Get view of current row of B_graph, in its local indices.
1105  B_graph->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind);
1106  const size_type B_numEnt = B_local_ind.size ();
1107  if (B_numEnt > B_global_ind.size ()) {
1108  B_global_ind.resize (B_numEnt);
1109  }
1110  // Convert B's local indices to global indices.
1111  for (size_type k = 0; k < B_numEnt; ++k) {
1112  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1113  }
1114 
1115  // Count the number of entries in the merged row of A + B.
1116  const size_t curNumEntriesPerRow =
1117  keyMergeCount (A_global_ind.begin (), A_global_ind.end (),
1118  B_global_ind.begin (), B_global_ind.end ());
1119  numEntriesPerRow[localRow] = curNumEntriesPerRow;
1120  maxNumEntriesPerRow = std::max (maxNumEntriesPerRow, curNumEntriesPerRow);
1121  }
1122 
1123  // Create C, using the sum column Map and number of entries per
1124  // row that we computed above. Having the exact number of
1125  // entries per row lets us use static profile, making it valid
1126  // to call expertStaticFillComplete.
1127  C = rcp (new crs_matrix_type (rowMap, C_colMap, numEntriesPerRow, StaticProfile));
1128 
1129  // Go back through the rows and actually compute the sum. We
1130  // don't ever have to resize A_global_ind or B_global_ind below,
1131  // since we've already done it above.
1132  ArrayView<const Scalar> A_val;
1133  ArrayView<const Scalar> B_val;
1134 
1135  Array<LocalOrdinal> AplusB_local_ind (maxNumEntriesPerRow);
1136  Array<GlobalOrdinal> AplusB_global_ind (maxNumEntriesPerRow);
1137  Array<Scalar> AplusB_val (maxNumEntriesPerRow);
1138 
1139  for (size_t localRow = 0; localRow < localNumRows; ++localRow) {
1140  // Get view of current row of A, in A's local indices.
1141  Aprime->getLocalRowView (as<LocalOrdinal> (localRow), A_local_ind, A_val);
1142  // Convert A's local indices to global indices.
1143  for (size_type k = 0; k < A_local_ind.size (); ++k) {
1144  A_global_ind[k] = A_colMap->getGlobalElement (A_local_ind[k]);
1145  }
1146 
1147  // Get view of current row of B, in B's local indices.
1148  Bprime->getLocalRowView (as<LocalOrdinal> (localRow), B_local_ind, B_val);
1149  // Convert B's local indices to global indices.
1150  for (size_type k = 0; k < B_local_ind.size (); ++k) {
1151  B_global_ind[k] = B_colMap->getGlobalElement (B_local_ind[k]);
1152  }
1153 
1154  const size_t curNumEntries = numEntriesPerRow[localRow];
1155  ArrayView<LocalOrdinal> C_local_ind = AplusB_local_ind (0, curNumEntries);
1156  ArrayView<GlobalOrdinal> C_global_ind = AplusB_global_ind (0, curNumEntries);
1157  ArrayView<Scalar> C_val = AplusB_val (0, curNumEntries);
1158 
1159  // Sum the entries in the current row of A plus B.
1160  keyValueMerge (A_global_ind.begin (), A_global_ind.end (),
1161  A_val.begin (), A_val.end (),
1162  B_global_ind.begin (), B_global_ind.end (),
1163  B_val.begin (), B_val.end (),
1164  C_global_ind.begin (), C_val.begin (),
1165  std::plus<Scalar> ());
1166  // Convert the sum's global indices into C's local indices.
1167  for (size_type k = 0; k < as<size_type> (numEntriesPerRow[localRow]); ++k) {
1168  C_local_ind[k] = C_colMap->getLocalElement (C_global_ind[k]);
1169  }
1170  // Give the current row sum to C.
1171  C->replaceLocalValues (localRow, C_local_ind, C_val);
1172  }
1173 
1174  // Use "expert static fill complete" to bypass construction of
1175  // the Import and Export (if applicable) object(s).
1176  RCP<const map_type> domainMap = A_graph->getDomainMap ();
1177  RCP<const map_type> rangeMap = A_graph->getRangeMap ();
1178  C->expertStaticFillComplete (domainMap, rangeMap, sumImport, A_graph->getExporter ());
1179 
1180  return; // Now we're done!
1181  }
1182  else {
1183  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1184  // noticed that C was being given the row Map of Aprime (the
1185  // possibly transposed version of A). Is this what we want?
1186  C = rcp (new crs_matrix_type (Aprime->getRowMap (), null));
1187  }
1188 
1189 #else
1190  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1191  // noticed that C was being given the row Map of Aprime (the
1192  // possibly transposed version of A). Is this what we want?
1193  C = rcp (new crs_matrix_type (Aprime->getRowMap (), 0));
1194 #endif // 0
1195  }
1196 
1197 #ifdef HAVE_TPETRA_DEBUG
1198  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1199  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1200  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1201  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1202  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1203  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1204 #endif // HAVE_TPETRA_DEBUG
1205 
1206  Array<RCP<const crs_matrix_type> > Mat =
1207  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1208  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1209 
1210  // do a loop over each matrix to add: A reordering might be more efficient
1211  for (int k = 0; k < 2; ++k) {
1212  Array<GlobalOrdinal> Indices;
1213  Array<Scalar> Values;
1214 
1215  // Loop over each locally owned row of the current matrix (either
1216  // Aprime or Bprime), and sum its entries into the corresponding
1217  // row of C. This works regardless of whether Aprime or Bprime
1218  // has the same row Map as C, because both sumIntoGlobalValues and
1219  // insertGlobalValues allow summing resp. inserting into nonowned
1220  // rows of C.
1221 #ifdef HAVE_TPETRA_DEBUG
1222  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1223  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1224 #endif // HAVE_TPETRA_DEBUG
1225  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1226 #ifdef HAVE_TPETRA_DEBUG
1227  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1228  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1229 #endif // HAVE_TPETRA_DEBUG
1230 
1231  const size_t localNumRows = Mat[k]->getNodeNumRows ();
1232  for (size_t i = 0; i < localNumRows; ++i) {
1233  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1234  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1235  if (numEntries > 0) {
1236  Indices.resize (numEntries);
1237  Values.resize (numEntries);
1238  Mat[k]->getGlobalRowCopy (globalRow, Indices (), Values (), numEntries);
1239 
1240  if (scalar[k] != STS::one ()) {
1241  for (size_t j = 0; j < numEntries; ++j) {
1242  Values[j] *= scalar[k];
1243  }
1244  }
1245 
1246  if (C->isFillComplete ()) {
1247  C->sumIntoGlobalValues (globalRow, Indices, Values);
1248  } else {
1249  C->insertGlobalValues (globalRow, Indices, Values);
1250  }
1251  }
1252  }
1253  }
1254 }
1255 
1256 template<typename SC, typename LO, typename GO, typename NO>
1257 void AddDetails::AddKernels<SC, LO, GO, NO>::
1258 addSorted(
1259  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1260  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1261  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1262  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1263  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1264  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1265  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1266  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1267  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1268  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1269  typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1270 {
1271  using Teuchos::TimeMonitor;
1272  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
1273  auto nrows = Arowptrs.extent(0) - 1;
1274  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1275  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, impl_scalar_type,
1276  execution_space, memory_space, memory_space> KKH;
1277  KKH handle;
1278  handle.create_spadd_handle(true);
1279  auto addHandle = handle.get_spadd_handle();
1280 #ifdef HAVE_TPETRA_MMM_TIMINGS
1281  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
1282 #endif
1283  KokkosSparse::Experimental::spadd_symbolic
1284  <KKH,
1285  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1286  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1287  row_ptrs_array, col_inds_array>
1288  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1289  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1290  Ccolinds = col_inds_array("C colinds", addHandle->get_max_result_nnz());
1291 #ifdef HAVE_TPETRA_MMM_TIMINGS
1292  MM = Teuchos::null;
1293  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
1294 #endif
1295  KokkosSparse::Experimental::spadd_numeric(&handle,
1296  Arowptrs, Acolinds, Avals, scalarA,
1297  Browptrs, Bcolinds, Bvals, scalarB,
1298  Crowptrs, Ccolinds, Cvals);
1299 }
1300 
1301 template<typename SC, typename LO, typename GO, typename NO>
1302 void AddDetails::AddKernels<SC, LO, GO, NO>::
1303 addUnsorted(
1304  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
1305  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
1306  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
1307  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1308  const typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
1309  const typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
1310  const typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
1311  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1312  GO /* numGlobalCols */,
1313  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1314  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1315  typename AddDetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
1316 {
1317  using Teuchos::TimeMonitor;
1318  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
1319  auto nrows = Arowptrs.extent(0) - 1;
1320  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1321  typedef AddDetails::AddKernels<SC, LO, GO, NO> AddKern;
1322  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, LO, AddKern::impl_scalar_type,
1323  AddKern::execution_space, AddKern::memory_space, AddKern::memory_space> KKH;
1324  KKH handle;
1325  handle.create_spadd_handle(false);
1326  auto addHandle = handle.get_spadd_handle();
1327 #ifdef HAVE_TPETRA_MMM_TIMINGS
1328  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
1329 #endif
1330  KokkosSparse::Experimental::spadd_symbolic
1331  <KKH,
1332  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1333  typename row_ptrs_array::const_type, typename col_inds_array::const_type,
1334  row_ptrs_array, col_inds_array>
1335  (&handle, Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
1336  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1337  Ccolinds = col_inds_array("C colinds", addHandle->get_max_result_nnz());
1338 #ifdef HAVE_TPETRA_MMM_TIMINGS
1339  MM = Teuchos::null;
1340  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted kernel: sorted numeric")));
1341 #endif
1342  KokkosSparse::Experimental::spadd_numeric(&handle,
1343  Arowptrs, Acolinds, Avals, scalarA,
1344  Browptrs, Bcolinds, Bvals, scalarB,
1345  Crowptrs, Ccolinds, Cvals);
1346 }
1347 
1348 template<typename GO,
1349  typename LocalIndicesType,
1350  typename GlobalIndicesType,
1351  typename ColMapType>
1352 struct ConvertColIndsFunctor
1353 {
1354  ConvertColIndsFunctor (const GO minGlobal_,
1355  const LocalIndicesType& colindsOrig_,
1356  const GlobalIndicesType& colindsConverted_,
1357  const ColMapType& colmap_) :
1358  minGlobal (minGlobal_),
1359  colindsOrig (colindsOrig_),
1360  colindsConverted (colindsConverted_),
1361  colmap (colmap_)
1362  {}
1363  KOKKOS_INLINE_FUNCTION void
1364  operator() (const size_t& i) const
1365  {
1366  colindsConverted[i] = colmap[colindsOrig[i]];
1367  }
1368  GO minGlobal;
1369  LocalIndicesType colindsOrig;
1370  GlobalIndicesType colindsConverted;
1371  ColMapType colmap;
1372 };
1373 
1374 template<typename SC, typename LO, typename GO, typename NO>
1375 void AddDetails::AddKernels<SC, LO, GO, NO>::
1376 convertToGlobalAndAdd(
1377  const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
1378  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
1379  const typename AddDetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
1380  const typename AddDetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
1381  const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
1382  const typename AddDetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
1383  GO minGlobalCol,
1384  GO /* numGlobalCols */,
1385  typename AddDetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
1386  typename AddDetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
1387  typename AddDetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
1388 {
1389  using Teuchos::TimeMonitor;
1390 
1391  const values_array& Avals = A.values;
1392  const values_array& Bvals = B.values;
1393  const col_inds_array& Acolinds = A.graph.entries;
1394  const col_inds_array& Bcolinds = B.graph.entries;
1395  auto Arowptrs = A.graph.row_map;
1396  auto Browptrs = B.graph.row_map;
1397  global_col_inds_array AcolindsConverted("A colinds (converted)", Acolinds.extent(0));
1398  global_col_inds_array BcolindsConverted("B colinds (converted)", Bcolinds.extent(0));
1399 #ifdef HAVE_TPETRA_MMM_TIMINGS
1400  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
1401 #endif
1402  ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(minGlobalCol, Acolinds, AcolindsConverted, AcolMap);
1403  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
1404  ConvertColIndsFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(minGlobalCol, Bcolinds, BcolindsConverted, BcolMap);
1405  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
1406  typedef KokkosKernels::Experimental::KokkosKernelsHandle<typename col_inds_array::size_type, GO, impl_scalar_type,
1407  execution_space, memory_space, memory_space> KKH;
1408  KKH handle;
1409  handle.create_spadd_handle(false);
1410  auto addHandle = handle.get_spadd_handle();
1411 #ifdef HAVE_TPETRA_MMM_TIMINGS
1412  MM = Teuchos::null;
1413  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
1414 #endif
1415  auto nrows = Arowptrs.extent(0) - 1;
1416  Crowptrs = row_ptrs_array("C row ptrs", nrows + 1);
1417  KokkosSparse::Experimental::spadd_symbolic
1418  <KKH, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, typename row_ptrs_array::const_type, typename global_col_inds_array::const_type, row_ptrs_array, global_col_inds_array>
1419  (&handle, Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
1420  Cvals = values_array("C values", addHandle->get_max_result_nnz());
1421  Ccolinds = global_col_inds_array("C colinds", addHandle->get_max_result_nnz());
1422 #ifdef HAVE_TPETRA_MMM_TIMINGS
1423  MM = Teuchos::null;
1424  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
1425 #endif
1426  KokkosSparse::Experimental::spadd_numeric(&handle,
1427  Arowptrs, AcolindsConverted, Avals, scalarA,
1428  Browptrs, BcolindsConverted, Bvals, scalarB,
1429  Crowptrs, Ccolinds, Cvals);
1430 }
1431 
1432 } //End namespace MatrixMatrix
1433 
1434 namespace MMdetails{
1435 
1436 /*********************************************************************************************************/
1437 // Prints MMM-style statistics on communication done with an Import or Export object
1438 template <class TransferType>
1439 void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1440  if (Transfer.is_null())
1441  return;
1442 
1443  const Distributor & Distor = Transfer->getDistributor();
1444  Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1445 
1446  size_t rows_send = Transfer->getNumExportIDs();
1447  size_t rows_recv = Transfer->getNumRemoteIDs();
1448 
1449  size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1450  size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1451  size_t num_send_neighbors = Distor.getNumSends();
1452  size_t num_recv_neighbors = Distor.getNumReceives();
1453  size_t round2_send, round2_recv;
1454  Distor.getLastDoStatistics(round2_send,round2_recv);
1455 
1456  int myPID = Comm->getRank();
1457  int NumProcs = Comm->getSize();
1458 
1459  // Processor by processor statistics
1460  // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1461  // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1462 
1463  // Global statistics
1464  size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1465  size_t gstats_min[8], gstats_max[8];
1466 
1467  double lstats_avg[8], gstats_avg[8];
1468  for(int i=0; i<8; i++)
1469  lstats_avg[i] = ((double)lstats[i])/NumProcs;
1470 
1471  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1472  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1473  Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1474 
1475  if(!myPID) {
1476  printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1477  (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1478  (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1479  printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1480  (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1481  (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1482  }
1483 }
1484 
1485 // Kernel method for computing the local portion of C = A*B
1486 template<class Scalar,
1487  class LocalOrdinal,
1488  class GlobalOrdinal,
1489  class Node>
1490 void mult_AT_B_newmatrix(
1491  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1492  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1493  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1494  const std::string & label,
1495  const Teuchos::RCP<Teuchos::ParameterList>& params)
1496 {
1497  using Teuchos::RCP;
1498  using Teuchos::rcp;
1499  typedef Scalar SC;
1500  typedef LocalOrdinal LO;
1501  typedef GlobalOrdinal GO;
1502  typedef Node NO;
1503  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1504  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1505 
1506 #ifdef HAVE_TPETRA_MMM_TIMINGS
1507  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1508  using Teuchos::TimeMonitor;
1509  RCP<TimeMonitor> MM = rcp (new TimeMonitor
1510  (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1511 #endif
1512 
1513  /*************************************************************/
1514  /* 1) Local Transpose of A */
1515  /*************************************************************/
1516  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1517 
1518  using Teuchos::ParameterList;
1519  RCP<ParameterList> transposeParams (new ParameterList);
1520  transposeParams->set ("sort", false);
1521  if(! params.is_null ()) {
1522  transposeParams->set ("compute global constants",
1523  params->get ("compute global constants: temporaries",
1524  false));
1525  }
1526  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1527  transposer.createTransposeLocal (transposeParams);
1528 
1529  /*************************************************************/
1530  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1531  /*************************************************************/
1532 #ifdef HAVE_TPETRA_MMM_TIMINGS
1533  MM = Teuchos::null;
1534  MM = rcp (new TimeMonitor
1535  (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1536 #endif
1537 
1538  // Get views, asserting that no import is required to speed up computation
1539  crs_matrix_struct_type Aview;
1540  crs_matrix_struct_type Bview;
1541  RCP<const Import<LO, GO, NO> > dummyImporter;
1542 
1543  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1544  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1545  if (! params.is_null ()) {
1546  importParams1->set ("compute global constants",
1547  params->get ("compute global constants: temporaries",
1548  false));
1549  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1550  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1551  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1552  int mm_optimization_core_count =
1554  mm_optimization_core_count =
1555  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1556  int mm_optimization_core_count2 =
1557  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1558  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1559  mm_optimization_core_count = mm_optimization_core_count2;
1560  }
1561  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1562  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1563  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1564  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1565  }
1566 
1567  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1568  Aview, dummyImporter, true,
1569  label, importParams1);
1570 
1571  RCP<ParameterList> importParams2 (new ParameterList);
1572  if (! params.is_null ()) {
1573  importParams2->set ("compute global constants",
1574  params->get ("compute global constants: temporaries",
1575  false));
1576  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1577  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1578  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1579  int mm_optimization_core_count =
1581  mm_optimization_core_count =
1582  slist.get ("MM_TAFC_OptimizationCoreCount",
1583  mm_optimization_core_count);
1584  int mm_optimization_core_count2 =
1585  params->get ("MM_TAFC_OptimizationCoreCount",
1586  mm_optimization_core_count);
1587  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1588  mm_optimization_core_count = mm_optimization_core_count2;
1589  }
1590  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1591  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1592  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1593  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1594  }
1595 
1596  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1597  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1598  }
1599  else {
1600  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1601  }
1602 
1603 #ifdef HAVE_TPETRA_MMM_TIMINGS
1604  MM = Teuchos::null;
1605  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1606 #endif
1607 
1608  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1609 
1610  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1611  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1612  if (needs_final_export) {
1613  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1614  }
1615  else {
1616  Ctemp = rcp (&C, false);
1617  }
1618 
1619  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1620 
1621  /*************************************************************/
1622  /* 4) exportAndFillComplete matrix */
1623  /*************************************************************/
1624 #ifdef HAVE_TPETRA_MMM_TIMINGS
1625  MM = Teuchos::null;
1626  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1627 #endif
1628 
1629  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1630 
1631  if (needs_final_export) {
1632  ParameterList labelList;
1633  labelList.set("Timer Label", label);
1634  if(!params.is_null()) {
1635  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1636  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1637  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1638  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1639  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1640  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1641  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1642  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1643  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1644 
1645  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1646  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1647  labelList.set("compute global constants",params->get("compute global constants",true));
1648  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1649  }
1650  Ctemp->exportAndFillComplete (Crcp,
1651  *Ctemp->getGraph ()->getExporter (),
1652  B.getDomainMap (),
1653  A.getDomainMap (),
1654  rcp (&labelList, false));
1655  }
1656 #ifdef HAVE_TPETRA_MMM_STATISTICS
1657  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1658 #endif
1659 }
1660 
1661 /*********************************************************************************************************/
1662 // Kernel method for computing the local portion of C = A*B
1663 template<class Scalar,
1664  class LocalOrdinal,
1665  class GlobalOrdinal,
1666  class Node>
1667 void mult_A_B(
1668  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1669  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1670  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1671  const std::string& /* label */,
1672  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1673 {
1674  using Teuchos::Array;
1675  using Teuchos::ArrayRCP;
1676  using Teuchos::ArrayView;
1677  using Teuchos::OrdinalTraits;
1678  using Teuchos::null;
1679 
1680  typedef Teuchos::ScalarTraits<Scalar> STS;
1681  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1682  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1683  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1684 
1685  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1686  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1687 
1688  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getNodeElementList();
1689  ArrayView<const GlobalOrdinal> bcols_import = null;
1690  if (Bview.importColMap != null) {
1691  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1692  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1693 
1694  bcols_import = Bview.importColMap->getNodeElementList();
1695  }
1696 
1697  size_t C_numCols = C_lastCol - C_firstCol +
1698  OrdinalTraits<LocalOrdinal>::one();
1699  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1700  OrdinalTraits<LocalOrdinal>::one();
1701 
1702  if (C_numCols_import > C_numCols)
1703  C_numCols = C_numCols_import;
1704 
1705  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1706  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1707  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1708 
1709  Array<Scalar> C_row_i = dwork;
1710  Array<GlobalOrdinal> C_cols = iwork;
1711  Array<size_t> c_index = iwork2;
1712  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1713  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1714 
1715  size_t C_row_i_length, j, k, last_index;
1716 
1717  // Run through all the hash table lookups once and for all
1718  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1719  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1720  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getNodeNumElements(),LO_INVALID);
1721  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1722  // Maps are the same: Use local IDs as the hash
1723  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1724  Aview.colMap->getMaxLocalIndex(); i++)
1725  Acol2Brow[i]=i;
1726  }
1727  else {
1728  // Maps are not the same: Use the map's hash
1729  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1730  Aview.colMap->getMaxLocalIndex(); i++) {
1731  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1732  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1733  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1734  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1735  }
1736  }
1737 
1738  // To form C = A*B we're going to execute this expression:
1739  //
1740  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1741  //
1742  // Our goal, of course, is to navigate the data in A and B once, without
1743  // performing searches for column-indices, etc.
1744  ArrayRCP<const size_t> Arowptr_RCP, Browptr_RCP, Irowptr_RCP;
1745  ArrayRCP<const LocalOrdinal> Acolind_RCP, Bcolind_RCP, Icolind_RCP;
1746  ArrayRCP<const Scalar> Avals_RCP, Bvals_RCP, Ivals_RCP;
1747  ArrayView<const size_t> Arowptr, Browptr, Irowptr;
1748  ArrayView<const LocalOrdinal> Acolind, Bcolind, Icolind;
1749  ArrayView<const Scalar> Avals, Bvals, Ivals;
1750  Aview.origMatrix->getAllValues(Arowptr_RCP,Acolind_RCP,Avals_RCP);
1751  Bview.origMatrix->getAllValues(Browptr_RCP,Bcolind_RCP,Bvals_RCP);
1752  Arowptr = Arowptr_RCP(); Acolind = Acolind_RCP(); Avals = Avals_RCP();
1753  Browptr = Browptr_RCP(); Bcolind = Bcolind_RCP(); Bvals = Bvals_RCP();
1754  if(!Bview.importMatrix.is_null()) {
1755  Bview.importMatrix->getAllValues(Irowptr_RCP,Icolind_RCP,Ivals_RCP);
1756  Irowptr = Irowptr_RCP(); Icolind = Icolind_RCP(); Ivals = Ivals_RCP();
1757  }
1758 
1759  bool C_filled = C.isFillComplete();
1760 
1761  for (size_t i = 0; i < C_numCols; i++)
1762  c_index[i] = OrdinalTraits<size_t>::invalid();
1763 
1764  // Loop over the rows of A.
1765  size_t Arows = Aview.rowMap->getNodeNumElements();
1766  for(size_t i=0; i<Arows; ++i) {
1767 
1768  // Only navigate the local portion of Aview... which is, thankfully, all of
1769  // A since this routine doesn't do transpose modes
1770  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1771 
1772  // Loop across the i-th row of A and for each corresponding row in B, loop
1773  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1774  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1775  // calculating updates for row i of the result matrix C.
1776  C_row_i_length = OrdinalTraits<size_t>::zero();
1777 
1778  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1779  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1780  const Scalar Aval = Avals[k];
1781  if (Aval == STS::zero())
1782  continue;
1783 
1784  if (Ak == LO_INVALID)
1785  continue;
1786 
1787  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1788  LocalOrdinal col = Bcolind[j];
1789  //assert(col >= 0 && col < C_numCols);
1790 
1791  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1792  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1793  // This has to be a += so insertGlobalValue goes out
1794  C_row_i[C_row_i_length] = Aval*Bvals[j];
1795  C_cols[C_row_i_length] = col;
1796  c_index[col] = C_row_i_length;
1797  C_row_i_length++;
1798 
1799  } else {
1800  C_row_i[c_index[col]] += Aval*Bvals[j];
1801  }
1802  }
1803  }
1804 
1805  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1806  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1807  C_cols[ii] = bcols[C_cols[ii]];
1808  combined_index[ii] = C_cols[ii];
1809  combined_values[ii] = C_row_i[ii];
1810  }
1811  last_index = C_row_i_length;
1812 
1813  //
1814  //Now put the C_row_i values into C.
1815  //
1816  // We might have to revamp this later.
1817  C_row_i_length = OrdinalTraits<size_t>::zero();
1818 
1819  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1820  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1821  const Scalar Aval = Avals[k];
1822  if (Aval == STS::zero())
1823  continue;
1824 
1825  if (Ak!=LO_INVALID) continue;
1826 
1827  Ak = Acol2Irow[Acolind[k]];
1828  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1829  LocalOrdinal col = Icolind[j];
1830  //assert(col >= 0 && col < C_numCols);
1831 
1832  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1833  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1834  // This has to be a += so insertGlobalValue goes out
1835  C_row_i[C_row_i_length] = Aval*Ivals[j];
1836  C_cols[C_row_i_length] = col;
1837  c_index[col] = C_row_i_length;
1838  C_row_i_length++;
1839 
1840  } else {
1841  // This has to be a += so insertGlobalValue goes out
1842  C_row_i[c_index[col]] += Aval*Ivals[j];
1843  }
1844  }
1845  }
1846 
1847  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1848  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1849  C_cols[ii] = bcols_import[C_cols[ii]];
1850  combined_index[last_index] = C_cols[ii];
1851  combined_values[last_index] = C_row_i[ii];
1852  last_index++;
1853  }
1854 
1855  // Now put the C_row_i values into C.
1856  // We might have to revamp this later.
1857  C_filled ?
1858  C.sumIntoGlobalValues(
1859  global_row,
1860  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1861  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1862  :
1863  C.insertGlobalValues(
1864  global_row,
1865  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1866  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1867 
1868  }
1869 }
1870 
1871 /*********************************************************************************************************/
1872 template<class Scalar,
1873  class LocalOrdinal,
1874  class GlobalOrdinal,
1875  class Node>
1876 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1877  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1878  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1879 
1880  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1881  Mview.maxNumRowEntries = Mview.indices[0].size();
1882 
1883  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1884  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1885  Mview.maxNumRowEntries = Mview.indices[i].size();
1886  }
1887 }
1888 
1889 /*********************************************************************************************************/
1890 template<class CrsMatrixType>
1891 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1892  // Follows the NZ estimate in ML's ml_matmatmult.c
1893  size_t Aest = 100, Best=100;
1894  if (A.getNodeNumEntries() > 0)
1895  Aest = (A.getNodeNumRows() > 0)? A.getNodeNumEntries()/A.getNodeNumRows() : 100;
1896  if (B.getNodeNumEntries() > 0)
1897  Best = (B.getNodeNumRows() > 0) ? B.getNodeNumEntries()/B.getNodeNumRows() : 100;
1898 
1899  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1900  nnzperrow *= nnzperrow;
1901 
1902  return (size_t)(A.getNodeNumRows()*nnzperrow*0.75 + 100);
1903 }
1904 
1905 
1906 /*********************************************************************************************************/
1907 // Kernel method for computing the local portion of C = A*B
1908 //
1909 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1910 // function, so this is probably the function we want to
1911 // thread-parallelize.
1912 template<class Scalar,
1913  class LocalOrdinal,
1914  class GlobalOrdinal,
1915  class Node>
1916 void mult_A_B_newmatrix(
1917  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1918  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1919  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1920  const std::string& label,
1921  const Teuchos::RCP<Teuchos::ParameterList>& params)
1922 {
1923  using Teuchos::Array;
1924  using Teuchos::ArrayRCP;
1925  using Teuchos::ArrayView;
1926  using Teuchos::RCP;
1927  using Teuchos::rcp;
1928 
1929  // Tpetra typedefs
1930  typedef LocalOrdinal LO;
1931  typedef GlobalOrdinal GO;
1932  typedef Node NO;
1933  typedef Import<LO,GO,NO> import_type;
1934  typedef Map<LO,GO,NO> map_type;
1935 
1936  // Kokkos typedefs
1937  typedef typename map_type::local_map_type local_map_type;
1939  typedef typename KCRS::StaticCrsGraphType graph_t;
1940  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1941  typedef typename NO::execution_space execution_space;
1942  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1943  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1944 
1945 #ifdef HAVE_TPETRA_MMM_TIMINGS
1946  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1947  using Teuchos::TimeMonitor;
1948  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1949 #endif
1950  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1951 
1952  // Build the final importer / column map, hash table lookups for C
1953  RCP<const import_type> Cimport;
1954  RCP<const map_type> Ccolmap;
1955  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1956  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1957  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1958  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1959  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1960  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1961  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1962  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1963 
1964  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1965  // indices of B, to local column indices of C. (B and C have the
1966  // same number of columns.) The kernel uses this, instead of
1967  // copying the entire input matrix B and converting its column
1968  // indices to those of C.
1969  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
1970 
1971  if (Bview.importMatrix.is_null()) {
1972  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1973  Cimport = Bimport;
1974  Ccolmap = Bview.colMap;
1975  const LO colMapSize = static_cast<LO>(Bview.colMap->getNodeNumElements());
1976  // Bcol2Ccol is trivial
1977  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1978  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1979  KOKKOS_LAMBDA(const LO i) {
1980  Bcol2Ccol(i) = i;
1981  });
1982  }
1983  else {
1984  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1985  // column Map of C, as well as C's Import object (from its domain
1986  // Map to its column Map). C's column Map is the union of the
1987  // column Maps of (the local part of) B, and the "remote" part of
1988  // B. Ditto for the Import. We have optimized this "setUnion"
1989  // operation on Import objects and Maps.
1990 
1991  // Choose the right variant of setUnion
1992  if (!Bimport.is_null() && !Iimport.is_null()) {
1993  Cimport = Bimport->setUnion(*Iimport);
1994  }
1995  else if (!Bimport.is_null() && Iimport.is_null()) {
1996  Cimport = Bimport->setUnion();
1997  }
1998  else if (Bimport.is_null() && !Iimport.is_null()) {
1999  Cimport = Iimport->setUnion();
2000  }
2001  else {
2002  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
2003  }
2004  Ccolmap = Cimport->getTargetMap();
2005 
2006  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
2007  // in general. We should get rid of it in order to reduce
2008  // communication costs of sparse matrix-matrix multiply.
2009  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2010  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2011 
2012  // NOTE: This is not efficient and should be folded into setUnion
2013  //
2014  // mfh 27 Sep 2016: What the above comment means, is that the
2015  // setUnion operation on Import objects could also compute these
2016  // local index - to - local index look-up tables.
2017  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2018  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2019  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2020  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2021  });
2022  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2023  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2024  });
2025 
2026  }
2027 
2028  // Replace the column map
2029  //
2030  // mfh 27 Sep 2016: We do this because C was originally created
2031  // without a column Map. Now we have its column Map.
2032  C.replaceColMap(Ccolmap);
2033 
2034  // mfh 27 Sep 2016: Construct tables that map from local column
2035  // indices of A, to local row indices of either B_local (the locally
2036  // owned part of B), or B_remote (the "imported" remote part of B).
2037  //
2038  // For column index Aik in row i of A, if the corresponding row of B
2039  // exists in the local part of B ("orig") (which I'll call B_local),
2040  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2041  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2042  //
2043  // For column index Aik in row i of A, if the corresponding row of B
2044  // exists in the remote part of B ("Import") (which I'll call
2045  // B_remote), then targetMapToImportRow[Aik] is the local index of
2046  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2047  // (a flag value).
2048 
2049  // Run through all the hash table lookups once and for all
2050  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2051  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2052  Kokkos::fence();
2053  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2054  GO aidx = Acolmap_local.getGlobalElement(i);
2055  LO B_LID = Browmap_local.getLocalElement(aidx);
2056  if (B_LID != LO_INVALID) {
2057  targetMapToOrigRow(i) = B_LID;
2058  targetMapToImportRow(i) = LO_INVALID;
2059  } else {
2060  LO I_LID = Irowmap_local.getLocalElement(aidx);
2061  targetMapToOrigRow(i) = LO_INVALID;
2062  targetMapToImportRow(i) = I_LID;
2063 
2064  }
2065  });
2066 
2067  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2068  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2069  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2070 
2071 }
2072 
2073 
2074 /*********************************************************************************************************/
2075 // AB NewMatrix Kernel wrappers (Default non-threaded version)
2076 template<class Scalar,
2077  class LocalOrdinal,
2078  class GlobalOrdinal,
2079  class Node,
2080  class LocalOrdinalViewType>
2081 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2082  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2083  const LocalOrdinalViewType & targetMapToOrigRow,
2084  const LocalOrdinalViewType & targetMapToImportRow,
2085  const LocalOrdinalViewType & Bcol2Ccol,
2086  const LocalOrdinalViewType & Icol2Ccol,
2087  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2088  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2089  const std::string& label,
2090  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2091 #ifdef HAVE_TPETRA_MMM_TIMINGS
2092  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2093  using Teuchos::TimeMonitor;
2094  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
2095 #endif
2096 
2097  using Teuchos::Array;
2098  using Teuchos::ArrayRCP;
2099  using Teuchos::ArrayView;
2100  using Teuchos::RCP;
2101  using Teuchos::rcp;
2102 
2103  // Lots and lots of typedefs
2105  typedef typename KCRS::StaticCrsGraphType graph_t;
2106  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2107  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2108  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2109  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2110 
2111  typedef Scalar SC;
2112  typedef LocalOrdinal LO;
2113  typedef GlobalOrdinal GO;
2114  typedef Node NO;
2115  typedef Map<LO,GO,NO> map_type;
2116  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2117  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2118  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2119 
2120  // Sizes
2121  RCP<const map_type> Ccolmap = C.getColMap();
2122  size_t m = Aview.origMatrix->getNodeNumRows();
2123  size_t n = Ccolmap->getNodeNumElements();
2124  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2125 
2126  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2127  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2128  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2129 
2130  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2131  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2132  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2133 
2134  c_lno_view_t Irowptr;
2135  lno_nnz_view_t Icolind;
2136  scalar_view_t Ivals;
2137  if(!Bview.importMatrix.is_null()) {
2138  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2139  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2140  Ivals = Bview.importMatrix->getLocalMatrix().values;
2141  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2142  }
2143 
2144 #ifdef HAVE_TPETRA_MMM_TIMINGS
2145  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2146 #endif
2147 
2148  // Classic csr assembly (low memory edition)
2149  //
2150  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2151  // The method loops over rows of A, and may resize after processing
2152  // each row. Chris Siefert says that this reflects experience in
2153  // ML; for the non-threaded case, ML found it faster to spend less
2154  // effort on estimation and risk an occasional reallocation.
2155  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2156  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2157  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2158  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2159 
2160  // mfh 27 Sep 2016: The c_status array is an implementation detail
2161  // of the local sparse matrix-matrix multiply routine.
2162 
2163  // The status array will contain the index into colind where this entry was last deposited.
2164  // c_status[i] < CSR_ip - not in the row yet
2165  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2166  // We start with this filled with INVALID's indicating that there are no entries yet.
2167  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2168  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2169  std::vector<size_t> c_status(n, ST_INVALID);
2170 
2171  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2172  // routine. The routine computes C := A * (B_local + B_remote).
2173  //
2174  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2175  // you whether the corresponding row of B belongs to B_local
2176  // ("orig") or B_remote ("Import").
2177 
2178  // For each row of A/C
2179  size_t CSR_ip = 0, OLD_ip = 0;
2180  for (size_t i = 0; i < m; i++) {
2181  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2182  // on the calling process.
2183  Crowptr[i] = CSR_ip;
2184 
2185  // mfh 27 Sep 2016: For each entry of A in the current row of A
2186  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2187  LO Aik = Acolind[k]; // local column index of current entry of A
2188  const SC Aval = Avals[k]; // value of current entry of A
2189  if (Aval == SC_ZERO)
2190  continue; // skip explicitly stored zero values in A
2191 
2192  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2193  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2194  // corresponding to the current entry of A is populated, then
2195  // the corresponding row of B is in B_local (i.e., it lives on
2196  // the calling process).
2197 
2198  // Local matrix
2199  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2200 
2201  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2202  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2203  LO Bkj = Bcolind[j];
2204  LO Cij = Bcol2Ccol[Bkj];
2205 
2206  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2207  // New entry
2208  c_status[Cij] = CSR_ip;
2209  Ccolind[CSR_ip] = Cij;
2210  Cvals[CSR_ip] = Aval*Bvals[j];
2211  CSR_ip++;
2212 
2213  } else {
2214  Cvals[c_status[Cij]] += Aval*Bvals[j];
2215  }
2216  }
2217 
2218  } else {
2219  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2220  // corresponding to the current entry of A NOT populated (has
2221  // a flag "invalid" value), then the corresponding row of B is
2222  // in B_local (i.e., it lives on the calling process).
2223 
2224  // Remote matrix
2225  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2226  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2227  LO Ikj = Icolind[j];
2228  LO Cij = Icol2Ccol[Ikj];
2229 
2230  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2231  // New entry
2232  c_status[Cij] = CSR_ip;
2233  Ccolind[CSR_ip] = Cij;
2234  Cvals[CSR_ip] = Aval*Ivals[j];
2235  CSR_ip++;
2236  } else {
2237  Cvals[c_status[Cij]] += Aval*Ivals[j];
2238  }
2239  }
2240  }
2241  }
2242 
2243  // Resize for next pass if needed
2244  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2245  CSR_alloc *= 2;
2246  Kokkos::resize(Ccolind,CSR_alloc);
2247  Kokkos::resize(Cvals,CSR_alloc);
2248  }
2249  OLD_ip = CSR_ip;
2250  }
2251 
2252  Crowptr[m] = CSR_ip;
2253 
2254  // Downward resize
2255  Kokkos::resize(Ccolind,CSR_ip);
2256  Kokkos::resize(Cvals,CSR_ip);
2257 
2258 #ifdef HAVE_TPETRA_MMM_TIMINGS
2259  {
2260  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
2261 #endif
2262 
2263  // Final sort & set of CRS arrays
2264  if (params.is_null() || params->get("sort entries",true))
2265  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2266  C.setAllValues(Crowptr,Ccolind, Cvals);
2267 
2268 
2269 #ifdef HAVE_TPETRA_MMM_TIMINGS
2270  }
2271  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
2272  {
2273 #endif
2274 
2275  // Final FillComplete
2276  //
2277  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2278  // Import (from domain Map to column Map) construction (which costs
2279  // lots of communication) by taking the previously constructed
2280  // Import object. We should be able to do this without interfering
2281  // with the implementation of the local part of sparse matrix-matrix
2282  // multply above.
2283  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2284  labelList->set("Timer Label",label);
2285  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2286  RCP<const Export<LO,GO,NO> > dummyExport;
2287  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2288 #ifdef HAVE_TPETRA_MMM_TIMINGS
2289  }
2290  MM2 = Teuchos::null;
2291 #endif
2292 
2293 }
2294 
2295 
2296 
2297 
2298 /*********************************************************************************************************/
2299 // Kernel method for computing the local portion of C = A*B (reuse)
2300 template<class Scalar,
2301  class LocalOrdinal,
2302  class GlobalOrdinal,
2303  class Node>
2304 void mult_A_B_reuse(
2305  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2306  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2307  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2308  const std::string& label,
2309  const Teuchos::RCP<Teuchos::ParameterList>& params)
2310 {
2311  using Teuchos::Array;
2312  using Teuchos::ArrayRCP;
2313  using Teuchos::ArrayView;
2314  using Teuchos::RCP;
2315  using Teuchos::rcp;
2316 
2317  // Tpetra typedefs
2318  typedef LocalOrdinal LO;
2319  typedef GlobalOrdinal GO;
2320  typedef Node NO;
2321  typedef Import<LO,GO,NO> import_type;
2322  typedef Map<LO,GO,NO> map_type;
2323 
2324  // Kokkos typedefs
2325  typedef typename map_type::local_map_type local_map_type;
2327  typedef typename KCRS::StaticCrsGraphType graph_t;
2328  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2329  typedef typename NO::execution_space execution_space;
2330  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2331  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2332 
2333 #ifdef HAVE_TPETRA_MMM_TIMINGS
2334  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2335  using Teuchos::TimeMonitor;
2336  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2337 #endif
2338  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2339 
2340  // Grab all the maps
2341  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2342  RCP<const map_type> Ccolmap = C.getColMap();
2343  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2344  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2345  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2346  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2347  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2348  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2349 
2350  // Build the final importer / column map, hash table lookups for C
2351  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2352  {
2353  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2354  // So, column map of C may be a strict subset of the column map of B
2355  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2356  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2357  });
2358 
2359  if (!Bview.importMatrix.is_null()) {
2360  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2361  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2362 
2363  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2364  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2365  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2366  });
2367  }
2368  }
2369 
2370  // Run through all the hash table lookups once and for all
2371  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2372  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2373  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2374  GO aidx = Acolmap_local.getGlobalElement(i);
2375  LO B_LID = Browmap_local.getLocalElement(aidx);
2376  if (B_LID != LO_INVALID) {
2377  targetMapToOrigRow(i) = B_LID;
2378  targetMapToImportRow(i) = LO_INVALID;
2379  } else {
2380  LO I_LID = Irowmap_local.getLocalElement(aidx);
2381  targetMapToOrigRow(i) = LO_INVALID;
2382  targetMapToImportRow(i) = I_LID;
2383 
2384  }
2385  });
2386 
2387  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2388  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2389  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2390 }
2391 
2392 /*********************************************************************************************************/
2393 template<class Scalar,
2394  class LocalOrdinal,
2395  class GlobalOrdinal,
2396  class Node,
2397  class LocalOrdinalViewType>
2398 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2399  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2400  const LocalOrdinalViewType & targetMapToOrigRow,
2401  const LocalOrdinalViewType & targetMapToImportRow,
2402  const LocalOrdinalViewType & Bcol2Ccol,
2403  const LocalOrdinalViewType & Icol2Ccol,
2404  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2405  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2406  const std::string& label,
2407  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2408 #ifdef HAVE_TPETRA_MMM_TIMINGS
2409  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2410  using Teuchos::TimeMonitor;
2411  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2412  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2413 #else
2414  (void)label;
2415 #endif
2416  using Teuchos::RCP;
2417  using Teuchos::rcp;
2418 
2419 
2420  // Lots and lots of typedefs
2422  typedef typename KCRS::StaticCrsGraphType graph_t;
2423  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2424  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2425  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2426 
2427  typedef Scalar SC;
2428  typedef LocalOrdinal LO;
2429  typedef GlobalOrdinal GO;
2430  typedef Node NO;
2431  typedef Map<LO,GO,NO> map_type;
2432  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2433  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2434  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2435 
2436  // Sizes
2437  RCP<const map_type> Ccolmap = C.getColMap();
2438  size_t m = Aview.origMatrix->getNodeNumRows();
2439  size_t n = Ccolmap->getNodeNumElements();
2440 
2441  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2442  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2443  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2444  const KCRS & Cmat = C.getLocalMatrix();
2445 
2446  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2447  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2448  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2449  scalar_view_t Cvals = Cmat.values;
2450 
2451  c_lno_view_t Irowptr;
2452  lno_nnz_view_t Icolind;
2453  scalar_view_t Ivals;
2454  if(!Bview.importMatrix.is_null()) {
2455  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2456  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2457  Ivals = Bview.importMatrix->getLocalMatrix().values;
2458  }
2459 
2460 #ifdef HAVE_TPETRA_MMM_TIMINGS
2461  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2462 #endif
2463 
2464  // Classic csr assembly (low memory edition)
2465  // mfh 27 Sep 2016: The c_status array is an implementation detail
2466  // of the local sparse matrix-matrix multiply routine.
2467 
2468  // The status array will contain the index into colind where this entry was last deposited.
2469  // c_status[i] < CSR_ip - not in the row yet
2470  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2471  // We start with this filled with INVALID's indicating that there are no entries yet.
2472  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2473  std::vector<size_t> c_status(n, ST_INVALID);
2474 
2475  // For each row of A/C
2476  size_t CSR_ip = 0, OLD_ip = 0;
2477  for (size_t i = 0; i < m; i++) {
2478  // First fill the c_status array w/ locations where we're allowed to
2479  // generate nonzeros for this row
2480  OLD_ip = Crowptr[i];
2481  CSR_ip = Crowptr[i+1];
2482  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2483  c_status[Ccolind[k]] = k;
2484 
2485  // Reset values in the row of C
2486  Cvals[k] = SC_ZERO;
2487  }
2488 
2489  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2490  LO Aik = Acolind[k];
2491  const SC Aval = Avals[k];
2492  if (Aval == SC_ZERO)
2493  continue;
2494 
2495  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2496  // Local matrix
2497  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2498 
2499  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2500  LO Bkj = Bcolind[j];
2501  LO Cij = Bcol2Ccol[Bkj];
2502 
2503  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2504  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2505  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2506 
2507  Cvals[c_status[Cij]] += Aval * Bvals[j];
2508  }
2509 
2510  } else {
2511  // Remote matrix
2512  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2513  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2514  LO Ikj = Icolind[j];
2515  LO Cij = Icol2Ccol[Ikj];
2516 
2517  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2518  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2519  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2520 
2521  Cvals[c_status[Cij]] += Aval * Ivals[j];
2522  }
2523  }
2524  }
2525  }
2526 
2527 #ifdef HAVE_TPETRA_MMM_TIMINGS
2528  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2529 #endif
2530 
2531  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2532 }
2533 
2534 
2535 /*********************************************************************************************************/
2536 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2537 template<class Scalar,
2538  class LocalOrdinal,
2539  class GlobalOrdinal,
2540  class Node>
2541 void jacobi_A_B_newmatrix(
2542  Scalar omega,
2543  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2544  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2545  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2546  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2547  const std::string& label,
2548  const Teuchos::RCP<Teuchos::ParameterList>& params)
2549 {
2550  using Teuchos::Array;
2551  using Teuchos::ArrayRCP;
2552  using Teuchos::ArrayView;
2553  using Teuchos::RCP;
2554  using Teuchos::rcp;
2555  // typedef Scalar SC;
2556  typedef LocalOrdinal LO;
2557  typedef GlobalOrdinal GO;
2558  typedef Node NO;
2559 
2560  typedef Import<LO,GO,NO> import_type;
2561  typedef Map<LO,GO,NO> map_type;
2562  typedef typename map_type::local_map_type local_map_type;
2563 
2564  // All of the Kokkos typedefs
2566  typedef typename KCRS::StaticCrsGraphType graph_t;
2567  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2568  typedef typename NO::execution_space execution_space;
2569  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2570  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2571 
2572 
2573 #ifdef HAVE_TPETRA_MMM_TIMINGS
2574  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2575  using Teuchos::TimeMonitor;
2576  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2577 #endif
2578  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2579 
2580  // Build the final importer / column map, hash table lookups for C
2581  RCP<const import_type> Cimport;
2582  RCP<const map_type> Ccolmap;
2583  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2584  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2585  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2586  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2587  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2588  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2589  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2590  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2591 
2592  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2593  // indices of B, to local column indices of C. (B and C have the
2594  // same number of columns.) The kernel uses this, instead of
2595  // copying the entire input matrix B and converting its column
2596  // indices to those of C.
2597  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2598 
2599  if (Bview.importMatrix.is_null()) {
2600  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2601  Cimport = Bimport;
2602  Ccolmap = Bview.colMap;
2603  // Bcol2Ccol is trivial
2604  // Bcol2Ccol is trivial
2605 
2606  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getNodeNumElements ()));
2607  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2608  Bcol2Ccol(i) = static_cast<LO> (i);
2609  });
2610  } else {
2611  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2612  // column Map of C, as well as C's Import object (from its domain
2613  // Map to its column Map). C's column Map is the union of the
2614  // column Maps of (the local part of) B, and the "remote" part of
2615  // B. Ditto for the Import. We have optimized this "setUnion"
2616  // operation on Import objects and Maps.
2617 
2618  // Choose the right variant of setUnion
2619  if (!Bimport.is_null() && !Iimport.is_null()){
2620  Cimport = Bimport->setUnion(*Iimport);
2621  Ccolmap = Cimport->getTargetMap();
2622 
2623  } else if (!Bimport.is_null() && Iimport.is_null()) {
2624  Cimport = Bimport->setUnion();
2625 
2626  } else if(Bimport.is_null() && !Iimport.is_null()) {
2627  Cimport = Iimport->setUnion();
2628 
2629  } else
2630  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2631 
2632  Ccolmap = Cimport->getTargetMap();
2633 
2634  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2635  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2636 
2637  // NOTE: This is not efficient and should be folded into setUnion
2638  //
2639  // mfh 27 Sep 2016: What the above comment means, is that the
2640  // setUnion operation on Import objects could also compute these
2641  // local index - to - local index look-up tables.
2642  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
2643  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2644  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2645  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2646  });
2647  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
2648  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2649  });
2650 
2651  }
2652 
2653  // Replace the column map
2654  //
2655  // mfh 27 Sep 2016: We do this because C was originally created
2656  // without a column Map. Now we have its column Map.
2657  C.replaceColMap(Ccolmap);
2658 
2659  // mfh 27 Sep 2016: Construct tables that map from local column
2660  // indices of A, to local row indices of either B_local (the locally
2661  // owned part of B), or B_remote (the "imported" remote part of B).
2662  //
2663  // For column index Aik in row i of A, if the corresponding row of B
2664  // exists in the local part of B ("orig") (which I'll call B_local),
2665  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2666  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2667  //
2668  // For column index Aik in row i of A, if the corresponding row of B
2669  // exists in the remote part of B ("Import") (which I'll call
2670  // B_remote), then targetMapToImportRow[Aik] is the local index of
2671  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2672  // (a flag value).
2673 
2674  // Run through all the hash table lookups once and for all
2675  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
2676  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
2677  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2678  GO aidx = Acolmap_local.getGlobalElement(i);
2679  LO B_LID = Browmap_local.getLocalElement(aidx);
2680  if (B_LID != LO_INVALID) {
2681  targetMapToOrigRow(i) = B_LID;
2682  targetMapToImportRow(i) = LO_INVALID;
2683  } else {
2684  LO I_LID = Irowmap_local.getLocalElement(aidx);
2685  targetMapToOrigRow(i) = LO_INVALID;
2686  targetMapToImportRow(i) = I_LID;
2687 
2688  }
2689  });
2690 
2691  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2692  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2693  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2694 
2695 }
2696 
2697 
2698 /*********************************************************************************************************/
2699 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2700 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2701 
2702 template<class Scalar,
2703  class LocalOrdinal,
2704  class GlobalOrdinal,
2705  class Node,
2706  class LocalOrdinalViewType>
2707 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2708  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2709  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2710  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2711  const LocalOrdinalViewType & targetMapToOrigRow,
2712  const LocalOrdinalViewType & targetMapToImportRow,
2713  const LocalOrdinalViewType & Bcol2Ccol,
2714  const LocalOrdinalViewType & Icol2Ccol,
2715  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2716  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2717  const std::string& label,
2718  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2719 
2720 #ifdef HAVE_TPETRA_MMM_TIMINGS
2721  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2722  using Teuchos::TimeMonitor;
2723  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2724 #endif
2725 
2726  using Teuchos::Array;
2727  using Teuchos::ArrayRCP;
2728  using Teuchos::ArrayView;
2729  using Teuchos::RCP;
2730  using Teuchos::rcp;
2731 
2732  // Lots and lots of typedefs
2734  typedef typename KCRS::StaticCrsGraphType graph_t;
2735  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2736  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2737  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2738  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2739 
2740  // Jacobi-specific
2741  typedef typename scalar_view_t::memory_space scalar_memory_space;
2742 
2743  typedef Scalar SC;
2744  typedef LocalOrdinal LO;
2745  typedef GlobalOrdinal GO;
2746  typedef Node NO;
2747 
2748  typedef Map<LO,GO,NO> map_type;
2749  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2750  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2751 
2752  // Sizes
2753  RCP<const map_type> Ccolmap = C.getColMap();
2754  size_t m = Aview.origMatrix->getNodeNumRows();
2755  size_t n = Ccolmap->getNodeNumElements();
2756  size_t b_max_nnz_per_row = Bview.origMatrix->getNodeMaxNumRowEntries();
2757 
2758  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2759  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
2760  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
2761 
2762  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2763  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2764  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2765 
2766  c_lno_view_t Irowptr;
2767  lno_nnz_view_t Icolind;
2768  scalar_view_t Ivals;
2769  if(!Bview.importMatrix.is_null()) {
2770  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
2771  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
2772  Ivals = Bview.importMatrix->getLocalMatrix().values;
2773  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getNodeMaxNumRowEntries());
2774  }
2775 
2776  // Jacobi-specific inner stuff
2777  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
2778 
2779  // Teuchos::ArrayView::operator[].
2780  // The status array will contain the index into colind where this entry was last deposited.
2781  // c_status[i] < CSR_ip - not in the row yet.
2782  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2783  // We start with this filled with INVALID's indicating that there are no entries yet.
2784  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2785  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2786  Array<size_t> c_status(n, ST_INVALID);
2787 
2788  // Classic csr assembly (low memory edition)
2789  //
2790  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2791  // The method loops over rows of A, and may resize after processing
2792  // each row. Chris Siefert says that this reflects experience in
2793  // ML; for the non-threaded case, ML found it faster to spend less
2794  // effort on estimation and risk an occasional reallocation.
2795  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2796  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2797  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2798  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2799  size_t CSR_ip = 0, OLD_ip = 0;
2800 
2801  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2802 
2803  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2804  // routine. The routine computes
2805  //
2806  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2807  //
2808  // This corresponds to one sweep of (weighted) Jacobi.
2809  //
2810  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2811  // you whether the corresponding row of B belongs to B_local
2812  // ("orig") or B_remote ("Import").
2813 
2814  // For each row of A/C
2815  for (size_t i = 0; i < m; i++) {
2816  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2817  // on the calling process.
2818  Crowptr[i] = CSR_ip;
2819  SC minusOmegaDval = -omega*Dvals(i,0);
2820 
2821  // Entries of B
2822  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2823  Scalar Bval = Bvals[j];
2824  if (Bval == SC_ZERO)
2825  continue;
2826  LO Bij = Bcolind[j];
2827  LO Cij = Bcol2Ccol[Bij];
2828 
2829  // Assume no repeated entries in B
2830  c_status[Cij] = CSR_ip;
2831  Ccolind[CSR_ip] = Cij;
2832  Cvals[CSR_ip] = Bvals[j];
2833  CSR_ip++;
2834  }
2835 
2836  // Entries of -omega * Dinv * A * B
2837  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2838  LO Aik = Acolind[k];
2839  const SC Aval = Avals[k];
2840  if (Aval == SC_ZERO)
2841  continue;
2842 
2843  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2844  // Local matrix
2845  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2846 
2847  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2848  LO Bkj = Bcolind[j];
2849  LO Cij = Bcol2Ccol[Bkj];
2850 
2851  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2852  // New entry
2853  c_status[Cij] = CSR_ip;
2854  Ccolind[CSR_ip] = Cij;
2855  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2856  CSR_ip++;
2857 
2858  } else {
2859  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2860  }
2861  }
2862 
2863  } else {
2864  // Remote matrix
2865  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2866  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2867  LO Ikj = Icolind[j];
2868  LO Cij = Icol2Ccol[Ikj];
2869 
2870  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2871  // New entry
2872  c_status[Cij] = CSR_ip;
2873  Ccolind[CSR_ip] = Cij;
2874  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2875  CSR_ip++;
2876  } else {
2877  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2878  }
2879  }
2880  }
2881  }
2882 
2883  // Resize for next pass if needed
2884  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2885  CSR_alloc *= 2;
2886  Kokkos::resize(Ccolind,CSR_alloc);
2887  Kokkos::resize(Cvals,CSR_alloc);
2888  }
2889  OLD_ip = CSR_ip;
2890  }
2891  Crowptr[m] = CSR_ip;
2892 
2893  // Downward resize
2894  Kokkos::resize(Ccolind,CSR_ip);
2895  Kokkos::resize(Cvals,CSR_ip);
2896 
2897  {
2898 #ifdef HAVE_TPETRA_MMM_TIMINGS
2899  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2900 #endif
2901 
2902  // Replace the column map
2903  //
2904  // mfh 27 Sep 2016: We do this because C was originally created
2905  // without a column Map. Now we have its column Map.
2906  C.replaceColMap(Ccolmap);
2907 
2908  // Final sort & set of CRS arrays
2909  //
2910  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2911  // matrix-matrix multiply routine sort the entries for us?
2912  // Final sort & set of CRS arrays
2913  if (params.is_null() || params->get("sort entries",true))
2914  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2915  C.setAllValues(Crowptr,Ccolind, Cvals);
2916  }
2917  {
2918 #ifdef HAVE_TPETRA_MMM_TIMINGS
2919  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2920 #endif
2921 
2922  // Final FillComplete
2923  //
2924  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2925  // Import (from domain Map to column Map) construction (which costs
2926  // lots of communication) by taking the previously constructed
2927  // Import object. We should be able to do this without interfering
2928  // with the implementation of the local part of sparse matrix-matrix
2929  // multply above
2930  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2931  labelList->set("Timer Label",label);
2932  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2933  RCP<const Export<LO,GO,NO> > dummyExport;
2934  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2935 
2936  }
2937 }
2938 
2939 
2940 /*********************************************************************************************************/
2941 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2942 template<class Scalar,
2943  class LocalOrdinal,
2944  class GlobalOrdinal,
2945  class Node>
2946 void jacobi_A_B_reuse(
2947  Scalar omega,
2948  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2949  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2950  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2951  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2952  const std::string& label,
2953  const Teuchos::RCP<Teuchos::ParameterList>& params)
2954 {
2955  using Teuchos::Array;
2956  using Teuchos::ArrayRCP;
2957  using Teuchos::ArrayView;
2958  using Teuchos::RCP;
2959  using Teuchos::rcp;
2960 
2961  typedef LocalOrdinal LO;
2962  typedef GlobalOrdinal GO;
2963  typedef Node NO;
2964 
2965  typedef Import<LO,GO,NO> import_type;
2966  typedef Map<LO,GO,NO> map_type;
2967 
2968  // Kokkos typedefs
2969  typedef typename map_type::local_map_type local_map_type;
2971  typedef typename KCRS::StaticCrsGraphType graph_t;
2972  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2973  typedef typename NO::execution_space execution_space;
2974  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2975  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2976 
2977 #ifdef HAVE_TPETRA_MMM_TIMINGS
2978  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2979  using Teuchos::TimeMonitor;
2980  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2981 #endif
2982  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2983 
2984  // Grab all the maps
2985  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2986  RCP<const map_type> Ccolmap = C.getColMap();
2987  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2988  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2989  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2990  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2991  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2992  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2993 
2994  // Build the final importer / column map, hash table lookups for C
2995  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getNodeNumElements()), Icol2Ccol;
2996  {
2997  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2998  // So, column map of C may be a strict subset of the column map of B
2999  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
3000  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
3001  });
3002 
3003  if (!Bview.importMatrix.is_null()) {
3004  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
3005  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
3006 
3007  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getNodeNumElements());
3008  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getNodeNumElements()),KOKKOS_LAMBDA(const LO i) {
3009  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
3010  });
3011  }
3012  }
3013 
3014  // Run through all the hash table lookups once and for all
3015  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getNodeNumElements());
3016  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getNodeNumElements());
3017  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
3018  GO aidx = Acolmap_local.getGlobalElement(i);
3019  LO B_LID = Browmap_local.getLocalElement(aidx);
3020  if (B_LID != LO_INVALID) {
3021  targetMapToOrigRow(i) = B_LID;
3022  targetMapToImportRow(i) = LO_INVALID;
3023  } else {
3024  LO I_LID = Irowmap_local.getLocalElement(aidx);
3025  targetMapToOrigRow(i) = LO_INVALID;
3026  targetMapToImportRow(i) = I_LID;
3027 
3028  }
3029  });
3030 
3031 #ifdef HAVE_TPETRA_MMM_TIMINGS
3032  MM = Teuchos::null;
3033 #endif
3034 
3035  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
3036  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
3037  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
3038 }
3039 
3040 
3041 
3042 /*********************************************************************************************************/
3043 template<class Scalar,
3044  class LocalOrdinal,
3045  class GlobalOrdinal,
3046  class Node,
3047  class LocalOrdinalViewType>
3048 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
3049  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
3050  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3051  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3052  const LocalOrdinalViewType & targetMapToOrigRow,
3053  const LocalOrdinalViewType & targetMapToImportRow,
3054  const LocalOrdinalViewType & Bcol2Ccol,
3055  const LocalOrdinalViewType & Icol2Ccol,
3056  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
3057  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
3058  const std::string& label,
3059  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
3060 #ifdef HAVE_TPETRA_MMM_TIMINGS
3061  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3062  using Teuchos::TimeMonitor;
3063  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
3064  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
3065 #else
3066  (void)label;
3067 #endif
3068  using Teuchos::RCP;
3069  using Teuchos::rcp;
3070 
3071  // Lots and lots of typedefs
3073  typedef typename KCRS::StaticCrsGraphType graph_t;
3074  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3075  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3076  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3077  typedef typename scalar_view_t::memory_space scalar_memory_space;
3078 
3079  typedef Scalar SC;
3080  typedef LocalOrdinal LO;
3081  typedef GlobalOrdinal GO;
3082  typedef Node NO;
3083  typedef Map<LO,GO,NO> map_type;
3084  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3085  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3086  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3087 
3088  // Sizes
3089  RCP<const map_type> Ccolmap = C.getColMap();
3090  size_t m = Aview.origMatrix->getNodeNumRows();
3091  size_t n = Ccolmap->getNodeNumElements();
3092 
3093  // Grab the Kokkos::SparseCrsMatrices & inner stuff
3094  const KCRS & Amat = Aview.origMatrix->getLocalMatrix();
3095  const KCRS & Bmat = Bview.origMatrix->getLocalMatrix();
3096  const KCRS & Cmat = C.getLocalMatrix();
3097 
3098  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3099  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3100  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3101  scalar_view_t Cvals = Cmat.values;
3102 
3103  c_lno_view_t Irowptr;
3104  lno_nnz_view_t Icolind;
3105  scalar_view_t Ivals;
3106  if(!Bview.importMatrix.is_null()) {
3107  Irowptr = Bview.importMatrix->getLocalMatrix().graph.row_map;
3108  Icolind = Bview.importMatrix->getLocalMatrix().graph.entries;
3109  Ivals = Bview.importMatrix->getLocalMatrix().values;
3110  }
3111 
3112  // Jacobi-specific inner stuff
3113  auto Dvals = Dinv.template getLocalView<scalar_memory_space>();
3114 
3115 #ifdef HAVE_TPETRA_MMM_TIMINGS
3116  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
3117 #endif
3118 
3119  // The status array will contain the index into colind where this entry was last deposited.
3120  // c_status[i] < CSR_ip - not in the row yet
3121  // c_status[i] >= CSR_ip - this is the entry where you can find the data
3122  // We start with this filled with INVALID's indicating that there are no entries yet.
3123  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
3124  std::vector<size_t> c_status(n, ST_INVALID);
3125 
3126  // For each row of A/C
3127  size_t CSR_ip = 0, OLD_ip = 0;
3128  for (size_t i = 0; i < m; i++) {
3129 
3130  // First fill the c_status array w/ locations where we're allowed to
3131  // generate nonzeros for this row
3132  OLD_ip = Crowptr[i];
3133  CSR_ip = Crowptr[i+1];
3134  for (size_t k = OLD_ip; k < CSR_ip; k++) {
3135  c_status[Ccolind[k]] = k;
3136 
3137  // Reset values in the row of C
3138  Cvals[k] = SC_ZERO;
3139  }
3140 
3141  SC minusOmegaDval = -omega*Dvals(i,0);
3142 
3143  // Entries of B
3144  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3145  Scalar Bval = Bvals[j];
3146  if (Bval == SC_ZERO)
3147  continue;
3148  LO Bij = Bcolind[j];
3149  LO Cij = Bcol2Ccol[Bij];
3150 
3151  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3152  std::runtime_error, "Trying to insert a new entry into a static graph");
3153 
3154  Cvals[c_status[Cij]] = Bvals[j];
3155  }
3156 
3157  // Entries of -omega * Dinv * A * B
3158  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3159  LO Aik = Acolind[k];
3160  const SC Aval = Avals[k];
3161  if (Aval == SC_ZERO)
3162  continue;
3163 
3164  if (targetMapToOrigRow[Aik] != LO_INVALID) {
3165  // Local matrix
3166  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3167 
3168  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3169  LO Bkj = Bcolind[j];
3170  LO Cij = Bcol2Ccol[Bkj];
3171 
3172  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3173  std::runtime_error, "Trying to insert a new entry into a static graph");
3174 
3175  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3176  }
3177 
3178  } else {
3179  // Remote matrix
3180  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3181  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3182  LO Ikj = Icolind[j];
3183  LO Cij = Icol2Ccol[Ikj];
3184 
3185  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3186  std::runtime_error, "Trying to insert a new entry into a static graph");
3187 
3188  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3189  }
3190  }
3191  }
3192  }
3193 
3194 #ifdef HAVE_TPETRA_MMM_TIMINGS
3195  MM2 = Teuchos::null;
3196  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3197 #endif
3198 
3199  C.fillComplete(C.getDomainMap(), C.getRangeMap());
3200 #ifdef HAVE_TPETRA_MMM_TIMINGS
3201  MM2 = Teuchos::null;
3202  MM = Teuchos::null;
3203 #endif
3204 
3205 }
3206 
3207 
3208 
3209 /*********************************************************************************************************/
3210 template<class Scalar,
3211  class LocalOrdinal,
3212  class GlobalOrdinal,
3213  class Node>
3214 void import_and_extract_views(
3215  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3216  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3217  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3218  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3219  bool userAssertsThereAreNoRemotes,
3220  const std::string& label,
3221  const Teuchos::RCP<Teuchos::ParameterList>& params)
3222 {
3223  using Teuchos::Array;
3224  using Teuchos::ArrayView;
3225  using Teuchos::RCP;
3226  using Teuchos::rcp;
3227  using Teuchos::null;
3228 
3229  typedef Scalar SC;
3230  typedef LocalOrdinal LO;
3231  typedef GlobalOrdinal GO;
3232  typedef Node NO;
3233 
3234  typedef Map<LO,GO,NO> map_type;
3235  typedef Import<LO,GO,NO> import_type;
3236  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3237 
3238 #ifdef HAVE_TPETRA_MMM_TIMINGS
3239  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3240  using Teuchos::TimeMonitor;
3241  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3242 #endif
3243  // The goal of this method is to populate the 'Aview' struct with views of the
3244  // rows of A, including all rows that correspond to elements in 'targetMap'.
3245  //
3246  // If targetMap includes local elements that correspond to remotely-owned rows
3247  // of A, then those remotely-owned rows will be imported into
3248  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3249  Aview.deleteContents();
3250 
3251  Aview.origMatrix = rcp(&A, false);
3252  Aview.origRowMap = A.getRowMap();
3253  Aview.rowMap = targetMap;
3254  Aview.colMap = A.getColMap();
3255  Aview.domainMap = A.getDomainMap();
3256  Aview.importColMap = null;
3257 
3258  // Short circuit if the user swears there are no remotes
3259  if (userAssertsThereAreNoRemotes)
3260  return;
3261 
3262  RCP<const import_type> importer;
3263  if (params != null && params->isParameter("importer")) {
3264  importer = params->get<RCP<const import_type> >("importer");
3265 
3266  } else {
3267 #ifdef HAVE_TPETRA_MMM_TIMINGS
3268  MM = Teuchos::null;
3269  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3270 #endif
3271 
3272  // Mark each row in targetMap as local or remote, and go ahead and get a view
3273  // for the local rows
3274  RCP<const map_type> rowMap = A.getRowMap(), remoteRowMap;
3275  size_t numRemote = 0;
3276  int mode = 0;
3277  if (!prototypeImporter.is_null() &&
3278  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3279  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3280  // We have a valid prototype importer --- ask it for the remotes
3281  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3282  numRemote = prototypeImporter->getNumRemoteIDs();
3283 
3284  Array<GO> remoteRows(numRemote);
3285  for (size_t i = 0; i < numRemote; i++)
3286  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3287 
3288  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3289  rowMap->getIndexBase(), rowMap->getComm()));
3290  mode = 1;
3291 
3292  } else if (prototypeImporter.is_null()) {
3293  // No prototype importer --- count the remotes the hard way
3294  ArrayView<const GO> rows = targetMap->getNodeElementList();
3295  size_t numRows = targetMap->getNodeNumElements();
3296 
3297  Array<GO> remoteRows(numRows);
3298  for(size_t i = 0; i < numRows; ++i) {
3299  const LO mlid = rowMap->getLocalElement(rows[i]);
3300 
3301  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3302  remoteRows[numRemote++] = rows[i];
3303  }
3304  remoteRows.resize(numRemote);
3305  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3306  rowMap->getIndexBase(), rowMap->getComm()));
3307  mode = 2;
3308 
3309  } else {
3310  // PrototypeImporter is bad. But if we're in serial that's OK.
3311  mode = 3;
3312  }
3313 
3314  const int numProcs = rowMap->getComm()->getSize();
3315 
3316  if (numProcs < 2) {
3317  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3318  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3319  // If only one processor we don't need to import any remote rows, so return.
3320  return;
3321  }
3322 
3323  //
3324  // Now we will import the needed remote rows of A, if the global maximum
3325  // value of numRemote is greater than 0.
3326  //
3327 #ifdef HAVE_TPETRA_MMM_TIMINGS
3328  MM = Teuchos::null;
3329  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3330 #endif
3331 
3332  global_size_t globalMaxNumRemote = 0;
3333  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3334 
3335  if (globalMaxNumRemote > 0) {
3336 #ifdef HAVE_TPETRA_MMM_TIMINGS
3337  MM = Teuchos::null;
3338  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3339 #endif
3340  // Create an importer with target-map remoteRowMap and source-map rowMap.
3341  if (mode == 1)
3342  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3343  else if (mode == 2)
3344  importer = rcp(new import_type(rowMap, remoteRowMap));
3345  else
3346  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3347  }
3348 
3349  if (params != null)
3350  params->set("importer", importer);
3351  }
3352 
3353  if (importer != null) {
3354 #ifdef HAVE_TPETRA_MMM_TIMINGS
3355  MM = Teuchos::null;
3356  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3357 #endif
3358 
3359  // Now create a new matrix into which we can import the remote rows of A that we need.
3360  Teuchos::ParameterList labelList;
3361  labelList.set("Timer Label", label);
3362  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3363 
3364  bool isMM = true;
3365  bool overrideAllreduce = false;
3366  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3367  // Minor speedup tweak - avoid computing the global constants
3368  Teuchos::ParameterList params_sublist;
3369  if(!params.is_null()) {
3370  labelList.set("compute global constants", params->get("compute global constants",false));
3371  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3372  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3373  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3374  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3375  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3376  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3377  }
3378  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3379  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3380  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3381 
3382  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3383  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3384 
3385 #if 0
3386  // Disabled code for dumping input matrices
3387  static int count=0;
3388  char str[80];
3389  sprintf(str,"import_matrix.%d.dat",count);
3391  count++;
3392 #endif
3393 
3394 #ifdef HAVE_TPETRA_MMM_STATISTICS
3395  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3396 #endif
3397 
3398 
3399 #ifdef HAVE_TPETRA_MMM_TIMINGS
3400  MM = Teuchos::null;
3401  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3402 #endif
3403 
3404  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3405  Aview.importColMap = Aview.importMatrix->getColMap();
3406 #ifdef HAVE_TPETRA_MMM_TIMINGS
3407  MM = Teuchos::null;
3408 #endif
3409  }
3410 }
3411 
3412 
3413 
3414 
3415 
3416 /*********************************************************************************************************/
3417  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3418 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3420 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3421  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3422  const LocalOrdinalViewType & Acol2Brow,
3423  const LocalOrdinalViewType & Acol2Irow,
3424  const LocalOrdinalViewType & Bcol2Ccol,
3425  const LocalOrdinalViewType & Icol2Ccol,
3426  const size_t mergedNodeNumCols) {
3427 
3428  using Teuchos::RCP;
3430  typedef typename KCRS::StaticCrsGraphType graph_t;
3431  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3432  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3433  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3434  // Grab the Kokkos::SparseCrsMatrices
3435  const KCRS & Ak = Aview.origMatrix->getLocalMatrix();
3436  const KCRS & Bk = Bview.origMatrix->getLocalMatrix();
3437 
3438  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3439  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3440  // We do have a Bimport
3441  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3442  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3443  RCP<const KCRS> Ik_;
3444  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrix());
3445  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3446  KCRS Iks;
3447  if(Ik!=0) Iks = *Ik;
3448  size_t merge_numrows = Ak.numCols();
3449  // The last entry of this at least, need to be initialized
3450  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3451 
3452  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3453 
3454  // Use a Kokkos::parallel_scan to build the rowptr
3455  typedef typename Node::execution_space execution_space;
3456  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3457  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3458  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3459  if(final) Mrowptr(i) = update;
3460  // Get the row count
3461  size_t ct=0;
3462  if(Acol2Brow(i)!=LO_INVALID)
3463  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3464  else
3465  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3466  update+=ct;
3467 
3468  if(final && i+1==merge_numrows)
3469  Mrowptr(i+1)=update;
3470  });
3471 
3472  // Allocate nnz
3473  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3474  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3475  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3476 
3477  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3478  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3479  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3480  if(Acol2Brow(i)!=LO_INVALID) {
3481  size_t row = Acol2Brow(i);
3482  size_t start = Bk.graph.row_map(row);
3483  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3484  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3485  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3486  }
3487  }
3488  else {
3489  size_t row = Acol2Irow(i);
3490  size_t start = Iks.graph.row_map(row);
3491  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3492  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3493  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3494  }
3495  }
3496  });
3497 
3498  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3499  return newmat;
3500  }
3501  else {
3502  // We don't have a Bimport (the easy case)
3503  return Bk;
3504  }
3505 }//end merge_matrices
3506 
3507 
3508 
3509 } //End namepsace MMdetails
3510 
3511 } //End namespace Tpetra
3512 
3513 /*********************************************************************************************************/
3514 //
3515 // Explicit instantiation macro
3516 //
3517 // Must be expanded from within the Tpetra namespace!
3518 //
3519 namespace Tpetra {
3520 
3521 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3522  template \
3523  void MatrixMatrix::Multiply( \
3524  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3525  bool transposeA, \
3526  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3527  bool transposeB, \
3528  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3529  bool call_FillComplete_on_result, \
3530  const std::string & label, \
3531  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3532 \
3533 template \
3534  void MatrixMatrix::Jacobi( \
3535  SCALAR omega, \
3536  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3537  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3538  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3539  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3540  bool call_FillComplete_on_result, \
3541  const std::string & label, \
3542  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3543 \
3544  template \
3545  void MatrixMatrix::Add( \
3546  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3547  bool transposeA, \
3548  SCALAR scalarA, \
3549  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3550  bool transposeB, \
3551  SCALAR scalarB, \
3552  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > C); \
3553 \
3554  template \
3555  void MatrixMatrix::Add( \
3556  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3557  bool transposeA, \
3558  SCALAR scalarA, \
3559  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3560  SCALAR scalarB ); \
3561 \
3562  template \
3563  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3564  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3565  (const SCALAR & alpha, \
3566  const bool transposeA, \
3567  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3568  const SCALAR & beta, \
3569  const bool transposeB, \
3570  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3571  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3572  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3573  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3574 \
3575  template struct MatrixMatrix::AddDetails::AddKernels<SCALAR, LO, GO, NODE>;
3576 
3577 } //End namespace Tpetra
3578 
3579 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
The core count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced nei...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
size_t getNodeNumRows() const override
The number of matrix rows owned by the calling process.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
void keyValueMerge(KeyInputIterType keyBeg1, KeyInputIterType keyEnd1, ValueInputIterType valBeg1, ValueInputIterType valEnd1, KeyInputIterType keyBeg2, KeyInputIterType keyEnd2, ValueInputIterType valBeg2, ValueInputIterType valEnd2, KeyOutputIterType keyOut, ValueOutputIterType valOut, BinaryFunction f)
Merge two sorted (by keys) sequences of unique (key,value) pairs by combining pairs with equal keys...
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, execution_space, void, typename local_graph_type::size_type > local_matrix_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
void setAllValues(const typename local_matrix_type::row_map_type &ptr, const typename local_graph_type::entries_type::non_const_type &ind, const typename local_matrix_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
size_t getNodeMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.