Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
TpetraExt_MatrixMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // ***********************************************************************
3 //
4 // Tpetra: Templated Linear Algebra Services Package
5 // Copyright (2008) Sandia Corporation
6 //
7 // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
8 // the U.S. Government retains certain rights in this software.
9 //
10 // Redistribution and use in source and binary forms, with or without
11 // modification, are permitted provided that the following conditions are
12 // met:
13 //
14 // 1. Redistributions of source code must retain the above copyright
15 // notice, this list of conditions and the following disclaimer.
16 //
17 // 2. Redistributions in binary form must reproduce the above copyright
18 // notice, this list of conditions and the following disclaimer in the
19 // documentation and/or other materials provided with the distribution.
20 //
21 // 3. Neither the name of the Corporation nor the names of the
22 // contributors may be used to endorse or promote products derived from
23 // this software without specific prior written permission.
24 //
25 // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
26 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
27 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
28 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
29 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
30 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
31 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
32 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
33 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
34 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
35 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
36 //
37 // Questions? Contact Michael A. Heroux (maherou@sandia.gov)
38 //
39 // ************************************************************************
40 // @HEADER
41 #ifndef TPETRA_MATRIXMATRIX_DEF_HPP
42 #define TPETRA_MATRIXMATRIX_DEF_HPP
43 #include "KokkosSparse_Utils.hpp"
44 #include "Tpetra_ConfigDefs.hpp"
46 #include "Teuchos_VerboseObject.hpp"
47 #include "Teuchos_Array.hpp"
48 #include "Tpetra_Util.hpp"
49 #include "Tpetra_CrsMatrix.hpp"
50 #include "Tpetra_BlockCrsMatrix.hpp"
52 #include "Tpetra_RowMatrixTransposer.hpp"
55 #include "Tpetra_Details_makeColMap.hpp"
56 #include "Tpetra_ConfigDefs.hpp"
57 #include "Tpetra_Map.hpp"
58 #include "Tpetra_Export.hpp"
59 #include "Tpetra_Import_Util.hpp"
60 #include "Tpetra_Import_Util2.hpp"
61 #include <algorithm>
62 #include <type_traits>
63 #include "Teuchos_FancyOStream.hpp"
64 
65 #include "TpetraExt_MatrixMatrix_ExtraKernels_def.hpp"
67 
68 #include "KokkosSparse_spgemm.hpp"
69 #include "KokkosSparse_spadd.hpp"
70 #include "Kokkos_Bitset.hpp"
71 
72 #include <MatrixMarket_Tpetra.hpp>
73 
81 /*********************************************************************************************************/
82 // Include the architecture-specific kernel partial specializations here
83 // NOTE: This needs to be outside all namespaces
84 #include "TpetraExt_MatrixMatrix_OpenMP.hpp"
85 #include "TpetraExt_MatrixMatrix_Cuda.hpp"
86 #include "TpetraExt_MatrixMatrix_HIP.hpp"
87 #include "TpetraExt_MatrixMatrix_SYCL.hpp"
88 
89 namespace Tpetra {
90 
91 namespace MatrixMatrix{
92 
93 //
94 // This method forms the matrix-matrix product C = op(A) * op(B), where
95 // op(A) == A if transposeA is false,
96 // op(A) == A^T if transposeA is true,
97 // and similarly for op(B).
98 //
99 template <class Scalar,
100  class LocalOrdinal,
101  class GlobalOrdinal,
102  class Node>
103 void Multiply(
105  bool transposeA,
107  bool transposeB,
109  bool call_FillComplete_on_result,
110  const std::string& label,
111  const Teuchos::RCP<Teuchos::ParameterList>& params)
112 {
113  using Teuchos::null;
114  using Teuchos::RCP;
115  using Teuchos::rcp;
116  typedef Scalar SC;
117  typedef LocalOrdinal LO;
118  typedef GlobalOrdinal GO;
119  typedef Node NO;
120  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
121  typedef Import<LO,GO,NO> import_type;
122  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
123  typedef Map<LO,GO,NO> map_type;
124  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
125 
126 #ifdef HAVE_TPETRA_MMM_TIMINGS
127  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
128  using Teuchos::TimeMonitor;
129  //MM is used to time setup, and then multiply.
130 
131  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Setup"))));
132 #endif
133 
134  const std::string prefix = "TpetraExt::MatrixMatrix::Multiply(): ";
135 
136  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("My Matrix Mult", mmm_multiply);
137 
138  // The input matrices A and B must both be fillComplete.
139  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
140  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
141 
142  // If transposeA is true, then Aprime will be the transpose of A
143  // (computed explicitly via RowMatrixTransposer). Otherwise, Aprime
144  // will just be a pointer to A.
145  RCP<const crs_matrix_type> Aprime = null;
146  // If transposeB is true, then Bprime will be the transpose of B
147  // (computed explicitly via RowMatrixTransposer). Otherwise, Bprime
148  // will just be a pointer to B.
149  RCP<const crs_matrix_type> Bprime = null;
150 
151  // Is this a "clean" matrix?
152  //
153  // mfh 27 Sep 2016: Historically, if Epetra_CrsMatrix was neither
154  // locally nor globally indexed, then it was empty. I don't like
155  // this, because the most straightforward implementation presumes
156  // lazy allocation of indices. However, historical precedent
157  // demands that we keep around this predicate as a way to test
158  // whether the matrix is empty.
159  const bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
160 
161  bool use_optimized_ATB = false;
162  if (transposeA && !transposeB && call_FillComplete_on_result && newFlag)
163  use_optimized_ATB = true;
164 
165 #ifdef USE_OLD_TRANSPOSE // NOTE: For Grey Ballard's use. Remove this later.
166  use_optimized_ATB = false;
167 #endif
168 
169  using Teuchos::ParameterList;
170  RCP<ParameterList> transposeParams (new ParameterList);
171  transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
172 
173  if (!use_optimized_ATB && transposeA) {
174  transposer_type transposer (rcpFromRef (A));
175  Aprime = transposer.createTranspose (transposeParams);
176  }
177  else {
178  Aprime = rcpFromRef(A);
179  }
180 
181  if (transposeB) {
182  transposer_type transposer (rcpFromRef (B));
183  Bprime = transposer.createTranspose (transposeParams);
184  }
185  else {
186  Bprime = rcpFromRef(B);
187  }
188 
189  // Check size compatibility
190  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
191  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
192  global_size_t Aouter = transposeA ? numACols : A.getGlobalNumRows();
193  global_size_t Bouter = transposeB ? B.getGlobalNumRows() : numBCols;
194  global_size_t Ainner = transposeA ? A.getGlobalNumRows() : numACols;
195  global_size_t Binner = transposeB ? numBCols : B.getGlobalNumRows();
196  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
197  prefix << "ERROR, inner dimensions of op(A) and op(B) "
198  "must match for matrix-matrix product. op(A) is "
199  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
200 
201  // The result matrix C must at least have a row-map that reflects the correct
202  // row-size. Don't check the number of columns because rectangular matrices
203  // which were constructed with only one map can still end up having the
204  // correct capacity and dimensions when filled.
205  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
206  prefix << "ERROR, dimensions of result C must "
207  "match dimensions of op(A) * op(B). C has " << C.getGlobalNumRows()
208  << " rows, should have at least " << Aouter << std::endl);
209 
210  // It doesn't matter whether C is already Filled or not. If it is already
211  // Filled, it must have space allocated for the positions that will be
212  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
213  // we'll error out later when trying to store result values.
214 
215  // CGB: However, matrix must be in active-fill
216  if (!C.isFillActive()) C.resumeFill();
217 
218  // We're going to need to import remotely-owned sections of A and/or B if
219  // more than one processor is performing this run, depending on the scenario.
220  int numProcs = A.getComm()->getSize();
221 
222  // Declare a couple of structs that will be used to hold views of the data
223  // of A and B, to be used for fast access during the matrix-multiplication.
224  crs_matrix_struct_type Aview;
225  crs_matrix_struct_type Bview;
226 
227  RCP<const map_type> targetMap_A = Aprime->getRowMap();
228  RCP<const map_type> targetMap_B = Bprime->getRowMap();
229 
230 #ifdef HAVE_TPETRA_MMM_TIMINGS
231  {
232  TimeMonitor MM_importExtract(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All I&X")));
233 #endif
234 
235  // Now import any needed remote rows and populate the Aview struct
236  // NOTE: We assert that an import isn't needed --- since we do the transpose
237  // above to handle that.
238  if (!use_optimized_ATB) {
239  RCP<const import_type> dummyImporter;
240  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label, params);
241  }
242 
243  // We will also need local access to all rows of B that correspond to the
244  // column-map of op(A).
245  if (numProcs > 1)
246  targetMap_B = Aprime->getColMap();
247 
248  // Import any needed remote rows and populate the Bview struct.
249  if (!use_optimized_ATB)
250  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label, params);
251 
252 #ifdef HAVE_TPETRA_MMM_TIMINGS
253  } //stop MM_importExtract here
254  //stop the setup timer, and start the multiply timer
255  MM = Teuchos::null; MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All Multiply"))));
256 #endif
257 
258  // Call the appropriate method to perform the actual multiplication.
259  if (use_optimized_ATB) {
260  MMdetails::mult_AT_B_newmatrix(A, B, C, label,params);
261 
262  } else if (call_FillComplete_on_result && newFlag) {
263  MMdetails::mult_A_B_newmatrix(Aview, Bview, C, label,params);
264 
265  } else if (call_FillComplete_on_result) {
266  MMdetails::mult_A_B_reuse(Aview, Bview, C, label,params);
267 
268  } else {
269  // mfh 27 Sep 2016: Is this the "slow" case? This
270  // "CrsWrapper_CrsMatrix" thing could perhaps be made to support
271  // thread-parallel inserts, but that may take some effort.
272  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
273 
274  MMdetails::mult_A_B(Aview, Bview, crsmat, label,params);
275  }
276 
277 #ifdef HAVE_TPETRA_MMM_TIMINGS
278  TimeMonitor MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM All FillComplete")));
279 #endif
280  if (call_FillComplete_on_result && !C.isFillComplete()) {
281  // We'll call FillComplete on the C matrix before we exit, and give it a
282  // domain-map and a range-map.
283  // The domain-map will be the domain-map of B, unless
284  // op(B)==transpose(B), in which case the range-map of B will be used.
285  // The range-map will be the range-map of A, unless op(A)==transpose(A),
286  // in which case the domain-map of A will be used.
287  C.fillComplete(Bprime->getDomainMap(), Aprime->getRangeMap());
288  }
289 }
290 
291 //
292 // This method forms the matrix-matrix product C = op(A) * op(B), where
293 // op(A) == A and similarly for op(B). op(A) = A^T is not yet implemented.
294 //
295 template <class Scalar,
296  class LocalOrdinal,
297  class GlobalOrdinal,
298  class Node>
299 void Multiply(
300  const Teuchos::RCP<const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& A,
301  bool transposeA,
302  const Teuchos::RCP<const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >& B,
303  bool transposeB,
305  const std::string& label)
306 {
307  using Teuchos::null;
308  using Teuchos::RCP;
309  using Teuchos::rcp;
310  typedef Scalar SC;
311  typedef LocalOrdinal LO;
312  typedef GlobalOrdinal GO;
313  typedef Node NO;
314  typedef BlockCrsMatrixStruct<SC,LO,GO,NO> blockcrs_matrix_struct_type;
315  typedef Map<LO,GO,NO> map_type;
316  typedef Import<LO,GO,NO> import_type;
317 
318  std::string prefix = std::string("TpetraExt ") + label + std::string(": ");
319 
320  TEUCHOS_TEST_FOR_EXCEPTION(transposeA==true, std::runtime_error, prefix << "Matrix A cannot be transposed.");
321  TEUCHOS_TEST_FOR_EXCEPTION(transposeB==true, std::runtime_error, prefix << "Matrix B cannot be transposed.");
322 
323  // Check size compatibility
324  global_size_t numACols = A->getGlobalNumCols();
325  global_size_t numBCols = B->getGlobalNumCols();
326  global_size_t numARows = A->getGlobalNumRows();
327  global_size_t numBRows = B->getGlobalNumRows();
328 
329  global_size_t Aouter = numARows;
330  global_size_t Bouter = numBCols;
331  global_size_t Ainner = numACols;
332  global_size_t Binner = numBRows;
333  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
334  prefix << "ERROR, inner dimensions of op(A) and op(B) "
335  "must match for matrix-matrix product. op(A) is "
336  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
337 
338  // We're going to need to import remotely-owned sections of A and/or B if
339  // more than one processor is performing this run, depending on the scenario.
340  int numProcs = A->getComm()->getSize();
341 
342  const LO blocksize = A->getBlockSize();
343  TEUCHOS_TEST_FOR_EXCEPTION(blocksize != B->getBlockSize(), std::runtime_error,
344  prefix << "ERROR, Blocksizes do not match. A.blocksize = " <<
345  blocksize << ", B.blocksize = " << B->getBlockSize() );
346 
347  // Declare a couple of structs that will be used to hold views of the data
348  // of A and B, to be used for fast access during the matrix-multiplication.
349  blockcrs_matrix_struct_type Aview(blocksize);
350  blockcrs_matrix_struct_type Bview(blocksize);
351 
352  RCP<const map_type> targetMap_A = A->getRowMap();
353  RCP<const map_type> targetMap_B = B->getRowMap();
354 
355  // Populate the Aview struct. No remotes are needed.
356  RCP<const import_type> dummyImporter;
357  MMdetails::import_and_extract_views(*A, targetMap_A, Aview, dummyImporter, true);
358 
359  // We will also need local access to all rows of B that correspond to the
360  // column-map of op(A).
361  if (numProcs > 1)
362  targetMap_B = A->getColMap();
363 
364  // Import any needed remote rows and populate the Bview struct.
365  MMdetails::import_and_extract_views(*B, targetMap_B, Bview, A->getGraph()->getImporter(),
366  A->getGraph()->getImporter().is_null());
367 
368  // Call the appropriate method to perform the actual multiplication.
369  MMdetails::mult_A_B_newmatrix(Aview, Bview, C);
370 }
371 
372 template <class Scalar,
373  class LocalOrdinal,
374  class GlobalOrdinal,
375  class Node>
376 void Jacobi(Scalar omega,
381  bool call_FillComplete_on_result,
382  const std::string& label,
383  const Teuchos::RCP<Teuchos::ParameterList>& params)
384 {
385  using Teuchos::RCP;
386  typedef Scalar SC;
387  typedef LocalOrdinal LO;
388  typedef GlobalOrdinal GO;
389  typedef Node NO;
390  typedef Import<LO,GO,NO> import_type;
391  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
392  typedef Map<LO,GO,NO> map_type;
393  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
394 
395 #ifdef HAVE_TPETRA_MMM_TIMINGS
396  std::string prefix_mmm = std::string("TpetraExt ")+ label + std::string(": ");
397  using Teuchos::TimeMonitor;
398  TimeMonitor MM(*TimeMonitor::getNewTimer(prefix_mmm+std::string("Jacobi All Setup")));
399 #endif
400 
401  const std::string prefix = "TpetraExt::MatrixMatrix::Jacobi(): ";
402 
403  // A and B should already be Filled.
404  // Should we go ahead and call FillComplete() on them if necessary or error
405  // out? For now, we choose to error out.
406  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error, prefix << "Matrix A is not fill complete.");
407  TEUCHOS_TEST_FOR_EXCEPTION(!B.isFillComplete(), std::runtime_error, prefix << "Matrix B is not fill complete.");
408 
409  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
410  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
411 
412  // Now check size compatibility
413  global_size_t numACols = A.getDomainMap()->getGlobalNumElements();
414  global_size_t numBCols = B.getDomainMap()->getGlobalNumElements();
415  global_size_t Aouter = A.getGlobalNumRows();
416  global_size_t Bouter = numBCols;
417  global_size_t Ainner = numACols;
418  global_size_t Binner = B.getGlobalNumRows();
419  TEUCHOS_TEST_FOR_EXCEPTION(Ainner != Binner, std::runtime_error,
420  prefix << "ERROR, inner dimensions of op(A) and op(B) "
421  "must match for matrix-matrix product. op(A) is "
422  << Aouter << "x" << Ainner << ", op(B) is "<< Binner << "x" << Bouter);
423 
424  // The result matrix C must at least have a row-map that reflects the correct
425  // row-size. Don't check the number of columns because rectangular matrices
426  // which were constructed with only one map can still end up having the
427  // correct capacity and dimensions when filled.
428  TEUCHOS_TEST_FOR_EXCEPTION(Aouter > C.getGlobalNumRows(), std::runtime_error,
429  prefix << "ERROR, dimensions of result C must "
430  "match dimensions of op(A) * op(B). C has "<< C.getGlobalNumRows()
431  << " rows, should have at least "<< Aouter << std::endl);
432 
433  // It doesn't matter whether C is already Filled or not. If it is already
434  // Filled, it must have space allocated for the positions that will be
435  // referenced in forming C = op(A)*op(B). If it doesn't have enough space,
436  // we'll error out later when trying to store result values.
437 
438  // CGB: However, matrix must be in active-fill
439  TEUCHOS_TEST_FOR_EXCEPT( C.isFillActive() == false );
440 
441  // We're going to need to import remotely-owned sections of A and/or B if
442  // more than one processor is performing this run, depending on the scenario.
443  int numProcs = A.getComm()->getSize();
444 
445  // Declare a couple of structs that will be used to hold views of the data of
446  // A and B, to be used for fast access during the matrix-multiplication.
447  crs_matrix_struct_type Aview;
448  crs_matrix_struct_type Bview;
449 
450  RCP<const map_type> targetMap_A = Aprime->getRowMap();
451  RCP<const map_type> targetMap_B = Bprime->getRowMap();
452 
453 #ifdef HAVE_TPETRA_MMM_TIMINGS
454  TimeMonitor MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All I&X")));
455  {
456 #endif
457 
458  // Enable globalConstants by default
459  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
460  RCP<Teuchos::ParameterList> importParams1 = Teuchos::rcp(new Teuchos::ParameterList);
461  if(!params.is_null()) {
462  importParams1->set("compute global constants",params->get("compute global constants: temporaries",false));
463  int mm_optimization_core_count=0;
464  auto slist = params->sublist("matrixmatrix: kernel params",false);
465  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount ());
466  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
467  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
468  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
469  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
470  auto & ip1slist = importParams1->sublist("matrixmatrix: kernel params",false);
471  ip1slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
472  ip1slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
473  ip1slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
474  }
475 
476  //Now import any needed remote rows and populate the Aview struct.
477  RCP<const import_type> dummyImporter;
478  MMdetails::import_and_extract_views(*Aprime, targetMap_A, Aview, dummyImporter, true, label,importParams1);
479 
480  // We will also need local access to all rows of B that correspond to the
481  // column-map of op(A).
482  if (numProcs > 1)
483  targetMap_B = Aprime->getColMap();
484 
485  // Now import any needed remote rows and populate the Bview struct.
486  // Enable globalConstants by default
487  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
488  RCP<Teuchos::ParameterList> importParams2 = Teuchos::rcp(new Teuchos::ParameterList);
489  if(!params.is_null()) {
490  importParams2->set("compute global constants",params->get("compute global constants: temporaries",false));
491 
492  auto slist = params->sublist("matrixmatrix: kernel params",false);
493  int mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount () );
494  bool isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
495  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
496  auto & ip2slist = importParams2->sublist("matrixmatrix: kernel params",false);
497  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
498  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
499  ip2slist.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
500  ip2slist.set("isMatrixMatrix_TransferAndFillComplete",isMM);
501  ip2slist.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
502  }
503 
504  MMdetails::import_and_extract_views(*Bprime, targetMap_B, Bview, Aprime->getGraph()->getImporter(), Aprime->getGraph()->getImporter().is_null(), label,importParams2);
505 
506 #ifdef HAVE_TPETRA_MMM_TIMINGS
507  }
508  TimeMonitor MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi All Multiply")));
509 #endif
510 
511  // Now call the appropriate method to perform the actual multiplication.
512  CrsWrapper_CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> crsmat(C);
513 
514  // Is this a "clean" matrix
515  bool newFlag = !C.getGraph()->isLocallyIndexed() && !C.getGraph()->isGloballyIndexed();
516 
517  if (call_FillComplete_on_result && newFlag) {
518  MMdetails::jacobi_A_B_newmatrix(omega, Dinv, Aview, Bview, C, label, params);
519 
520  } else if (call_FillComplete_on_result) {
521  MMdetails::jacobi_A_B_reuse(omega, Dinv, Aview, Bview, C, label, params);
522 
523  } else {
524  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, "jacobi_A_B_general not implemented");
525  }
526 
527  if(!params.is_null()) {
528  bool removeZeroEntries = params->get("remove zeros", false);
529  if (removeZeroEntries) {
530  typedef Teuchos::ScalarTraits<Scalar> STS;
531  typename STS::magnitudeType threshold = params->get("remove zeros threshold", STS::magnitude(STS::zero()));
532  removeCrsMatrixZeros(C, threshold);
533  }
534  }
535 }
536 
537 
538 template <class Scalar,
539  class LocalOrdinal,
540  class GlobalOrdinal,
541  class Node>
542 void Add(
544  bool transposeA,
545  Scalar scalarA,
547  Scalar scalarB )
548 {
549  using Teuchos::Array;
550  using Teuchos::RCP;
551  using Teuchos::null;
552  typedef Scalar SC;
553  typedef LocalOrdinal LO;
554  typedef GlobalOrdinal GO;
555  typedef Node NO;
556  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
557  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
558 
559  const std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
560 
561  TEUCHOS_TEST_FOR_EXCEPTION(!A.isFillComplete(), std::runtime_error,
562  prefix << "ERROR, input matrix A.isFillComplete() is false; it is required to be true. "
563  "(Result matrix B is not required to be isFillComplete()).");
564  TEUCHOS_TEST_FOR_EXCEPTION(B.isFillComplete() , std::runtime_error,
565  prefix << "ERROR, input matrix B must not be fill complete!");
566  TEUCHOS_TEST_FOR_EXCEPTION(B.isStaticGraph() , std::runtime_error,
567  prefix << "ERROR, input matrix B must not have static graph!");
568  TEUCHOS_TEST_FOR_EXCEPTION(B.isLocallyIndexed() , std::runtime_error,
569  prefix << "ERROR, input matrix B must not be locally indexed!");
570 
571  using Teuchos::ParameterList;
572  RCP<ParameterList> transposeParams (new ParameterList);
573  transposeParams->set ("sort", false);
574 
575  RCP<const crs_matrix_type> Aprime = null;
576  if (transposeA) {
577  transposer_type transposer (rcpFromRef (A));
578  Aprime = transposer.createTranspose (transposeParams);
579  }
580  else {
581  Aprime = rcpFromRef(A);
582  }
583 
584  size_t a_numEntries;
585  typename crs_matrix_type::nonconst_global_inds_host_view_type a_inds("a_inds",A.getLocalMaxNumRowEntries());
586  typename crs_matrix_type::nonconst_values_host_view_type a_vals("a_vals",A.getLocalMaxNumRowEntries());
587  GO row;
588 
589  if (scalarB != Teuchos::ScalarTraits<SC>::one())
590  B.scale(scalarB);
591 
592  size_t numMyRows = B.getLocalNumRows();
593  if (scalarA != Teuchos::ScalarTraits<SC>::zero()) {
594  for (LO i = 0; (size_t)i < numMyRows; ++i) {
595  row = B.getRowMap()->getGlobalElement(i);
596  Aprime->getGlobalRowCopy(row, a_inds, a_vals, a_numEntries);
597 
598  if (scalarA != Teuchos::ScalarTraits<SC>::one()) {
599  for (size_t j = 0; j < a_numEntries; ++j)
600  a_vals[j] *= scalarA;
601  }
602  B.insertGlobalValues(row, a_numEntries, reinterpret_cast<Scalar *>(a_vals.data()), a_inds.data());
603  }
604  }
605 }
606 
607 template <class Scalar,
608  class LocalOrdinal,
609  class GlobalOrdinal,
610  class Node>
611 Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
612 add (const Scalar& alpha,
613  const bool transposeA,
615  const Scalar& beta,
616  const bool transposeB,
618  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
619  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
620  const Teuchos::RCP<Teuchos::ParameterList>& params)
621 {
622  using Teuchos::RCP;
623  using Teuchos::rcpFromRef;
624  using Teuchos::rcp;
625  using Teuchos::ParameterList;
627  if(!params.is_null())
628  {
629  TEUCHOS_TEST_FOR_EXCEPTION(
630  params->isParameter("Call fillComplete") && !params->get<bool>("Call fillComplete"),
631  std::invalid_argument,
632  "Tpetra::MatrixMatrix::add(): this version of add() always calls fillComplete\n"
633  "on the result, but you explicitly set 'Call fillComplete' = false in the parameter list. Don't set this explicitly.");
634  params->set("Call fillComplete", true);
635  }
636  //If transposeB, must compute B's explicit transpose to
637  //get the correct row map for C.
638  RCP<const crs_matrix_type> Brcp = rcpFromRef(B);
639  if(transposeB)
640  {
642  Brcp = transposer.createTranspose();
643  }
644  //Check that A,B are fillComplete before getting B's column map
645  TEUCHOS_TEST_FOR_EXCEPTION
646  (! A.isFillComplete () || ! Brcp->isFillComplete (), std::invalid_argument,
647  "TpetraExt::MatrixMatrix::add(): A and B must both be fill complete.");
648  RCP<crs_matrix_type> C = rcp(new crs_matrix_type(Brcp->getRowMap(), 0));
649  //this version of add() always fill completes the result, no matter what is in params on input
650  add(alpha, transposeA, A, beta, false, *Brcp, *C, domainMap, rangeMap, params);
651  return C;
652 }
653 
654 //This functor does the same thing as CrsGraph::convertColumnIndicesFromGlobalToLocal,
655 //but since the spadd() output is always packed there is no need for a separate
656 //numRowEntries here.
657 //
658 template<class LO, class GO, class LOView, class GOView, class LocalMap>
659 struct ConvertGlobalToLocalFunctor
660 {
661  ConvertGlobalToLocalFunctor(LOView& lids_, const GOView& gids_, const LocalMap localColMap_)
662  : lids(lids_), gids(gids_), localColMap(localColMap_)
663  {}
664 
665  KOKKOS_FUNCTION void operator() (const GO i) const
666  {
667  lids(i) = localColMap.getLocalElement(gids(i));
668  }
669 
670  LOView lids;
671  const GOView gids;
672  const LocalMap localColMap;
673 };
674 
675 template <class Scalar,
676  class LocalOrdinal,
677  class GlobalOrdinal,
678  class Node>
679 void
680 add (const Scalar& alpha,
681  const bool transposeA,
683  const Scalar& beta,
684  const bool transposeB,
687  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& domainMap,
688  const Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >& rangeMap,
689  const Teuchos::RCP<Teuchos::ParameterList>& params)
690 {
691  using Teuchos::RCP;
692  using Teuchos::rcp;
693  using Teuchos::rcpFromRef;
694  using Teuchos::rcp_implicit_cast;
695  using Teuchos::rcp_dynamic_cast;
696  using Teuchos::TimeMonitor;
697  using SC = Scalar;
698  using LO = LocalOrdinal;
699  using GO = GlobalOrdinal;
700  using NO = Node;
701  using crs_matrix_type = CrsMatrix<SC,LO,GO,NO>;
702  using crs_graph_type = CrsGraph<LO,GO,NO>;
703  using map_type = Map<LO,GO,NO>;
704  using transposer_type = RowMatrixTransposer<SC,LO,GO,NO>;
705  using import_type = Import<LO,GO,NO>;
706  using export_type = Export<LO,GO,NO>;
707  using exec_space = typename crs_graph_type::execution_space;
708  using AddKern = MMdetails::AddKernels<SC,LO,GO,NO>;
709  const char* prefix_mmm = "TpetraExt::MatrixMatrix::add: ";
710  constexpr bool debug = false;
711 
712 #ifdef HAVE_TPETRA_MMM_TIMINGS
713  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("transpose"))));
714 #endif
715 
716  if (debug) {
717  std::ostringstream os;
718  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
719  << "TpetraExt::MatrixMatrix::add" << std::endl;
720  std::cerr << os.str ();
721  }
722 
723  TEUCHOS_TEST_FOR_EXCEPTION
724  (C.isLocallyIndexed() || C.isGloballyIndexed(), std::invalid_argument,
725  prefix_mmm << "C must be a 'new' matrix (neither locally nor globally indexed).");
726  TEUCHOS_TEST_FOR_EXCEPTION
727  (! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
728  prefix_mmm << "A and B must both be fill complete.");
729 #ifdef HAVE_TPETRA_DEBUG
730  // The matrices don't have domain or range Maps unless they are fill complete.
731  if (A.isFillComplete () && B.isFillComplete ()) {
732  const bool domainMapsSame =
733  (! transposeA && ! transposeB &&
734  ! A.getDomainMap()->locallySameAs (*B.getDomainMap ())) ||
735  (! transposeA && transposeB &&
736  ! A.getDomainMap()->isSameAs (*B.getRangeMap ())) ||
737  ( transposeA && ! transposeB &&
738  ! A.getRangeMap ()->isSameAs (*B.getDomainMap ()));
739  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
740  prefix_mmm << "The domain Maps of Op(A) and Op(B) are not the same.");
741 
742  const bool rangeMapsSame =
743  (! transposeA && ! transposeB &&
744  ! A.getRangeMap ()->isSameAs (*B.getRangeMap ())) ||
745  (! transposeA && transposeB &&
746  ! A.getRangeMap ()->isSameAs (*B.getDomainMap())) ||
747  ( transposeA && ! transposeB &&
748  ! A.getDomainMap()->isSameAs (*B.getRangeMap ()));
749  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
750  prefix_mmm << "The range Maps of Op(A) and Op(B) are not the same.");
751  }
752 #endif // HAVE_TPETRA_DEBUG
753 
754  using Teuchos::ParameterList;
755  // Form the explicit transpose of A if necessary.
756  RCP<const crs_matrix_type> Aprime = rcpFromRef(A);
757  if (transposeA) {
758  transposer_type transposer (Aprime);
759  Aprime = transposer.createTranspose ();
760  }
761 
762 #ifdef HAVE_TPETRA_DEBUG
763  TEUCHOS_TEST_FOR_EXCEPTION
764  (Aprime.is_null (), std::logic_error,
765  prefix_mmm << "Failed to compute Op(A). "
766  "Please report this bug to the Tpetra developers.");
767 #endif // HAVE_TPETRA_DEBUG
768 
769  // Form the explicit transpose of B if necessary.
770  RCP<const crs_matrix_type> Bprime = rcpFromRef(B);
771  if (transposeB) {
772  if (debug) {
773  std::ostringstream os;
774  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
775  << "Form explicit xpose of B" << std::endl;
776  std::cerr << os.str ();
777  }
778  transposer_type transposer (Bprime);
779  Bprime = transposer.createTranspose ();
780  }
781 #ifdef HAVE_TPETRA_DEBUG
782  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
783  prefix_mmm << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
784  TEUCHOS_TEST_FOR_EXCEPTION(
785  !Aprime->isFillComplete () || !Bprime->isFillComplete (), std::invalid_argument,
786  prefix_mmm << "Aprime and Bprime must both be fill complete. "
787  "Please report this bug to the Tpetra developers.");
788 #endif // HAVE_TPETRA_DEBUG
789  RCP<const map_type> CDomainMap = domainMap;
790  RCP<const map_type> CRangeMap = rangeMap;
791  if(CDomainMap.is_null())
792  {
793  CDomainMap = Bprime->getDomainMap();
794  }
795  if(CRangeMap.is_null())
796  {
797  CRangeMap = Bprime->getRangeMap();
798  }
799  assert(!(CDomainMap.is_null()));
800  assert(!(CRangeMap.is_null()));
801  typedef typename AddKern::values_array values_array;
802  typedef typename AddKern::row_ptrs_array row_ptrs_array;
803  typedef typename AddKern::col_inds_array col_inds_array;
804  bool AGraphSorted = Aprime->getCrsGraph()->isSorted();
805  bool BGraphSorted = Bprime->getCrsGraph()->isSorted();
806  values_array vals;
807  row_ptrs_array rowptrs;
808  col_inds_array colinds;
809 #ifdef HAVE_TPETRA_MMM_TIMINGS
810  MM = Teuchos::null;
811  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("rowmap check/import"))));
812 #endif
813  if(!(Aprime->getRowMap()->isSameAs(*(Bprime->getRowMap()))))
814  {
815  //import Aprime into Bprime's row map so the local matrices have same # of rows
816  auto import = rcp(new import_type(Aprime->getRowMap(), Bprime->getRowMap()));
817  // cbl do not set
818  // parameterlist "isMatrixMatrix_TransferAndFillComplete" true here as
819  // this import _may_ take the form of a transfer. In practice it would be unlikely,
820  // but the general case is not so forgiving.
821  Aprime = importAndFillCompleteCrsMatrix<crs_matrix_type>(Aprime, *import, Bprime->getDomainMap(), Bprime->getRangeMap());
822  }
823  bool matchingColMaps = Aprime->getColMap()->isSameAs(*(Bprime->getColMap()));
824  bool sorted = AGraphSorted && BGraphSorted;
825  RCP<const import_type> Cimport = Teuchos::null;
826  RCP<export_type> Cexport = Teuchos::null;
827  bool doFillComplete = true;
828  if(Teuchos::nonnull(params) && params->isParameter("Call fillComplete"))
829  {
830  doFillComplete = params->get<bool>("Call fillComplete");
831  }
832  auto Alocal = Aprime->getLocalMatrixDevice();
833  auto Blocal = Bprime->getLocalMatrixDevice();
834  LO numLocalRows = Alocal.numRows();
835  if(numLocalRows == 0)
836  {
837  //KokkosKernels spadd assumes rowptrs.extent(0) + 1 == nrows,
838  //but an empty Tpetra matrix is allowed to have rowptrs.extent(0) == 0.
839  //Handle this case now
840  //(without interfering with collective operations, since it's possible for
841  //some ranks to have 0 local rows and others not).
842  rowptrs = row_ptrs_array("C rowptrs", 0);
843  }
844  auto Acolmap = Aprime->getColMap();
845  auto Bcolmap = Bprime->getColMap();
846  if(!matchingColMaps)
847  {
848  using global_col_inds_array = typename AddKern::global_col_inds_array;
849 #ifdef HAVE_TPETRA_MMM_TIMINGS
850  MM = Teuchos::null;
851  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mismatched col map full kernel"))));
852 #endif
853  //use kernel that converts col indices in both A and B to common domain map before adding
854  auto AlocalColmap = Acolmap->getLocalMap();
855  auto BlocalColmap = Bcolmap->getLocalMap();
856  global_col_inds_array globalColinds;
857  if (debug) {
858  std::ostringstream os;
859  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
860  << "Call AddKern::convertToGlobalAndAdd(...)" << std::endl;
861  std::cerr << os.str ();
862  }
863  AddKern::convertToGlobalAndAdd(
864  Alocal, alpha, Blocal, beta, AlocalColmap, BlocalColmap,
865  vals, rowptrs, globalColinds);
866  if (debug) {
867  std::ostringstream os;
868  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
869  << "Finished AddKern::convertToGlobalAndAdd(...)" << std::endl;
870  std::cerr << os.str ();
871  }
872 #ifdef HAVE_TPETRA_MMM_TIMINGS
873  MM = Teuchos::null;
874  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Constructing graph"))));
875 #endif
876  RCP<const map_type> CcolMap;
877  Tpetra::Details::makeColMap<LocalOrdinal, GlobalOrdinal, Node>
878  (CcolMap, CDomainMap, globalColinds);
879  C.replaceColMap(CcolMap);
880  col_inds_array localColinds("C colinds", globalColinds.extent(0));
881  Kokkos::parallel_for(Kokkos::RangePolicy<exec_space>(0, globalColinds.extent(0)),
882  ConvertGlobalToLocalFunctor<LocalOrdinal, GlobalOrdinal,
883  col_inds_array, global_col_inds_array,
884  typename map_type::local_map_type>
885  (localColinds, globalColinds, CcolMap->getLocalMap()));
886  KokkosSparse::sort_crs_matrix<exec_space, row_ptrs_array, col_inds_array, values_array>(rowptrs, localColinds, vals);
887  C.setAllValues(rowptrs, localColinds, vals);
888  C.fillComplete(CDomainMap, CRangeMap, params);
889  if(!doFillComplete)
890  C.resumeFill();
891  }
892  else
893  {
894  //Aprime, Bprime and C all have the same column maps
895  auto Avals = Alocal.values;
896  auto Bvals = Blocal.values;
897  auto Arowptrs = Alocal.graph.row_map;
898  auto Browptrs = Blocal.graph.row_map;
899  auto Acolinds = Alocal.graph.entries;
900  auto Bcolinds = Blocal.graph.entries;
901  if(sorted)
902  {
903  //use sorted kernel
904 #ifdef HAVE_TPETRA_MMM_TIMINGS
905  MM = Teuchos::null;
906  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("sorted entries full kernel"))));
907 #endif
908  if (debug) {
909  std::ostringstream os;
910  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
911  << "Call AddKern::addSorted(...)" << std::endl;
912  std::cerr << os.str ();
913  }
914 #if KOKKOSKERNELS_VERSION >= 40299
915  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
916 #else
917  AddKern::addSorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, vals, rowptrs, colinds);
918 #endif
919  }
920  else
921  {
922  //use unsorted kernel
923 #ifdef HAVE_TPETRA_MMM_TIMINGS
924  MM = Teuchos::null;
925  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("mm add unsorted entries full kernel"))));
926 #endif
927  if (debug) {
928  std::ostringstream os;
929  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
930  << "Call AddKern::addUnsorted(...)" << std::endl;
931  std::cerr << os.str ();
932  }
933  AddKern::addUnsorted(Avals, Arowptrs, Acolinds, alpha, Bvals, Browptrs, Bcolinds, beta, Aprime->getGlobalNumCols(), vals, rowptrs, colinds);
934  }
935  //Bprime col map works as C's row map, since Aprime and Bprime have the same colmaps.
936  RCP<const map_type> Ccolmap = Bcolmap;
937  C.replaceColMap(Ccolmap);
938  C.setAllValues(rowptrs, colinds, vals);
939  if(doFillComplete)
940  {
941  #ifdef HAVE_TPETRA_MMM_TIMINGS
942  MM = Teuchos::null;
943  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Tpetra::Crs expertStaticFillComplete"))));
944  #endif
945  if(!CDomainMap->isSameAs(*Ccolmap))
946  {
947  if (debug) {
948  std::ostringstream os;
949  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
950  << "Create Cimport" << std::endl;
951  std::cerr << os.str ();
952  }
953  Cimport = rcp(new import_type(CDomainMap, Ccolmap));
954  }
955  if(!C.getRowMap()->isSameAs(*CRangeMap))
956  {
957  if (debug) {
958  std::ostringstream os;
959  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
960  << "Create Cexport" << std::endl;
961  std::cerr << os.str ();
962  }
963  Cexport = rcp(new export_type(C.getRowMap(), CRangeMap));
964  }
965 
966  if (debug) {
967  std::ostringstream os;
968  os << "Proc " << A.getMap ()->getComm ()->getRank () << ": "
969  << "Call C->expertStaticFillComplete(...)" << std::endl;
970  std::cerr << os.str ();
971  }
972  C.expertStaticFillComplete(CDomainMap, CRangeMap, Cimport, Cexport, params);
973  }
974  }
975 }
976 
977 // This version of Add takes C as RCP&, so C may be null on input (in this case,
978 // it is allocated and constructed in this function).
979 template <class Scalar,
980  class LocalOrdinal,
981  class GlobalOrdinal,
982  class Node>
983 void Add(
985  bool transposeA,
986  Scalar scalarA,
988  bool transposeB,
989  Scalar scalarB,
991 {
992  using Teuchos::Array;
993  using Teuchos::ArrayRCP;
994  using Teuchos::ArrayView;
995  using Teuchos::RCP;
996  using Teuchos::rcp;
997  using Teuchos::rcp_dynamic_cast;
998  using Teuchos::rcpFromRef;
999  using Teuchos::tuple;
1000  using std::endl;
1001  // typedef typename ArrayView<const Scalar>::size_type size_type;
1002  typedef Teuchos::ScalarTraits<Scalar> STS;
1003  typedef Map<LocalOrdinal, GlobalOrdinal, Node> map_type;
1004  // typedef Import<LocalOrdinal, GlobalOrdinal, Node> import_type;
1005  // typedef RowGraph<LocalOrdinal, GlobalOrdinal, Node> row_graph_type;
1006  // typedef CrsGraph<LocalOrdinal, GlobalOrdinal, Node> crs_graph_type;
1009 
1010  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1011 
1012  TEUCHOS_TEST_FOR_EXCEPTION(
1013  ! A.isFillComplete () || ! B.isFillComplete (), std::invalid_argument,
1014  prefix << "A and B must both be fill complete before calling this function.");
1015 
1016  if(C.is_null()) {
1017  TEUCHOS_TEST_FOR_EXCEPTION(!A.haveGlobalConstants(), std::logic_error,
1018  prefix << "C is null (must be allocated), but A.haveGlobalConstants() is false. "
1019  "Please report this bug to the Tpetra developers.");
1020  TEUCHOS_TEST_FOR_EXCEPTION(!B.haveGlobalConstants(), std::logic_error,
1021  prefix << "C is null (must be allocated), but B.haveGlobalConstants() is false. "
1022  "Please report this bug to the Tpetra developers.");
1023  }
1024 
1025 #ifdef HAVE_TPETRA_DEBUG
1026  {
1027  const bool domainMapsSame =
1028  (! transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getDomainMap ()))) ||
1029  (! transposeA && transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ()))) ||
1030  (transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ())));
1031  TEUCHOS_TEST_FOR_EXCEPTION(domainMapsSame, std::invalid_argument,
1032  prefix << "The domain Maps of Op(A) and Op(B) are not the same.");
1033 
1034  const bool rangeMapsSame =
1035  (! transposeA && ! transposeB && ! A.getRangeMap ()->isSameAs (* (B.getRangeMap ()))) ||
1036  (! transposeA && transposeB && ! A.getRangeMap ()->isSameAs (* (B.getDomainMap ()))) ||
1037  (transposeA && ! transposeB && ! A.getDomainMap ()->isSameAs (* (B.getRangeMap ())));
1038  TEUCHOS_TEST_FOR_EXCEPTION(rangeMapsSame, std::invalid_argument,
1039  prefix << "The range Maps of Op(A) and Op(B) are not the same.");
1040  }
1041 #endif // HAVE_TPETRA_DEBUG
1042 
1043  using Teuchos::ParameterList;
1044  RCP<ParameterList> transposeParams (new ParameterList);
1045  transposeParams->set ("sort", false);
1046 
1047  // Form the explicit transpose of A if necessary.
1048  RCP<const crs_matrix_type> Aprime;
1049  if (transposeA) {
1050  transposer_type theTransposer (rcpFromRef (A));
1051  Aprime = theTransposer.createTranspose (transposeParams);
1052  }
1053  else {
1054  Aprime = rcpFromRef (A);
1055  }
1056 
1057 #ifdef HAVE_TPETRA_DEBUG
1058  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1059  prefix << "Failed to compute Op(A). Please report this bug to the Tpetra developers.");
1060 #endif // HAVE_TPETRA_DEBUG
1061 
1062  // Form the explicit transpose of B if necessary.
1063  RCP<const crs_matrix_type> Bprime;
1064  if (transposeB) {
1065  transposer_type theTransposer (rcpFromRef (B));
1066  Bprime = theTransposer.createTranspose (transposeParams);
1067  }
1068  else {
1069  Bprime = rcpFromRef (B);
1070  }
1071 
1072 #ifdef HAVE_TPETRA_DEBUG
1073  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1074  prefix << "Failed to compute Op(B). Please report this bug to the Tpetra developers.");
1075 #endif // HAVE_TPETRA_DEBUG
1076 
1077  bool CwasFillComplete = false;
1078 
1079  // Allocate or zero the entries of the result matrix.
1080  if (! C.is_null ()) {
1081  CwasFillComplete = C->isFillComplete();
1082  if(CwasFillComplete)
1083  C->resumeFill();
1084  C->setAllToScalar (STS::zero ());
1085  } else {
1086  // FIXME (mfh 08 May 2013) When I first looked at this method, I
1087  // noticed that C was being given the row Map of Aprime (the
1088  // possibly transposed version of A). Is this what we want?
1089 
1090  // It is a precondition that Aprime and Bprime have the same domain and range maps.
1091  // However, they may have different row maps. In this case, it's difficult to
1092  // get a precise upper bound on the number of entries in each local row of C, so
1093  // just use the looser upper bound based on the max number of entries in any row of Aprime and Bprime.
1094  if(Aprime->getRowMap()->isSameAs(*Bprime->getRowMap())) {
1095  LocalOrdinal numLocalRows = Aprime->getLocalNumRows();
1096  Array<size_t> CmaxEntriesPerRow(numLocalRows);
1097  for(LocalOrdinal i = 0; i < numLocalRows; i++) {
1098  CmaxEntriesPerRow[i] = Aprime->getNumEntriesInLocalRow(i) + Bprime->getNumEntriesInLocalRow(i);
1099  }
1100  C = rcp (new crs_matrix_type (Aprime->getRowMap (), CmaxEntriesPerRow()));
1101  }
1102  else {
1103  // Note: above we checked that Aprime and Bprime have global constants, so it's safe to ask for max entries per row.
1104  C = rcp (new crs_matrix_type (Aprime->getRowMap (), Aprime->getGlobalMaxNumRowEntries() + Bprime->getGlobalMaxNumRowEntries()));
1105  }
1106  }
1107 
1108 #ifdef HAVE_TPETRA_DEBUG
1109  TEUCHOS_TEST_FOR_EXCEPTION(Aprime.is_null (), std::logic_error,
1110  prefix << "At this point, Aprime is null. Please report this bug to the Tpetra developers.");
1111  TEUCHOS_TEST_FOR_EXCEPTION(Bprime.is_null (), std::logic_error,
1112  prefix << "At this point, Bprime is null. Please report this bug to the Tpetra developers.");
1113  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::logic_error,
1114  prefix << "At this point, C is null. Please report this bug to the Tpetra developers.");
1115 #endif // HAVE_TPETRA_DEBUG
1116 
1117  Array<RCP<const crs_matrix_type> > Mat =
1118  tuple<RCP<const crs_matrix_type> > (Aprime, Bprime);
1119  Array<Scalar> scalar = tuple<Scalar> (scalarA, scalarB);
1120 
1121  // do a loop over each matrix to add: A reordering might be more efficient
1122  for (int k = 0; k < 2; ++k) {
1123  typename crs_matrix_type::nonconst_global_inds_host_view_type Indices;
1124  typename crs_matrix_type::nonconst_values_host_view_type Values;
1125 
1126  // Loop over each locally owned row of the current matrix (either
1127  // Aprime or Bprime), and sum its entries into the corresponding
1128  // row of C. This works regardless of whether Aprime or Bprime
1129  // has the same row Map as C, because both sumIntoGlobalValues and
1130  // insertGlobalValues allow summing resp. inserting into nonowned
1131  // rows of C.
1132 #ifdef HAVE_TPETRA_DEBUG
1133  TEUCHOS_TEST_FOR_EXCEPTION(Mat[k].is_null (), std::logic_error,
1134  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1135 #endif // HAVE_TPETRA_DEBUG
1136  RCP<const map_type> curRowMap = Mat[k]->getRowMap ();
1137 #ifdef HAVE_TPETRA_DEBUG
1138  TEUCHOS_TEST_FOR_EXCEPTION(curRowMap.is_null (), std::logic_error,
1139  prefix << "At this point, curRowMap is null. Please report this bug to the Tpetra developers.");
1140 #endif // HAVE_TPETRA_DEBUG
1141 
1142  const size_t localNumRows = Mat[k]->getLocalNumRows ();
1143  for (size_t i = 0; i < localNumRows; ++i) {
1144  const GlobalOrdinal globalRow = curRowMap->getGlobalElement (i);
1145  size_t numEntries = Mat[k]->getNumEntriesInGlobalRow (globalRow);
1146  if (numEntries > 0) {
1147  if(numEntries > Indices.extent(0)) {
1148  Kokkos::resize(Indices, numEntries);
1149  Kokkos::resize(Values, numEntries);
1150  }
1151  Mat[k]->getGlobalRowCopy (globalRow, Indices, Values, numEntries);
1152 
1153  if (scalar[k] != STS::one ()) {
1154  for (size_t j = 0; j < numEntries; ++j) {
1155  Values[j] *= scalar[k];
1156  }
1157  }
1158 
1159  if (CwasFillComplete) {
1160  size_t result = C->sumIntoGlobalValues (globalRow, numEntries,
1161  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1162  TEUCHOS_TEST_FOR_EXCEPTION(result != numEntries, std::logic_error,
1163  prefix << "sumIntoGlobalValues failed to add entries from A or B into C.");
1164  } else {
1165  C->insertGlobalValues (globalRow, numEntries,
1166  reinterpret_cast<Scalar *>(Values.data()), Indices.data());
1167  }
1168  }
1169  }
1170  }
1171  if(CwasFillComplete) {
1172  C->fillComplete(C->getDomainMap (),
1173  C->getRangeMap ());
1174  }
1175 }
1176 
1177 // This version of Add takes C as const RCP&, so C must not be null on input. Otherwise, its behavior is identical
1178 // to the above version where C is RCP&.
1179 template <class Scalar,
1180  class LocalOrdinal,
1181  class GlobalOrdinal,
1182  class Node>
1183 void Add(
1185  bool transposeA,
1186  Scalar scalarA,
1188  bool transposeB,
1189  Scalar scalarB,
1191 {
1192  std::string prefix = "TpetraExt::MatrixMatrix::Add(): ";
1193 
1194  TEUCHOS_TEST_FOR_EXCEPTION(C.is_null (), std::invalid_argument,
1195  prefix << "C must not be null");
1196 
1197  Teuchos::RCP<CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > C_ = C;
1198  Add(A, transposeA, scalarA, B, transposeB, scalarB, C_);
1199 }
1200 
1201 } //End namespace MatrixMatrix
1202 
1203 namespace MMdetails{
1204 
1205 /*********************************************************************************************************/
1207 //template <class TransferType>
1208 //void printMultiplicationStatistics(Teuchos::RCP<TransferType > Transfer, const std::string &label) {
1209 // if (Transfer.is_null())
1210 // return;
1211 //
1212 // const Distributor & Distor = Transfer->getDistributor();
1213 // Teuchos::RCP<const Teuchos::Comm<int> > Comm = Transfer->getSourceMap()->getComm();
1214 //
1215 // size_t rows_send = Transfer->getNumExportIDs();
1216 // size_t rows_recv = Transfer->getNumRemoteIDs();
1217 //
1218 // size_t round1_send = Transfer->getNumExportIDs() * sizeof(size_t);
1219 // size_t round1_recv = Transfer->getNumRemoteIDs() * sizeof(size_t);
1220 // size_t num_send_neighbors = Distor.getNumSends();
1221 // size_t num_recv_neighbors = Distor.getNumReceives();
1222 // size_t round2_send, round2_recv;
1223 // Distor.getLastDoStatistics(round2_send,round2_recv);
1224 //
1225 // int myPID = Comm->getRank();
1226 // int NumProcs = Comm->getSize();
1227 //
1228 // // Processor by processor statistics
1229 // // printf("[%d] %s Statistics: neigh[s/r]=%d/%d rows[s/r]=%d/%d r1bytes[s/r]=%d/%d r2bytes[s/r]=%d/%d\n",
1230 // // myPID, label.c_str(),num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv);
1231 //
1232 // // Global statistics
1233 // size_t lstats[8] = {num_send_neighbors,num_recv_neighbors,rows_send,rows_recv,round1_send,round1_recv,round2_send,round2_recv};
1234 // size_t gstats_min[8], gstats_max[8];
1235 //
1236 // double lstats_avg[8], gstats_avg[8];
1237 // for(int i=0; i<8; i++)
1238 // lstats_avg[i] = ((double)lstats[i])/NumProcs;
1239 //
1240 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MIN,8,lstats,gstats_min);
1241 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_MAX,8,lstats,gstats_max);
1242 // Teuchos::reduceAll(*Comm(),Teuchos::REDUCE_SUM,8,lstats_avg,gstats_avg);
1243 //
1244 // if(!myPID) {
1245 // printf("%s Send Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1246 // (int)gstats_min[0],gstats_avg[0],(int)gstats_max[0], (int)gstats_min[2],gstats_avg[2],(int)gstats_max[2],
1247 // (int)gstats_min[4],gstats_avg[4],(int)gstats_max[4], (int)gstats_min[6],gstats_avg[6],(int)gstats_max[6]);
1248 // printf("%s Recv Statistics[min/avg/max]: neigh=%d/%4.1f/%d rows=%d/%4.1f/%d round1=%d/%4.1f/%d round2=%d/%4.1f/%d\n", label.c_str(),
1249 // (int)gstats_min[1],gstats_avg[1],(int)gstats_max[1], (int)gstats_min[3],gstats_avg[3],(int)gstats_max[3],
1250 // (int)gstats_min[5],gstats_avg[5],(int)gstats_max[5], (int)gstats_min[7],gstats_avg[7],(int)gstats_max[7]);
1251 // }
1252 //}
1253 
1254 // Kernel method for computing the local portion of C = A*B
1255 template<class Scalar,
1256  class LocalOrdinal,
1257  class GlobalOrdinal,
1258  class Node>
1259 void mult_AT_B_newmatrix(
1260  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
1261  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& B,
1262  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1263  const std::string & label,
1264  const Teuchos::RCP<Teuchos::ParameterList>& params)
1265 {
1266  using Teuchos::RCP;
1267  using Teuchos::rcp;
1268  typedef Scalar SC;
1269  typedef LocalOrdinal LO;
1270  typedef GlobalOrdinal GO;
1271  typedef Node NO;
1272  typedef CrsMatrixStruct<SC,LO,GO,NO> crs_matrix_struct_type;
1273  typedef RowMatrixTransposer<SC,LO,GO,NO> transposer_type;
1274 
1275 #ifdef HAVE_TPETRA_MMM_TIMINGS
1276  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1277  using Teuchos::TimeMonitor;
1278  RCP<TimeMonitor> MM = rcp (new TimeMonitor
1279  (*TimeMonitor::getNewTimer (prefix_mmm + "MMM-T Transpose")));
1280 #endif
1281 
1282  /*************************************************************/
1283  /* 1) Local Transpose of A */
1284  /*************************************************************/
1285  transposer_type transposer (rcpFromRef (A), label + std::string("XP: "));
1286 
1287  using Teuchos::ParameterList;
1288  RCP<ParameterList> transposeParams (new ParameterList);
1289  transposeParams->set ("sort", true); // Kokkos Kernels spgemm requires inputs to be sorted
1290  if(! params.is_null ()) {
1291  transposeParams->set ("compute global constants",
1292  params->get ("compute global constants: temporaries",
1293  false));
1294  }
1295  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Atrans =
1296  transposer.createTransposeLocal (transposeParams);
1297 
1298  /*************************************************************/
1299  /* 2/3) Call mult_A_B_newmatrix w/ fillComplete */
1300  /*************************************************************/
1301 #ifdef HAVE_TPETRA_MMM_TIMINGS
1302  MM = Teuchos::null;
1303  MM = rcp (new TimeMonitor
1304  (*TimeMonitor::getNewTimer (prefix_mmm + std::string ("MMM-T I&X"))));
1305 #endif
1306 
1307  // Get views, asserting that no import is required to speed up computation
1308  crs_matrix_struct_type Aview;
1309  crs_matrix_struct_type Bview;
1310  RCP<const Import<LO, GO, NO> > dummyImporter;
1311 
1312  // NOTE: the I&X routine sticks an importer on the paramlist as output, so we have to use a unique guy here
1313  RCP<Teuchos::ParameterList> importParams1 (new ParameterList);
1314  if (! params.is_null ()) {
1315  importParams1->set ("compute global constants",
1316  params->get ("compute global constants: temporaries",
1317  false));
1318  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1319  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1320  bool overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck", false);
1321  int mm_optimization_core_count =
1323  mm_optimization_core_count =
1324  slist.get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1325  int mm_optimization_core_count2 =
1326  params->get ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1327  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1328  mm_optimization_core_count = mm_optimization_core_count2;
1329  }
1330  auto & sip1 = importParams1->sublist ("matrixmatrix: kernel params", false);
1331  sip1.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1332  sip1.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1333  sip1.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1334  }
1335 
1336  MMdetails::import_and_extract_views (*Atrans, Atrans->getRowMap (),
1337  Aview, dummyImporter, true,
1338  label, importParams1);
1339 
1340  RCP<ParameterList> importParams2 (new ParameterList);
1341  if (! params.is_null ()) {
1342  importParams2->set ("compute global constants",
1343  params->get ("compute global constants: temporaries",
1344  false));
1345  auto slist = params->sublist ("matrixmatrix: kernel params", false);
1346  bool isMM = slist.get ("isMatrixMatrix_TransferAndFillComplete", false);
1347  bool overrideAllreduce = slist.get ("MM_TAFC_OverrideAllreduceCheck", false);
1348  int mm_optimization_core_count =
1350  mm_optimization_core_count =
1351  slist.get ("MM_TAFC_OptimizationCoreCount",
1352  mm_optimization_core_count);
1353  int mm_optimization_core_count2 =
1354  params->get ("MM_TAFC_OptimizationCoreCount",
1355  mm_optimization_core_count);
1356  if (mm_optimization_core_count2 < mm_optimization_core_count) {
1357  mm_optimization_core_count = mm_optimization_core_count2;
1358  }
1359  auto & sip2 = importParams2->sublist ("matrixmatrix: kernel params", false);
1360  sip2.set ("MM_TAFC_OptimizationCoreCount", mm_optimization_core_count);
1361  sip2.set ("isMatrixMatrix_TransferAndFillComplete", isMM);
1362  sip2.set ("MM_TAFC_OverrideAllreduceCheck", overrideAllreduce);
1363  }
1364 
1365  if(B.getRowMap()->isSameAs(*Atrans->getColMap())){
1366  MMdetails::import_and_extract_views(B, B.getRowMap(), Bview, dummyImporter,true, label,importParams2);
1367  }
1368  else {
1369  MMdetails::import_and_extract_views(B, Atrans->getColMap(), Bview, dummyImporter,false, label,importParams2);
1370  }
1371 
1372 #ifdef HAVE_TPETRA_MMM_TIMINGS
1373  MM = Teuchos::null;
1374  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T AB-core"))));
1375 #endif
1376 
1377  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Ctemp;
1378 
1379  // If Atrans has no Exporter, we can use C instead of having to create a temp matrix
1380  bool needs_final_export = ! Atrans->getGraph ()->getExporter ().is_null();
1381  if (needs_final_export) {
1382  Ctemp = rcp (new Tpetra::CrsMatrix<SC, LO, GO, NO> (Atrans->getRowMap (), 0));
1383  }
1384  else {
1385  Ctemp = rcp (&C, false);
1386  }
1387 
1388  mult_A_B_newmatrix(Aview, Bview, *Ctemp, label,params);
1389 
1390  /*************************************************************/
1391  /* 4) exportAndFillComplete matrix */
1392  /*************************************************************/
1393 #ifdef HAVE_TPETRA_MMM_TIMINGS
1394  MM = Teuchos::null;
1395  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM-T exportAndFillComplete"))));
1396 #endif
1397 
1398  RCP<Tpetra::CrsMatrix<SC, LO, GO, NO>> Crcp (&C, false);
1399 
1400  if (needs_final_export) {
1401  ParameterList labelList;
1402  labelList.set("Timer Label", label);
1403  if(!params.is_null()) {
1404  ParameterList& params_sublist = params->sublist("matrixmatrix: kernel params",false);
1405  ParameterList& labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
1406  int mm_optimization_core_count = ::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
1407  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1408  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
1409  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
1410  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count,"Core Count above which the optimized neighbor discovery is used");
1411  bool isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
1412  bool overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
1413 
1414  labelList_subList.set ("isMatrixMatrix_TransferAndFillComplete", isMM,
1415  "This parameter should be set to true only for MatrixMatrix operations: the optimization in Epetra that was ported to Tpetra does _not_ take into account the possibility that for any given source PID, a particular GID may not exist on the target PID: i.e. a transfer operation. A fix for this general case is in development.");
1416  labelList.set("compute global constants",params->get("compute global constants",true));
1417  labelList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
1418  }
1419  Ctemp->exportAndFillComplete (Crcp,
1420  *Ctemp->getGraph ()->getExporter (),
1421  B.getDomainMap (),
1422  A.getDomainMap (),
1423  rcp (&labelList, false));
1424  }
1425 #ifdef HAVE_TPETRA_MMM_STATISTICS
1426  printMultiplicationStatistics(Ctemp->getGraph()->getExporter(), label+std::string(" AT_B MMM"));
1427 #endif
1428 }
1429 
1430 /*********************************************************************************************************/
1431 // Kernel method for computing the local portion of C = A*B
1432 template<class Scalar,
1433  class LocalOrdinal,
1434  class GlobalOrdinal,
1435  class Node>
1436 void mult_A_B(
1437  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1438  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1439  CrsWrapper<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1440  const std::string& /* label */,
1441  const Teuchos::RCP<Teuchos::ParameterList>& /* params */)
1442 {
1443  using Teuchos::Array;
1444  using Teuchos::ArrayRCP;
1445  using Teuchos::ArrayView;
1446  using Teuchos::OrdinalTraits;
1447  using Teuchos::null;
1448 
1449  typedef Teuchos::ScalarTraits<Scalar> STS;
1450  // TEUCHOS_FUNC_TIME_MONITOR_DIFF("mult_A_B", mult_A_B);
1451  LocalOrdinal C_firstCol = Bview.colMap->getMinLocalIndex();
1452  LocalOrdinal C_lastCol = Bview.colMap->getMaxLocalIndex();
1453 
1454  LocalOrdinal C_firstCol_import = OrdinalTraits<LocalOrdinal>::zero();
1455  LocalOrdinal C_lastCol_import = OrdinalTraits<LocalOrdinal>::invalid();
1456 
1457  ArrayView<const GlobalOrdinal> bcols = Bview.colMap->getLocalElementList();
1458  ArrayView<const GlobalOrdinal> bcols_import = null;
1459  if (Bview.importColMap != null) {
1460  C_firstCol_import = Bview.importColMap->getMinLocalIndex();
1461  C_lastCol_import = Bview.importColMap->getMaxLocalIndex();
1462 
1463  bcols_import = Bview.importColMap->getLocalElementList();
1464  }
1465 
1466  size_t C_numCols = C_lastCol - C_firstCol +
1467  OrdinalTraits<LocalOrdinal>::one();
1468  size_t C_numCols_import = C_lastCol_import - C_firstCol_import +
1469  OrdinalTraits<LocalOrdinal>::one();
1470 
1471  if (C_numCols_import > C_numCols)
1472  C_numCols = C_numCols_import;
1473 
1474  Array<Scalar> dwork = Array<Scalar>(C_numCols);
1475  Array<GlobalOrdinal> iwork = Array<GlobalOrdinal>(C_numCols);
1476  Array<size_t> iwork2 = Array<size_t>(C_numCols);
1477 
1478  Array<Scalar> C_row_i = dwork;
1479  Array<GlobalOrdinal> C_cols = iwork;
1480  Array<size_t> c_index = iwork2;
1481  Array<GlobalOrdinal> combined_index = Array<GlobalOrdinal>(2*C_numCols);
1482  Array<Scalar> combined_values = Array<Scalar>(2*C_numCols);
1483 
1484  size_t C_row_i_length, j, k, last_index;
1485 
1486  // Run through all the hash table lookups once and for all
1487  LocalOrdinal LO_INVALID = OrdinalTraits<LocalOrdinal>::invalid();
1488  Array<LocalOrdinal> Acol2Brow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1489  Array<LocalOrdinal> Acol2Irow(Aview.colMap->getLocalNumElements(),LO_INVALID);
1490  if(Aview.colMap->isSameAs(*Bview.origMatrix->getRowMap())){
1491  // Maps are the same: Use local IDs as the hash
1492  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1493  Aview.colMap->getMaxLocalIndex(); i++)
1494  Acol2Brow[i]=i;
1495  }
1496  else {
1497  // Maps are not the same: Use the map's hash
1498  for(LocalOrdinal i=Aview.colMap->getMinLocalIndex(); i <=
1499  Aview.colMap->getMaxLocalIndex(); i++) {
1500  GlobalOrdinal GID = Aview.colMap->getGlobalElement(i);
1501  LocalOrdinal BLID = Bview.origMatrix->getRowMap()->getLocalElement(GID);
1502  if(BLID != LO_INVALID) Acol2Brow[i] = BLID;
1503  else Acol2Irow[i] = Bview.importMatrix->getRowMap()->getLocalElement(GID);
1504  }
1505  }
1506 
1507  // To form C = A*B we're going to execute this expression:
1508  //
1509  // C(i,j) = sum_k( A(i,k)*B(k,j) )
1510  //
1511  // Our goal, of course, is to navigate the data in A and B once, without
1512  // performing searches for column-indices, etc.
1513  auto Arowptr = Aview.origMatrix->getLocalRowPtrsHost();
1514  auto Acolind = Aview.origMatrix->getLocalIndicesHost();
1515  auto Avals = Aview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1516  auto Browptr = Bview.origMatrix->getLocalRowPtrsHost();
1517  auto Bcolind = Bview.origMatrix->getLocalIndicesHost();
1518  auto Bvals = Bview.origMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1519  decltype(Browptr) Irowptr;
1520  decltype(Bcolind) Icolind;
1521  decltype(Bvals) Ivals;
1522  if(!Bview.importMatrix.is_null()) {
1523  Irowptr = Bview.importMatrix->getLocalRowPtrsHost();
1524  Icolind = Bview.importMatrix->getLocalIndicesHost();
1525  Ivals = Bview.importMatrix->getLocalValuesHost(Tpetra::Access::ReadOnly);
1526  }
1527 
1528  bool C_filled = C.isFillComplete();
1529 
1530  for (size_t i = 0; i < C_numCols; i++)
1531  c_index[i] = OrdinalTraits<size_t>::invalid();
1532 
1533  // Loop over the rows of A.
1534  size_t Arows = Aview.rowMap->getLocalNumElements();
1535  for(size_t i=0; i<Arows; ++i) {
1536 
1537  // Only navigate the local portion of Aview... which is, thankfully, all of
1538  // A since this routine doesn't do transpose modes
1539  GlobalOrdinal global_row = Aview.rowMap->getGlobalElement(i);
1540 
1541  // Loop across the i-th row of A and for each corresponding row in B, loop
1542  // across columns and accumulate product A(i,k)*B(k,j) into our partial sum
1543  // quantities C_row_i. In other words, as we stride across B(k,:) we're
1544  // calculating updates for row i of the result matrix C.
1545  C_row_i_length = OrdinalTraits<size_t>::zero();
1546 
1547  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1548  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1549  const Scalar Aval = Avals[k];
1550  if (Aval == STS::zero())
1551  continue;
1552 
1553  if (Ak == LO_INVALID)
1554  continue;
1555 
1556  for (j = Browptr[Ak]; j < Browptr[Ak+1]; ++j) {
1557  LocalOrdinal col = Bcolind[j];
1558  //assert(col >= 0 && col < C_numCols);
1559 
1560  if (c_index[col] == OrdinalTraits<size_t>::invalid()){
1561  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1562  // This has to be a += so insertGlobalValue goes out
1563  C_row_i[C_row_i_length] = Aval*Bvals[j];
1564  C_cols[C_row_i_length] = col;
1565  c_index[col] = C_row_i_length;
1566  C_row_i_length++;
1567 
1568  } else {
1569  // static cast from impl_scalar_type to Scalar needed for complex
1570  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Bvals[j]);
1571  }
1572  }
1573  }
1574 
1575  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1576  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1577  C_cols[ii] = bcols[C_cols[ii]];
1578  combined_index[ii] = C_cols[ii];
1579  combined_values[ii] = C_row_i[ii];
1580  }
1581  last_index = C_row_i_length;
1582 
1583  //
1584  //Now put the C_row_i values into C.
1585  //
1586  // We might have to revamp this later.
1587  C_row_i_length = OrdinalTraits<size_t>::zero();
1588 
1589  for (k = Arowptr[i]; k < Arowptr[i+1]; ++k) {
1590  LocalOrdinal Ak = Acol2Brow[Acolind[k]];
1591  const Scalar Aval = Avals[k];
1592  if (Aval == STS::zero())
1593  continue;
1594 
1595  if (Ak!=LO_INVALID) continue;
1596 
1597  Ak = Acol2Irow[Acolind[k]];
1598  for (j = Irowptr[Ak]; j < Irowptr[Ak+1]; ++j) {
1599  LocalOrdinal col = Icolind[j];
1600  //assert(col >= 0 && col < C_numCols);
1601 
1602  if (c_index[col] == OrdinalTraits<size_t>::invalid()) {
1603  //assert(C_row_i_length >= 0 && C_row_i_length < C_numCols);
1604  // This has to be a += so insertGlobalValue goes out
1605  C_row_i[C_row_i_length] = Aval*Ivals[j];
1606  C_cols[C_row_i_length] = col;
1607  c_index[col] = C_row_i_length;
1608  C_row_i_length++;
1609 
1610  } else {
1611  // This has to be a += so insertGlobalValue goes out
1612  // static cast from impl_scalar_type to Scalar needed for complex
1613  C_row_i[c_index[col]] += Aval * static_cast<Scalar>(Ivals[j]);
1614  }
1615  }
1616  }
1617 
1618  for (size_t ii = 0; ii < C_row_i_length; ii++) {
1619  c_index[C_cols[ii]] = OrdinalTraits<size_t>::invalid();
1620  C_cols[ii] = bcols_import[C_cols[ii]];
1621  combined_index[last_index] = C_cols[ii];
1622  combined_values[last_index] = C_row_i[ii];
1623  last_index++;
1624  }
1625 
1626  // Now put the C_row_i values into C.
1627  // We might have to revamp this later.
1628  C_filled ?
1629  C.sumIntoGlobalValues(
1630  global_row,
1631  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1632  combined_values.view(OrdinalTraits<size_t>::zero(), last_index))
1633  :
1634  C.insertGlobalValues(
1635  global_row,
1636  combined_index.view(OrdinalTraits<size_t>::zero(), last_index),
1637  combined_values.view(OrdinalTraits<size_t>::zero(), last_index));
1638 
1639  }
1640 }
1641 
1642 /*********************************************************************************************************/
1643 template<class Scalar,
1644  class LocalOrdinal,
1645  class GlobalOrdinal,
1646  class Node>
1647 void setMaxNumEntriesPerRow(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview) {
1648  typedef typename Teuchos::Array<Teuchos::ArrayView<const LocalOrdinal> >::size_type local_length_size;
1649  Mview.maxNumRowEntries = Teuchos::OrdinalTraits<local_length_size>::zero();
1650 
1651  if (Mview.indices.size() > Teuchos::OrdinalTraits<local_length_size>::zero()) {
1652  Mview.maxNumRowEntries = Mview.indices[0].size();
1653 
1654  for (local_length_size i = 1; i < Mview.indices.size(); ++i)
1655  if (Mview.indices[i].size() > Mview.maxNumRowEntries)
1656  Mview.maxNumRowEntries = Mview.indices[i].size();
1657  }
1658 }
1659 
1660 /*********************************************************************************************************/
1661 template<class CrsMatrixType>
1662 size_t C_estimate_nnz(CrsMatrixType & A, CrsMatrixType &B){
1663  // Follows the NZ estimate in ML's ml_matmatmult.c
1664  size_t Aest = 100, Best=100;
1665  if (A.getLocalNumEntries() >= A.getLocalNumRows())
1666  Aest = (A.getLocalNumRows() > 0) ? A.getLocalNumEntries()/A.getLocalNumRows() : 100;
1667  if (B.getLocalNumEntries() >= B.getLocalNumRows())
1668  Best = (B.getLocalNumRows() > 0) ? B.getLocalNumEntries()/B.getLocalNumRows() : 100;
1669 
1670  size_t nnzperrow = (size_t)(sqrt((double)Aest) + sqrt((double)Best) - 1);
1671  nnzperrow *= nnzperrow;
1672 
1673  return (size_t)(A.getLocalNumRows()*nnzperrow*0.75 + 100);
1674 }
1675 
1676 
1677 /*********************************************************************************************************/
1678 // Kernel method for computing the local portion of C = A*B for CrsMatrix
1679 //
1680 // mfh 27 Sep 2016: Currently, mult_AT_B_newmatrix() also calls this
1681 // function, so this is probably the function we want to
1682 // thread-parallelize.
1683 template<class Scalar,
1684  class LocalOrdinal,
1685  class GlobalOrdinal,
1686  class Node>
1687 void mult_A_B_newmatrix(
1688  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1689  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1690  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
1691  const std::string& label,
1692  const Teuchos::RCP<Teuchos::ParameterList>& params)
1693 {
1694  using Teuchos::Array;
1695  using Teuchos::ArrayRCP;
1696  using Teuchos::ArrayView;
1697  using Teuchos::RCP;
1698  using Teuchos::rcp;
1699 
1700  // Tpetra typedefs
1701  typedef LocalOrdinal LO;
1702  typedef GlobalOrdinal GO;
1703  typedef Node NO;
1704  typedef Import<LO,GO,NO> import_type;
1705  typedef Map<LO,GO,NO> map_type;
1706 
1707  // Kokkos typedefs
1708  typedef typename map_type::local_map_type local_map_type;
1710  typedef typename KCRS::StaticCrsGraphType graph_t;
1711  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
1712  typedef typename NO::execution_space execution_space;
1713  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1714  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1715 
1716 #ifdef HAVE_TPETRA_MMM_TIMINGS
1717  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
1718  using Teuchos::TimeMonitor;
1719  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*(TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM M5 Cmap")))));
1720 #endif
1721  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1722 
1723  // Build the final importer / column map, hash table lookups for C
1724  RCP<const import_type> Cimport;
1725  RCP<const map_type> Ccolmap;
1726  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1727  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1728  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1729  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1730  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1731  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1732  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1733  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1734 
1735  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
1736  // indices of B, to local column indices of C. (B and C have the
1737  // same number of columns.) The kernel uses this, instead of
1738  // copying the entire input matrix B and converting its column
1739  // indices to those of C.
1740  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1741 
1742  if (Bview.importMatrix.is_null()) {
1743  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1744  Cimport = Bimport;
1745  Ccolmap = Bview.colMap;
1746  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1747  // Bcol2Ccol is trivial
1748  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1749  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1750  KOKKOS_LAMBDA(const LO i) {
1751  Bcol2Ccol(i) = i;
1752  });
1753  }
1754  else {
1755  // mfh 27 Sep 2016: B has "remotes," so we need to build the
1756  // column Map of C, as well as C's Import object (from its domain
1757  // Map to its column Map). C's column Map is the union of the
1758  // column Maps of (the local part of) B, and the "remote" part of
1759  // B. Ditto for the Import. We have optimized this "setUnion"
1760  // operation on Import objects and Maps.
1761 
1762  // Choose the right variant of setUnion
1763  if (!Bimport.is_null() && !Iimport.is_null()) {
1764  Cimport = Bimport->setUnion(*Iimport);
1765  }
1766  else if (!Bimport.is_null() && Iimport.is_null()) {
1767  Cimport = Bimport->setUnion();
1768  }
1769  else if (Bimport.is_null() && !Iimport.is_null()) {
1770  Cimport = Iimport->setUnion();
1771  }
1772  else {
1773  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1774  }
1775  Ccolmap = Cimport->getTargetMap();
1776 
1777  // FIXME (mfh 27 Sep 2016) This error check requires an all-reduce
1778  // in general. We should get rid of it in order to reduce
1779  // communication costs of sparse matrix-matrix multiply.
1780  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
1781  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
1782 
1783  // NOTE: This is not efficient and should be folded into setUnion
1784  //
1785  // mfh 27 Sep 2016: What the above comment means, is that the
1786  // setUnion operation on Import objects could also compute these
1787  // local index - to - local index look-up tables.
1788  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1789  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1790  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1791  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1792  });
1793  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
1794  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1795  });
1796 
1797  }
1798 
1799  // Replace the column map
1800  //
1801  // mfh 27 Sep 2016: We do this because C was originally created
1802  // without a column Map. Now we have its column Map.
1803  C.replaceColMap(Ccolmap);
1804 
1805  // mfh 27 Sep 2016: Construct tables that map from local column
1806  // indices of A, to local row indices of either B_local (the locally
1807  // owned part of B), or B_remote (the "imported" remote part of B).
1808  //
1809  // For column index Aik in row i of A, if the corresponding row of B
1810  // exists in the local part of B ("orig") (which I'll call B_local),
1811  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1812  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1813  //
1814  // For column index Aik in row i of A, if the corresponding row of B
1815  // exists in the remote part of B ("Import") (which I'll call
1816  // B_remote), then targetMapToImportRow[Aik] is the local index of
1817  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1818  // (a flag value).
1819 
1820  // Run through all the hash table lookups once and for all
1821  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
1822  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
1823 
1824  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
1825  GO aidx = Acolmap_local.getGlobalElement(i);
1826  LO B_LID = Browmap_local.getLocalElement(aidx);
1827  if (B_LID != LO_INVALID) {
1828  targetMapToOrigRow(i) = B_LID;
1829  targetMapToImportRow(i) = LO_INVALID;
1830  } else {
1831  LO I_LID = Irowmap_local.getLocalElement(aidx);
1832  targetMapToOrigRow(i) = LO_INVALID;
1833  targetMapToImportRow(i) = I_LID;
1834 
1835  }
1836  });
1837 
1838  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
1839  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
1840  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_newmatrix_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
1841 
1842 }
1843 
1844 /*********************************************************************************************************/
1845 // Kernel method for computing the local portion of C = A*B for BlockCrsMatrix
1846 template<class Scalar,
1847  class LocalOrdinal,
1848  class GlobalOrdinal,
1849  class Node>
1850 void mult_A_B_newmatrix(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
1851  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
1852  Teuchos::RCP<BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> > &C)
1853 {
1854  using Teuchos::null;
1855  using Teuchos::Array;
1856  using Teuchos::ArrayRCP;
1857  using Teuchos::ArrayView;
1858  using Teuchos::RCP;
1859  using Teuchos::rcp;
1860 
1861  // Tpetra typedefs
1862  typedef LocalOrdinal LO;
1863  typedef GlobalOrdinal GO;
1864  typedef Node NO;
1865  typedef Import<LO,GO,NO> import_type;
1866  typedef Map<LO,GO,NO> map_type;
1867  typedef BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> block_crs_matrix_type;
1868  typedef typename block_crs_matrix_type::crs_graph_type graph_t;
1869 
1870  // Kokkos typedefs
1871  typedef typename map_type::local_map_type local_map_type;
1872  typedef typename block_crs_matrix_type::local_matrix_device_type KBSR;
1873  typedef typename KBSR::device_type device_t;
1874  typedef typename KBSR::StaticCrsGraphType static_graph_t;
1875  typedef typename static_graph_t::row_map_type::non_const_type lno_view_t;
1876  typedef typename static_graph_t::entries_type::non_const_type lno_nnz_view_t;
1877  typedef typename KBSR::values_type::non_const_type scalar_view_t;
1878  typedef typename NO::execution_space execution_space;
1879  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
1880  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
1881 
1882  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
1883 
1884  // Build the final importer / column map, hash table lookups for C
1885  RCP<const import_type> Cimport;
1886  RCP<const map_type> Ccolmap;
1887  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
1888  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
1889  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
1890  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
1891  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
1892  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
1893  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
1894  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
1895 
1896  // Bcol2Ccol is a table that maps from local column
1897  // indices of B, to local column indices of C. (B and C have the
1898  // same number of columns.) The kernel uses this, instead of
1899  // copying the entire input matrix B and converting its column
1900  // indices to those of C.
1901  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
1902 
1903  if (Bview.importMatrix.is_null()) {
1904  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
1905  Cimport = Bimport;
1906  Ccolmap = Bview.colMap;
1907  const LO colMapSize = static_cast<LO>(Bview.colMap->getLocalNumElements());
1908  // Bcol2Ccol is trivial
1909  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_fill",
1910  Kokkos::RangePolicy<execution_space, LO>(0, colMapSize),
1911  KOKKOS_LAMBDA(const LO i) {
1912  Bcol2Ccol(i) = i;
1913  });
1914  }
1915  else {
1916  // B has "remotes," so we need to build the
1917  // column Map of C, as well as C's Import object (from its domain
1918  // Map to its column Map). C's column Map is the union of the
1919  // column Maps of (the local part of) B, and the "remote" part of
1920  // B. Ditto for the Import. We have optimized this "setUnion"
1921  // operation on Import objects and Maps.
1922 
1923  // Choose the right variant of setUnion
1924  if (!Bimport.is_null() && !Iimport.is_null()) {
1925  Cimport = Bimport->setUnion(*Iimport);
1926  }
1927  else if (!Bimport.is_null() && Iimport.is_null()) {
1928  Cimport = Bimport->setUnion();
1929  }
1930  else if (Bimport.is_null() && !Iimport.is_null()) {
1931  Cimport = Iimport->setUnion();
1932  }
1933  else {
1934  throw std::runtime_error("TpetraExt::MMM status of matrix importers is nonsensical");
1935  }
1936  Ccolmap = Cimport->getTargetMap();
1937 
1938  // NOTE: This is not efficient and should be folded into setUnion
1939  //
1940  // What the above comment means, is that the
1941  // setUnion operation on Import objects could also compute these
1942  // local index - to - local index look-up tables.
1943  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
1944  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
1945  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Bcol2Ccol_getGlobalElement",
1946  range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),
1947  KOKKOS_LAMBDA(const LO i) {
1948  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
1949  });
1950  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::Icol2Ccol_getGlobalElement",
1951  range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),
1952  KOKKOS_LAMBDA(const LO i) {
1953  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
1954  });
1955  }
1956 
1957  // Construct tables that map from local column
1958  // indices of A, to local row indices of either B_local (the locally
1959  // owned part of B), or B_remote (the "imported" remote part of B).
1960  //
1961  // For column index Aik in row i of A, if the corresponding row of B
1962  // exists in the local part of B ("orig") (which I'll call B_local),
1963  // then targetMapToOrigRow[Aik] is the local index of that row of B.
1964  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
1965  //
1966  // For column index Aik in row i of A, if the corresponding row of B
1967  // exists in the remote part of B ("Import") (which I'll call
1968  // B_remote), then targetMapToImportRow[Aik] is the local index of
1969  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
1970  // (a flag value).
1971 
1972  // Run through all the hash table lookups once and for all
1973  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),
1974  Aview.colMap->getLocalNumElements());
1975  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),
1976  Aview.colMap->getLocalNumElements());
1977 
1978  Kokkos::parallel_for("Tpetra::mult_A_B_newmatrix::construct_tables",
1979  range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),
1980  KOKKOS_LAMBDA(const LO i) {
1981  GO aidx = Acolmap_local.getGlobalElement(i);
1982  LO B_LID = Browmap_local.getLocalElement(aidx);
1983  if (B_LID != LO_INVALID) {
1984  targetMapToOrigRow(i) = B_LID;
1985  targetMapToImportRow(i) = LO_INVALID;
1986  } else {
1987  LO I_LID = Irowmap_local.getLocalElement(aidx);
1988  targetMapToOrigRow(i) = LO_INVALID;
1989  targetMapToImportRow(i) = I_LID;
1990  }
1991  });
1992 
1993  // Create the KernelHandle
1994  using KernelHandle =
1995  KokkosKernels::Experimental::KokkosKernelsHandle<typename lno_view_t::const_value_type,
1996  typename lno_nnz_view_t::const_value_type,
1997  typename scalar_view_t::const_value_type,
1998  typename device_t::execution_space,
1999  typename device_t::memory_space,
2000  typename device_t::memory_space>;
2001  int team_work_size = 16; // Defaults to 16 as per Deveci 12/7/16 - csiefer
2002  std::string myalg("SPGEMM_KK_MEMORY");
2003  KokkosSparse::SPGEMMAlgorithm alg_enum = KokkosSparse::StringToSPGEMMAlgorithm(myalg);
2004 
2005  KernelHandle kh;
2006  kh.create_spgemm_handle(alg_enum);
2007  kh.set_team_work_size(team_work_size);
2008 
2009  // Get KokkosSparse::BsrMatrix for A and Bmerged (B and BImport)
2010  const KBSR Amat = Aview.origMatrix->getLocalMatrixDevice();
2011  const KBSR Bmerged = Tpetra::MMdetails::merge_matrices(Aview,Bview,
2012  targetMapToOrigRow,targetMapToImportRow,
2013  Bcol2Ccol,Icol2Ccol,
2014  Ccolmap.getConst()->getLocalNumElements());
2015 
2016  RCP<graph_t> graphC;
2017  typename KBSR::values_type values;
2018  {
2019  // Call KokkosSparse routines to calculate Amat*Bmerged on device.
2020  // NOTE: Need to scope guard this since the BlockCrs constructor will need to copy the host graph
2021  KBSR Cmat;
2022  KokkosSparse::block_spgemm_symbolic(kh, Amat, false, Bmerged, false, Cmat);
2023  KokkosSparse::block_spgemm_numeric (kh, Amat, false, Bmerged, false, Cmat);
2024  kh.destroy_spgemm_handle();
2025 
2026  // Build Tpetra::BlockCrsMatrix from KokkosSparse::BsrMatrix
2027  graphC = rcp(new graph_t(Cmat.graph, Aview.origMatrix->getRowMap(), Ccolmap.getConst()));
2028  values = Cmat.values;
2029  }
2030  C = rcp (new block_crs_matrix_type (*graphC, values, Aview.blocksize));
2031 
2032 }
2033 
2034 /*********************************************************************************************************/
2035 // AB NewMatrix Kernel wrappers (Default non-threaded version for CrsMatrix)
2036 template<class Scalar,
2037  class LocalOrdinal,
2038  class GlobalOrdinal,
2039  class Node,
2040  class LocalOrdinalViewType>
2041 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_newmatrix_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2042  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2043  const LocalOrdinalViewType & targetMapToOrigRow,
2044  const LocalOrdinalViewType & targetMapToImportRow,
2045  const LocalOrdinalViewType & Bcol2Ccol,
2046  const LocalOrdinalViewType & Icol2Ccol,
2047  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2048  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2049  const std::string& label,
2050  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2051 #ifdef HAVE_TPETRA_MMM_TIMINGS
2052  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2053  using Teuchos::TimeMonitor;
2054  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore"))));
2055 #endif
2056 
2057  using Teuchos::Array;
2058  using Teuchos::ArrayRCP;
2059  using Teuchos::ArrayView;
2060  using Teuchos::RCP;
2061  using Teuchos::rcp;
2062 
2063  // Lots and lots of typedefs
2064  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2065  typedef typename KCRS::StaticCrsGraphType graph_t;
2066  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2067  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2068  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2069  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2070 
2071  typedef Scalar SC;
2072  typedef LocalOrdinal LO;
2073  typedef GlobalOrdinal GO;
2074  typedef Node NO;
2075  typedef Map<LO,GO,NO> map_type;
2076  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2077  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2078  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2079 
2080  // Sizes
2081  RCP<const map_type> Ccolmap = C.getColMap();
2082  size_t m = Aview.origMatrix->getLocalNumRows();
2083  size_t n = Ccolmap->getLocalNumElements();
2084  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2085 
2086  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2087  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2088  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2089 
2090  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2091  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2092  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2093 
2094  c_lno_view_t Irowptr;
2095  lno_nnz_view_t Icolind;
2096  scalar_view_t Ivals;
2097  if(!Bview.importMatrix.is_null()) {
2098  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2099  Irowptr = lclB.graph.row_map;
2100  Icolind = lclB.graph.entries;
2101  Ivals = lclB.values;
2102  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2103  }
2104 
2105 #ifdef HAVE_TPETRA_MMM_TIMINGS
2106  RCP<TimeMonitor> MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2107 #endif
2108 
2109  // Classic csr assembly (low memory edition)
2110  //
2111  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2112  // The method loops over rows of A, and may resize after processing
2113  // each row. Chris Siefert says that this reflects experience in
2114  // ML; for the non-threaded case, ML found it faster to spend less
2115  // effort on estimation and risk an occasional reallocation.
2116  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2117  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2118  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2119  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2120 
2121  // mfh 27 Sep 2016: The c_status array is an implementation detail
2122  // of the local sparse matrix-matrix multiply routine.
2123 
2124  // The status array will contain the index into colind where this entry was last deposited.
2125  // c_status[i] < CSR_ip - not in the row yet
2126  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2127  // We start with this filled with INVALID's indicating that there are no entries yet.
2128  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2129  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2130  std::vector<size_t> c_status(n, ST_INVALID);
2131 
2132  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2133  // routine. The routine computes C := A * (B_local + B_remote).
2134  //
2135  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2136  // you whether the corresponding row of B belongs to B_local
2137  // ("orig") or B_remote ("Import").
2138 
2139  // For each row of A/C
2140  size_t CSR_ip = 0, OLD_ip = 0;
2141  for (size_t i = 0; i < m; i++) {
2142  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2143  // on the calling process.
2144  Crowptr[i] = CSR_ip;
2145 
2146  // mfh 27 Sep 2016: For each entry of A in the current row of A
2147  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2148  LO Aik = Acolind[k]; // local column index of current entry of A
2149  const SC Aval = Avals[k]; // value of current entry of A
2150  if (Aval == SC_ZERO)
2151  continue; // skip explicitly stored zero values in A
2152 
2153  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2154  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2155  // corresponding to the current entry of A is populated, then
2156  // the corresponding row of B is in B_local (i.e., it lives on
2157  // the calling process).
2158 
2159  // Local matrix
2160  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2161 
2162  // mfh 27 Sep 2016: Go through all entries in that row of B_local.
2163  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2164  LO Bkj = Bcolind[j];
2165  LO Cij = Bcol2Ccol[Bkj];
2166 
2167  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2168  // New entry
2169  c_status[Cij] = CSR_ip;
2170  Ccolind[CSR_ip] = Cij;
2171  Cvals[CSR_ip] = Aval*Bvals[j];
2172  CSR_ip++;
2173 
2174  } else {
2175  Cvals[c_status[Cij]] += Aval*Bvals[j];
2176  }
2177  }
2178 
2179  } else {
2180  // mfh 27 Sep 2016: If the entry of targetMapToOrigRow
2181  // corresponding to the current entry of A NOT populated (has
2182  // a flag "invalid" value), then the corresponding row of B is
2183  // in B_local (i.e., it lives on the calling process).
2184 
2185  // Remote matrix
2186  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2187  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2188  LO Ikj = Icolind[j];
2189  LO Cij = Icol2Ccol[Ikj];
2190 
2191  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip){
2192  // New entry
2193  c_status[Cij] = CSR_ip;
2194  Ccolind[CSR_ip] = Cij;
2195  Cvals[CSR_ip] = Aval*Ivals[j];
2196  CSR_ip++;
2197  } else {
2198  Cvals[c_status[Cij]] += Aval*Ivals[j];
2199  }
2200  }
2201  }
2202  }
2203 
2204  // Resize for next pass if needed
2205  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1])*b_max_nnz_per_row) > CSR_alloc) {
2206  CSR_alloc *= 2;
2207  Kokkos::resize(Ccolind,CSR_alloc);
2208  Kokkos::resize(Cvals,CSR_alloc);
2209  }
2210  OLD_ip = CSR_ip;
2211  }
2212 
2213  Crowptr[m] = CSR_ip;
2214 
2215  // Downward resize
2216  Kokkos::resize(Ccolind,CSR_ip);
2217  Kokkos::resize(Cvals,CSR_ip);
2218 
2219 #ifdef HAVE_TPETRA_MMM_TIMINGS
2220  {
2221  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix Final Sort")));
2222 #endif
2223 
2224  // Final sort & set of CRS arrays
2225  if (params.is_null() || params->get("sort entries",true))
2226  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2227  C.setAllValues(Crowptr,Ccolind, Cvals);
2228 
2229 
2230 #ifdef HAVE_TPETRA_MMM_TIMINGS
2231  }
2232  auto MM4(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix ESFC")));
2233  {
2234 #endif
2235 
2236  // Final FillComplete
2237  //
2238  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2239  // Import (from domain Map to column Map) construction (which costs
2240  // lots of communication) by taking the previously constructed
2241  // Import object. We should be able to do this without interfering
2242  // with the implementation of the local part of sparse matrix-matrix
2243  // multply above.
2244  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2245  labelList->set("Timer Label",label);
2246  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2247  RCP<const Export<LO,GO,NO> > dummyExport;
2248  C.expertStaticFillComplete(Bview. origMatrix->getDomainMap(), Aview. origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2249 #ifdef HAVE_TPETRA_MMM_TIMINGS
2250  }
2251  MM2 = Teuchos::null;
2252 #endif
2253 
2254 }
2255 /*********************************************************************************************************/
2256 // Kernel method for computing the local portion of C = A*B (reuse)
2257 template<class Scalar,
2258  class LocalOrdinal,
2259  class GlobalOrdinal,
2260  class Node>
2261 void mult_A_B_reuse(
2262  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2263  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2264  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2265  const std::string& label,
2266  const Teuchos::RCP<Teuchos::ParameterList>& params)
2267 {
2268  using Teuchos::Array;
2269  using Teuchos::ArrayRCP;
2270  using Teuchos::ArrayView;
2271  using Teuchos::RCP;
2272  using Teuchos::rcp;
2273 
2274  // Tpetra typedefs
2275  typedef LocalOrdinal LO;
2276  typedef GlobalOrdinal GO;
2277  typedef Node NO;
2278  typedef Import<LO,GO,NO> import_type;
2279  typedef Map<LO,GO,NO> map_type;
2280 
2281  // Kokkos typedefs
2282  typedef typename map_type::local_map_type local_map_type;
2284  typedef typename KCRS::StaticCrsGraphType graph_t;
2285  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2286  typedef typename NO::execution_space execution_space;
2287  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2288  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2289 
2290 #ifdef HAVE_TPETRA_MMM_TIMINGS
2291  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2292  using Teuchos::TimeMonitor;
2293  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse Cmap"))));
2294 #endif
2295  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2296 
2297  // Grab all the maps
2298  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2299  RCP<const map_type> Ccolmap = C.getColMap();
2300  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2301  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2302  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2303  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2304  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2305  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2306 
2307  // Build the final importer / column map, hash table lookups for C
2308  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2309  {
2310  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2311  // So, column map of C may be a strict subset of the column map of B
2312  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2313  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2314  });
2315 
2316  if (!Bview.importMatrix.is_null()) {
2317  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2318  std::runtime_error, "Tpetra::MMM: Import setUnion messed with the DomainMap in an unfortunate way");
2319 
2320  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2321  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2322  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2323  });
2324  }
2325  }
2326 
2327  // Run through all the hash table lookups once and for all
2328  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2329  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2330  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2331  GO aidx = Acolmap_local.getGlobalElement(i);
2332  LO B_LID = Browmap_local.getLocalElement(aidx);
2333  if (B_LID != LO_INVALID) {
2334  targetMapToOrigRow(i) = B_LID;
2335  targetMapToImportRow(i) = LO_INVALID;
2336  } else {
2337  LO I_LID = Irowmap_local.getLocalElement(aidx);
2338  targetMapToOrigRow(i) = LO_INVALID;
2339  targetMapToImportRow(i) = I_LID;
2340 
2341  }
2342  });
2343 
2344  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2345  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2346  KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::mult_A_B_reuse_kernel_wrapper(Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2347 }
2348 
2349 /*********************************************************************************************************/
2350 template<class Scalar,
2351  class LocalOrdinal,
2352  class GlobalOrdinal,
2353  class Node,
2354  class LocalOrdinalViewType>
2355 void KernelWrappers<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::mult_A_B_reuse_kernel_wrapper(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2356  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2357  const LocalOrdinalViewType & targetMapToOrigRow,
2358  const LocalOrdinalViewType & targetMapToImportRow,
2359  const LocalOrdinalViewType & Bcol2Ccol,
2360  const LocalOrdinalViewType & Icol2Ccol,
2361  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2362  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
2363  const std::string& label,
2364  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
2365 #ifdef HAVE_TPETRA_MMM_TIMINGS
2366  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2367  using Teuchos::TimeMonitor;
2368  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse SerialCore"))));
2369  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
2370 #else
2371  (void)label;
2372 #endif
2373  using Teuchos::RCP;
2374  using Teuchos::rcp;
2375 
2376 
2377  // Lots and lots of typedefs
2378  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2379  typedef typename KCRS::StaticCrsGraphType graph_t;
2380  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2381  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2382  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2383 
2384  typedef Scalar SC;
2385  typedef LocalOrdinal LO;
2386  typedef GlobalOrdinal GO;
2387  typedef Node NO;
2388  typedef Map<LO,GO,NO> map_type;
2389  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2390  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2391  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2392 
2393  // Sizes
2394  RCP<const map_type> Ccolmap = C.getColMap();
2395  size_t m = Aview.origMatrix->getLocalNumRows();
2396  size_t n = Ccolmap->getLocalNumElements();
2397 
2398  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2399  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2400  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2401  const KCRS & Cmat = C.getLocalMatrixHost();
2402 
2403  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
2404  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
2405  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2406  scalar_view_t Cvals = Cmat.values;
2407 
2408  c_lno_view_t Irowptr;
2409  lno_nnz_view_t Icolind;
2410  scalar_view_t Ivals;
2411  if(!Bview.importMatrix.is_null()) {
2412  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2413  Irowptr = lclB.graph.row_map;
2414  Icolind = lclB.graph.entries;
2415  Ivals = lclB.values;
2416  }
2417 
2418 #ifdef HAVE_TPETRA_MMM_TIMINGS
2419  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Newmatrix SerialCore - Compare"))));
2420 #endif
2421 
2422  // Classic csr assembly (low memory edition)
2423  // mfh 27 Sep 2016: The c_status array is an implementation detail
2424  // of the local sparse matrix-matrix multiply routine.
2425 
2426  // The status array will contain the index into colind where this entry was last deposited.
2427  // c_status[i] < CSR_ip - not in the row yet
2428  // c_status[i] >= CSR_ip - this is the entry where you can find the data
2429  // We start with this filled with INVALID's indicating that there are no entries yet.
2430  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2431  std::vector<size_t> c_status(n, ST_INVALID);
2432 
2433  // For each row of A/C
2434  size_t CSR_ip = 0, OLD_ip = 0;
2435  for (size_t i = 0; i < m; i++) {
2436  // First fill the c_status array w/ locations where we're allowed to
2437  // generate nonzeros for this row
2438  OLD_ip = Crowptr[i];
2439  CSR_ip = Crowptr[i+1];
2440  for (size_t k = OLD_ip; k < CSR_ip; k++) {
2441  c_status[Ccolind[k]] = k;
2442 
2443  // Reset values in the row of C
2444  Cvals[k] = SC_ZERO;
2445  }
2446 
2447  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2448  LO Aik = Acolind[k];
2449  const SC Aval = Avals[k];
2450  if (Aval == SC_ZERO)
2451  continue;
2452 
2453  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2454  // Local matrix
2455  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2456 
2457  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2458  LO Bkj = Bcolind[j];
2459  LO Cij = Bcol2Ccol[Bkj];
2460 
2461  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2462  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2463  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2464 
2465  Cvals[c_status[Cij]] += Aval * Bvals[j];
2466  }
2467 
2468  } else {
2469  // Remote matrix
2470  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2471  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2472  LO Ikj = Icolind[j];
2473  LO Cij = Icol2Ccol[Ikj];
2474 
2475  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
2476  std::runtime_error, "Trying to insert a new entry (" << i << "," << Cij << ") into a static graph " <<
2477  "(c_status = " << c_status[Cij] << " of [" << OLD_ip << "," << CSR_ip << "))");
2478 
2479  Cvals[c_status[Cij]] += Aval * Ivals[j];
2480  }
2481  }
2482  }
2483  }
2484 
2485 #ifdef HAVE_TPETRA_MMM_TIMINGS
2486  auto MM3 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM Reuse ESFC"))));
2487 #endif
2488 
2489  C.fillComplete(C.getDomainMap(), C.getRangeMap());
2490 }
2491 
2492 
2493 /*********************************************************************************************************/
2494 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2495 template<class Scalar,
2496  class LocalOrdinal,
2497  class GlobalOrdinal,
2498  class Node>
2499 void jacobi_A_B_newmatrix(
2500  Scalar omega,
2501  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2502  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2503  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2504  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2505  const std::string& label,
2506  const Teuchos::RCP<Teuchos::ParameterList>& params)
2507 {
2508  using Teuchos::Array;
2509  using Teuchos::ArrayRCP;
2510  using Teuchos::ArrayView;
2511  using Teuchos::RCP;
2512  using Teuchos::rcp;
2513  // typedef Scalar SC;
2514  typedef LocalOrdinal LO;
2515  typedef GlobalOrdinal GO;
2516  typedef Node NO;
2517 
2518  typedef Import<LO,GO,NO> import_type;
2519  typedef Map<LO,GO,NO> map_type;
2520  typedef typename map_type::local_map_type local_map_type;
2521 
2522  // All of the Kokkos typedefs
2524  typedef typename KCRS::StaticCrsGraphType graph_t;
2525  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2526  typedef typename NO::execution_space execution_space;
2527  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2528  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2529 
2530 
2531 #ifdef HAVE_TPETRA_MMM_TIMINGS
2532  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2533  using Teuchos::TimeMonitor;
2534  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi M5 Cmap"))));
2535 #endif
2536  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2537 
2538  // Build the final importer / column map, hash table lookups for C
2539  RCP<const import_type> Cimport;
2540  RCP<const map_type> Ccolmap;
2541  RCP<const import_type> Bimport = Bview.origMatrix->getGraph()->getImporter();
2542  RCP<const import_type> Iimport = Bview.importMatrix.is_null() ?
2543  Teuchos::null : Bview.importMatrix->getGraph()->getImporter();
2544  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2545  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2546  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2547  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2548  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2549 
2550  // mfh 27 Sep 2016: Bcol2Ccol is a table that maps from local column
2551  // indices of B, to local column indices of C. (B and C have the
2552  // same number of columns.) The kernel uses this, instead of
2553  // copying the entire input matrix B and converting its column
2554  // indices to those of C.
2555  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2556 
2557  if (Bview.importMatrix.is_null()) {
2558  // mfh 27 Sep 2016: B has no "remotes," so B and C have the same column Map.
2559  Cimport = Bimport;
2560  Ccolmap = Bview.colMap;
2561  // Bcol2Ccol is trivial
2562  // Bcol2Ccol is trivial
2563 
2564  Kokkos::RangePolicy<execution_space, LO> range (0, static_cast<LO> (Bview.colMap->getLocalNumElements ()));
2565  Kokkos::parallel_for (range, KOKKOS_LAMBDA (const size_t i) {
2566  Bcol2Ccol(i) = static_cast<LO> (i);
2567  });
2568  } else {
2569  // mfh 27 Sep 2016: B has "remotes," so we need to build the
2570  // column Map of C, as well as C's Import object (from its domain
2571  // Map to its column Map). C's column Map is the union of the
2572  // column Maps of (the local part of) B, and the "remote" part of
2573  // B. Ditto for the Import. We have optimized this "setUnion"
2574  // operation on Import objects and Maps.
2575 
2576  // Choose the right variant of setUnion
2577  if (!Bimport.is_null() && !Iimport.is_null()){
2578  Cimport = Bimport->setUnion(*Iimport);
2579  Ccolmap = Cimport->getTargetMap();
2580 
2581  } else if (!Bimport.is_null() && Iimport.is_null()) {
2582  Cimport = Bimport->setUnion();
2583 
2584  } else if(Bimport.is_null() && !Iimport.is_null()) {
2585  Cimport = Iimport->setUnion();
2586 
2587  } else
2588  throw std::runtime_error("TpetraExt::Jacobi status of matrix importers is nonsensical");
2589 
2590  Ccolmap = Cimport->getTargetMap();
2591 
2592  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2593  std::runtime_error, "Tpetra:Jacobi Import setUnion messed with the DomainMap in an unfortunate way");
2594 
2595  // NOTE: This is not efficient and should be folded into setUnion
2596  //
2597  // mfh 27 Sep 2016: What the above comment means, is that the
2598  // setUnion operation on Import objects could also compute these
2599  // local index - to - local index look-up tables.
2600  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2601  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2602  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2603  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2604  });
2605  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2606  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2607  });
2608 
2609  }
2610 
2611  // Replace the column map
2612  //
2613  // mfh 27 Sep 2016: We do this because C was originally created
2614  // without a column Map. Now we have its column Map.
2615  C.replaceColMap(Ccolmap);
2616 
2617  // mfh 27 Sep 2016: Construct tables that map from local column
2618  // indices of A, to local row indices of either B_local (the locally
2619  // owned part of B), or B_remote (the "imported" remote part of B).
2620  //
2621  // For column index Aik in row i of A, if the corresponding row of B
2622  // exists in the local part of B ("orig") (which I'll call B_local),
2623  // then targetMapToOrigRow[Aik] is the local index of that row of B.
2624  // Otherwise, targetMapToOrigRow[Aik] is "invalid" (a flag value).
2625  //
2626  // For column index Aik in row i of A, if the corresponding row of B
2627  // exists in the remote part of B ("Import") (which I'll call
2628  // B_remote), then targetMapToImportRow[Aik] is the local index of
2629  // that row of B. Otherwise, targetMapToOrigRow[Aik] is "invalid"
2630  // (a flag value).
2631 
2632  // Run through all the hash table lookups once and for all
2633  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2634  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2635  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2636  GO aidx = Acolmap_local.getGlobalElement(i);
2637  LO B_LID = Browmap_local.getLocalElement(aidx);
2638  if (B_LID != LO_INVALID) {
2639  targetMapToOrigRow(i) = B_LID;
2640  targetMapToImportRow(i) = LO_INVALID;
2641  } else {
2642  LO I_LID = Irowmap_local.getLocalElement(aidx);
2643  targetMapToOrigRow(i) = LO_INVALID;
2644  targetMapToImportRow(i) = I_LID;
2645 
2646  }
2647  });
2648 
2649  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2650  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2651  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_newmatrix_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2652 
2653 }
2654 
2655 
2656 /*********************************************************************************************************/
2657 // Jacobi AB NewMatrix Kernel wrappers (Default non-threaded version)
2658 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2659 
2660 template<class Scalar,
2661  class LocalOrdinal,
2662  class GlobalOrdinal,
2663  class Node,
2664  class LocalOrdinalViewType>
2665 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_newmatrix_kernel_wrapper(Scalar omega,
2666  const Vector<Scalar,LocalOrdinal,GlobalOrdinal,Node> & Dinv,
2667  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2668  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2669  const LocalOrdinalViewType & targetMapToOrigRow,
2670  const LocalOrdinalViewType & targetMapToImportRow,
2671  const LocalOrdinalViewType & Bcol2Ccol,
2672  const LocalOrdinalViewType & Icol2Ccol,
2673  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2674  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > Cimport,
2675  const std::string& label,
2676  const Teuchos::RCP<Teuchos::ParameterList>& params) {
2677 
2678 #ifdef HAVE_TPETRA_MMM_TIMINGS
2679  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2680  using Teuchos::TimeMonitor;
2681  auto MM(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Nemwmatrix SerialCore")));
2682 #endif
2683 
2684  using Teuchos::Array;
2685  using Teuchos::ArrayRCP;
2686  using Teuchos::ArrayView;
2687  using Teuchos::RCP;
2688  using Teuchos::rcp;
2689 
2690  // Lots and lots of typedefs
2691  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
2692  typedef typename KCRS::StaticCrsGraphType graph_t;
2693  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
2694  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2695  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
2696  typedef typename KCRS::values_type::non_const_type scalar_view_t;
2697 
2698  // Jacobi-specific
2699  typedef typename scalar_view_t::memory_space scalar_memory_space;
2700 
2701  typedef Scalar SC;
2702  typedef LocalOrdinal LO;
2703  typedef GlobalOrdinal GO;
2704  typedef Node NO;
2705 
2706  typedef Map<LO,GO,NO> map_type;
2707  size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2708  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2709 
2710  // Sizes
2711  RCP<const map_type> Ccolmap = C.getColMap();
2712  size_t m = Aview.origMatrix->getLocalNumRows();
2713  size_t n = Ccolmap->getLocalNumElements();
2714  size_t b_max_nnz_per_row = Bview.origMatrix->getLocalMaxNumRowEntries();
2715 
2716  // Grab the Kokkos::SparseCrsMatrices & inner stuff
2717  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
2718  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
2719 
2720  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map;
2721  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries;
2722  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
2723 
2724  c_lno_view_t Irowptr;
2725  lno_nnz_view_t Icolind;
2726  scalar_view_t Ivals;
2727  if(!Bview.importMatrix.is_null()) {
2728  auto lclB = Bview.importMatrix->getLocalMatrixHost();
2729  Irowptr = lclB.graph.row_map;
2730  Icolind = lclB.graph.entries;
2731  Ivals = lclB.values;
2732  b_max_nnz_per_row = std::max(b_max_nnz_per_row,Bview.importMatrix->getLocalMaxNumRowEntries());
2733  }
2734 
2735  // Jacobi-specific inner stuff
2736  auto Dvals =
2737  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
2738 
2739  // Teuchos::ArrayView::operator[].
2740  // The status array will contain the index into colind where this entry was last deposited.
2741  // c_status[i] < CSR_ip - not in the row yet.
2742  // c_status[i] >= CSR_ip, this is the entry where you can find the data
2743  // We start with this filled with INVALID's indicating that there are no entries yet.
2744  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
2745  size_t INVALID = Teuchos::OrdinalTraits<size_t>::invalid();
2746  Array<size_t> c_status(n, ST_INVALID);
2747 
2748  // Classic csr assembly (low memory edition)
2749  //
2750  // mfh 27 Sep 2016: C_estimate_nnz does not promise an upper bound.
2751  // The method loops over rows of A, and may resize after processing
2752  // each row. Chris Siefert says that this reflects experience in
2753  // ML; for the non-threaded case, ML found it faster to spend less
2754  // effort on estimation and risk an occasional reallocation.
2755  size_t CSR_alloc = std::max(C_estimate_nnz(*Aview.origMatrix, *Bview.origMatrix), n);
2756  lno_view_t Crowptr(Kokkos::ViewAllocateWithoutInitializing("Crowptr"),m+1);
2757  lno_nnz_view_t Ccolind(Kokkos::ViewAllocateWithoutInitializing("Ccolind"),CSR_alloc);
2758  scalar_view_t Cvals(Kokkos::ViewAllocateWithoutInitializing("Cvals"),CSR_alloc);
2759  size_t CSR_ip = 0, OLD_ip = 0;
2760 
2761  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
2762 
2763  // mfh 27 Sep 2016: Here is the local sparse matrix-matrix multiply
2764  // routine. The routine computes
2765  //
2766  // C := (I - omega * D^{-1} * A) * (B_local + B_remote)).
2767  //
2768  // This corresponds to one sweep of (weighted) Jacobi.
2769  //
2770  // For column index Aik in row i of A, targetMapToOrigRow[Aik] tells
2771  // you whether the corresponding row of B belongs to B_local
2772  // ("orig") or B_remote ("Import").
2773 
2774  // For each row of A/C
2775  for (size_t i = 0; i < m; i++) {
2776  // mfh 27 Sep 2016: m is the number of rows in the input matrix A
2777  // on the calling process.
2778  Crowptr[i] = CSR_ip;
2779  SC minusOmegaDval = -omega*Dvals(i,0);
2780 
2781  // Entries of B
2782  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
2783  Scalar Bval = Bvals[j];
2784  if (Bval == SC_ZERO)
2785  continue;
2786  LO Bij = Bcolind[j];
2787  LO Cij = Bcol2Ccol[Bij];
2788 
2789  // Assume no repeated entries in B
2790  c_status[Cij] = CSR_ip;
2791  Ccolind[CSR_ip] = Cij;
2792  Cvals[CSR_ip] = Bvals[j];
2793  CSR_ip++;
2794  }
2795 
2796  // Entries of -omega * Dinv * A * B
2797  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
2798  LO Aik = Acolind[k];
2799  const SC Aval = Avals[k];
2800  if (Aval == SC_ZERO)
2801  continue;
2802 
2803  if (targetMapToOrigRow[Aik] != LO_INVALID) {
2804  // Local matrix
2805  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
2806 
2807  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
2808  LO Bkj = Bcolind[j];
2809  LO Cij = Bcol2Ccol[Bkj];
2810 
2811  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2812  // New entry
2813  c_status[Cij] = CSR_ip;
2814  Ccolind[CSR_ip] = Cij;
2815  Cvals[CSR_ip] = minusOmegaDval* Aval * Bvals[j];
2816  CSR_ip++;
2817 
2818  } else {
2819  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Bvals[j];
2820  }
2821  }
2822 
2823  } else {
2824  // Remote matrix
2825  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
2826  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
2827  LO Ikj = Icolind[j];
2828  LO Cij = Icol2Ccol[Ikj];
2829 
2830  if (c_status[Cij] == INVALID || c_status[Cij] < OLD_ip) {
2831  // New entry
2832  c_status[Cij] = CSR_ip;
2833  Ccolind[CSR_ip] = Cij;
2834  Cvals[CSR_ip] = minusOmegaDval* Aval * Ivals[j];
2835  CSR_ip++;
2836  } else {
2837  Cvals[c_status[Cij]] += minusOmegaDval* Aval * Ivals[j];
2838  }
2839  }
2840  }
2841  }
2842 
2843  // Resize for next pass if needed
2844  if (i+1 < m && CSR_ip + std::min(n,(Arowptr[i+2]-Arowptr[i+1]+1)*b_max_nnz_per_row) > CSR_alloc) {
2845  CSR_alloc *= 2;
2846  Kokkos::resize(Ccolind,CSR_alloc);
2847  Kokkos::resize(Cvals,CSR_alloc);
2848  }
2849  OLD_ip = CSR_ip;
2850  }
2851  Crowptr[m] = CSR_ip;
2852 
2853  // Downward resize
2854  Kokkos::resize(Ccolind,CSR_ip);
2855  Kokkos::resize(Cvals,CSR_ip);
2856 
2857  {
2858 #ifdef HAVE_TPETRA_MMM_TIMINGS
2859  auto MM2(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix Final Sort")));
2860 #endif
2861 
2862  // Replace the column map
2863  //
2864  // mfh 27 Sep 2016: We do this because C was originally created
2865  // without a column Map. Now we have its column Map.
2866  C.replaceColMap(Ccolmap);
2867 
2868  // Final sort & set of CRS arrays
2869  //
2870  // TODO (mfh 27 Sep 2016) Will the thread-parallel "local" sparse
2871  // matrix-matrix multiply routine sort the entries for us?
2872  // Final sort & set of CRS arrays
2873  if (params.is_null() || params->get("sort entries",true))
2874  Import_Util::sortCrsEntries(Crowptr,Ccolind, Cvals);
2875  C.setAllValues(Crowptr,Ccolind, Cvals);
2876  }
2877  {
2878 #ifdef HAVE_TPETRA_MMM_TIMINGS
2879  auto MM3(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Newmatrix ESFC")));
2880 #endif
2881 
2882  // Final FillComplete
2883  //
2884  // mfh 27 Sep 2016: So-called "expert static fill complete" bypasses
2885  // Import (from domain Map to column Map) construction (which costs
2886  // lots of communication) by taking the previously constructed
2887  // Import object. We should be able to do this without interfering
2888  // with the implementation of the local part of sparse matrix-matrix
2889  // multply above
2890  RCP<Teuchos::ParameterList> labelList = rcp(new Teuchos::ParameterList);
2891  labelList->set("Timer Label",label);
2892  if(!params.is_null()) labelList->set("compute global constants",params->get("compute global constants",true));
2893  RCP<const Export<LO,GO,NO> > dummyExport;
2894  C.expertStaticFillComplete(Bview.origMatrix->getDomainMap(), Aview.origMatrix->getRangeMap(), Cimport,dummyExport,labelList);
2895 
2896  }
2897 }
2898 
2899 
2900 /*********************************************************************************************************/
2901 // Kernel method for computing the local portion of C = (I-omega D^{-1} A)*B
2902 template<class Scalar,
2903  class LocalOrdinal,
2904  class GlobalOrdinal,
2905  class Node>
2906 void jacobi_A_B_reuse(
2907  Scalar omega,
2908  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
2909  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
2910  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
2911  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
2912  const std::string& label,
2913  const Teuchos::RCP<Teuchos::ParameterList>& params)
2914 {
2915  using Teuchos::Array;
2916  using Teuchos::ArrayRCP;
2917  using Teuchos::ArrayView;
2918  using Teuchos::RCP;
2919  using Teuchos::rcp;
2920 
2921  typedef LocalOrdinal LO;
2922  typedef GlobalOrdinal GO;
2923  typedef Node NO;
2924 
2925  typedef Import<LO,GO,NO> import_type;
2926  typedef Map<LO,GO,NO> map_type;
2927 
2928  // Kokkos typedefs
2929  typedef typename map_type::local_map_type local_map_type;
2931  typedef typename KCRS::StaticCrsGraphType graph_t;
2932  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
2933  typedef typename NO::execution_space execution_space;
2934  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
2935  typedef Kokkos::View<LO*, typename lno_view_t::array_layout, typename lno_view_t::device_type> lo_view_t;
2936 
2937 #ifdef HAVE_TPETRA_MMM_TIMINGS
2938  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
2939  using Teuchos::TimeMonitor;
2940  RCP<TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse Cmap"))));
2941 #endif
2942  LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
2943 
2944  // Grab all the maps
2945  RCP<const import_type> Cimport = C.getGraph()->getImporter();
2946  RCP<const map_type> Ccolmap = C.getColMap();
2947  local_map_type Acolmap_local = Aview.colMap->getLocalMap();
2948  local_map_type Browmap_local = Bview.origMatrix->getRowMap()->getLocalMap();
2949  local_map_type Irowmap_local; if(!Bview.importMatrix.is_null()) Irowmap_local = Bview.importMatrix->getRowMap()->getLocalMap();
2950  local_map_type Bcolmap_local = Bview.origMatrix->getColMap()->getLocalMap();
2951  local_map_type Icolmap_local; if(!Bview.importMatrix.is_null()) Icolmap_local = Bview.importMatrix->getColMap()->getLocalMap();
2952  local_map_type Ccolmap_local = Ccolmap->getLocalMap();
2953 
2954  // Build the final importer / column map, hash table lookups for C
2955  lo_view_t Bcol2Ccol(Kokkos::ViewAllocateWithoutInitializing("Bcol2Ccol"),Bview.colMap->getLocalNumElements()), Icol2Ccol;
2956  {
2957  // Bcol2Col may not be trivial, as Ccolmap is compressed during fillComplete in newmatrix
2958  // So, column map of C may be a strict subset of the column map of B
2959  Kokkos::parallel_for(range_type(0,Bview.origMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2960  Bcol2Ccol(i) = Ccolmap_local.getLocalElement(Bcolmap_local.getGlobalElement(i));
2961  });
2962 
2963  if (!Bview.importMatrix.is_null()) {
2964  TEUCHOS_TEST_FOR_EXCEPTION(!Cimport->getSourceMap()->isSameAs(*Bview.origMatrix->getDomainMap()),
2965  std::runtime_error, "Tpetra::Jacobi: Import setUnion messed with the DomainMap in an unfortunate way");
2966 
2967  Kokkos::resize(Icol2Ccol,Bview.importMatrix->getColMap()->getLocalNumElements());
2968  Kokkos::parallel_for(range_type(0,Bview.importMatrix->getColMap()->getLocalNumElements()),KOKKOS_LAMBDA(const LO i) {
2969  Icol2Ccol(i) = Ccolmap_local.getLocalElement(Icolmap_local.getGlobalElement(i));
2970  });
2971  }
2972  }
2973 
2974  // Run through all the hash table lookups once and for all
2975  lo_view_t targetMapToOrigRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToOrigRow"),Aview.colMap->getLocalNumElements());
2976  lo_view_t targetMapToImportRow(Kokkos::ViewAllocateWithoutInitializing("targetMapToImportRow"),Aview.colMap->getLocalNumElements());
2977  Kokkos::parallel_for(range_type(Aview.colMap->getMinLocalIndex(), Aview.colMap->getMaxLocalIndex()+1),KOKKOS_LAMBDA(const LO i) {
2978  GO aidx = Acolmap_local.getGlobalElement(i);
2979  LO B_LID = Browmap_local.getLocalElement(aidx);
2980  if (B_LID != LO_INVALID) {
2981  targetMapToOrigRow(i) = B_LID;
2982  targetMapToImportRow(i) = LO_INVALID;
2983  } else {
2984  LO I_LID = Irowmap_local.getLocalElement(aidx);
2985  targetMapToOrigRow(i) = LO_INVALID;
2986  targetMapToImportRow(i) = I_LID;
2987 
2988  }
2989  });
2990 
2991 #ifdef HAVE_TPETRA_MMM_TIMINGS
2992  MM = Teuchos::null;
2993 #endif
2994 
2995  // Call the actual kernel. We'll rely on partial template specialization to call the correct one ---
2996  // Either the straight-up Tpetra code (SerialNode) or the KokkosKernels one (other NGP node types)
2997  KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,lo_view_t>::jacobi_A_B_reuse_kernel_wrapper(omega,Dinv,Aview,Bview,targetMapToOrigRow,targetMapToImportRow,Bcol2Ccol,Icol2Ccol,C,Cimport,label,params);
2998 }
2999 
3000 
3001 
3002 /*********************************************************************************************************/
3003 template<class Scalar,
3004  class LocalOrdinal,
3005  class GlobalOrdinal,
3006  class Node,
3007  class LocalOrdinalViewType>
3008 void KernelWrappers2<Scalar,LocalOrdinal,GlobalOrdinal,Node,LocalOrdinalViewType>::jacobi_A_B_reuse_kernel_wrapper(Scalar omega,
3009  const Vector<Scalar, LocalOrdinal, GlobalOrdinal, Node> & Dinv,
3010  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3011  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3012  const LocalOrdinalViewType & targetMapToOrigRow,
3013  const LocalOrdinalViewType & targetMapToImportRow,
3014  const LocalOrdinalViewType & Bcol2Ccol,
3015  const LocalOrdinalViewType & Icol2Ccol,
3016  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& C,
3017  Teuchos::RCP<const Import<LocalOrdinal,GlobalOrdinal,Node> > /* Cimport */,
3018  const std::string& label,
3019  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) {
3020 #ifdef HAVE_TPETRA_MMM_TIMINGS
3021  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3022  using Teuchos::TimeMonitor;
3023  Teuchos::RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore"))));
3024  Teuchos::RCP<Teuchos::TimeMonitor> MM2;
3025 #else
3026  (void)label;
3027 #endif
3028  using Teuchos::RCP;
3029  using Teuchos::rcp;
3030 
3031  // Lots and lots of typedefs
3032  typedef typename Tpetra::CrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_host_type KCRS;
3033  typedef typename KCRS::StaticCrsGraphType graph_t;
3034  typedef typename graph_t::row_map_type::const_type c_lno_view_t;
3035  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3036  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3037  typedef typename scalar_view_t::memory_space scalar_memory_space;
3038 
3039  typedef Scalar SC;
3040  typedef LocalOrdinal LO;
3041  typedef GlobalOrdinal GO;
3042  typedef Node NO;
3043  typedef Map<LO,GO,NO> map_type;
3044  const size_t ST_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3045  const LO LO_INVALID = Teuchos::OrdinalTraits<LO>::invalid();
3046  const SC SC_ZERO = Teuchos::ScalarTraits<Scalar>::zero();
3047 
3048  // Sizes
3049  RCP<const map_type> Ccolmap = C.getColMap();
3050  size_t m = Aview.origMatrix->getLocalNumRows();
3051  size_t n = Ccolmap->getLocalNumElements();
3052 
3053  // Grab the Kokkos::SparseCrsMatrices & inner stuff
3054  const KCRS & Amat = Aview.origMatrix->getLocalMatrixHost();
3055  const KCRS & Bmat = Bview.origMatrix->getLocalMatrixHost();
3056  const KCRS & Cmat = C.getLocalMatrixHost();
3057 
3058  c_lno_view_t Arowptr = Amat.graph.row_map, Browptr = Bmat.graph.row_map, Crowptr = Cmat.graph.row_map;
3059  const lno_nnz_view_t Acolind = Amat.graph.entries, Bcolind = Bmat.graph.entries, Ccolind = Cmat.graph.entries;
3060  const scalar_view_t Avals = Amat.values, Bvals = Bmat.values;
3061  scalar_view_t Cvals = Cmat.values;
3062 
3063  c_lno_view_t Irowptr;
3064  lno_nnz_view_t Icolind;
3065  scalar_view_t Ivals;
3066  if(!Bview.importMatrix.is_null()) {
3067  auto lclB = Bview.importMatrix->getLocalMatrixHost();
3068  Irowptr = lclB.graph.row_map;
3069  Icolind = lclB.graph.entries;
3070  Ivals = lclB.values;
3071  }
3072 
3073  // Jacobi-specific inner stuff
3074  auto Dvals =
3075  Dinv.template getLocalView<scalar_memory_space>(Access::ReadOnly);
3076 
3077 #ifdef HAVE_TPETRA_MMM_TIMINGS
3078  MM2 = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse SerialCore - Compare"))));
3079 #endif
3080 
3081  // The status array will contain the index into colind where this entry was last deposited.
3082  // c_status[i] < CSR_ip - not in the row yet
3083  // c_status[i] >= CSR_ip - this is the entry where you can find the data
3084  // We start with this filled with INVALID's indicating that there are no entries yet.
3085  // Sadly, this complicates the code due to the fact that size_t's are unsigned.
3086  std::vector<size_t> c_status(n, ST_INVALID);
3087 
3088  // For each row of A/C
3089  size_t CSR_ip = 0, OLD_ip = 0;
3090  for (size_t i = 0; i < m; i++) {
3091 
3092  // First fill the c_status array w/ locations where we're allowed to
3093  // generate nonzeros for this row
3094  OLD_ip = Crowptr[i];
3095  CSR_ip = Crowptr[i+1];
3096  for (size_t k = OLD_ip; k < CSR_ip; k++) {
3097  c_status[Ccolind[k]] = k;
3098 
3099  // Reset values in the row of C
3100  Cvals[k] = SC_ZERO;
3101  }
3102 
3103  SC minusOmegaDval = -omega*Dvals(i,0);
3104 
3105  // Entries of B
3106  for (size_t j = Browptr[i]; j < Browptr[i+1]; j++) {
3107  Scalar Bval = Bvals[j];
3108  if (Bval == SC_ZERO)
3109  continue;
3110  LO Bij = Bcolind[j];
3111  LO Cij = Bcol2Ccol[Bij];
3112 
3113  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3114  std::runtime_error, "Trying to insert a new entry into a static graph");
3115 
3116  Cvals[c_status[Cij]] = Bvals[j];
3117  }
3118 
3119  // Entries of -omega * Dinv * A * B
3120  for (size_t k = Arowptr[i]; k < Arowptr[i+1]; k++) {
3121  LO Aik = Acolind[k];
3122  const SC Aval = Avals[k];
3123  if (Aval == SC_ZERO)
3124  continue;
3125 
3126  if (targetMapToOrigRow[Aik] != LO_INVALID) {
3127  // Local matrix
3128  size_t Bk = static_cast<size_t> (targetMapToOrigRow[Aik]);
3129 
3130  for (size_t j = Browptr[Bk]; j < Browptr[Bk+1]; ++j) {
3131  LO Bkj = Bcolind[j];
3132  LO Cij = Bcol2Ccol[Bkj];
3133 
3134  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3135  std::runtime_error, "Trying to insert a new entry into a static graph");
3136 
3137  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Bvals[j];
3138  }
3139 
3140  } else {
3141  // Remote matrix
3142  size_t Ik = static_cast<size_t> (targetMapToImportRow[Aik]);
3143  for (size_t j = Irowptr[Ik]; j < Irowptr[Ik+1]; ++j) {
3144  LO Ikj = Icolind[j];
3145  LO Cij = Icol2Ccol[Ikj];
3146 
3147  TEUCHOS_TEST_FOR_EXCEPTION(c_status[Cij] < OLD_ip || c_status[Cij] >= CSR_ip,
3148  std::runtime_error, "Trying to insert a new entry into a static graph");
3149 
3150  Cvals[c_status[Cij]] += minusOmegaDval * Aval * Ivals[j];
3151  }
3152  }
3153  }
3154  }
3155 
3156 #ifdef HAVE_TPETRA_MMM_TIMINGS
3157  MM2 = Teuchos::null;
3158  MM2 = rcp(new TimeMonitor (*TimeMonitor::getNewTimer(prefix_mmm + std::string("Jacobi Reuse ESFC"))));
3159 #endif
3160 
3161  C.fillComplete(C.getDomainMap(), C.getRangeMap());
3162 #ifdef HAVE_TPETRA_MMM_TIMINGS
3163  MM2 = Teuchos::null;
3164  MM = Teuchos::null;
3165 #endif
3166 
3167 }
3168 
3169 
3170 
3171 /*********************************************************************************************************/
3172 template<class Scalar,
3173  class LocalOrdinal,
3174  class GlobalOrdinal,
3175  class Node>
3176 void import_and_extract_views(
3177  const CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& A,
3178  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3179  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3180  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3181  bool userAssertsThereAreNoRemotes,
3182  const std::string& label,
3183  const Teuchos::RCP<Teuchos::ParameterList>& params)
3184 {
3185  using Teuchos::Array;
3186  using Teuchos::ArrayView;
3187  using Teuchos::RCP;
3188  using Teuchos::rcp;
3189  using Teuchos::null;
3190 
3191  typedef Scalar SC;
3192  typedef LocalOrdinal LO;
3193  typedef GlobalOrdinal GO;
3194  typedef Node NO;
3195 
3196  typedef Map<LO,GO,NO> map_type;
3197  typedef Import<LO,GO,NO> import_type;
3198  typedef CrsMatrix<SC,LO,GO,NO> crs_matrix_type;
3199 
3200 #ifdef HAVE_TPETRA_MMM_TIMINGS
3201  std::string prefix_mmm = std::string("TpetraExt ") + label + std::string(": ");
3202  using Teuchos::TimeMonitor;
3203  RCP<Teuchos::TimeMonitor> MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Alloc"))));
3204 #endif
3205  // The goal of this method is to populate the 'Aview' struct with views of the
3206  // rows of A, including all rows that correspond to elements in 'targetMap'.
3207  //
3208  // If targetMap includes local elements that correspond to remotely-owned rows
3209  // of A, then those remotely-owned rows will be imported into
3210  // 'Aview.importMatrix', and views of them will be included in 'Aview'.
3211  Aview.deleteContents();
3212 
3213  Aview.origMatrix = rcp(&A, false);
3214  Aview.origRowMap = A.getRowMap();
3215  Aview.rowMap = targetMap;
3216  Aview.colMap = A.getColMap();
3217  Aview.domainMap = A.getDomainMap();
3218  Aview.importColMap = null;
3219  RCP<const map_type> rowMap = A.getRowMap();
3220  const int numProcs = rowMap->getComm()->getSize();
3221 
3222  // Short circuit if the user swears there are no remotes (or if we're in serial)
3223  if (userAssertsThereAreNoRemotes || numProcs < 2)
3224  return;
3225 
3226  RCP<const import_type> importer;
3227  if (params != null && params->isParameter("importer")) {
3228  importer = params->get<RCP<const import_type> >("importer");
3229 
3230  } else {
3231 #ifdef HAVE_TPETRA_MMM_TIMINGS
3232  MM = Teuchos::null;
3233  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap"))));
3234 #endif
3235 
3236  // Mark each row in targetMap as local or remote, and go ahead and get a view
3237  // for the local rows
3238  RCP<const map_type> remoteRowMap;
3239  size_t numRemote = 0;
3240  int mode = 0;
3241  if (!prototypeImporter.is_null() &&
3242  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3243  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3244  // We have a valid prototype importer --- ask it for the remotes
3245 #ifdef HAVE_TPETRA_MMM_TIMINGS
3246  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode1"));
3247 #endif
3248  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3249  numRemote = prototypeImporter->getNumRemoteIDs();
3250 
3251  Array<GO> remoteRows(numRemote);
3252  for (size_t i = 0; i < numRemote; i++)
3253  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3254 
3255  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3256  rowMap->getIndexBase(), rowMap->getComm()));
3257  mode = 1;
3258 
3259  } else if (prototypeImporter.is_null()) {
3260  // No prototype importer --- count the remotes the hard way
3261 #ifdef HAVE_TPETRA_MMM_TIMINGS
3262  TimeMonitor MM2 = *TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X RemoteMap-Mode2"));
3263 #endif
3264  ArrayView<const GO> rows = targetMap->getLocalElementList();
3265  size_t numRows = targetMap->getLocalNumElements();
3266 
3267  Array<GO> remoteRows(numRows);
3268  for(size_t i = 0; i < numRows; ++i) {
3269  const LO mlid = rowMap->getLocalElement(rows[i]);
3270 
3271  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3272  remoteRows[numRemote++] = rows[i];
3273  }
3274  remoteRows.resize(numRemote);
3275  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3276  rowMap->getIndexBase(), rowMap->getComm()));
3277  mode = 2;
3278 
3279  } else {
3280  // PrototypeImporter is bad. But if we're in serial that's OK.
3281  mode = 3;
3282  }
3283 
3284  if (numProcs < 2) {
3285  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3286  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3287  // If only one processor we don't need to import any remote rows, so return.
3288  return;
3289  }
3290 
3291  //
3292  // Now we will import the needed remote rows of A, if the global maximum
3293  // value of numRemote is greater than 0.
3294  //
3295 #ifdef HAVE_TPETRA_MMM_TIMINGS
3296  MM = Teuchos::null;
3297  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Collective-0"))));
3298 #endif
3299 
3300  global_size_t globalMaxNumRemote = 0;
3301  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3302 
3303  if (globalMaxNumRemote > 0) {
3304 #ifdef HAVE_TPETRA_MMM_TIMINGS
3305  MM = Teuchos::null;
3306  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-2"))));
3307 #endif
3308  // Create an importer with target-map remoteRowMap and source-map rowMap.
3309  if (mode == 1)
3310  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3311  else if (mode == 2)
3312  importer = rcp(new import_type(rowMap, remoteRowMap));
3313  else
3314  throw std::runtime_error("prototypeImporter->SourceMap() does not match A.getRowMap()!");
3315  }
3316 
3317  if (params != null)
3318  params->set("importer", importer);
3319  }
3320 
3321  if (importer != null) {
3322 #ifdef HAVE_TPETRA_MMM_TIMINGS
3323  MM = Teuchos::null;
3324  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-3"))));
3325 #endif
3326 
3327  // Now create a new matrix into which we can import the remote rows of A that we need.
3328  Teuchos::ParameterList labelList;
3329  labelList.set("Timer Label", label);
3330  auto & labelList_subList = labelList.sublist("matrixmatrix: kernel params",false);
3331 
3332  bool isMM = true;
3333  bool overrideAllreduce = false;
3334  int mm_optimization_core_count=::Tpetra::Details::Behavior::TAFC_OptimizationCoreCount();
3335  // Minor speedup tweak - avoid computing the global constants
3336  Teuchos::ParameterList params_sublist;
3337  if(!params.is_null()) {
3338  labelList.set("compute global constants", params->get("compute global constants",false));
3339  params_sublist = params->sublist("matrixmatrix: kernel params",false);
3340  mm_optimization_core_count = params_sublist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3341  int mm_optimization_core_count2 = params->get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3342  if(mm_optimization_core_count2<mm_optimization_core_count) mm_optimization_core_count=mm_optimization_core_count2;
3343  isMM = params_sublist.get("isMatrixMatrix_TransferAndFillComplete",false);
3344  overrideAllreduce = params_sublist.get("MM_TAFC_OverrideAllreduceCheck",false);
3345  }
3346  labelList_subList.set("isMatrixMatrix_TransferAndFillComplete",isMM);
3347  labelList_subList.set("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
3348  labelList_subList.set("MM_TAFC_OverrideAllreduceCheck",overrideAllreduce);
3349 
3350  Aview.importMatrix = Tpetra::importAndFillCompleteCrsMatrix<crs_matrix_type>(rcpFromRef(A), *importer,
3351  A.getDomainMap(), importer->getTargetMap(), rcpFromRef(labelList));
3352 
3353 #if 0
3354  // Disabled code for dumping input matrices
3355  static int count=0;
3356  char str[80];
3357  sprintf(str,"import_matrix.%d.dat",count);
3359  count++;
3360 #endif
3361 
3362 #ifdef HAVE_TPETRA_MMM_STATISTICS
3363  printMultiplicationStatistics(importer, label + std::string(" I&X MMM"));
3364 #endif
3365 
3366 
3367 #ifdef HAVE_TPETRA_MMM_TIMINGS
3368  MM = Teuchos::null;
3369  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix_mmm + std::string("MMM I&X Import-4"))));
3370 #endif
3371 
3372  // Save the column map of the imported matrix, so that we can convert indices back to global for arithmetic later
3373  Aview.importColMap = Aview.importMatrix->getColMap();
3374 #ifdef HAVE_TPETRA_MMM_TIMINGS
3375  MM = Teuchos::null;
3376 #endif
3377  }
3378 }
3379 
3380 /*********************************************************************************************************/
3381 template<class Scalar,
3382  class LocalOrdinal,
3383  class GlobalOrdinal,
3384  class Node>
3385 void import_and_extract_views(
3386  const BlockCrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& M,
3387  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> > targetMap,
3388  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Mview,
3389  Teuchos::RCP<const Import<LocalOrdinal, GlobalOrdinal, Node> > prototypeImporter,
3390  bool userAssertsThereAreNoRemotes)
3391 {
3392  using Teuchos::Array;
3393  using Teuchos::ArrayView;
3394  using Teuchos::RCP;
3395  using Teuchos::rcp;
3396  using Teuchos::null;
3397 
3398  typedef Scalar SC;
3399  typedef LocalOrdinal LO;
3400  typedef GlobalOrdinal GO;
3401  typedef Node NO;
3402 
3403  typedef Map<LO,GO,NO> map_type;
3404  typedef Import<LO,GO,NO> import_type;
3405  typedef BlockCrsMatrix<SC,LO,GO,NO> blockcrs_matrix_type;
3406 
3407  // The goal of this method is to populate the 'Mview' struct with views of the
3408  // rows of M, including all rows that correspond to elements in 'targetMap'.
3409  //
3410  // If targetMap includes local elements that correspond to remotely-owned rows
3411  // of M, then those remotely-owned rows will be imported into
3412  // 'Mview.importMatrix', and views of them will be included in 'Mview'.
3413  Mview.deleteContents();
3414 
3415  Mview.origMatrix = rcp(&M, false);
3416  Mview.origRowMap = M.getRowMap();
3417  Mview.rowMap = targetMap;
3418  Mview.colMap = M.getColMap();
3419  Mview.importColMap = null;
3420  RCP<const map_type> rowMap = M.getRowMap();
3421  const int numProcs = rowMap->getComm()->getSize();
3422 
3423  // Short circuit if the user swears there are no remotes (or if we're in serial)
3424  if (userAssertsThereAreNoRemotes || numProcs < 2) return;
3425 
3426  // Mark each row in targetMap as local or remote, and go ahead and get a view
3427  // for the local rows
3428  RCP<const map_type> remoteRowMap;
3429  size_t numRemote = 0;
3430  int mode = 0;
3431  if (!prototypeImporter.is_null() &&
3432  prototypeImporter->getSourceMap()->isSameAs(*rowMap) &&
3433  prototypeImporter->getTargetMap()->isSameAs(*targetMap)) {
3434 
3435  // We have a valid prototype importer --- ask it for the remotes
3436  ArrayView<const LO> remoteLIDs = prototypeImporter->getRemoteLIDs();
3437  numRemote = prototypeImporter->getNumRemoteIDs();
3438 
3439  Array<GO> remoteRows(numRemote);
3440  for (size_t i = 0; i < numRemote; i++)
3441  remoteRows[i] = targetMap->getGlobalElement(remoteLIDs[i]);
3442 
3443  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3444  rowMap->getIndexBase(), rowMap->getComm()));
3445  mode = 1;
3446 
3447  } else if (prototypeImporter.is_null()) {
3448 
3449  // No prototype importer --- count the remotes the hard way
3450  ArrayView<const GO> rows = targetMap->getLocalElementList();
3451  size_t numRows = targetMap->getLocalNumElements();
3452 
3453  Array<GO> remoteRows(numRows);
3454  for(size_t i = 0; i < numRows; ++i) {
3455  const LO mlid = rowMap->getLocalElement(rows[i]);
3456 
3457  if (mlid == Teuchos::OrdinalTraits<LO>::invalid())
3458  remoteRows[numRemote++] = rows[i];
3459  }
3460  remoteRows.resize(numRemote);
3461  remoteRowMap = rcp(new map_type(Teuchos::OrdinalTraits<global_size_t>::invalid(), remoteRows(),
3462  rowMap->getIndexBase(), rowMap->getComm()));
3463  mode = 2;
3464 
3465  } else {
3466  // PrototypeImporter is bad. But if we're in serial that's OK.
3467  mode = 3;
3468  }
3469 
3470  if (numProcs < 2) {
3471  TEUCHOS_TEST_FOR_EXCEPTION(numRemote > 0, std::runtime_error,
3472  "MatrixMatrix::import_and_extract_views ERROR, numProcs < 2 but attempting to import remote matrix rows.");
3473  // If only one processor we don't need to import any remote rows, so return.
3474  return;
3475  }
3476 
3477  // Now we will import the needed remote rows of M, if the global maximum
3478  // value of numRemote is greater than 0.
3479  global_size_t globalMaxNumRemote = 0;
3480  Teuchos::reduceAll(*(rowMap->getComm()), Teuchos::REDUCE_MAX, (global_size_t)numRemote, Teuchos::outArg(globalMaxNumRemote) );
3481 
3482  RCP<const import_type> importer;
3483 
3484  if (globalMaxNumRemote > 0) {
3485  // Create an importer with target-map remoteRowMap and source-map rowMap.
3486  if (mode == 1)
3487  importer = prototypeImporter->createRemoteOnlyImport(remoteRowMap);
3488  else if (mode == 2)
3489  importer = rcp(new import_type(rowMap, remoteRowMap));
3490  else
3491  throw std::runtime_error("prototypeImporter->SourceMap() does not match M.getRowMap()!");
3492  }
3493 
3494  if (importer != null) {
3495  // Get import matrix
3496  Mview.importMatrix = Tpetra::importAndFillCompleteBlockCrsMatrix<blockcrs_matrix_type>(rcpFromRef(M), *importer);
3497 
3498  // Save the column map of the imported matrix, so that we can convert indices
3499  // back to global for arithmetic later
3500  Mview.importColMap = Mview.importMatrix->getColMap();
3501  }
3502 }
3503 
3504 /*********************************************************************************************************/
3505  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3506 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3508 merge_matrices(CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3509  CrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3510  const LocalOrdinalViewType & Acol2Brow,
3511  const LocalOrdinalViewType & Acol2Irow,
3512  const LocalOrdinalViewType & Bcol2Ccol,
3513  const LocalOrdinalViewType & Icol2Ccol,
3514  const size_t mergedNodeNumCols) {
3515 
3516  using Teuchos::RCP;
3518  typedef typename KCRS::StaticCrsGraphType graph_t;
3519  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3520  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3521  typedef typename KCRS::values_type::non_const_type scalar_view_t;
3522  // Grab the Kokkos::SparseCrsMatrices
3523  const KCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3524  const KCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3525 
3526  // We need to do this dance if either (a) We have Bimport or (b) We don't A's colMap is not the same as B's rowMap
3527  if(!Bview.importMatrix.is_null() || (Bview.importMatrix.is_null() && (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3528  // We do have a Bimport
3529  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3530  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3531  RCP<const KCRS> Ik_;
3532  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KCRS>(Bview.importMatrix->getLocalMatrixDevice());
3533  const KCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3534  KCRS Iks;
3535  if(Ik!=0) Iks = *Ik;
3536  size_t merge_numrows = Ak.numCols();
3537  // The last entry of this at least, need to be initialized
3538  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3539 
3540  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3541 
3542  // Use a Kokkos::parallel_scan to build the rowptr
3543  typedef typename Node::execution_space execution_space;
3544  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3545  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3546  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3547  if(final) Mrowptr(i) = update;
3548  // Get the row count
3549  size_t ct=0;
3550  if(Acol2Brow(i)!=LO_INVALID)
3551  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3552  else
3553  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3554  update+=ct;
3555 
3556  if(final && i+1==merge_numrows)
3557  Mrowptr(i+1)=update;
3558  });
3559 
3560  // Allocate nnz
3561  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3562  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3563  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz);
3564 
3565  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3566  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3567  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3568  if(Acol2Brow(i)!=LO_INVALID) {
3569  size_t row = Acol2Brow(i);
3570  size_t start = Bk.graph.row_map(row);
3571  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3572  Mvalues(j) = Bk.values(j-Mrowptr(i)+start);
3573  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3574  }
3575  }
3576  else {
3577  size_t row = Acol2Irow(i);
3578  size_t start = Iks.graph.row_map(row);
3579  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3580  Mvalues(j) = Iks.values(j-Mrowptr(i)+start);
3581  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3582  }
3583  }
3584  });
3585 
3586  KCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind);
3587  return newmat;
3588  }
3589  else {
3590  // We don't have a Bimport (the easy case)
3591  return Bk;
3592  }
3593 }//end merge_matrices
3594 
3595 /*********************************************************************************************************/
3596  // This only merges matrices that look like B & Bimport, aka, they have no overlapping rows
3597 template<class Scalar,class LocalOrdinal,class GlobalOrdinal,class Node, class LocalOrdinalViewType>
3598 const typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type
3599 merge_matrices(BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Aview,
3600  BlockCrsMatrixStruct<Scalar, LocalOrdinal, GlobalOrdinal, Node>& Bview,
3601  const LocalOrdinalViewType & Acol2Brow,
3602  const LocalOrdinalViewType & Acol2Irow,
3603  const LocalOrdinalViewType & Bcol2Ccol,
3604  const LocalOrdinalViewType & Icol2Ccol,
3605  const size_t mergedNodeNumCols)
3606 {
3607  using Teuchos::RCP;
3608  typedef typename Tpetra::BlockCrsMatrix<Scalar,LocalOrdinal,GlobalOrdinal,Node>::local_matrix_device_type KBCRS;
3609  typedef typename KBCRS::StaticCrsGraphType graph_t;
3610  typedef typename graph_t::row_map_type::non_const_type lno_view_t;
3611  typedef typename graph_t::entries_type::non_const_type lno_nnz_view_t;
3612  typedef typename KBCRS::values_type::non_const_type scalar_view_t;
3613 
3614  // Grab the KokkosSparse::BsrMatrix
3615  const KBCRS & Ak = Aview.origMatrix->getLocalMatrixDevice();
3616  const KBCRS & Bk = Bview.origMatrix->getLocalMatrixDevice();
3617 
3618  // We need to do this dance if either (a) We have Bimport or (b) A's colMap is not the same as B's rowMap
3619  if(!Bview.importMatrix.is_null() ||
3620  (Bview.importMatrix.is_null() &&
3621  (&*Aview.origMatrix->getGraph()->getColMap() != &*Bview.origMatrix->getGraph()->getRowMap()))) {
3622 
3623  // We do have a Bimport
3624  // NOTE: We're going merge Borig and Bimport into a single matrix and reindex the columns *before* we multiply.
3625  // This option was chosen because we know we don't have any duplicate entries, so we can allocate once.
3626  RCP<const KBCRS> Ik_;
3627  if(!Bview.importMatrix.is_null()) Ik_ = Teuchos::rcpFromRef<const KBCRS>(Bview.importMatrix->getLocalMatrixDevice());
3628  const KBCRS * Ik = Bview.importMatrix.is_null() ? 0 : &*Ik_;
3629  KBCRS Iks;
3630  if(Ik!=0) Iks = *Ik;
3631  size_t merge_numrows = Ak.numCols();
3632 
3633  // The last entry of this at least, need to be initialized
3634  lno_view_t Mrowptr("Mrowptr", merge_numrows + 1);
3635 
3636  const LocalOrdinal LO_INVALID =Teuchos::OrdinalTraits<LocalOrdinal>::invalid();
3637 
3638  // Use a Kokkos::parallel_scan to build the rowptr
3639  typedef typename Node::execution_space execution_space;
3640  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3641  Kokkos::parallel_scan ("Tpetra_MatrixMatrix_merge_matrices_buildRowptr", range_type (0, merge_numrows),
3642  KOKKOS_LAMBDA(const size_t i, size_t& update, const bool final) {
3643  if(final) Mrowptr(i) = update;
3644  // Get the row count
3645  size_t ct=0;
3646  if(Acol2Brow(i)!=LO_INVALID)
3647  ct = Bk.graph.row_map(Acol2Brow(i)+1) - Bk.graph.row_map(Acol2Brow(i));
3648  else
3649  ct = Iks.graph.row_map(Acol2Irow(i)+1) - Iks.graph.row_map(Acol2Irow(i));
3650  update+=ct;
3651 
3652  if(final && i+1==merge_numrows)
3653  Mrowptr(i+1)=update;
3654  });
3655 
3656  // Allocate nnz
3657  size_t merge_nnz = ::Tpetra::Details::getEntryOnHost(Mrowptr,merge_numrows);
3658  const int blocksize = Ak.blockDim();
3659  lno_nnz_view_t Mcolind(Kokkos::ViewAllocateWithoutInitializing("Mcolind"),merge_nnz);
3660  scalar_view_t Mvalues(Kokkos::ViewAllocateWithoutInitializing("Mvals"),merge_nnz*blocksize*blocksize);
3661 
3662  // Use a Kokkos::parallel_for to fill the rowptr/colind arrays
3663  typedef Kokkos::RangePolicy<execution_space, size_t> range_type;
3664  Kokkos::parallel_for ("Tpetra_MatrixMatrix_merg_matrices_buildColindValues", range_type (0, merge_numrows),KOKKOS_LAMBDA(const size_t i) {
3665  if(Acol2Brow(i)!=LO_INVALID) {
3666  size_t row = Acol2Brow(i);
3667  size_t start = Bk.graph.row_map(row);
3668  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3669  Mcolind(j) = Bcol2Ccol(Bk.graph.entries(j-Mrowptr(i)+start));
3670 
3671  for (int b=0; b<blocksize*blocksize; ++b) {
3672  const int val_indx = j*blocksize*blocksize + b;
3673  const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3674  Mvalues(val_indx) = Bk.values(b_val_indx);
3675  }
3676  }
3677  }
3678  else {
3679  size_t row = Acol2Irow(i);
3680  size_t start = Iks.graph.row_map(row);
3681  for(size_t j= Mrowptr(i); j<Mrowptr(i+1); j++) {
3682  Mcolind(j) = Icol2Ccol(Iks.graph.entries(j-Mrowptr(i)+start));
3683 
3684  for (int b=0; b<blocksize*blocksize; ++b) {
3685  const int val_indx = j*blocksize*blocksize + b;
3686  const int b_val_indx = (j-Mrowptr(i)+start)*blocksize*blocksize + b;
3687  Mvalues(val_indx) = Iks.values(b_val_indx);
3688  }
3689  }
3690  }
3691  });
3692 
3693  // Build and return merged KokkosSparse matrix
3694  KBCRS newmat("CrsMatrix",merge_numrows,mergedNodeNumCols,merge_nnz,Mvalues,Mrowptr,Mcolind, blocksize);
3695  return newmat;
3696  }
3697  else {
3698  // We don't have a Bimport (the easy case)
3699  return Bk;
3700  }
3701 }//end merge_matrices
3702 
3703 /*********************************************************************************************************/
3704 template<typename SC, typename LO, typename GO, typename NO>
3705 void AddKernels<SC, LO, GO, NO>::
3706 addSorted(
3707  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3708  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3709  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3710  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3711  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3712  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3713  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3714  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3715 #if KOKKOSKERNELS_VERSION >= 40299
3716  GO numGlobalCols,
3717 #endif
3718  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3719  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3720  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3721 {
3722  using Teuchos::TimeMonitor;
3723  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3724  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3725  auto nrows = Arowptrs.extent(0) - 1;
3726  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3727  typename AddKern::KKH handle;
3728  handle.create_spadd_handle(true);
3729  auto addHandle = handle.get_spadd_handle();
3730 #ifdef HAVE_TPETRA_MMM_TIMINGS
3731  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted symbolic")));
3732 #endif
3733  KokkosSparse::Experimental::spadd_symbolic
3734  (&handle,
3735 #if KOKKOSKERNELS_VERSION >= 40299
3736  nrows, numGlobalCols,
3737 #endif
3738  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3739  //KokkosKernels requires values to be zeroed
3740  Cvals = values_array("C values", addHandle->get_c_nnz());
3741  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3742 #ifdef HAVE_TPETRA_MMM_TIMINGS
3743  MM = Teuchos::null;
3744  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() sorted numeric")));
3745 #endif
3746  KokkosSparse::Experimental::spadd_numeric(&handle,
3747 #if KOKKOSKERNELS_VERSION >= 40299
3748  nrows, numGlobalCols,
3749 #endif
3750  Arowptrs, Acolinds, Avals, scalarA,
3751  Browptrs, Bcolinds, Bvals, scalarB,
3752  Crowptrs, Ccolinds, Cvals);
3753 }
3754 
3755 template<typename SC, typename LO, typename GO, typename NO>
3756 void AddKernels<SC, LO, GO, NO>::
3757 addUnsorted(
3758  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Avals,
3759  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Arowptrs,
3760  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Acolinds,
3761  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3762  const typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Bvals,
3763  const typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array_const& Browptrs,
3764  const typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Bcolinds,
3765  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3766 #if KOKKOSKERNELS_VERSION >= 40299
3767  GO numGlobalCols,
3768 #else
3769  GO /* numGlobalCols */,
3770 #endif
3771  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3772  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3773  typename MMdetails::AddKernels<SC, LO, GO, NO>::col_inds_array& Ccolinds)
3774 {
3775  using Teuchos::TimeMonitor;
3776  using AddKern = MMdetails::AddKernels<SC, LO, GO, NO>;
3777  TEUCHOS_TEST_FOR_EXCEPTION(Arowptrs.extent(0) != Browptrs.extent(0), std::runtime_error, "Can't add matrices with different numbers of rows.");
3778  auto nrows = Arowptrs.extent(0) - 1;
3779  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3780  typedef MMdetails::AddKernels<SC, LO, GO, NO> AddKern;
3781  typename AddKern::KKH handle;
3782  handle.create_spadd_handle(false);
3783  auto addHandle = handle.get_spadd_handle();
3784 #ifdef HAVE_TPETRA_MMM_TIMINGS
3785  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted symbolic")));
3786 #endif
3787  KokkosSparse::Experimental::spadd_symbolic
3788  (&handle,
3789 #if KOKKOSKERNELS_VERSION >= 40299
3790  nrows, numGlobalCols,
3791 #endif
3792  Arowptrs, Acolinds, Browptrs, Bcolinds, Crowptrs);
3793  //Cvals must be zeroed out
3794  Cvals = values_array("C values", addHandle->get_c_nnz());
3795  Ccolinds = col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3796 #ifdef HAVE_TPETRA_MMM_TIMINGS
3797  MM = Teuchos::null;
3798  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() unsorted kernel: unsorted numeric")));
3799 #endif
3800  KokkosSparse::Experimental::spadd_numeric(&handle,
3801 #if KOKKOSKERNELS_VERSION >= 40299
3802  nrows, numGlobalCols,
3803 #endif
3804  Arowptrs, Acolinds, Avals, scalarA,
3805  Browptrs, Bcolinds, Bvals, scalarB,
3806  Crowptrs, Ccolinds, Cvals);
3807 }
3808 
3809 template<typename GO,
3810  typename LocalIndicesType,
3811  typename GlobalIndicesType,
3812  typename ColMapType>
3813 struct ConvertLocalToGlobalFunctor
3814 {
3815  ConvertLocalToGlobalFunctor(
3816  const LocalIndicesType& colindsOrig_,
3817  const GlobalIndicesType& colindsConverted_,
3818  const ColMapType& colmap_) :
3819  colindsOrig (colindsOrig_),
3820  colindsConverted (colindsConverted_),
3821  colmap (colmap_)
3822  {}
3823  KOKKOS_INLINE_FUNCTION void
3824  operator() (const GO i) const
3825  {
3826  colindsConverted(i) = colmap.getGlobalElement(colindsOrig(i));
3827  }
3828  LocalIndicesType colindsOrig;
3829  GlobalIndicesType colindsConverted;
3830  ColMapType colmap;
3831 };
3832 
3833 template<typename SC, typename LO, typename GO, typename NO>
3834 void MMdetails::AddKernels<SC, LO, GO, NO>::
3835 convertToGlobalAndAdd(
3836  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& A,
3837  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarA,
3838  const typename MMdetails::AddKernels<SC, LO, GO, NO>::KCRS& B,
3839  const typename MMdetails::AddKernels<SC, LO, GO, NO>::impl_scalar_type scalarB,
3840  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& AcolMap,
3841  const typename MMdetails::AddKernels<SC, LO, GO, NO>::local_map_type& BcolMap,
3842  typename MMdetails::AddKernels<SC, LO, GO, NO>::values_array& Cvals,
3843  typename MMdetails::AddKernels<SC, LO, GO, NO>::row_ptrs_array& Crowptrs,
3844  typename MMdetails::AddKernels<SC, LO, GO, NO>::global_col_inds_array& Ccolinds)
3845 {
3846  using Teuchos::TimeMonitor;
3847  //Need to use a different KokkosKernelsHandle type than other versions,
3848  //since the ordinals are now GO
3849  using KKH_GO = KokkosKernels::Experimental::KokkosKernelsHandle<size_t, GO, impl_scalar_type,
3850  typename NO::execution_space, typename NO::memory_space, typename NO::memory_space>;
3851 
3852  const values_array Avals = A.values;
3853  const values_array Bvals = B.values;
3854  const col_inds_array Acolinds = A.graph.entries;
3855  const col_inds_array Bcolinds = B.graph.entries;
3856  auto Arowptrs = A.graph.row_map;
3857  auto Browptrs = B.graph.row_map;
3858  global_col_inds_array AcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("A colinds (converted)"), Acolinds.extent(0));
3859  global_col_inds_array BcolindsConverted(Kokkos::ViewAllocateWithoutInitializing("B colinds (converted)"), Bcolinds.extent(0));
3860 #ifdef HAVE_TPETRA_MMM_TIMINGS
3861  auto MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: " + std::string("column map conversion"))));
3862 #endif
3863  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertA(Acolinds, AcolindsConverted, AcolMap);
3864  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsA", range_type(0, Acolinds.extent(0)), convertA);
3865  ConvertLocalToGlobalFunctor<GO, col_inds_array, global_col_inds_array, local_map_type> convertB(Bcolinds, BcolindsConverted, BcolMap);
3866  Kokkos::parallel_for("Tpetra_MatrixMatrix_convertColIndsB", range_type(0, Bcolinds.extent(0)), convertB);
3867  KKH_GO handle;
3868  handle.create_spadd_handle(false);
3869  auto addHandle = handle.get_spadd_handle();
3870 #ifdef HAVE_TPETRA_MMM_TIMINGS
3871  MM = Teuchos::null;
3872  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted symbolic")));
3873 #endif
3874  auto nrows = Arowptrs.extent(0) - 1;
3875  Crowptrs = row_ptrs_array(Kokkos::ViewAllocateWithoutInitializing("C row ptrs"), nrows + 1);
3876  KokkosSparse::Experimental::spadd_symbolic
3877  (&handle,
3878 #if KOKKOSKERNELS_VERSION >= 40299
3879  nrows, A.numCols(),
3880 #endif
3881  Arowptrs, AcolindsConverted, Browptrs, BcolindsConverted, Crowptrs);
3882  Cvals = values_array("C values", addHandle->get_c_nnz());
3883  Ccolinds = global_col_inds_array(Kokkos::ViewAllocateWithoutInitializing("C colinds"), addHandle->get_c_nnz());
3884 #ifdef HAVE_TPETRA_MMM_TIMINGS
3885  MM = Teuchos::null;
3886  MM = rcp(new TimeMonitor(*TimeMonitor::getNewTimer("TpetraExt::MatrixMatrix::add() diff col map kernel: unsorted numeric")));
3887 #endif
3888  KokkosSparse::Experimental::spadd_numeric(&handle,
3889 #if KOKKOSKERNELS_VERSION >= 40299
3890  nrows, A.numCols(),
3891 #endif
3892  Arowptrs, AcolindsConverted, Avals, scalarA,
3893  Browptrs, BcolindsConverted, Bvals, scalarB,
3894  Crowptrs, Ccolinds, Cvals);
3895 }
3896 
3897 
3898 } //End namepsace MMdetails
3899 
3900 } //End namespace Tpetra
3901 
3902 /*********************************************************************************************************/
3903 //
3904 // Explicit instantiation macro
3905 //
3906 // Must be expanded from within the Tpetra namespace!
3907 //
3908 namespace Tpetra {
3909 
3910 #define TPETRA_MATRIXMATRIX_INSTANT(SCALAR,LO,GO,NODE) \
3911 template \
3912  void MatrixMatrix::Multiply( \
3913  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3914  bool transposeA, \
3915  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3916  bool transposeB, \
3917  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3918  bool call_FillComplete_on_result, \
3919  const std::string & label, \
3920  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3921 \
3922 template \
3923  void MatrixMatrix::Multiply( \
3924  const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& A, \
3925  bool transposeA, \
3926  const Teuchos::RCP<const BlockCrsMatrix< SCALAR , LO , GO , NODE > >& B, \
3927  bool transposeB, \
3928  Teuchos::RCP<BlockCrsMatrix< SCALAR , LO , GO , NODE > >& C, \
3929  const std::string & label); \
3930 \
3931 template \
3932  void MatrixMatrix::Jacobi( \
3933  SCALAR omega, \
3934  const Vector< SCALAR, LO, GO, NODE > & Dinv, \
3935  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3936  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3937  CrsMatrix< SCALAR , LO , GO , NODE >& C, \
3938  bool call_FillComplete_on_result, \
3939  const std::string & label, \
3940  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3941 \
3942  template \
3943  void MatrixMatrix::Add( \
3944  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3945  bool transposeA, \
3946  SCALAR scalarA, \
3947  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3948  bool transposeB, \
3949  SCALAR scalarB, \
3950  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3951 \
3952  template \
3953  void MatrixMatrix::Add( \
3954  const CrsMatrix< SCALAR , LO , GO , NODE >& A, \
3955  bool transposeA, \
3956  SCALAR scalarA, \
3957  const CrsMatrix< SCALAR , LO , GO , NODE >& B, \
3958  bool transposeB, \
3959  SCALAR scalarB, \
3960  const Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > >& C); \
3961 \
3962  template \
3963  void MatrixMatrix::Add( \
3964  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3965  bool transposeA, \
3966  SCALAR scalarA, \
3967  CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3968  SCALAR scalarB ); \
3969 \
3970  template \
3971  Teuchos::RCP<CrsMatrix< SCALAR , LO , GO , NODE > > \
3972  MatrixMatrix::add<SCALAR, LO, GO, NODE> \
3973  (const SCALAR & alpha, \
3974  const bool transposeA, \
3975  const CrsMatrix<SCALAR, LO, GO, NODE>& A, \
3976  const SCALAR & beta, \
3977  const bool transposeB, \
3978  const CrsMatrix<SCALAR, LO, GO, NODE>& B, \
3979  const Teuchos::RCP<const Map<LO, GO, NODE> >& domainMap, \
3980  const Teuchos::RCP<const Map<LO, GO, NODE> >& rangeMap, \
3981  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3982 \
3983  template \
3984  void \
3985  MatrixMatrix::add< SCALAR , LO, GO , NODE > \
3986  (const SCALAR & alpha, \
3987  const bool transposeA, \
3988  const CrsMatrix< SCALAR , LO, GO , NODE >& A, \
3989  const SCALAR& beta, \
3990  const bool transposeB, \
3991  const CrsMatrix< SCALAR , LO, GO , NODE >& B, \
3992  CrsMatrix< SCALAR , LO, GO , NODE >& C, \
3993  const Teuchos::RCP<const Map<LO, GO , NODE > >& domainMap, \
3994  const Teuchos::RCP<const Map<LO, GO , NODE > >& rangeMap, \
3995  const Teuchos::RCP<Teuchos::ParameterList>& params); \
3996 \
3997  template struct MMdetails::AddKernels<SCALAR, LO, GO, NODE>; \
3998 \
3999  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const CrsMatrix<SCALAR, LO, GO, NODE>& M, \
4000  Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
4001  CrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
4002  Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
4003  bool userAssertsThereAreNoRemotes, \
4004  const std::string& label, \
4005  const Teuchos::RCP<Teuchos::ParameterList>& params); \
4006 \
4007  template void MMdetails::import_and_extract_views<SCALAR, LO, GO, NODE>(const BlockCrsMatrix<SCALAR, LO, GO, NODE>& M, \
4008  Teuchos::RCP<const Map<LO, GO, NODE> > targetMap, \
4009  BlockCrsMatrixStruct<SCALAR, LO, GO, NODE>& Mview, \
4010  Teuchos::RCP<const Import<LO,GO, NODE> > prototypeImporter, \
4011  bool userAssertsThereAreNoRemotes);
4012 } //End namespace Tpetra
4013 
4014 #endif // TPETRA_MATRIXMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
static int TAFC_OptimizationCoreCount()
MPI process count above which Tpetra::CrsMatrix::transferAndFillComplete will attempt to do advanced ...
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
Sparse matrix whose entries are small dense square blocks, all of the same dimensions.
void Jacobi(Scalar omega, const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Dinv, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
bool isFillActive() const
Whether the matrix is not fill complete.
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void Add(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, Scalar scalarA, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, Scalar scalarB)
bool isFillComplete() const override
Whether the matrix is fill complete.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Construct and (optionally) redistribute the explicitly stored transpose of a CrsMatrix.
Struct that holds views of the contents of a BlockCrsMatrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
static void writeSparseFile(const std::string &filename, const sparse_matrix_type &matrix, const std::string &matrixName, const std::string &matrixDescription, const bool debug=false)
Print the sparse matrix in Matrix Market format, with comments.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
Utility functions for packing and unpacking sparse matrix entries.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
void start()
Start the deep_copy counter.
void Multiply(const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, bool transposeB, CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &C, bool call_FillComplete_on_result=true, const std::string &label=std::string(), const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Sparse matrix-matrix multiply.
Teuchos::RCP< crs_matrix_type > createTranspose(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute and return the transpose of the matrix given to the constructor.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
A parallel distribution of indices over processes.
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
A distributed dense vector.
Stand-alone utility functions and macros.
Struct that holds views of the contents of a CrsMatrix.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
Matrix Market file readers and writers for Tpetra objects.
Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const bool transposeA, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const bool transposeB, const CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &B, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap=Teuchos::null, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Compute the sparse matrix sum C = scalarA * Op(A) + scalarB * Op(B), where Op(X) is either X or its t...
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void removeCrsMatrixZeros(CrsMatrixType &matrix, typename Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitudeType const &threshold=Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::magnitude(Teuchos::ScalarTraits< typename CrsMatrixType::scalar_type >::zero()))
Remove zero entries from a matrix.
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.