Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_CRSMATRIX_DEF_HPP
11 #define TPETRA_CRSMATRIX_DEF_HPP
12 
20 
21 #include "Tpetra_Import_Util.hpp"
22 #include "Tpetra_Import_Util2.hpp"
23 #include "Tpetra_RowMatrix.hpp"
24 #include "Tpetra_LocalCrsMatrixOperator.hpp"
25 
32 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
40 #include "Tpetra_Details_packCrsMatrix.hpp"
41 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
43 #include "Teuchos_FancyOStream.hpp"
44 #include "Teuchos_RCP.hpp"
45 #include "Teuchos_DataAccess.hpp"
46 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
47 #include "KokkosBlas1_scal.hpp"
48 #include "KokkosSparse_getDiagCopy.hpp"
49 #include "KokkosSparse_spmv.hpp"
50 #include "Kokkos_StdAlgorithms.hpp"
51 
52 #include <memory>
53 #include <sstream>
54 #include <typeinfo>
55 #include <utility>
56 #include <vector>
57 
58 namespace Tpetra {
59 
60 namespace { // (anonymous)
61 
62  template<class T, class BinaryFunction>
63  T atomic_binary_function_update (volatile T* const dest,
64  const T& inputVal,
65  BinaryFunction f)
66  {
67  T oldVal = *dest;
68  T assume;
69 
70  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
71  // POWER architectures, because 'newval' depends on 'assume',
72  // which depends on 'oldVal', which depends on '*dest'. This
73  // sets up a chain of read dependencies that should ensure
74  // correct behavior given a sane memory model.
75  do {
76  assume = oldVal;
77  T newVal = f (assume, inputVal);
78  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
79  } while (assume != oldVal);
80 
81  return oldVal;
82  }
83 } // namespace (anonymous)
84 
85 //
86 // Users must never rely on anything in the Details namespace.
87 //
88 namespace Details {
89 
99 template<class Scalar>
100 struct AbsMax {
102  Scalar operator() (const Scalar& x, const Scalar& y) {
103  typedef Teuchos::ScalarTraits<Scalar> STS;
104  return std::max (STS::magnitude (x), STS::magnitude (y));
105  }
106 };
107 
108 } // namespace Details
109 } // namespace Tpetra
110 
111 namespace Tpetra {
112 
113  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
114  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
115  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
116  size_t maxNumEntriesPerRow,
117  const Teuchos::RCP<Teuchos::ParameterList>& params) :
118  dist_object_type (rowMap)
119  {
120  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
121  "[, RCP<ParameterList>]): ";
122  Teuchos::RCP<crs_graph_type> graph;
123  try {
124  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
125  params));
126  }
127  catch (std::exception& e) {
128  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
129  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
130  "size_t [, RCP<ParameterList>]) threw an exception: "
131  << e.what ());
132  }
133  // myGraph_ not null means that the matrix owns the graph. That's
134  // different than the const CrsGraph constructor, where the matrix
135  // does _not_ own the graph.
136  myGraph_ = graph;
137  staticGraph_ = myGraph_;
138  resumeFill (params);
140  }
141 
142  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
144  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
145  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
146  const Teuchos::RCP<Teuchos::ParameterList>& params) :
147  dist_object_type (rowMap)
148  {
149  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
150  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
151  Teuchos::RCP<crs_graph_type> graph;
152  try {
153  using Teuchos::rcp;
154  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
155  params));
156  }
157  catch (std::exception& e) {
158  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
159  (true, std::runtime_error, "CrsGraph constructor "
160  "(RCP<const Map>, ArrayView<const size_t>"
161  "[, RCP<ParameterList>]) threw an exception: "
162  << e.what ());
163  }
164  // myGraph_ not null means that the matrix owns the graph. That's
165  // different than the const CrsGraph constructor, where the matrix
166  // does _not_ own the graph.
167  myGraph_ = graph;
168  staticGraph_ = graph;
169  resumeFill (params);
171  }
172 
173  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
175  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
176  const Teuchos::RCP<const map_type>& colMap,
177  const size_t maxNumEntPerRow,
178  const Teuchos::RCP<Teuchos::ParameterList>& params) :
179  dist_object_type (rowMap)
180  {
181  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
182  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
183  const char suffix[] =
184  " Please report this bug to the Tpetra developers.";
185 
186  // An artifact of debugging something a while back.
187  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
188  (! staticGraph_.is_null (), std::logic_error,
189  "staticGraph_ is not null at the beginning of the constructor."
190  << suffix);
191  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
192  (! myGraph_.is_null (), std::logic_error,
193  "myGraph_ is not null at the beginning of the constructor."
194  << suffix);
195  Teuchos::RCP<crs_graph_type> graph;
196  try {
197  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
198  maxNumEntPerRow,
199  params));
200  }
201  catch (std::exception& e) {
202  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
203  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
204  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
205  "exception: " << e.what ());
206  }
207  // myGraph_ not null means that the matrix owns the graph. That's
208  // different than the const CrsGraph constructor, where the matrix
209  // does _not_ own the graph.
210  myGraph_ = graph;
211  staticGraph_ = myGraph_;
212  resumeFill (params);
214  }
215 
216  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
218  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
219  const Teuchos::RCP<const map_type>& colMap,
220  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
221  const Teuchos::RCP<Teuchos::ParameterList>& params) :
222  dist_object_type (rowMap)
223  {
224  const char tfecfFuncName[] =
225  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
226  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
227  Teuchos::RCP<crs_graph_type> graph;
228  try {
229  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
230  numEntPerRowToAlloc,
231  params));
232  }
233  catch (std::exception& e) {
234  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
235  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
236  "RCP<const Map>, ArrayView<const size_t>[, "
237  "RCP<ParameterList>]) threw an exception: " << e.what ());
238  }
239  // myGraph_ not null means that the matrix owns the graph. That's
240  // different than the const CrsGraph constructor, where the matrix
241  // does _not_ own the graph.
242  myGraph_ = graph;
243  staticGraph_ = graph;
244  resumeFill (params);
246  }
247 
248 
249  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
251  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
252  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
253  dist_object_type (graph->getRowMap ()),
254  staticGraph_ (graph),
255  storageStatus_ (Details::STORAGE_1D_PACKED)
256  {
257  using std::endl;
258  typedef typename local_matrix_device_type::values_type values_type;
259  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
260  "RCP<ParameterList>]): ";
261  const bool verbose = Details::Behavior::verbose("CrsMatrix");
262 
263  std::unique_ptr<std::string> prefix;
264  if (verbose) {
265  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
266  std::ostringstream os;
267  os << *prefix << "Start" << endl;
268  std::cerr << os.str ();
269  }
270 
271  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
272  (graph.is_null (), std::runtime_error, "Input graph is null.");
273  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
274  (! graph->isFillComplete (), std::runtime_error, "Input graph "
275  "is not fill complete. You must call fillComplete on the "
276  "graph before using it to construct a CrsMatrix. Note that "
277  "calling resumeFill on the graph makes it not fill complete, "
278  "even if you had previously called fillComplete. In that "
279  "case, you must call fillComplete on the graph again.");
280 
281  // The graph is fill complete, so it is locally indexed and has a
282  // fixed structure. This means we can allocate the (1-D) array of
283  // values and build the local matrix right now. Note that the
284  // local matrix's number of columns comes from the column Map, not
285  // the domain Map.
286 
287  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
288  if (verbose) {
289  std::ostringstream os;
290  os << *prefix << "Allocate values: " << numEnt << endl;
291  std::cerr << os.str ();
292  }
293 
294  values_type val ("Tpetra::CrsMatrix::values", numEnt);
295  valuesPacked_wdv = values_wdv_type(val);
296  valuesUnpacked_wdv = valuesPacked_wdv;
297 
299 
300  if (verbose) {
301  std::ostringstream os;
302  os << *prefix << "Done" << endl;
303  std::cerr << os.str ();
304  }
305  }
306 
307  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
310  const Teuchos::RCP<const crs_graph_type>& graph,
311  const Teuchos::RCP<Teuchos::ParameterList>& params) :
312  dist_object_type (graph->getRowMap ()),
313  staticGraph_ (graph),
314  storageStatus_ (matrix.storageStatus_)
315  {
316  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
317  "local_matrix_device_type::values_type, "
318  "[,RCP<ParameterList>]): ";
319  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
320  (graph.is_null (), std::runtime_error, "Input graph is null.");
321  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
322  (! graph->isFillComplete (), std::runtime_error, "Input graph "
323  "is not fill complete. You must call fillComplete on the "
324  "graph before using it to construct a CrsMatrix. Note that "
325  "calling resumeFill on the graph makes it not fill complete, "
326  "even if you had previously called fillComplete. In that "
327  "case, you must call fillComplete on the graph again.");
328 
329  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
330  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
331 
332  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
333  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
334 
336  }
337 
338 
339  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
341  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
342  const typename local_matrix_device_type::values_type& values,
343  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
344  dist_object_type (graph->getRowMap ()),
345  staticGraph_ (graph),
346  storageStatus_ (Details::STORAGE_1D_PACKED)
347  {
348  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
349  "local_matrix_device_type::values_type, "
350  "[,RCP<ParameterList>]): ";
351  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
352  (graph.is_null (), std::runtime_error, "Input graph is null.");
353  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
354  (! graph->isFillComplete (), std::runtime_error, "Input graph "
355  "is not fill complete. You must call fillComplete on the "
356  "graph before using it to construct a CrsMatrix. Note that "
357  "calling resumeFill on the graph makes it not fill complete, "
358  "even if you had previously called fillComplete. In that "
359  "case, you must call fillComplete on the graph again.");
360 
361  // The graph is fill complete, so it is locally indexed and has a
362  // fixed structure. This means we can allocate the (1-D) array of
363  // values and build the local matrix right now. Note that the
364  // local matrix's number of columns comes from the column Map, not
365  // the domain Map.
366 
367  valuesPacked_wdv = values_wdv_type(values);
368  valuesUnpacked_wdv = valuesPacked_wdv;
369 
370  // FIXME (22 Jun 2016) I would very much like to get rid of
371  // k_values1D_ at some point. I find it confusing to have all
372  // these extra references lying around.
373  // KDDKDD ALMOST THERE, MARK!
374 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
375 
377  }
378 
379  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
382  const Teuchos::RCP<const map_type>& colMap,
383  const typename local_graph_device_type::row_map_type& rowPointers,
384  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
385  const typename local_matrix_device_type::values_type& values,
386  const Teuchos::RCP<Teuchos::ParameterList>& params) :
387  dist_object_type (rowMap),
388  storageStatus_ (Details::STORAGE_1D_PACKED)
389  {
390  using Details::getEntryOnHost;
391  using Teuchos::RCP;
392  using std::endl;
393  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
394  "RCP<const Map>, ptr, ind, val[, params]): ";
395  const char suffix[] =
396  ". Please report this bug to the Tpetra developers.";
397  const bool debug = Details::Behavior::debug("CrsMatrix");
398  const bool verbose = Details::Behavior::verbose("CrsMatrix");
399 
400  std::unique_ptr<std::string> prefix;
401  if (verbose) {
402  prefix = this->createPrefix(
403  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
404  std::ostringstream os;
405  os << *prefix << "Start" << endl;
406  std::cerr << os.str ();
407  }
408 
409  // Check the user's input. Note that this might throw only on
410  // some processes but not others, causing deadlock. We prefer
411  // deadlock due to exceptions to segfaults, because users can
412  // catch exceptions.
413  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
414  (values.extent(0) != columnIndices.extent(0),
415  std::invalid_argument, "values.extent(0)=" << values.extent(0)
416  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
417  << ".");
418  if (debug && rowPointers.extent(0) != 0) {
419  const size_t numEnt =
420  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
421  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
422  (numEnt != size_t(columnIndices.extent(0)) ||
423  numEnt != size_t(values.extent(0)),
424  std::invalid_argument, "Last entry of rowPointers says that "
425  "the matrix has " << numEnt << " entr"
426  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
427  "columnIndices and values don't match this. "
428  "columnIndices.extent(0)=" << columnIndices.extent (0)
429  << " and values.extent(0)=" << values.extent (0) << ".");
430  }
431 
432  RCP<crs_graph_type> graph;
433  try {
434  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
435  columnIndices, params));
436  }
437  catch (std::exception& e) {
438  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
439  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
440  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
441  << e.what ());
442  }
443  // The newly created CrsGraph _must_ have a local graph at this
444  // point. We don't really care whether CrsGraph's constructor
445  // deep-copies or shallow-copies the input, but the dimensions
446  // have to be right. That's how we tell whether the CrsGraph has
447  // a local graph.
448  auto lclGraph = graph->getLocalGraphDevice ();
449  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
450  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
451  lclGraph.entries.extent (0) != columnIndices.extent (0),
452  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
453  "ind[, params]) did not set the local graph correctly." << suffix);
454  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
455  (lclGraph.entries.extent (0) != values.extent (0),
456  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
457  "params]) did not set the local graph correctly. "
458  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
459  << " != values.extent(0) = " << values.extent (0) << suffix);
460 
461  // myGraph_ not null means that the matrix owns the graph. This
462  // is true because the column indices come in as nonconst,
463  // implying shared ownership.
464  myGraph_ = graph;
465  staticGraph_ = graph;
466 
467  // The graph may not be fill complete yet. However, it is locally
468  // indexed (since we have a column Map) and has a fixed structure
469  // (due to the input arrays). This means we can allocate the
470  // (1-D) array of values and build the local matrix right now.
471  // Note that the local matrix's number of columns comes from the
472  // column Map, not the domain Map.
473 
474  valuesPacked_wdv = values_wdv_type(values);
475  valuesUnpacked_wdv = valuesPacked_wdv;
476 
477  // FIXME (22 Jun 2016) I would very much like to get rid of
478  // k_values1D_ at some point. I find it confusing to have all
479  // these extra references lying around.
480 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
481 
483  if (verbose) {
484  std::ostringstream os;
485  os << *prefix << "Done" << endl;
486  std::cerr << os.str();
487  }
488  }
489 
490  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
492  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
493  const Teuchos::RCP<const map_type>& colMap,
494  const Teuchos::ArrayRCP<size_t>& ptr,
495  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
496  const Teuchos::ArrayRCP<Scalar>& val,
497  const Teuchos::RCP<Teuchos::ParameterList>& params) :
498  dist_object_type (rowMap),
499  storageStatus_ (Details::STORAGE_1D_PACKED)
500  {
501  using Kokkos::Compat::getKokkosViewDeepCopy;
502  using Teuchos::av_reinterpret_cast;
503  using Teuchos::RCP;
504  using values_type = typename local_matrix_device_type::values_type;
505  using IST = impl_scalar_type;
506  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
507  "RCP<const Map>, ptr, ind, val[, params]): ";
508 
509  RCP<crs_graph_type> graph;
510  try {
511  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
512  ind, params));
513  }
514  catch (std::exception& e) {
515  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
516  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
517  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
518  "RCP<ParameterList>]) threw an exception: " << e.what ());
519  }
520  // myGraph_ not null means that the matrix owns the graph. This
521  // is true because the column indices come in as nonconst,
522  // implying shared ownership.
523  myGraph_ = graph;
524  staticGraph_ = graph;
525 
526  // The graph may not be fill complete yet. However, it is locally
527  // indexed (since we have a column Map) and has a fixed structure
528  // (due to the input arrays). This means we can allocate the
529  // (1-D) array of values and build the local matrix right now.
530  // Note that the local matrix's number of columns comes from the
531  // column Map, not the domain Map.
532 
533  // The graph _must_ have a local graph at this point. We don't
534  // really care whether CrsGraph's constructor deep-copies or
535  // shallow-copies the input, but the dimensions have to be right.
536  // That's how we tell whether the CrsGraph has a local graph.
537  auto lclGraph = staticGraph_->getLocalGraphDevice ();
538  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
539  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
540  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
541  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
542  "ptr, ind[, params]) did not set the local graph correctly. "
543  "Please report this bug to the Tpetra developers.");
544 
545  values_type valIn =
546  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
547  valuesPacked_wdv = values_wdv_type(valIn);
548  valuesUnpacked_wdv = valuesPacked_wdv;
549 
550  // FIXME (22 Jun 2016) I would very much like to get rid of
551  // k_values1D_ at some point. I find it confusing to have all
552  // these extra references lying around.
553 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
554 
556  }
557 
558  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
560  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
561  const Teuchos::RCP<const map_type>& colMap,
562  const local_matrix_device_type& lclMatrix,
563  const Teuchos::RCP<Teuchos::ParameterList>& params) :
564  dist_object_type (rowMap),
565  storageStatus_ (Details::STORAGE_1D_PACKED),
566  fillComplete_ (true)
567  {
568  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
569  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
570  const char suffix[] =
571  " Please report this bug to the Tpetra developers.";
572 
573  Teuchos::RCP<crs_graph_type> graph;
574  try {
575  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
576  lclMatrix.graph, params));
577  }
578  catch (std::exception& e) {
579  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
580  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
581  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
582  "exception: " << e.what ());
583  }
584  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
585  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
586  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
587  "did not produce a fill-complete graph. Please report this bug to the "
588  "Tpetra developers.");
589  // myGraph_ not null means that the matrix owns the graph. This
590  // is true because the column indices come in as nonconst through
591  // the matrix, implying shared ownership.
592  myGraph_ = graph;
593  staticGraph_ = graph;
594 
595  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
596  valuesUnpacked_wdv = valuesPacked_wdv;
597 
598  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
599  (isFillActive (), std::logic_error,
600  "At the end of a CrsMatrix constructor that should produce "
601  "a fillComplete matrix, isFillActive() is true." << suffix);
602  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
603  (! isFillComplete (), std::logic_error, "At the end of a "
604  "CrsMatrix constructor that should produce a fillComplete "
605  "matrix, isFillComplete() is false." << suffix);
607  }
608 
609  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
612  const Teuchos::RCP<const map_type>& rowMap,
613  const Teuchos::RCP<const map_type>& colMap,
614  const Teuchos::RCP<const map_type>& domainMap,
615  const Teuchos::RCP<const map_type>& rangeMap,
616  const Teuchos::RCP<Teuchos::ParameterList>& params) :
617  dist_object_type (rowMap),
618  storageStatus_ (Details::STORAGE_1D_PACKED),
619  fillComplete_ (true)
620  {
621  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
622  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
623  "local_matrix_device_type[, RCP<ParameterList>]): ";
624  const char suffix[] =
625  " Please report this bug to the Tpetra developers.";
626 
627  Teuchos::RCP<crs_graph_type> graph;
628  try {
629  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
630  domainMap, rangeMap, params));
631  }
632  catch (std::exception& e) {
633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
634  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
635  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
636  "RCP<ParameterList>]) threw an exception: " << e.what ());
637  }
638  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
639  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
640  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
641  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
642  "not produce a fillComplete graph." << suffix);
643  // myGraph_ not null means that the matrix owns the graph. This
644  // is true because the column indices come in as nonconst through
645  // the matrix, implying shared ownership.
646  myGraph_ = graph;
647  staticGraph_ = graph;
648 
649  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
650  valuesUnpacked_wdv = valuesPacked_wdv;
651 
652  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
653  (isFillActive (), std::logic_error,
654  "At the end of a CrsMatrix constructor that should produce "
655  "a fillComplete matrix, isFillActive() is true." << suffix);
656  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
657  (! isFillComplete (), std::logic_error, "At the end of a "
658  "CrsMatrix constructor that should produce a fillComplete "
659  "matrix, isFillComplete() is false." << suffix);
661  }
662 
663  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
666  const Teuchos::RCP<const map_type>& rowMap,
667  const Teuchos::RCP<const map_type>& colMap,
668  const Teuchos::RCP<const map_type>& domainMap,
669  const Teuchos::RCP<const map_type>& rangeMap,
670  const Teuchos::RCP<const import_type>& importer,
671  const Teuchos::RCP<const export_type>& exporter,
672  const Teuchos::RCP<Teuchos::ParameterList>& params) :
673  dist_object_type (rowMap),
674  storageStatus_ (Details::STORAGE_1D_PACKED),
675  fillComplete_ (true)
676  {
677  using Teuchos::rcp;
678  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
679  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
680  const char suffix[] =
681  " Please report this bug to the Tpetra developers.";
682 
683  Teuchos::RCP<crs_graph_type> graph;
684  try {
685  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
686  domainMap, rangeMap, importer,
687  exporter, params));
688  }
689  catch (std::exception& e) {
690  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
691  (true, std::runtime_error, "CrsGraph constructor "
692  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
693  "params) threw: " << e.what ());
694  }
695  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
696  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
697  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
698  "Export, params) did not produce a fill-complete graph. "
699  "Please report this bug to the Tpetra developers.");
700  // myGraph_ not null means that the matrix owns the graph. This
701  // is true because the column indices come in as nonconst through
702  // the matrix, implying shared ownership.
703  myGraph_ = graph;
704  staticGraph_ = graph;
705 
706  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
707  valuesUnpacked_wdv = valuesPacked_wdv;
708 
709  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
710  (isFillActive (), std::logic_error,
711  "At the end of a CrsMatrix constructor that should produce "
712  "a fillComplete matrix, isFillActive() is true." << suffix);
713  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
714  (! isFillComplete (), std::logic_error, "At the end of a "
715  "CrsMatrix constructor that should produce a fillComplete "
716  "matrix, isFillComplete() is false." << suffix);
718  }
719 
720  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
723  const Teuchos::DataAccess copyOrView):
724  dist_object_type (source.getCrsGraph()->getRowMap ()),
725  staticGraph_ (source.getCrsGraph()),
726  storageStatus_ (source.storageStatus_)
727  {
728  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
729  "const CrsMatrix&, const Teuchos::DataAccess): ";
730  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
731  (! source.isFillComplete (), std::invalid_argument,
732  "Source graph must be fillComplete().");
733 
734  if (copyOrView == Teuchos::Copy) {
735  using values_type = typename local_matrix_device_type::values_type;
736  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
737  using Kokkos::view_alloc;
738  using Kokkos::WithoutInitializing;
739  values_type newvals (view_alloc ("val", WithoutInitializing),
740  vals.extent (0));
741  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
742  Kokkos::deep_copy (newvals, vals);
743  valuesPacked_wdv = values_wdv_type(newvals);
744  valuesUnpacked_wdv = valuesPacked_wdv;
745  fillComplete (source.getDomainMap (), source.getRangeMap ());
746  }
747  else if (copyOrView == Teuchos::View) {
748  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
749  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
750  fillComplete (source.getDomainMap (), source.getRangeMap ());
751  }
752  else {
753  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
754  (true, std::invalid_argument, "Second argument 'copyOrView' "
755  "has an invalid value " << copyOrView << ". Valid values "
756  "include Teuchos::Copy = " << Teuchos::Copy << " and "
757  "Teuchos::View = " << Teuchos::View << ".");
758  }
760  }
761 
762  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
763  void
766  {
767  std::swap(crs_matrix.importMV_, this->importMV_);
768  std::swap(crs_matrix.exportMV_, this->exportMV_);
769  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
770  std::swap(crs_matrix.myGraph_, this->myGraph_);
771  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
772  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
773  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
774  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
775  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
776  }
777 
778  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
779  Teuchos::RCP<const Teuchos::Comm<int> >
781  getComm () const {
782  return getCrsGraphRef ().getComm ();
783  }
784 
785  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
786  bool
788  isFillComplete () const {
789  return fillComplete_;
790  }
791 
792  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
793  bool
795  isFillActive () const {
796  return ! fillComplete_;
797  }
798 
799  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
800  bool
803  return this->getCrsGraphRef ().isStorageOptimized ();
804  }
805 
806  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
807  bool
810  return getCrsGraphRef ().isLocallyIndexed ();
811  }
812 
813  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
814  bool
817  return getCrsGraphRef ().isGloballyIndexed ();
818  }
819 
820  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
821  bool
823  hasColMap () const {
824  return getCrsGraphRef ().hasColMap ();
825  }
826 
827  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
831  return getCrsGraphRef ().getGlobalNumEntries ();
832  }
833 
834  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
835  size_t
838  return getCrsGraphRef ().getLocalNumEntries ();
839  }
840 
841  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
845  return getCrsGraphRef ().getGlobalNumRows ();
846  }
847 
848  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
852  return getCrsGraphRef ().getGlobalNumCols ();
853  }
854 
855  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
856  size_t
859  return getCrsGraphRef ().getLocalNumRows ();
860  }
861 
862 
863  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
864  size_t
867  return getCrsGraphRef ().getLocalNumCols ();
868  }
869 
870 
871  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
872  size_t
874  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
875  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
876  }
877 
878  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
879  size_t
881  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
882  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
883  }
884 
885  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
886  size_t
889  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
890  }
891 
892  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
893  size_t
896  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
897  }
898 
899  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
900  GlobalOrdinal
902  getIndexBase () const {
903  return getRowMap ()->getIndexBase ();
904  }
905 
906  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
907  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
909  getRowMap () const {
910  return getCrsGraphRef ().getRowMap ();
911  }
912 
913  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
914  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
916  getColMap () const {
917  return getCrsGraphRef ().getColMap ();
918  }
919 
920  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
923  getDomainMap () const {
924  return getCrsGraphRef ().getDomainMap ();
925  }
926 
927  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
928  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
930  getRangeMap () const {
931  return getCrsGraphRef ().getRangeMap ();
932  }
933 
934  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
935  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
937  getGraph () const {
938  if (staticGraph_ != Teuchos::null) {
939  return staticGraph_;
940  }
941  return myGraph_;
942  }
943 
944  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
945  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
947  getCrsGraph () const {
948  if (staticGraph_ != Teuchos::null) {
949  return staticGraph_;
950  }
951  return myGraph_;
952  }
953 
954  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
957  getCrsGraphRef () const
958  {
959 #ifdef HAVE_TPETRA_DEBUG
960  constexpr bool debug = true;
961 #else
962  constexpr bool debug = false;
963 #endif // HAVE_TPETRA_DEBUG
964 
965  if (! this->staticGraph_.is_null ()) {
966  return * (this->staticGraph_);
967  }
968  else {
969  if (debug) {
970  const char tfecfFuncName[] = "getCrsGraphRef: ";
971  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
972  (this->myGraph_.is_null (), std::logic_error,
973  "Both staticGraph_ and myGraph_ are null. "
974  "Please report this bug to the Tpetra developers.");
975  }
976  return * (this->myGraph_);
977  }
978  }
979 
980  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
981  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
984  {
985  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
986  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
987  numCols,
988  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
989  staticGraph_->getLocalGraphDevice());
990  }
991 
992  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
993  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
995  getLocalMatrixHost () const
996  {
997  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
998  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
999  valuesPacked_wdv.getHostView(Access::ReadWrite),
1000  staticGraph_->getLocalGraphHost());
1001  }
1002 
1003 #if KOKKOSKERNELS_VERSION < 40299
1004 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1005  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1006  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1009  {
1010  auto localMatrix = getLocalMatrixDevice();
1011 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
1012  if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()))
1013  {
1014  if(this->ordinalRowptrs.data() == nullptr)
1015  {
1016  auto originalRowptrs = localMatrix.graph.row_map;
1017  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1018  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1019  this->ordinalRowptrs = ordinal_rowptrs_type(
1020  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1021  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1022  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1023  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1024  KOKKOS_LAMBDA(LocalOrdinal i)
1025  {
1026  ordinalRowptrs_(i) = originalRowptrs(i);
1027  });
1028  }
1029  //return local operator using ordinalRowptrs
1030  return std::make_shared<local_multiply_op_type>(
1031  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1032  }
1033 #endif
1034 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1035  return std::make_shared<local_multiply_op_type>(
1036  std::make_shared<local_matrix_device_type>(localMatrix));
1037  }
1038 #endif
1039 
1040  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1041  bool
1043  isStaticGraph () const {
1044  return myGraph_.is_null ();
1045  }
1046 
1047  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1048  bool
1051  return true;
1052  }
1053 
1054  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1055  bool
1058  return true;
1059  }
1060 
1061  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1062  void
1064  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1065  const bool verbose)
1066  {
1067  using Details::Behavior;
1069  using std::endl;
1070  const char tfecfFuncName[] = "allocateValues: ";
1071  const char suffix[] =
1072  " Please report this bug to the Tpetra developers.";
1073  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1074 
1075  std::unique_ptr<std::string> prefix;
1076  if (verbose) {
1077  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1078  std::ostringstream os;
1079  os << *prefix << "lg: "
1080  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1081  << ", gas: Graph"
1082  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1083  << "Allocated" << endl;
1084  std::cerr << os.str();
1085  }
1086 
1087  const bool debug = Behavior::debug("CrsMatrix");
1088  if (debug) {
1089  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1090  (this->staticGraph_.is_null (), std::logic_error,
1091  "staticGraph_ is null." << suffix);
1092 
1093  // If the graph indices are already allocated, then gas should be
1094  // GraphAlreadyAllocated. Otherwise, gas should be
1095  // GraphNotYetAllocated.
1096  if ((gas == GraphAlreadyAllocated) !=
1097  staticGraph_->indicesAreAllocated ()) {
1098  const char err1[] = "The caller has asserted that the graph "
1099  "is ";
1100  const char err2[] = "already allocated, but the static graph "
1101  "says that its indices are ";
1102  const char err3[] = "already allocated. ";
1103  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1104  (gas == GraphAlreadyAllocated &&
1105  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1106  err1 << err2 << "not " << err3 << suffix);
1107  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1108  (gas != GraphAlreadyAllocated &&
1109  staticGraph_->indicesAreAllocated (), std::logic_error,
1110  err1 << "not " << err2 << err3 << suffix);
1111  }
1112 
1113  // If the graph is unallocated, then it had better be a
1114  // matrix-owned graph. ("Matrix-owned graph" means that the
1115  // matrix gets to define the graph structure. If the CrsMatrix
1116  // constructor that takes an RCP<const CrsGraph> was used, then
1117  // the matrix does _not_ own the graph.)
1118  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1119  (! this->staticGraph_->indicesAreAllocated () &&
1120  this->myGraph_.is_null (), std::logic_error,
1121  "The static graph says that its indices are not allocated, "
1122  "but the graph is not owned by the matrix." << suffix);
1123  }
1124 
1125  if (gas == GraphNotYetAllocated) {
1126  if (debug) {
1127  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1128  (this->myGraph_.is_null (), std::logic_error,
1129  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1130  }
1131  try {
1132  this->myGraph_->allocateIndices (lg, verbose);
1133  }
1134  catch (std::exception& e) {
1135  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1136  (true, std::runtime_error, "CrsGraph::allocateIndices "
1137  "threw an exception: " << e.what ());
1138  }
1139  catch (...) {
1140  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1141  (true, std::runtime_error, "CrsGraph::allocateIndices "
1142  "threw an exception not a subclass of std::exception.");
1143  }
1144  }
1145 
1146  // Allocate matrix values.
1147  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1148  if (debug) {
1149  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1150  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1151  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1152  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1153  }
1154 
1155  // Allocate array of (packed???) matrix values.
1156  using values_type = typename local_matrix_device_type::values_type;
1157  if (verbose) {
1158  std::ostringstream os;
1159  os << *prefix << "Allocate values_wdv: Pre "
1160  << valuesUnpacked_wdv.extent(0) << ", post "
1161  << lclTotalNumEntries << endl;
1162  std::cerr << os.str();
1163  }
1164 // this->k_values1D_ =
1165  valuesUnpacked_wdv = values_wdv_type(
1166  values_type("Tpetra::CrsMatrix::values",
1167  lclTotalNumEntries));
1168  }
1169 
1170 
1171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1172  void
1174  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1175  {
1177  using ::Tpetra::Details::getEntryOnHost;
1178  using Teuchos::arcp_const_cast;
1179  using Teuchos::Array;
1180  using Teuchos::ArrayRCP;
1181  using Teuchos::null;
1182  using Teuchos::RCP;
1183  using Teuchos::rcp;
1184  using std::endl;
1185  using row_map_type = typename local_graph_device_type::row_map_type;
1186  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1187  using values_type = typename local_matrix_device_type::values_type;
1188  Details::ProfilingRegion regionFLGAM
1189  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1190 
1191  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1192  "fillComplete or expertStaticFillComplete): ";
1193  const char suffix[] =
1194  " Please report this bug to the Tpetra developers.";
1195  const bool debug = Details::Behavior::debug("CrsMatrix");
1196  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1197 
1198  std::unique_ptr<std::string> prefix;
1199  if (verbose) {
1200  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1201  std::ostringstream os;
1202  os << *prefix << endl;
1203  std::cerr << os.str ();
1204  }
1205 
1206  if (debug) {
1207  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1208  // matrix owns the graph, which means myGraph_ is not null.
1209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1210  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1211  "(myGraph_) is null. This means that the matrix has a "
1212  "const (a.k.a. \"static\") graph. fillComplete or "
1213  "expertStaticFillComplete should never call "
1214  "fillLocalGraphAndMatrix in that case." << suffix);
1215  }
1216 
1217  const size_t lclNumRows = this->getLocalNumRows ();
1218 
1219  // This method's goal is to fill in the three arrays (compressed
1220  // sparse row format) that define the sparse graph's and matrix's
1221  // structure, and the sparse matrix's values.
1222  //
1223  // Get references to the data in myGraph_, so we can modify them
1224  // as well. Note that we only call fillLocalGraphAndMatrix() if
1225  // the matrix owns the graph, which means myGraph_ is not null.
1226 
1227  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1228  // See: https://github.com/trilinos/Trilinos/issues/12237
1229  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1230  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1231 
1232  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1233  myGraph_->rowPtrsUnpacked_dev_;
1234 
1235  if (debug) {
1236  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1237  (curRowOffsets.extent (0) == 0, std::logic_error,
1238  "curRowOffsets.extent(0) == 0.");
1239  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1240  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1241  "curRowOffsets.extent(0) = "
1242  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1243  << (lclNumRows + 1) << ".");
1244  const size_t numOffsets = curRowOffsets.extent (0);
1245  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1246  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1247  (numOffsets != 0 &&
1248  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1249  std::logic_error, "numOffsets = " <<
1250  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1251  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1252  << numOffsets << ") = " << valToCheck << ".");
1253  }
1254 
1255  if (myGraph_->getLocalNumEntries() !=
1256  myGraph_->getLocalAllocationSize()) {
1257 
1258  // Use the nonconst version of row_map_type for k_ptrs,
1259  // because row_map_type is const and we need to modify k_ptrs here.
1260  typename row_map_type::non_const_type k_ptrs;
1261  row_map_type k_ptrs_const;
1262  lclinds_1d_type k_inds;
1263  values_type k_vals;
1264 
1265  if (verbose) {
1266  std::ostringstream os;
1267  const auto numEnt = myGraph_->getLocalNumEntries();
1268  const auto allocSize = myGraph_->getLocalAllocationSize();
1269  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1270  << ", allocSize=" << allocSize << endl;
1271  std::cerr << os.str ();
1272  }
1273  // The matrix's current 1-D storage is "unpacked." This means
1274  // the row offsets may differ from what the final row offsets
1275  // should be. This could happen, for example, if the user
1276  // set an upper
1277  // bound on the number of entries per row, but didn't fill all
1278  // those entries.
1279  if (debug && curRowOffsets.extent (0) != 0) {
1280  const size_t numOffsets =
1281  static_cast<size_t> (curRowOffsets.extent (0));
1282  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1283  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1284  (static_cast<size_t> (valToCheck) !=
1285  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1286  std::logic_error, "(unpacked branch) Before "
1287  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1288  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1289  " = " << valuesUnpacked_wdv.extent (0) << ".");
1290  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1291  (static_cast<size_t> (valToCheck) !=
1292  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1293  std::logic_error, "(unpacked branch) Before "
1294  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1295  << ") = " << valToCheck
1296  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1297  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1298  }
1299  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1300  // the array of valid entry counts per row.
1301 
1302  // Total number of entries in the matrix on the calling
1303  // process. We will compute this in the loop below. It's
1304  // cheap to compute and useful as a sanity check.
1305  size_t lclTotalNumEntries = 0;
1306  {
1307  // Allocate the packed row offsets array. We use a nonconst
1308  // temporary (packedRowOffsets) here, because k_ptrs is
1309  // const. We will assign packedRowOffsets to k_ptrs below.
1310  if (verbose) {
1311  std::ostringstream os;
1312  os << *prefix << "Allocate packed row offsets: "
1313  << (lclNumRows+1) << endl;
1314  std::cerr << os.str ();
1315  }
1316  typename row_map_type::non_const_type
1317  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1318  typename row_entries_type::const_type numRowEnt_h =
1319  myGraph_->k_numRowEntries_;
1320  // We're computing offsets on device. This function can
1321  // handle numRowEnt_h being a host View.
1322  lclTotalNumEntries =
1323  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1324  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1325  // to use packedRowOffsets in the loop above and assign here.
1326  k_ptrs = packedRowOffsets;
1327  k_ptrs_const = k_ptrs;
1328  }
1329 
1330  if (debug) {
1331  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1332  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1333  std::logic_error,
1334  "(unpacked branch) After packing k_ptrs, "
1335  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1336  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1337  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1338  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1339  (valToCheck != lclTotalNumEntries, std::logic_error,
1340  "(unpacked branch) After filling k_ptrs, "
1341  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1342  << " != total number of entries on the calling process = "
1343  << lclTotalNumEntries << ".");
1344  }
1345 
1346  // Allocate the arrays of packed column indices and values.
1347  if (verbose) {
1348  std::ostringstream os;
1349  os << *prefix << "Allocate packed local column indices: "
1350  << lclTotalNumEntries << endl;
1351  std::cerr << os.str ();
1352  }
1353  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1354  if (verbose) {
1355  std::ostringstream os;
1356  os << *prefix << "Allocate packed values: "
1357  << lclTotalNumEntries << endl;
1358  std::cerr << os.str ();
1359  }
1360  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1361 
1362  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1363  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1364  // the packed row offsets array k_ptrs that we created above.
1365  //
1366  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1367  // need to keep around the unpacked row offsets, column
1368  // indices, and values arrays.
1369 
1370  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1371  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1372  using inds_packer_type = pack_functor<
1373  typename Graph::local_graph_device_type::entries_type::non_const_type,
1374  typename Graph::local_inds_dualv_type::t_dev::const_type,
1375  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1376  typename Graph::local_graph_device_type::row_map_type>;
1377  inds_packer_type indsPacker (
1378  k_inds,
1379  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1380  k_ptrs, curRowOffsets);
1381  using exec_space = typename decltype (k_inds)::execution_space;
1382  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1383  Kokkos::parallel_for
1384  ("Tpetra::CrsMatrix pack column indices",
1385  range_type (0, lclNumRows), indsPacker);
1386 
1387  // Pack the values from unpacked valuesUnpacked_wdv into packed
1388  // k_vals. We will replace valuesPacked_wdv below.
1389  using vals_packer_type = pack_functor<
1390  typename values_type::non_const_type,
1391  typename values_type::const_type,
1392  typename row_map_type::non_const_type,
1393  typename row_map_type::const_type>;
1394  vals_packer_type valsPacker (
1395  k_vals,
1396  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1397  k_ptrs, curRowOffsets);
1398  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1399  range_type (0, lclNumRows), valsPacker);
1400 
1401  if (debug) {
1402  const char myPrefix[] = "(\"Optimize Storage\""
1403  "=true branch) After packing, ";
1404  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1405  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1406  << "k_ptrs.extent(0) = 0. This probably means that "
1407  "rowPtrsUnpacked_ was never allocated.");
1408  if (k_ptrs.extent (0) != 0) {
1409  const size_t numOffsets (k_ptrs.extent (0));
1410  const auto valToCheck =
1411  getEntryOnHost (k_ptrs, numOffsets - 1);
1412  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1413  (size_t (valToCheck) != k_vals.extent (0),
1414  std::logic_error, myPrefix <<
1415  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1416  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1417  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1418  (size_t (valToCheck) != k_inds.extent (0),
1419  std::logic_error, myPrefix <<
1420  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1421  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1422  }
1423  }
1424  // Build the local graph.
1425  myGraph_->setRowPtrsPacked(k_ptrs_const);
1426  myGraph_->lclIndsPacked_wdv =
1427  typename crs_graph_type::local_inds_wdv_type(k_inds);
1428  valuesPacked_wdv = values_wdv_type(k_vals);
1429  }
1430  else { // We don't have to pack, so just set the pointers.
1431  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1432  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1433  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1434  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1435  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1436  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1437  valuesPacked_wdv = valuesUnpacked_wdv;
1438 
1439  if (verbose) {
1440  std::ostringstream os;
1441  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1442  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1443  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1444  << valuesUnpacked_wdv.extent(0) << endl;
1445  std::cerr << os.str();
1446  }
1447 
1448  if (debug) {
1449  const char myPrefix[] =
1450  "(\"Optimize Storage\"=false branch) ";
1451  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1452  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1453  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1454  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1455  "that rowPtrsUnpacked_ was never allocated.");
1456  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1457  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1458  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1459  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1460  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1461  std::logic_error, myPrefix <<
1462  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1463  << " != valuesPacked_wdv.extent(0) = "
1464  << valuesPacked_wdv.extent (0) << ".");
1465  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1466  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1467  std::logic_error, myPrefix <<
1468  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1469  << " != myGraph_->lclIndsPacked.extent(0) = "
1470  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1471  }
1472  }
1473  }
1474 
1475  if (debug) {
1476  const char myPrefix[] = "After packing, ";
1477  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1478  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1479  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1480  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1481  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1482  (lclNumRows+1) << ".");
1483  if (rowPtrsPackedHost.extent (0) != 0) {
1484  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1485  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1486  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1487  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1488  std::logic_error, myPrefix << "k_ptrs_const(" <<
1489  (numOffsets-1) << ") = " << valToCheck
1490  << " != valuesPacked_wdv.extent(0) = "
1491  << valuesPacked_wdv.extent (0) << ".");
1492  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1493  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1494  std::logic_error, myPrefix << "k_ptrs_const(" <<
1495  (numOffsets-1) << ") = " << valToCheck
1496  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1497  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1498  }
1499  }
1500 
1501  // May we ditch the old allocations for the packed (and otherwise
1502  // "optimized") allocations, later in this routine? Optimize
1503  // storage if the graph is not static, or if the graph already has
1504  // optimized storage.
1505  const bool defaultOptStorage =
1506  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1507  const bool requestOptimizedStorage =
1508  (! params.is_null () &&
1509  params->get ("Optimize Storage", defaultOptStorage)) ||
1510  (params.is_null () && defaultOptStorage);
1511 
1512  // The graph has optimized storage when indices are allocated,
1513  // myGraph_->k_numRowEntries_ is empty, and there are more than
1514  // zero rows on this process.
1515  if (requestOptimizedStorage) {
1516  // Free the old, unpacked, unoptimized allocations.
1517  // Free graph data structures that are only needed for
1518  // unpacked 1-D storage.
1519  if (verbose) {
1520  std::ostringstream os;
1521  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1522  << myGraph_->k_numRowEntries_.extent(0) << endl;
1523  std::cerr << os.str();
1524  }
1525 
1526  myGraph_->k_numRowEntries_ = row_entries_type ();
1527 
1528  // Keep the new 1-D packed allocations.
1529  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1530  // We directly set the memory spaces to avoid a memcpy from device to host
1531  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1532  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1533  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1534  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1535  valuesUnpacked_wdv = valuesPacked_wdv;
1536 
1537  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1538  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1539  }
1540  else {
1541  if (verbose) {
1542  std::ostringstream os;
1543  os << *prefix << "User requested NOT to optimize storage"
1544  << endl;
1545  std::cerr << os.str();
1546  }
1547  }
1548  }
1549 
1550  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1551  void
1553  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1554  {
1555  using ::Tpetra::Details::ProfilingRegion;
1556  using Teuchos::ArrayRCP;
1557  using Teuchos::Array;
1558  using Teuchos::null;
1559  using Teuchos::RCP;
1560  using Teuchos::rcp;
1561  using std::endl;
1562  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1563  using non_const_row_map_type = typename row_map_type::non_const_type;
1564  using values_type = typename local_matrix_device_type::values_type;
1565  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1566  const size_t lclNumRows = getLocalNumRows();
1567 
1568  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1569  std::unique_ptr<std::string> prefix;
1570  if (verbose) {
1571  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1572  std::ostringstream os;
1573  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1574  std::cerr << os.str ();
1575  }
1576 
1577  // The goals of this routine are first, to allocate and fill
1578  // packed 1-D storage (see below for an explanation) in the vals
1579  // array, and second, to give vals to the local matrix and
1580  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1581  // row offsets, within the scope of this routine, since we're only
1582  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1583  // fill both the graph and the matrix at the same time).
1584 
1585  // get data from staticGraph_
1586  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1587  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1588  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1589 
1590  row_map_type k_ptrs; // "packed" row offsets array
1591  values_type k_vals; // "packed" values array
1592 
1593  // May we ditch the old allocations for the packed (and otherwise
1594  // "optimized") allocations, later in this routine? Request
1595  // optimized storage by default.
1596  bool requestOptimizedStorage = true;
1597  const bool default_OptimizeStorage =
1598  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1599  if (! params.is_null() &&
1600  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1601  requestOptimizedStorage = false;
1602  }
1603  // If we're not allowed to change a static graph, then we can't
1604  // change the storage of the matrix, either. This means that if
1605  // the graph's storage isn't already optimized, we can't optimize
1606  // the matrix's storage either. Check and give warning, as
1607  // appropriate.
1608  if (! staticGraph_->isStorageOptimized () &&
1609  requestOptimizedStorage) {
1611  (true, std::runtime_error, "You requested optimized storage "
1612  "by setting the \"Optimize Storage\" flag to \"true\" in "
1613  "the ParameterList, or by virtue of default behavior. "
1614  "However, the associated CrsGraph was filled separately and "
1615  "requested not to optimize storage. Therefore, the "
1616  "CrsMatrix cannot optimize storage.");
1617  requestOptimizedStorage = false;
1618  }
1619 
1620  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1621  // See: https://github.com/trilinos/Trilinos/issues/12237
1622  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1623  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1624 
1625  // The matrix's values are currently
1626  // stored in a 1-D format. However, this format is "unpacked";
1627  // it doesn't necessarily have the same row offsets as indicated
1628  // by the ptrs array returned by allocRowPtrs. This could
1629  // happen, for example, if the user
1630  // fixed the number of matrix entries in
1631  // each row, but didn't fill all those entries.
1632  //
1633  // As above, we don't need to keep the "packed" row offsets
1634  // array ptrs here, but we do need it here temporarily, so we
1635  // have to allocate it. We'll free ptrs later in this method.
1636  //
1637  // Note that this routine checks whether storage has already
1638  // been packed. This is a common case for solution of nonlinear
1639  // PDEs using the finite element method, as long as the
1640  // structure of the sparse matrix does not change between linear
1641  // solves.
1642  if (nodeNumEntries != nodeNumAllocated) {
1643  if (verbose) {
1644  std::ostringstream os;
1645  os << *prefix << "Unpacked 1-D storage: numEnt="
1646  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1647  << endl;
1648  std::cerr << os.str();
1649  }
1650  // We have to pack the 1-D storage, since the user didn't fill
1651  // up all requested storage.
1652  if (verbose) {
1653  std::ostringstream os;
1654  os << *prefix << "Allocate packed row offsets: "
1655  << (lclNumRows+1) << endl;
1656  std::cerr << os.str();
1657  }
1658  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1659  lclNumRows+1);
1660  // Total number of entries in the matrix on the calling
1661  // process. We will compute this in the loop below. It's
1662  // cheap to compute and useful as a sanity check.
1663  size_t lclTotalNumEntries = 0;
1664  k_ptrs = tmpk_ptrs;
1665  {
1666  typename row_entries_type::const_type numRowEnt_h =
1667  staticGraph_->k_numRowEntries_;
1668  // This function can handle the counts being a host View.
1669  lclTotalNumEntries =
1670  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1671  }
1672 
1673  // Allocate the "packed" values array.
1674  // It has exactly the right number of entries.
1675  if (verbose) {
1676  std::ostringstream os;
1677  os << *prefix << "Allocate packed values: "
1678  << lclTotalNumEntries << endl;
1679  std::cerr << os.str ();
1680  }
1681  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1682 
1683  // Pack values_wdv into k_vals. We will replace values_wdv below.
1684  pack_functor<
1685  typename values_type::non_const_type,
1686  typename values_type::const_type,
1687  typename row_map_type::non_const_type,
1688  typename row_map_type::const_type> valsPacker
1689  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1690  tmpk_ptrs, k_rowPtrs);
1691 
1692  using exec_space = typename decltype (k_vals)::execution_space;
1693  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1694  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1695  range_type (0, lclNumRows), valsPacker);
1696  valuesPacked_wdv = values_wdv_type(k_vals);
1697  }
1698  else { // We don't have to pack, so just set the pointer.
1699  valuesPacked_wdv = valuesUnpacked_wdv;
1700  if (verbose) {
1701  std::ostringstream os;
1702  os << *prefix << "Storage already packed: "
1703  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1704  std::cerr << os.str();
1705  }
1706  }
1707 
1708  // May we ditch the old allocations for the packed one?
1709  if (requestOptimizedStorage) {
1710  // The user requested optimized storage, so we can dump the
1711  // unpacked 1-D storage, and keep the packed storage.
1712  valuesUnpacked_wdv = valuesPacked_wdv;
1713 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1714  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1715  }
1716  }
1717 
1718  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1719  void
1721  insertIndicesAndValues (crs_graph_type& graph,
1722  RowInfo& rowInfo,
1723  const typename crs_graph_type::SLocalGlobalViews& newInds,
1724  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1725  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1726  const ELocalGlobal lg,
1727  const ELocalGlobal I)
1728  {
1729  const size_t oldNumEnt = rowInfo.numEntries;
1730  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1731 
1732  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1733  // that probably relates to scalar_type vs. impl_scalar_type
1734  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1735  // details; look for GCC_WORKAROUND macro definition.
1736  if (numInserted > 0) {
1737  const size_t startOffset = oldNumEnt;
1738  memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1739  numInserted * sizeof (impl_scalar_type));
1740  }
1741  }
1742 
1743  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1744  void
1746  insertLocalValues (const LocalOrdinal lclRow,
1747  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1748  const Teuchos::ArrayView<const Scalar>& values,
1749  const CombineMode CM)
1750  {
1751  using std::endl;
1752  const char tfecfFuncName[] = "insertLocalValues: ";
1753 
1754  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1755  (! this->isFillActive (), std::runtime_error,
1756  "Fill is not active. After calling fillComplete, you must call "
1757  "resumeFill before you may insert entries into the matrix again.");
1758  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1759  (this->isStaticGraph (), std::runtime_error,
1760  "Cannot insert indices with static graph; use replaceLocalValues() "
1761  "instead.");
1762  // At this point, we know that myGraph_ is nonnull.
1763  crs_graph_type& graph = * (this->myGraph_);
1764  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1765  (graph.colMap_.is_null (), std::runtime_error,
1766  "Cannot insert local indices without a column map.");
1767  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1768  (graph.isGloballyIndexed (),
1769  std::runtime_error, "Graph indices are global; use "
1770  "insertGlobalValues().");
1771  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1772  (values.size () != indices.size (), std::runtime_error,
1773  "values.size() = " << values.size ()
1774  << " != indices.size() = " << indices.size () << ".");
1775  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1776  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1777  "Local row index " << lclRow << " does not belong to this process.");
1778 
1779  if (! graph.indicesAreAllocated ()) {
1780  // We only allocate values at most once per process, so it's OK
1781  // to check TPETRA_VERBOSE here.
1782  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1783  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1784  }
1785 
1786 #ifdef HAVE_TPETRA_DEBUG
1787  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1788  // In a debug build, test whether any of the given column indices
1789  // are not in the column Map. Keep track of the invalid column
1790  // indices so we can tell the user about them.
1791  {
1792  using Teuchos::toString;
1793 
1794  const map_type& colMap = * (graph.colMap_);
1795  Teuchos::Array<LocalOrdinal> badColInds;
1796  bool allInColMap = true;
1797  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1798  if (! colMap.isNodeLocalElement (indices[k])) {
1799  allInColMap = false;
1800  badColInds.push_back (indices[k]);
1801  }
1802  }
1803  if (! allInColMap) {
1804  std::ostringstream os;
1805  os << "You attempted to insert entries in owned row " << lclRow
1806  << ", at the following column indices: " << toString (indices)
1807  << "." << endl;
1808  os << "Of those, the following indices are not in the column Map on "
1809  "this process: " << toString (badColInds) << "." << endl << "Since "
1810  "the matrix has a column Map already, it is invalid to insert "
1811  "entries at those locations.";
1812  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1813  (true, std::invalid_argument, os.str ());
1814  }
1815  }
1816 #endif // HAVE_TPETRA_DEBUG
1817 
1818  RowInfo rowInfo = graph.getRowInfo (lclRow);
1819 
1820  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1821  if (CM == ADD) {
1822  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1823  valsView[offset] += values[k]; };
1824  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1825  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1826  } else if (CM == INSERT) {
1827  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1828  valsView[offset] = values[k]; };
1829  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1830  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1831  } else {
1832  std::ostringstream os;
1833  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1834  << "but this has not been implemented." << endl;
1835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1836  (true, std::invalid_argument, os.str ());
1837  }
1838  }
1839 
1840  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1841  void
1843  insertLocalValues (const LocalOrdinal localRow,
1844  const LocalOrdinal numEnt,
1845  const Scalar vals[],
1846  const LocalOrdinal cols[],
1847  const CombineMode CM)
1848  {
1849  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1850  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1851  this->insertLocalValues (localRow, colsT, valsT, CM);
1852  }
1853 
1854  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1855  void
1858  RowInfo& rowInfo,
1859  const GlobalOrdinal gblColInds[],
1860  const impl_scalar_type vals[],
1861  const size_t numInputEnt)
1862  {
1863 #ifdef HAVE_TPETRA_DEBUG
1864  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1865  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1866  const size_t curNumEnt = rowInfo.numEntries;
1867 #endif // HAVE_TPETRA_DEBUG
1868 
1869  if (! graph.indicesAreAllocated ()) {
1870  // We only allocate values at most once per process, so it's OK
1871  // to check TPETRA_VERBOSE here.
1872  using ::Tpetra::Details::Behavior;
1873  const bool verbose = Behavior::verbose("CrsMatrix");
1874  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1875  // mfh 23 Jul 2017: allocateValues invalidates existing
1876  // getRowInfo results. Once we get rid of lazy graph
1877  // allocation, we'll be able to move the getRowInfo call outside
1878  // of this method.
1879  rowInfo = graph.getRowInfo (rowInfo.localRow);
1880  }
1881 
1882  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1883  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1884  valsView[offset] += vals[k];
1885  };
1886  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1887 #ifdef HAVE_TPETRA_DEBUG
1888  //numInserted is only used inside the debug code below.
1889  auto numInserted =
1890 #endif
1891  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1892 
1893 #ifdef HAVE_TPETRA_DEBUG
1894  size_t newNumEnt = curNumEnt + numInserted;
1895  const size_t chkNewNumEnt =
1896  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1897  if (chkNewNumEnt != newNumEnt) {
1898  std::ostringstream os;
1899  os << std::endl << "newNumEnt = " << newNumEnt
1900  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1901  << ") = " << chkNewNumEnt << "." << std::endl
1902  << "\torigNumEnt: " << origNumEnt << std::endl
1903  << "\tnumInputEnt: " << numInputEnt << std::endl
1904  << "\tgblColInds: [";
1905  for (size_t k = 0; k < numInputEnt; ++k) {
1906  os << gblColInds[k];
1907  if (k + size_t (1) < numInputEnt) {
1908  os << ",";
1909  }
1910  }
1911  os << "]" << std::endl
1912  << "\tvals: [";
1913  for (size_t k = 0; k < numInputEnt; ++k) {
1914  os << vals[k];
1915  if (k + size_t (1) < numInputEnt) {
1916  os << ",";
1917  }
1918  }
1919  os << "]" << std::endl;
1920 
1921  if (this->supportsRowViews ()) {
1922  values_host_view_type vals2;
1923  if (this->isGloballyIndexed ()) {
1924  global_inds_host_view_type gblColInds2;
1925  const GlobalOrdinal gblRow =
1926  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1927  if (gblRow ==
1928  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1929  os << "Local row index " << rowInfo.localRow << " is invalid!"
1930  << std::endl;
1931  }
1932  else {
1933  bool getViewThrew = false;
1934  try {
1935  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1936  }
1937  catch (std::exception& e) {
1938  getViewThrew = true;
1939  os << "getGlobalRowView threw exception:" << std::endl
1940  << e.what () << std::endl;
1941  }
1942  if (! getViewThrew) {
1943  os << "\tNew global column indices: ";
1944  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1945  os << gblColInds2[jjj] << " ";
1946  os << std::endl;
1947  os << "\tNew values: ";
1948  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1949  os << vals2[jjj] << " ";
1950  os << std::endl;
1951  }
1952  }
1953  }
1954  else if (this->isLocallyIndexed ()) {
1955  local_inds_host_view_type lclColInds2;
1956  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1957  os << "\tNew local column indices: ";
1958  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1959  os << lclColInds2[jjj] << " ";
1960  os << std::endl;
1961  os << "\tNew values: ";
1962  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1963  os << vals2[jjj] << " ";
1964  os << std::endl;
1965  }
1966  }
1967 
1968  os << "Please report this bug to the Tpetra developers.";
1969  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1970  (true, std::logic_error, os.str ());
1971  }
1972 #endif // HAVE_TPETRA_DEBUG
1973  }
1974 
1975  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1976  void
1978  insertGlobalValues (const GlobalOrdinal gblRow,
1979  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1980  const Teuchos::ArrayView<const Scalar>& values)
1981  {
1982  using Teuchos::toString;
1983  using std::endl;
1984  typedef impl_scalar_type IST;
1985  typedef LocalOrdinal LO;
1986  typedef GlobalOrdinal GO;
1987  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1988  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1989  const char tfecfFuncName[] = "insertGlobalValues: ";
1990 
1991 #ifdef HAVE_TPETRA_DEBUG
1992  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1993  (values.size () != indices.size (), std::runtime_error,
1994  "values.size() = " << values.size () << " != indices.size() = "
1995  << indices.size () << ".");
1996 #endif // HAVE_TPETRA_DEBUG
1997 
1998  // getRowMap() is not thread safe, because it increments RCP's
1999  // reference count. getCrsGraphRef() is thread safe.
2000  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2001  const LO lclRow = rowMap.getLocalElement (gblRow);
2002 
2003  if (lclRow == OTLO::invalid ()) {
2004  // Input row is _not_ owned by the calling process.
2005  //
2006  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2007  // is not in the row Map, it doesn't matter whether or not the
2008  // graph is static; the data just get stashed for later use by
2009  // globalAssemble().
2010  this->insertNonownedGlobalValues (gblRow, indices, values);
2011  }
2012  else { // Input row _is_ owned by the calling process
2013  if (this->isStaticGraph ()) {
2014  // Uh oh! Not allowed to insert into owned rows in that case.
2015  const int myRank = rowMap.getComm ()->getRank ();
2016  const int numProcs = rowMap.getComm ()->getSize ();
2017  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2018  (true, std::runtime_error,
2019  "The matrix was constructed with a constant (\"static\") graph, "
2020  "yet the given global row index " << gblRow << " is in the row "
2021  "Map on the calling process (with rank " << myRank << ", of " <<
2022  numProcs << " process(es)). In this case, you may not insert "
2023  "new entries into rows owned by the calling process.");
2024  }
2025 
2026  crs_graph_type& graph = * (this->myGraph_);
2027  const IST* const inputVals =
2028  reinterpret_cast<const IST*> (values.getRawPtr ());
2029  const GO* const inputGblColInds = indices.getRawPtr ();
2030  const size_t numInputEnt = indices.size ();
2031  RowInfo rowInfo = graph.getRowInfo (lclRow);
2032 
2033  // If the matrix has a column Map, check at this point whether
2034  // the column indices belong to the column Map.
2035  //
2036  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2037  // test to the CrsGraph method, since it may have to do this
2038  // anyway.
2039  if (! graph.colMap_.is_null ()) {
2040  const map_type& colMap = * (graph.colMap_);
2041  // In a debug build, keep track of the nonowned ("bad") column
2042  // indices, so that we can display them in the exception
2043  // message. In a release build, just ditch the loop early if
2044  // we encounter a nonowned column index.
2045 #ifdef HAVE_TPETRA_DEBUG
2046  Teuchos::Array<GO> badColInds;
2047 #endif // HAVE_TPETRA_DEBUG
2048  const size_type numEntriesToInsert = indices.size ();
2049  bool allInColMap = true;
2050  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2051  if (! colMap.isNodeGlobalElement (indices[k])) {
2052  allInColMap = false;
2053 #ifdef HAVE_TPETRA_DEBUG
2054  badColInds.push_back (indices[k]);
2055 #else
2056  break;
2057 #endif // HAVE_TPETRA_DEBUG
2058  }
2059  }
2060  if (! allInColMap) {
2061  std::ostringstream os;
2062  os << "You attempted to insert entries in owned row " << gblRow
2063  << ", at the following column indices: " << toString (indices)
2064  << "." << endl;
2065 #ifdef HAVE_TPETRA_DEBUG
2066  os << "Of those, the following indices are not in the column Map "
2067  "on this process: " << toString (badColInds) << "." << endl
2068  << "Since the matrix has a column Map already, it is invalid "
2069  "to insert entries at those locations.";
2070 #else
2071  os << "At least one of those indices is not in the column Map "
2072  "on this process." << endl << "It is invalid to insert into "
2073  "columns not in the column Map on the process that owns the "
2074  "row.";
2075 #endif // HAVE_TPETRA_DEBUG
2076  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2077  (true, std::invalid_argument, os.str ());
2078  }
2079  }
2080 
2081  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2082  inputVals, numInputEnt);
2083  }
2084  }
2085 
2086 
2087  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2088  void
2090  insertGlobalValues (const GlobalOrdinal globalRow,
2091  const LocalOrdinal numEnt,
2092  const Scalar vals[],
2093  const GlobalOrdinal inds[])
2094  {
2095  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2096  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2097  this->insertGlobalValues (globalRow, indsT, valsT);
2098  }
2099 
2100 
2101  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2102  void
2105  const GlobalOrdinal gblRow,
2106  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2107  const Teuchos::ArrayView<const Scalar>& values,
2108  const bool debug)
2109  {
2110  typedef impl_scalar_type IST;
2111  typedef LocalOrdinal LO;
2112  typedef GlobalOrdinal GO;
2113  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2114  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2115 
2116  if (debug) {
2117  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2118  (values.size () != indices.size (), std::runtime_error,
2119  "values.size() = " << values.size () << " != indices.size() = "
2120  << indices.size () << ".");
2121  }
2122 
2123  // getRowMap() is not thread safe, because it increments RCP's
2124  // reference count. getCrsGraphRef() is thread safe.
2125  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2126  const LO lclRow = rowMap.getLocalElement (gblRow);
2127  if (lclRow == OTLO::invalid ()) {
2128  // Input row is _not_ owned by the calling process.
2129  //
2130  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2131  // is not in the row Map, it doesn't matter whether or not the
2132  // graph is static; the data just get stashed for later use by
2133  // globalAssemble().
2134  this->insertNonownedGlobalValues (gblRow, indices, values);
2135  }
2136  else { // Input row _is_ owned by the calling process
2137  if (this->isStaticGraph ()) {
2138  // Uh oh! Not allowed to insert into owned rows in that case.
2139  const int myRank = rowMap.getComm ()->getRank ();
2140  const int numProcs = rowMap.getComm ()->getSize ();
2141  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2142  (true, std::runtime_error,
2143  "The matrix was constructed with a constant (\"static\") graph, "
2144  "yet the given global row index " << gblRow << " is in the row "
2145  "Map on the calling process (with rank " << myRank << ", of " <<
2146  numProcs << " process(es)). In this case, you may not insert "
2147  "new entries into rows owned by the calling process.");
2148  }
2149 
2150  crs_graph_type& graph = * (this->myGraph_);
2151  const IST* const inputVals =
2152  reinterpret_cast<const IST*> (values.getRawPtr ());
2153  const GO* const inputGblColInds = indices.getRawPtr ();
2154  const size_t numInputEnt = indices.size ();
2155  RowInfo rowInfo = graph.getRowInfo (lclRow);
2156 
2157  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2158  // This branch is similar in function to the following branch, but for
2159  // the special case that the target graph is locally indexed.
2160  // In this case, we cannot simply filter
2161  // out global indices that don't exist on the receiving process and
2162  // insert the remaining (global) indices, but we must convert them (the
2163  // remaining global indices) to local and call `insertLocalValues`.
2164  const map_type& colMap = * (graph.colMap_);
2165  size_t curOffset = 0;
2166  while (curOffset < numInputEnt) {
2167  // Find a sequence of input indices that are in the column Map on the
2168  // calling process. Doing a sequence at a time, instead of one at a
2169  // time, amortizes some overhead.
2170  Teuchos::Array<LO> lclIndices;
2171  size_t endOffset = curOffset;
2172  for ( ; endOffset < numInputEnt; ++endOffset) {
2173  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2174  if (lclIndex != OTLO::invalid())
2175  lclIndices.push_back(lclIndex);
2176  else
2177  break;
2178  }
2179  // curOffset, endOffset: half-exclusive range of indices in the column
2180  // Map on the calling process. If endOffset == curOffset, the range is
2181  // empty.
2182  const LO numIndInSeq = (endOffset - curOffset);
2183  if (numIndInSeq != 0) {
2184  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2185  }
2186  // Invariant before the increment line: Either endOffset ==
2187  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2188  // on the calling process.
2189  if (debug) {
2190  const bool invariant = endOffset == numInputEnt ||
2191  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2192  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2193  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2194  }
2195  curOffset = endOffset + 1;
2196  }
2197  }
2198  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2199  const map_type& colMap = * (graph.colMap_);
2200  size_t curOffset = 0;
2201  while (curOffset < numInputEnt) {
2202  // Find a sequence of input indices that are in the column
2203  // Map on the calling process. Doing a sequence at a time,
2204  // instead of one at a time, amortizes some overhead.
2205  size_t endOffset = curOffset;
2206  for ( ; endOffset < numInputEnt &&
2207  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2208  ++endOffset)
2209  {}
2210  // curOffset, endOffset: half-exclusive range of indices in
2211  // the column Map on the calling process. If endOffset ==
2212  // curOffset, the range is empty.
2213  const LO numIndInSeq = (endOffset - curOffset);
2214  if (numIndInSeq != 0) {
2215  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2216  this->insertGlobalValuesImpl (graph, rowInfo,
2217  inputGblColInds + curOffset,
2218  inputVals + curOffset,
2219  numIndInSeq);
2220  }
2221  // Invariant before the increment line: Either endOffset ==
2222  // numInputEnt, or inputGblColInds[endOffset] is not in the
2223  // column Map on the calling process.
2224  if (debug) {
2225  const bool invariant = endOffset == numInputEnt ||
2226  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2227  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2228  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2229  }
2230  curOffset = endOffset + 1;
2231  }
2232  }
2233  else { // we don't have a column Map.
2234  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2235  inputVals, numInputEnt);
2236  }
2237  }
2238  }
2239 
2240  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2241  void
2242  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2243  insertGlobalValuesFilteredChecked(
2244  const GlobalOrdinal gblRow,
2245  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2246  const Teuchos::ArrayView<const Scalar>& values,
2247  const char* const prefix,
2248  const bool debug,
2249  const bool verbose)
2250  {
2252  using std::endl;
2253 
2254  try {
2255  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2256  }
2257  catch(std::exception& e) {
2258  std::ostringstream os;
2259  if (verbose) {
2260  const size_t maxNumToPrint =
2262  os << *prefix << ": insertGlobalValuesFiltered threw an "
2263  "exception: " << e.what() << endl
2264  << "Global row index: " << gblRow << endl;
2265  verbosePrintArray(os, indices, "Global column indices",
2266  maxNumToPrint);
2267  os << endl;
2268  verbosePrintArray(os, values, "Values", maxNumToPrint);
2269  os << endl;
2270  }
2271  else {
2272  os << ": insertGlobalValuesFiltered threw an exception: "
2273  << e.what();
2274  }
2275  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2276  }
2277  }
2278 
2279  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2280  LocalOrdinal
2283  const crs_graph_type& graph,
2284  const RowInfo& rowInfo,
2285  const LocalOrdinal inds[],
2286  const impl_scalar_type newVals[],
2287  const LocalOrdinal numElts)
2288  {
2289  typedef LocalOrdinal LO;
2290  typedef GlobalOrdinal GO;
2291  const bool sorted = graph.isSorted ();
2292 
2293  size_t hint = 0; // Guess for the current index k into rowVals
2294  LO numValid = 0; // number of valid local column indices
2295 
2296  if (graph.isLocallyIndexed ()) {
2297  // Get a view of the column indices in the row. This amortizes
2298  // the cost of getting the view over all the entries of inds.
2299  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2300 
2301  for (LO j = 0; j < numElts; ++j) {
2302  const LO lclColInd = inds[j];
2303  const size_t offset =
2304  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2305  lclColInd, hint, sorted);
2306  if (offset != rowInfo.numEntries) {
2307  rowVals[offset] = newVals[j];
2308  hint = offset + 1;
2309  ++numValid;
2310  }
2311  }
2312  }
2313  else if (graph.isGloballyIndexed ()) {
2314  if (graph.colMap_.is_null ()) {
2315  return Teuchos::OrdinalTraits<LO>::invalid ();
2316  }
2317  const map_type colMap = * (graph.colMap_);
2318 
2319  // Get a view of the column indices in the row. This amortizes
2320  // the cost of getting the view over all the entries of inds.
2321  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2322 
2323  for (LO j = 0; j < numElts; ++j) {
2324  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2325  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2326  const size_t offset =
2327  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2328  gblColInd, hint, sorted);
2329  if (offset != rowInfo.numEntries) {
2330  rowVals[offset] = newVals[j];
2331  hint = offset + 1;
2332  ++numValid;
2333  }
2334  }
2335  }
2336  }
2337  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2338  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2339  // to be neither locally nor globally indexed on a process.
2340  // This means that the graph or matrix has no entries on that
2341  // process. Epetra also works like this. It's related to lazy
2342  // allocation (on first insertion, not at graph / matrix
2343  // construction). Lazy allocation will go away because it is
2344  // not thread scalable.
2345 
2346  return numValid;
2347  }
2348 
2349  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2350  LocalOrdinal
2352  replaceLocalValues (const LocalOrdinal localRow,
2353  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2354  const Teuchos::ArrayView<const Scalar>& vals)
2355  {
2356  typedef LocalOrdinal LO;
2357 
2358  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2359  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2360  return Teuchos::OrdinalTraits<LO>::invalid ();
2361  }
2362  const LO* const inputInds = lclCols.getRawPtr ();
2363  const Scalar* const inputVals = vals.getRawPtr ();
2364  return this->replaceLocalValues (localRow, numInputEnt,
2365  inputVals, inputInds);
2366  }
2367 
2368  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2370  local_ordinal_type
2373  const local_ordinal_type localRow,
2374  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2375  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2376  {
2377  using LO = local_ordinal_type;
2378  const LO numInputEnt = inputInds.extent(0);
2379  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2380  return Teuchos::OrdinalTraits<LO>::invalid();
2381  }
2382  const Scalar* const inVals =
2383  reinterpret_cast<const Scalar*>(inputVals.data());
2384  return this->replaceLocalValues(localRow, numInputEnt,
2385  inVals, inputInds.data());
2386  }
2387 
2388  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2389  LocalOrdinal
2391  replaceLocalValues (const LocalOrdinal localRow,
2392  const LocalOrdinal numEnt,
2393  const Scalar inputVals[],
2394  const LocalOrdinal inputCols[])
2395  {
2396  typedef impl_scalar_type IST;
2397  typedef LocalOrdinal LO;
2398 
2399  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2400  // Fill must be active and the "nonconst" graph must exist.
2401  return Teuchos::OrdinalTraits<LO>::invalid ();
2402  }
2403  const crs_graph_type& graph = * (this->staticGraph_);
2404  const RowInfo rowInfo = graph.getRowInfo (localRow);
2405 
2406  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2407  // The calling process does not own this row, so it is not
2408  // allowed to modify its values.
2409  return static_cast<LO> (0);
2410  }
2411  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2412  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2413  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2414  inputCols, inVals, numEnt);
2415  }
2416 
2417  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2418  LocalOrdinal
2421  const crs_graph_type& graph,
2422  const RowInfo& rowInfo,
2423  const GlobalOrdinal inds[],
2424  const impl_scalar_type newVals[],
2425  const LocalOrdinal numElts)
2426  {
2427  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2428  auto fun =
2429  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2430  rowVals[offset] = newVals[k];
2431  };
2432  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2433  return graph.findGlobalIndices(rowInfo, indsT, cb);
2434  }
2435 
2436  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2437  LocalOrdinal
2439  replaceGlobalValues (const GlobalOrdinal globalRow,
2440  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2441  const Teuchos::ArrayView<const Scalar>& inputVals)
2442  {
2443  typedef LocalOrdinal LO;
2444 
2445  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2446  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2447  return Teuchos::OrdinalTraits<LO>::invalid ();
2448  }
2449  return this->replaceGlobalValues (globalRow, numInputEnt,
2450  inputVals.getRawPtr (),
2451  inputGblColInds.getRawPtr ());
2452  }
2453 
2454  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2455  LocalOrdinal
2457  replaceGlobalValues (const GlobalOrdinal globalRow,
2458  const LocalOrdinal numEnt,
2459  const Scalar inputVals[],
2460  const GlobalOrdinal inputGblColInds[])
2461  {
2462  typedef impl_scalar_type IST;
2463  typedef LocalOrdinal LO;
2464 
2465  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2466  // Fill must be active and the "nonconst" graph must exist.
2467  return Teuchos::OrdinalTraits<LO>::invalid ();
2468  }
2469  const crs_graph_type& graph = * (this->staticGraph_);
2470 
2471  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2472  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2473  // The input local row is invalid on the calling process,
2474  // which means that the calling process summed 0 entries.
2475  return static_cast<LO> (0);
2476  }
2477 
2478  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2479  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2480  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2481  inputGblColInds, inVals, numEnt);
2482  }
2483 
2484  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2486  local_ordinal_type
2489  const global_ordinal_type globalRow,
2490  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2491  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2492  {
2493  // We use static_assert here to check the template parameters,
2494  // rather than std::enable_if (e.g., on the return value, to
2495  // enable compilation only if the template parameters match the
2496  // desired attributes). This turns obscure link errors into
2497  // clear compilation errors. It also makes the return value a
2498  // lot easier to see.
2499  using LO = local_ordinal_type;
2500  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2501  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2502  return Teuchos::OrdinalTraits<LO>::invalid();
2503  }
2504  const Scalar* const inVals =
2505  reinterpret_cast<const Scalar*>(inputVals.data());
2506  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2507  inputInds.data());
2508  }
2509 
2510  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2511  LocalOrdinal
2514  const crs_graph_type& graph,
2515  const RowInfo& rowInfo,
2516  const GlobalOrdinal inds[],
2517  const impl_scalar_type newVals[],
2518  const LocalOrdinal numElts,
2519  const bool atomic)
2520  {
2521  typedef LocalOrdinal LO;
2522  typedef GlobalOrdinal GO;
2523 
2524  const bool sorted = graph.isSorted ();
2525 
2526  size_t hint = 0; // guess at the index's relative offset in the row
2527  LO numValid = 0; // number of valid input column indices
2528 
2529  if (graph.isLocallyIndexed ()) {
2530  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2531  // pointer does NOT change its reference count. Thus, this
2532  // code is still thread safe.
2533  if (graph.colMap_.is_null ()) {
2534  // NO input column indices are valid in this case, since if
2535  // the column Map is null on the calling process, then the
2536  // calling process owns no graph entries.
2537  return numValid;
2538  }
2539  const map_type& colMap = * (graph.colMap_);
2540 
2541  // Get a view of the column indices in the row. This amortizes
2542  // the cost of getting the view over all the entries of inds.
2543  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2544  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2545 
2546  for (LO j = 0; j < numElts; ++j) {
2547  const LO lclColInd = colMap.getLocalElement (inds[j]);
2548  if (lclColInd != LINV) {
2549  const size_t offset =
2550  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2551  lclColInd, hint, sorted);
2552  if (offset != rowInfo.numEntries) {
2553  if (atomic) {
2554  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2555  }
2556  else {
2557  rowVals[offset] += newVals[j];
2558  }
2559  hint = offset + 1;
2560  numValid++;
2561  }
2562  }
2563  }
2564  }
2565  else if (graph.isGloballyIndexed ()) {
2566  // Get a view of the column indices in the row. This amortizes
2567  // the cost of getting the view over all the entries of inds.
2568  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2569 
2570  for (LO j = 0; j < numElts; ++j) {
2571  const GO gblColInd = inds[j];
2572  const size_t offset =
2573  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2574  gblColInd, hint, sorted);
2575  if (offset != rowInfo.numEntries) {
2576  if (atomic) {
2577  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2578  }
2579  else {
2580  rowVals[offset] += newVals[j];
2581  }
2582  hint = offset + 1;
2583  numValid++;
2584  }
2585  }
2586  }
2587  // If the graph is neither locally nor globally indexed on the
2588  // calling process, that means the calling process has no graph
2589  // entries. Thus, none of the input column indices are valid.
2590 
2591  return numValid;
2592  }
2593 
2594  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2595  LocalOrdinal
2597  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2598  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2599  const Teuchos::ArrayView<const Scalar>& inputVals,
2600  const bool atomic)
2601  {
2602  typedef LocalOrdinal LO;
2603 
2604  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2605  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2606  return Teuchos::OrdinalTraits<LO>::invalid ();
2607  }
2608  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2609  inputVals.getRawPtr (),
2610  inputGblColInds.getRawPtr (),
2611  atomic);
2612  }
2613 
2614  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2615  LocalOrdinal
2617  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2618  const LocalOrdinal numInputEnt,
2619  const Scalar inputVals[],
2620  const GlobalOrdinal inputGblColInds[],
2621  const bool atomic)
2622  {
2623  typedef impl_scalar_type IST;
2624  typedef LocalOrdinal LO;
2625  typedef GlobalOrdinal GO;
2626 
2627  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2628  // Fill must be active and the "nonconst" graph must exist.
2629  return Teuchos::OrdinalTraits<LO>::invalid ();
2630  }
2631  const crs_graph_type& graph = * (this->staticGraph_);
2632 
2633  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2634  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2635  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2636  // thread safe in a debug build, in part because it uses
2637  // Teuchos::ArrayView, and in part because of the data structure
2638  // used to stash outgoing entries.
2639  using Teuchos::ArrayView;
2640  ArrayView<const GO> inputGblColInds_av(
2641  numInputEnt == 0 ? nullptr : inputGblColInds,
2642  numInputEnt);
2643  ArrayView<const Scalar> inputVals_av(
2644  numInputEnt == 0 ? nullptr :
2645  inputVals, numInputEnt);
2646  // gblRow is not in the row Map on the calling process, so stash
2647  // the given entries away in a separate data structure.
2648  // globalAssemble() (called during fillComplete()) will exchange
2649  // that data and sum it in using sumIntoGlobalValues().
2650  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2651  inputVals_av);
2652  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2653  // since we won't know whether the given indices were valid
2654  // until globalAssemble (called in fillComplete) is called.
2655  // That's why insertNonownedGlobalValues doesn't return
2656  // anything. Just for consistency, I'll return the number of
2657  // entries that the user gave us.
2658  return numInputEnt;
2659  }
2660  else { // input row is in the row Map on the calling process
2661  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2662  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2663  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2664  inputGblColInds, inVals,
2665  numInputEnt, atomic);
2666  }
2667  }
2668 
2669  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2670  LocalOrdinal
2672  transformLocalValues (const LocalOrdinal lclRow,
2673  const LocalOrdinal numInputEnt,
2674  const impl_scalar_type inputVals[],
2675  const LocalOrdinal inputCols[],
2676  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2677  const bool atomic)
2678  {
2679  using Tpetra::Details::OrdinalTraits;
2680  typedef LocalOrdinal LO;
2681 
2682  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2683  // Fill must be active and the "nonconst" graph must exist.
2684  return Teuchos::OrdinalTraits<LO>::invalid ();
2685  }
2686  const crs_graph_type& graph = * (this->staticGraph_);
2687  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2688 
2689  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2690  // The calling process does not own this row, so it is not
2691  // allowed to modify its values.
2692  return static_cast<LO> (0);
2693  }
2694  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2695  return this->transformLocalValues (curRowVals.data (), graph,
2696  rowInfo, inputCols, inputVals,
2697  numInputEnt, f, atomic);
2698  }
2699 
2700  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2701  LocalOrdinal
2702  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2703  transformGlobalValues (const GlobalOrdinal gblRow,
2704  const LocalOrdinal numInputEnt,
2705  const impl_scalar_type inputVals[],
2706  const GlobalOrdinal inputCols[],
2707  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2708  const bool atomic)
2709  {
2710  using Tpetra::Details::OrdinalTraits;
2711  typedef LocalOrdinal LO;
2712 
2713  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2714  // Fill must be active and the "nonconst" graph must exist.
2715  return OrdinalTraits<LO>::invalid ();
2716  }
2717  const crs_graph_type& graph = * (this->staticGraph_);
2718  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2719 
2720  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2721  // The calling process does not own this row, so it is not
2722  // allowed to modify its values.
2723  return static_cast<LO> (0);
2724  }
2725  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2726  return this->transformGlobalValues (curRowVals.data (), graph,
2727  rowInfo, inputCols, inputVals,
2728  numInputEnt, f, atomic);
2729  }
2730 
2731  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2732  LocalOrdinal
2733  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2734  transformLocalValues (impl_scalar_type rowVals[],
2735  const crs_graph_type& graph,
2736  const RowInfo& rowInfo,
2737  const LocalOrdinal inds[],
2738  const impl_scalar_type newVals[],
2739  const LocalOrdinal numElts,
2740  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2741  const bool atomic)
2742  {
2743  typedef impl_scalar_type ST;
2744  typedef LocalOrdinal LO;
2745  typedef GlobalOrdinal GO;
2746 
2747  //if (newVals.extent (0) != inds.extent (0)) {
2748  // The sizes of the input arrays must match.
2749  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2750  //}
2751  //const LO numElts = static_cast<LO> (inds.extent (0));
2752  const bool sorted = graph.isSorted ();
2753 
2754  LO numValid = 0; // number of valid input column indices
2755  size_t hint = 0; // Guess for the current index k into rowVals
2756 
2757  if (graph.isLocallyIndexed ()) {
2758  // Get a view of the column indices in the row. This amortizes
2759  // the cost of getting the view over all the entries of inds.
2760  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2761 
2762  for (LO j = 0; j < numElts; ++j) {
2763  const LO lclColInd = inds[j];
2764  const size_t offset =
2765  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2766  lclColInd, hint, sorted);
2767  if (offset != rowInfo.numEntries) {
2768  if (atomic) {
2769  // NOTE (mfh 30 Nov 2015) The commented-out code is
2770  // wrong because another thread may have changed
2771  // rowVals[offset] between those two lines of code.
2772  //
2773  //const ST newVal = f (rowVals[offset], newVals[j]);
2774  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2775 
2776  volatile ST* const dest = &rowVals[offset];
2777  (void) atomic_binary_function_update (dest, newVals[j], f);
2778  }
2779  else {
2780  // use binary function f
2781  rowVals[offset] = f (rowVals[offset], newVals[j]);
2782  }
2783  hint = offset + 1;
2784  ++numValid;
2785  }
2786  }
2787  }
2788  else if (graph.isGloballyIndexed ()) {
2789  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2790  // pointer does NOT change its reference count. Thus, this
2791  // code is still thread safe.
2792  if (graph.colMap_.is_null ()) {
2793  // NO input column indices are valid in this case. Either
2794  // the column Map hasn't been set yet (so local indices
2795  // don't exist yet), or the calling process owns no graph
2796  // entries.
2797  return numValid;
2798  }
2799  const map_type& colMap = * (graph.colMap_);
2800  // Get a view of the column indices in the row. This amortizes
2801  // the cost of getting the view over all the entries of inds.
2802  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2803 
2804  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2805  for (LO j = 0; j < numElts; ++j) {
2806  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2807  if (gblColInd != GINV) {
2808  const size_t offset =
2809  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2810  gblColInd, hint, sorted);
2811  if (offset != rowInfo.numEntries) {
2812  if (atomic) {
2813  // NOTE (mfh 30 Nov 2015) The commented-out code is
2814  // wrong because another thread may have changed
2815  // rowVals[offset] between those two lines of code.
2816  //
2817  //const ST newVal = f (rowVals[offset], newVals[j]);
2818  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2819 
2820  volatile ST* const dest = &rowVals[offset];
2821  (void) atomic_binary_function_update (dest, newVals[j], f);
2822  }
2823  else {
2824  // use binary function f
2825  rowVals[offset] = f (rowVals[offset], newVals[j]);
2826  }
2827  hint = offset + 1;
2828  numValid++;
2829  }
2830  }
2831  }
2832  }
2833  // If the graph is neither locally nor globally indexed on the
2834  // calling process, that means the calling process has no graph
2835  // entries. Thus, none of the input column indices are valid.
2836 
2837  return numValid;
2838  }
2839 
2840  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2841  LocalOrdinal
2842  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2843  transformGlobalValues (impl_scalar_type rowVals[],
2844  const crs_graph_type& graph,
2845  const RowInfo& rowInfo,
2846  const GlobalOrdinal inds[],
2847  const impl_scalar_type newVals[],
2848  const LocalOrdinal numElts,
2849  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2850  const bool atomic)
2851  {
2852  typedef impl_scalar_type ST;
2853  typedef LocalOrdinal LO;
2854  typedef GlobalOrdinal GO;
2855 
2856  //if (newVals.extent (0) != inds.extent (0)) {
2857  // The sizes of the input arrays must match.
2858  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2859  //}
2860  //const LO numElts = static_cast<LO> (inds.extent (0));
2861  const bool sorted = graph.isSorted ();
2862 
2863  LO numValid = 0; // number of valid input column indices
2864  size_t hint = 0; // Guess for the current index k into rowVals
2865 
2866  if (graph.isGloballyIndexed ()) {
2867  // Get a view of the column indices in the row. This amortizes
2868  // the cost of getting the view over all the entries of inds.
2869  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2870 
2871  for (LO j = 0; j < numElts; ++j) {
2872  const GO gblColInd = inds[j];
2873  const size_t offset =
2874  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2875  gblColInd, hint, sorted);
2876  if (offset != rowInfo.numEntries) {
2877  if (atomic) {
2878  // NOTE (mfh 30 Nov 2015) The commented-out code is
2879  // wrong because another thread may have changed
2880  // rowVals[offset] between those two lines of code.
2881  //
2882  //const ST newVal = f (rowVals[offset], newVals[j]);
2883  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2884 
2885  volatile ST* const dest = &rowVals[offset];
2886  (void) atomic_binary_function_update (dest, newVals[j], f);
2887  }
2888  else {
2889  // use binary function f
2890  rowVals[offset] = f (rowVals[offset], newVals[j]);
2891  }
2892  hint = offset + 1;
2893  ++numValid;
2894  }
2895  }
2896  }
2897  else if (graph.isLocallyIndexed ()) {
2898  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2899  // pointer does NOT change its reference count. Thus, this
2900  // code is still thread safe.
2901  if (graph.colMap_.is_null ()) {
2902  // NO input column indices are valid in this case. Either the
2903  // column Map hasn't been set yet (so local indices don't
2904  // exist yet), or the calling process owns no graph entries.
2905  return numValid;
2906  }
2907  const map_type& colMap = * (graph.colMap_);
2908  // Get a view of the column indices in the row. This amortizes
2909  // the cost of getting the view over all the entries of inds.
2910  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2911 
2912  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2913  for (LO j = 0; j < numElts; ++j) {
2914  const LO lclColInd = colMap.getLocalElement (inds[j]);
2915  if (lclColInd != LINV) {
2916  const size_t offset =
2917  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2918  lclColInd, hint, sorted);
2919  if (offset != rowInfo.numEntries) {
2920  if (atomic) {
2921  // NOTE (mfh 30 Nov 2015) The commented-out code is
2922  // wrong because another thread may have changed
2923  // rowVals[offset] between those two lines of code.
2924  //
2925  //const ST newVal = f (rowVals[offset], newVals[j]);
2926  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2927 
2928  volatile ST* const dest = &rowVals[offset];
2929  (void) atomic_binary_function_update (dest, newVals[j], f);
2930  }
2931  else {
2932  // use binary function f
2933  rowVals[offset] = f (rowVals[offset], newVals[j]);
2934  }
2935  hint = offset + 1;
2936  numValid++;
2937  }
2938  }
2939  }
2940  }
2941  // If the graph is neither locally nor globally indexed on the
2942  // calling process, that means the calling process has no graph
2943  // entries. Thus, none of the input column indices are valid.
2944 
2945  return numValid;
2946  }
2947 
2948  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2949  LocalOrdinal
2952  const crs_graph_type& graph,
2953  const RowInfo& rowInfo,
2954  const LocalOrdinal inds[],
2955  const impl_scalar_type newVals[],
2956  const LocalOrdinal numElts,
2957  const bool atomic)
2958  {
2959  typedef LocalOrdinal LO;
2960  typedef GlobalOrdinal GO;
2961 
2962  const bool sorted = graph.isSorted ();
2963 
2964  size_t hint = 0; // Guess for the current index k into rowVals
2965  LO numValid = 0; // number of valid local column indices
2966 
2967  if (graph.isLocallyIndexed ()) {
2968  // Get a view of the column indices in the row. This amortizes
2969  // the cost of getting the view over all the entries of inds.
2970  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2971 
2972  for (LO j = 0; j < numElts; ++j) {
2973  const LO lclColInd = inds[j];
2974  const size_t offset =
2975  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2976  lclColInd, hint, sorted);
2977  if (offset != rowInfo.numEntries) {
2978  if (atomic) {
2979  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2980  }
2981  else {
2982  rowVals[offset] += newVals[j];
2983  }
2984  hint = offset + 1;
2985  ++numValid;
2986  }
2987  }
2988  }
2989  else if (graph.isGloballyIndexed ()) {
2990  if (graph.colMap_.is_null ()) {
2991  return Teuchos::OrdinalTraits<LO>::invalid ();
2992  }
2993  const map_type colMap = * (graph.colMap_);
2994 
2995  // Get a view of the column indices in the row. This amortizes
2996  // the cost of getting the view over all the entries of inds.
2997  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2998 
2999  for (LO j = 0; j < numElts; ++j) {
3000  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3001  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3002  const size_t offset =
3003  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3004  gblColInd, hint, sorted);
3005  if (offset != rowInfo.numEntries) {
3006  if (atomic) {
3007  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3008  }
3009  else {
3010  rowVals[offset] += newVals[j];
3011  }
3012  hint = offset + 1;
3013  ++numValid;
3014  }
3015  }
3016  }
3017  }
3018  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3019  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3020  // to be neither locally nor globally indexed on a process.
3021  // This means that the graph or matrix has no entries on that
3022  // process. Epetra also works like this. It's related to lazy
3023  // allocation (on first insertion, not at graph / matrix
3024  // construction). Lazy allocation will go away because it is
3025  // not thread scalable.
3026 
3027  return numValid;
3028  }
3029 
3030  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3031  LocalOrdinal
3033  sumIntoLocalValues (const LocalOrdinal localRow,
3034  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3035  const Teuchos::ArrayView<const Scalar>& values,
3036  const bool atomic)
3037  {
3038  using LO = local_ordinal_type;
3039  const LO numInputEnt = static_cast<LO>(indices.size());
3040  if (static_cast<LO>(values.size()) != numInputEnt) {
3041  return Teuchos::OrdinalTraits<LO>::invalid();
3042  }
3043  const LO* const inputInds = indices.getRawPtr();
3044  const scalar_type* const inputVals = values.getRawPtr();
3045  return this->sumIntoLocalValues(localRow, numInputEnt,
3046  inputVals, inputInds, atomic);
3047  }
3048 
3049  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3051  local_ordinal_type
3054  const local_ordinal_type localRow,
3055  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3056  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3057  const bool atomic)
3058  {
3059  using LO = local_ordinal_type;
3060  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3061  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3062  return Teuchos::OrdinalTraits<LO>::invalid();
3063  }
3064  const scalar_type* inVals =
3065  reinterpret_cast<const scalar_type*>(inputVals.data());
3066  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3067  inputInds.data(), atomic);
3068  }
3069 
3070  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3071  LocalOrdinal
3073  sumIntoLocalValues (const LocalOrdinal localRow,
3074  const LocalOrdinal numEnt,
3075  const Scalar vals[],
3076  const LocalOrdinal cols[],
3077  const bool atomic)
3078  {
3079  typedef impl_scalar_type IST;
3080  typedef LocalOrdinal LO;
3081 
3082  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3083  // Fill must be active and the "nonconst" graph must exist.
3084  return Teuchos::OrdinalTraits<LO>::invalid ();
3085  }
3086  const crs_graph_type& graph = * (this->staticGraph_);
3087  const RowInfo rowInfo = graph.getRowInfo (localRow);
3088 
3089  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3090  // The calling process does not own this row, so it is not
3091  // allowed to modify its values.
3092  return static_cast<LO> (0);
3093  }
3094  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3095  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3096  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3097  cols, inputVals, numEnt, atomic);
3098  }
3099 
3100  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3102  values_dualv_type::t_host::const_type
3104  getValuesViewHost (const RowInfo& rowinfo) const
3105  {
3106  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3107  return typename values_dualv_type::t_host::const_type ();
3108  else
3109  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3110  rowinfo.allocSize,
3111  Access::ReadOnly);
3112  }
3113 
3114  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3116  values_dualv_type::t_host
3119  {
3120  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3121  return typename values_dualv_type::t_host ();
3122  else
3123  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3124  rowinfo.allocSize,
3125  Access::ReadWrite);
3126  }
3127 
3128  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3130  values_dualv_type::t_dev::const_type
3132  getValuesViewDevice (const RowInfo& rowinfo) const
3133  {
3134  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3135  return typename values_dualv_type::t_dev::const_type ();
3136  else
3137  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3138  rowinfo.allocSize,
3139  Access::ReadOnly);
3140  }
3141 
3142  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3144  values_dualv_type::t_dev
3147  {
3148  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3149  return typename values_dualv_type::t_dev ();
3150  else
3151  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3152  rowinfo.allocSize,
3153  Access::ReadWrite);
3154  }
3155 
3156 
3157  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3158  void
3161  nonconst_local_inds_host_view_type &indices,
3162  nonconst_values_host_view_type &values,
3163  size_t& numEntries) const
3164  {
3165  using Teuchos::ArrayView;
3166  using Teuchos::av_reinterpret_cast;
3167  const char tfecfFuncName[] = "getLocalRowCopy: ";
3168 
3169  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3170  (! this->hasColMap (), std::runtime_error,
3171  "The matrix does not have a column Map yet. This means we don't have "
3172  "local indices for columns yet, so it doesn't make sense to call this "
3173  "method. If the matrix doesn't have a column Map yet, you should call "
3174  "fillComplete on it first.");
3175 
3176  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3177  const size_t theNumEntries = rowinfo.numEntries;
3178  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3179  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3180  static_cast<size_t> (values.size ()) < theNumEntries,
3181  std::runtime_error, "Row with local index " << localRow << " has " <<
3182  theNumEntries << " entry/ies, but indices.size() = " <<
3183  indices.size () << " and values.size() = " << values.size () << ".");
3184  numEntries = theNumEntries; // first side effect
3185 
3186  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3187  if (staticGraph_->isLocallyIndexed ()) {
3188  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3189  auto curVals = getValuesViewHost(rowinfo);
3190 
3191  for (size_t j = 0; j < theNumEntries; ++j) {
3192  values[j] = curVals[j];
3193  indices[j] = curLclInds(j);
3194  }
3195  }
3196  else if (staticGraph_->isGloballyIndexed ()) {
3197  // Don't call getColMap(), because it touches RCP's reference count.
3198  const map_type& colMap = * (staticGraph_->colMap_);
3199  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3200  auto curVals = getValuesViewHost(rowinfo);
3201 
3202  for (size_t j = 0; j < theNumEntries; ++j) {
3203  values[j] = curVals[j];
3204  indices[j] = colMap.getLocalElement (curGblInds(j));
3205  }
3206  }
3207  }
3208  }
3209 
3210 
3211 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3212 void
3215  nonconst_global_inds_host_view_type &indices,
3216  nonconst_values_host_view_type &values,
3217  size_t& numEntries) const
3218  {
3219  using Teuchos::ArrayView;
3220  using Teuchos::av_reinterpret_cast;
3221  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3222 
3223  const RowInfo rowinfo =
3224  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3225  const size_t theNumEntries = rowinfo.numEntries;
3226  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3227  static_cast<size_t> (indices.size ()) < theNumEntries ||
3228  static_cast<size_t> (values.size ()) < theNumEntries,
3229  std::runtime_error, "Row with global index " << globalRow << " has "
3230  << theNumEntries << " entry/ies, but indices.size() = " <<
3231  indices.size () << " and values.size() = " << values.size () << ".");
3232  numEntries = theNumEntries; // first side effect
3233 
3234  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3235  if (staticGraph_->isLocallyIndexed ()) {
3236  const map_type& colMap = * (staticGraph_->colMap_);
3237  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3238  auto curVals = getValuesViewHost(rowinfo);
3239 
3240  for (size_t j = 0; j < theNumEntries; ++j) {
3241  values[j] = curVals[j];
3242  indices[j] = colMap.getGlobalElement (curLclInds(j));
3243  }
3244  }
3245  else if (staticGraph_->isGloballyIndexed ()) {
3246  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3247  auto curVals = getValuesViewHost(rowinfo);
3248 
3249  for (size_t j = 0; j < theNumEntries; ++j) {
3250  values[j] = curVals[j];
3251  indices[j] = curGblInds(j);
3252  }
3253  }
3254  }
3255  }
3256 
3257 
3258  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3259  void
3261  getLocalRowView(LocalOrdinal localRow,
3262  local_inds_host_view_type &indices,
3263  values_host_view_type &values) const
3264  {
3265  const char tfecfFuncName[] = "getLocalRowView: ";
3266 
3267  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3268  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3269  "its indices as global indices, so you cannot get a view with local "
3270  "column indices. If the matrix has a column Map, you may call "
3271  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3272  "a view with global column indices by calling getGlobalRowCopy().");
3273 
3274  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3275  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3276  rowInfo.numEntries > 0) {
3277  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3278  rowInfo.offset1D,
3279  rowInfo.numEntries,
3280  Access::ReadOnly);
3281  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3282  rowInfo.numEntries,
3283  Access::ReadOnly);
3284  }
3285  else {
3286  // This does the right thing (reports an empty row) if the input
3287  // row is invalid.
3288  indices = local_inds_host_view_type();
3289  values = values_host_view_type();
3290  }
3291 
3292 #ifdef HAVE_TPETRA_DEBUG
3293  const char suffix[] = ". This should never happen. Please report this "
3294  "bug to the Tpetra developers.";
3295  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3296  (static_cast<size_t> (indices.size ()) !=
3297  static_cast<size_t> (values.size ()), std::logic_error,
3298  "At the end of this method, for local row " << localRow << ", "
3299  "indices.size() = " << indices.size () << " != values.size () = "
3300  << values.size () << suffix);
3301  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3302  (static_cast<size_t> (indices.size ()) !=
3303  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3304  "At the end of this method, for local row " << localRow << ", "
3305  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3306  << rowInfo.numEntries << suffix);
3307  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3308  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3309  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3310  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3311  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3312  expectedNumEntries << suffix);
3313 #endif // HAVE_TPETRA_DEBUG
3314  }
3315 
3316 
3317  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3318  void
3320  getGlobalRowView (GlobalOrdinal globalRow,
3321  global_inds_host_view_type &indices,
3322  values_host_view_type &values) const
3323  {
3324  const char tfecfFuncName[] = "getGlobalRowView: ";
3325 
3326  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3327  isLocallyIndexed (), std::runtime_error,
3328  "The matrix is locally indexed, so we cannot return a view of the row "
3329  "with global column indices. Use getGlobalRowCopy() instead.");
3330 
3331  // This does the right thing (reports an empty row) if the input
3332  // row is invalid.
3333  const RowInfo rowInfo =
3334  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3335  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3336  rowInfo.numEntries > 0) {
3337  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3338  rowInfo.numEntries,
3339  Access::ReadOnly);
3340  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3341  rowInfo.numEntries,
3342  Access::ReadOnly);
3343  }
3344  else {
3345  indices = global_inds_host_view_type();
3346  values = values_host_view_type();
3347  }
3348 
3349 #ifdef HAVE_TPETRA_DEBUG
3350  const char suffix[] = ". This should never happen. Please report this "
3351  "bug to the Tpetra developers.";
3352  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3353  (static_cast<size_t> (indices.size ()) !=
3354  static_cast<size_t> (values.size ()), std::logic_error,
3355  "At the end of this method, for global row " << globalRow << ", "
3356  "indices.size() = " << indices.size () << " != values.size () = "
3357  << values.size () << suffix);
3358  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3359  (static_cast<size_t> (indices.size ()) !=
3360  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3361  "At the end of this method, for global row " << globalRow << ", "
3362  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3363  << rowInfo.numEntries << suffix);
3364  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3365  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3366  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3367  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3368  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3369  " " << expectedNumEntries << suffix);
3370 #endif // HAVE_TPETRA_DEBUG
3371  }
3372 
3373 
3374  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3375  void
3377  scale (const Scalar& alpha)
3378  {
3379  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3380 
3381  const size_t nlrs = staticGraph_->getLocalNumRows ();
3382  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3383  if (! staticGraph_->indicesAreAllocated () ||
3384  nlrs == 0 || numEntries == 0) {
3385  // do nothing
3386  }
3387  else {
3388 
3389  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3390  KokkosBlas::scal(vals, theAlpha, vals);
3391 
3392  }
3393  }
3394 
3395  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3396  void
3398  setAllToScalar (const Scalar& alpha)
3399  {
3400  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3401 
3402  // replace all values in the matrix
3403  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3404  // however, if there are no valid entries, we can short-circuit
3405  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3406  const size_t numEntries = staticGraph_->getLocalNumEntries();
3407  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3408  // do nothing
3409  }
3410  else {
3411  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3412  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3413  theAlpha);
3414  // CAG: This fence was found to be required on Cuda with UVM=on.
3415  Kokkos::fence("CrsMatrix::setAllToScalar");
3416  }
3417  }
3418 
3419  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3420  void
3422  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3423  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3424  const typename local_matrix_device_type::values_type& values)
3425  {
3426  using ProfilingRegion=Details::ProfilingRegion;
3427  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3428  const char tfecfFuncName[] = "setAllValues: ";
3429  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3430  (columnIndices.size () != values.size (), std::invalid_argument,
3431  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3432  " = " << values.size () << ".");
3433  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3434  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3435 
3436  try {
3437  myGraph_->setAllIndices (rowPointers, columnIndices);
3438  }
3439  catch (std::exception &e) {
3440  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3441  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3442  "exception: " << e.what ());
3443  }
3444 
3445  // Make sure that myGraph_ now has a local graph. It may not be
3446  // fillComplete yet, so it's important to check. We don't care
3447  // whether setAllIndices() did a shallow copy or a deep copy, so a
3448  // good way to check is to compare dimensions.
3449  auto lclGraph = myGraph_->getLocalGraphDevice ();
3450  const size_t numEnt = lclGraph.entries.extent (0);
3451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3452  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3453  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3454  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3455  "local graph. Please report this bug to the Tpetra developers.");
3456 
3457  valuesPacked_wdv = values_wdv_type(values);
3458  valuesUnpacked_wdv = valuesPacked_wdv;
3459 
3460  // Storage MUST be packed, since the interface doesn't give any
3461  // way to indicate any extra space at the end of each row.
3462  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3463 
3464  checkInternalState ();
3465  }
3466 
3467  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3468  void
3470  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3471  {
3472  using ProfilingRegion=Details::ProfilingRegion;
3473  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3474 
3475  auto graph = localDeviceMatrix.graph;
3476  //FIXME how to check whether graph is allocated
3477 
3478  auto rows = graph.row_map;
3479  auto columns = graph.entries;
3480  auto values = localDeviceMatrix.values;
3481 
3482  setAllValues(rows,columns,values);
3483  }
3484 
3485  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3486  void
3488  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3489  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3490  const Teuchos::ArrayRCP<Scalar>& val)
3491  {
3492  using Kokkos::Compat::getKokkosViewDeepCopy;
3493  using Teuchos::ArrayRCP;
3494  using Teuchos::av_reinterpret_cast;
3495  typedef device_type DT;
3496  typedef impl_scalar_type IST;
3497  typedef typename local_graph_device_type::row_map_type row_map_type;
3498  //typedef typename row_map_type::non_const_value_type row_offset_type;
3499  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3500 
3501  // The row offset type may depend on the execution space. It may
3502  // not necessarily be size_t. If it's not, we need to make a deep
3503  // copy. We need to make a deep copy anyway so that Kokkos can
3504  // own the memory. Regardless, ptrIn gets the copy.
3505  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3506  Kokkos::View<const size_t*,
3507  typename row_map_type::array_layout,
3508  Kokkos::HostSpace,
3509  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3510  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3511 
3512  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3513  (ptrNative.extent (0) != ptrSizeT.extent (0),
3514  std::logic_error, "ptrNative.extent(0) = " <<
3515  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3516  << ptrSizeT.extent (0) << ". Please report this bug to the "
3517  "Tpetra developers.");
3518 
3519  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3520  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3521  this->setAllValues (ptrNative, indIn, valIn);
3522  }
3523 
3524  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3525  void
3527  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3528  {
3529  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3530  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3531  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3532 
3533  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3534  // this method in CrsGraph too, so don't call it (otherwise build
3535  // warnings will show up and annoy users). Instead, copy results
3536  // in and out, if the memory space requires it.
3537 
3538  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3539  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3540  offsets.resize (lclNumRows);
3541  }
3542 
3543  // The input ArrayRCP must always be a host pointer. Thus, if
3544  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3545  // to write to that allocation directly as a Kokkos::View.
3546  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3547  // It is always syntactically correct to assign a raw host
3548  // pointer to a device View, so this code will compile correctly
3549  // even if this branch never runs.
3550  typedef Kokkos::View<size_t*, device_type,
3551  Kokkos::MemoryUnmanaged> output_type;
3552  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3553  staticGraph_->getLocalDiagOffsets (offsetsOut);
3554  }
3555  else {
3556  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3557  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3558  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3559  Kokkos::MemoryUnmanaged> output_type;
3560  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3561  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3562  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3563  }
3564  }
3565 
3566  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3567  void
3570  {
3571  using Teuchos::ArrayRCP;
3572  using Teuchos::ArrayView;
3573  using Teuchos::av_reinterpret_cast;
3574  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3575  typedef local_ordinal_type LO;
3576 
3577 
3578  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3579  staticGraph_.is_null (), std::runtime_error,
3580  "This method requires that the matrix have a graph.");
3581  auto rowMapPtr = this->getRowMap ();
3582  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3583  // Processes on which the row Map or its communicator is null
3584  // don't participate. Users shouldn't even call this method on
3585  // those processes.
3586  return;
3587  }
3588  auto colMapPtr = this->getColMap ();
3589  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3590  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3591  "This method requires that the matrix have a column Map.");
3592  const map_type& rowMap = * rowMapPtr;
3593  const map_type& colMap = * colMapPtr;
3594  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3595 
3596 #ifdef HAVE_TPETRA_DEBUG
3597  // isCompatible() requires an all-reduce, and thus this check
3598  // should only be done in debug mode.
3599  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3600  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3601  "The input Vector's Map must be compatible with the CrsMatrix's row "
3602  "Map. You may check this by using Map's isCompatible method: "
3603  "diag.getMap ()->isCompatible (A.getRowMap ());");
3604 #endif // HAVE_TPETRA_DEBUG
3605 
3606  if (this->isFillComplete ()) {
3607  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3608  // 1-D subview of the first (and only) column of D_lcl.
3609  const auto D_lcl_1d =
3610  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3611 
3612  const auto lclRowMap = rowMap.getLocalMap ();
3613  const auto lclColMap = colMap.getLocalMap ();
3615  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3616  lclColMap,
3617  getLocalMatrixDevice ());
3618  }
3619  else {
3621  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3622  }
3623  }
3624 
3625  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3626  void
3629  const Kokkos::View<const size_t*, device_type,
3630  Kokkos::MemoryUnmanaged>& offsets) const
3631  {
3632  typedef LocalOrdinal LO;
3633 
3634 #ifdef HAVE_TPETRA_DEBUG
3635  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3636  const map_type& rowMap = * (this->getRowMap ());
3637  // isCompatible() requires an all-reduce, and thus this check
3638  // should only be done in debug mode.
3639  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3640  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3641  "The input Vector's Map must be compatible with (in the sense of Map::"
3642  "isCompatible) the CrsMatrix's row Map.");
3643 #endif // HAVE_TPETRA_DEBUG
3644 
3645  // For now, we fill the Vector on the host and sync to device.
3646  // Later, we may write a parallel kernel that works entirely on
3647  // device.
3648  //
3649  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3650  // we write a device kernel, it will not need to assume UVM.
3651 
3652  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3653  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3654  // Get 1-D subview of the first (and only) column of D_lcl.
3655  auto D_lcl_1d =
3656  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3657 
3658  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3659  getLocalMatrixDevice ());
3660  }
3661 
3662  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3663  void
3666  const Teuchos::ArrayView<const size_t>& offsets) const
3667  {
3668  using LO = LocalOrdinal;
3669  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3670  using IST = impl_scalar_type;
3671 
3672 #ifdef HAVE_TPETRA_DEBUG
3673  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3674  const map_type& rowMap = * (this->getRowMap ());
3675  // isCompatible() requires an all-reduce, and thus this check
3676  // should only be done in debug mode.
3677  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3678  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3679  "The input Vector's Map must be compatible with (in the sense of Map::"
3680  "isCompatible) the CrsMatrix's row Map.");
3681 #endif // HAVE_TPETRA_DEBUG
3682 
3683  // See #1510. In case diag has already been marked modified on
3684  // device, we need to clear that flag, since the code below works
3685  // on host.
3686  //diag.clear_sync_state ();
3687 
3688  // For now, we fill the Vector on the host and sync to device.
3689  // Later, we may write a parallel kernel that works entirely on
3690  // device.
3691  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3692  // 1-D subview of the first (and only) column of lclVecHost.
3693  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3694 
3695  using host_offsets_view_type =
3696  Kokkos::View<const size_t*, Kokkos::HostSpace,
3697  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3698  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3699  // Find the diagonal entries and put them in lclVecHost1d.
3700  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3701  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3702  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3703 
3704  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3705  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3706  Kokkos::parallel_for
3707  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3708  range_type (0, myNumRows),
3709  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3710  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3711  if (h_offsets[lclRow] != INV) {
3712  auto curRowOffset = rowPtrsPackedHost (lclRow);
3713  lclVecHost1d(lclRow) =
3714  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3715  }
3716  });
3717  //diag.sync_device ();
3718  }
3719 
3720 
3721  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3722  void
3725  {
3726  using ::Tpetra::Details::ProfilingRegion;
3727  using Teuchos::ArrayRCP;
3728  using Teuchos::ArrayView;
3729  using Teuchos::null;
3730  using Teuchos::RCP;
3731  using Teuchos::rcp;
3732  using Teuchos::rcpFromRef;
3734  const char tfecfFuncName[] = "leftScale: ";
3735 
3736  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3737 
3738  RCP<const vec_type> xp;
3739  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3740  // Take from Epetra: If we have a non-trivial exporter, we must
3741  // import elements that are permuted or are on other processors.
3742  auto exporter = this->getCrsGraphRef ().getExporter ();
3743  if (exporter.get () != nullptr) {
3744  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3745  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3746  xp = tempVec;
3747  }
3748  else {
3749  xp = rcpFromRef (x);
3750  }
3751  }
3752  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3753  xp = rcpFromRef (x);
3754  }
3755  else {
3756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3757  (true, std::invalid_argument, "x's Map must be the same as "
3758  "either the row Map or the range Map of the CrsMatrix.");
3759  }
3760 
3761  if (this->isFillComplete()) {
3762  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3763  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3765  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3766  x_lcl_1d, false, false);
3767  }
3768  else {
3769  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3770  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3771  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3772  " fillComplete");
3773  }
3774  }
3775 
3776  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3777  void
3780  {
3781  using ::Tpetra::Details::ProfilingRegion;
3782  using Teuchos::ArrayRCP;
3783  using Teuchos::ArrayView;
3784  using Teuchos::null;
3785  using Teuchos::RCP;
3786  using Teuchos::rcp;
3787  using Teuchos::rcpFromRef;
3789  const char tfecfFuncName[] = "rightScale: ";
3790 
3791  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3792 
3793  RCP<const vec_type> xp;
3794  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3795  // Take from Epetra: If we have a non-trivial exporter, we must
3796  // import elements that are permuted or are on other processors.
3797  auto importer = this->getCrsGraphRef ().getImporter ();
3798  if (importer.get () != nullptr) {
3799  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3800  tempVec->doImport (x, *importer, REPLACE);
3801  xp = tempVec;
3802  }
3803  else {
3804  xp = rcpFromRef (x);
3805  }
3806  }
3807  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3808  xp = rcpFromRef (x);
3809  } else {
3810  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3811  (true, std::runtime_error, "x's Map must be the same as "
3812  "either the domain Map or the column Map of the CrsMatrix.");
3813  }
3814 
3815  if (this->isFillComplete()) {
3816  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3817  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3819  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3820  x_lcl_1d, false, false);
3821  }
3822  else {
3823  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3824  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3825  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3826  " fillComplete");
3827  }
3828  }
3829 
3830  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3834  {
3835  using Teuchos::ArrayView;
3836  using Teuchos::outArg;
3837  using Teuchos::REDUCE_SUM;
3838  using Teuchos::reduceAll;
3839 
3840  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3841  // local part of this computation. It could make sense to put
3842  // this operation in the Kokkos::CrsMatrix.
3843 
3844  // check the cache first
3845  mag_type mySum = STM::zero ();
3846  if (getLocalNumEntries() > 0) {
3847  if (isStorageOptimized ()) {
3848  // "Optimized" storage is packed storage. That means we can
3849  // iterate in one pass through the 1-D values array.
3850  const size_t numEntries = getLocalNumEntries ();
3851  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3852  for (size_t k = 0; k < numEntries; ++k) {
3853  auto val = values[k];
3854  // Note (etp 06 Jan 2015) We need abs() here for composite types
3855  // (in general, if mag_type is on the left-hand-side, we need
3856  // abs() on the right-hand-side)
3857  const mag_type val_abs = STS::abs (val);
3858  mySum += val_abs * val_abs;
3859  }
3860  }
3861  else {
3862  const LocalOrdinal numRows =
3863  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3864  for (LocalOrdinal r = 0; r < numRows; ++r) {
3865  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3866  const size_t numEntries = rowInfo.numEntries;
3867  auto A_r = this->getValuesViewHost(rowInfo);
3868  for (size_t k = 0; k < numEntries; ++k) {
3869  const impl_scalar_type val = A_r[k];
3870  const mag_type val_abs = STS::abs (val);
3871  mySum += val_abs * val_abs;
3872  }
3873  }
3874  }
3875  }
3876  mag_type totalSum = STM::zero ();
3877  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3878  mySum, outArg (totalSum));
3879  return STM::sqrt (totalSum);
3880  }
3881 
3882  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3883  void
3885  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3886  {
3887  const char tfecfFuncName[] = "replaceColMap: ";
3888  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3889  // Then replacing the column Map might mean that we need to
3890  // reindex the column indices.
3891  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3892  myGraph_.is_null (), std::runtime_error,
3893  "This method does not work if the matrix has a const graph. The whole "
3894  "idea of a const graph is that you are not allowed to change it, but "
3895  "this method necessarily must modify the graph, since the graph owns "
3896  "the matrix's column Map.");
3897  myGraph_->replaceColMap (newColMap);
3898  }
3899 
3900  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3901  void
3904  const Teuchos::RCP<const map_type>& newColMap,
3905  const Teuchos::RCP<const import_type>& newImport,
3906  const bool sortEachRow)
3907  {
3908  const char tfecfFuncName[] = "reindexColumns: ";
3909  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3910  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3911  "The input graph is null, but the matrix does not own its graph.");
3912 
3913  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3914  const bool sortGraph = false; // we'll sort graph & matrix together below
3915 
3916  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3917 
3918  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3919  const LocalOrdinal lclNumRows =
3920  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3921 
3922  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3923 
3924  const RowInfo rowInfo = theGraph.getRowInfo (row);
3925  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3926  auto vals = this->getValuesViewHostNonConst (rowInfo);
3927 
3928  sort2 (lclColInds.data (),
3929  lclColInds.data () + rowInfo.numEntries,
3930  vals.data ());
3931  }
3932  theGraph.indicesAreSorted_ = true;
3933  }
3934  }
3935 
3936  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3937  void
3939  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3940  {
3941  const char tfecfFuncName[] = "replaceDomainMap: ";
3942  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3943  myGraph_.is_null (), std::runtime_error,
3944  "This method does not work if the matrix has a const graph. The whole "
3945  "idea of a const graph is that you are not allowed to change it, but this"
3946  " method necessarily must modify the graph, since the graph owns the "
3947  "matrix's domain Map and Import objects.");
3948  myGraph_->replaceDomainMap (newDomainMap);
3949  }
3950 
3951  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3952  void
3954  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3955  Teuchos::RCP<const import_type>& newImporter)
3956  {
3957  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3958  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3959  myGraph_.is_null (), std::runtime_error,
3960  "This method does not work if the matrix has a const graph. The whole "
3961  "idea of a const graph is that you are not allowed to change it, but this"
3962  " method necessarily must modify the graph, since the graph owns the "
3963  "matrix's domain Map and Import objects.");
3964  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3965  }
3966 
3967  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3968  void
3970  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
3971  {
3972  const char tfecfFuncName[] = "replaceRangeMap: ";
3973  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3974  myGraph_.is_null (), std::runtime_error,
3975  "This method does not work if the matrix has a const graph. The whole "
3976  "idea of a const graph is that you are not allowed to change it, but this"
3977  " method necessarily must modify the graph, since the graph owns the "
3978  "matrix's domain Map and Import objects.");
3979  myGraph_->replaceRangeMap (newRangeMap);
3980  }
3981 
3982  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3983  void
3985  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
3986  Teuchos::RCP<const export_type>& newExporter)
3987  {
3988  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3989  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3990  myGraph_.is_null (), std::runtime_error,
3991  "This method does not work if the matrix has a const graph. The whole "
3992  "idea of a const graph is that you are not allowed to change it, but this"
3993  " method necessarily must modify the graph, since the graph owns the "
3994  "matrix's domain Map and Import objects.");
3995  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
3996  }
3997 
3998  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3999  void
4001  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4002  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4003  const Teuchos::ArrayView<const Scalar>& values)
4004  {
4005  using Teuchos::Array;
4006  typedef GlobalOrdinal GO;
4007  typedef typename Array<GO>::size_type size_type;
4008 
4009  const size_type numToInsert = indices.size ();
4010  // Add the new data to the list of nonlocals.
4011  // This creates the arrays if they don't exist yet.
4012  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4013  Array<GO>& curRowInds = curRow.first;
4014  Array<Scalar>& curRowVals = curRow.second;
4015  const size_type newCapacity = curRowInds.size () + numToInsert;
4016  curRowInds.reserve (newCapacity);
4017  curRowVals.reserve (newCapacity);
4018  for (size_type k = 0; k < numToInsert; ++k) {
4019  curRowInds.push_back (indices[k]);
4020  curRowVals.push_back (values[k]);
4021  }
4022  }
4023 
4024  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4025  void
4028  {
4029  using Details::Behavior;
4031  using Teuchos::Comm;
4032  using Teuchos::outArg;
4033  using Teuchos::RCP;
4034  using Teuchos::rcp;
4035  using Teuchos::REDUCE_MAX;
4036  using Teuchos::REDUCE_MIN;
4037  using Teuchos::reduceAll;
4038  using std::endl;
4040  //typedef LocalOrdinal LO;
4041  typedef GlobalOrdinal GO;
4042  typedef typename Teuchos::Array<GO>::size_type size_type;
4043  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4044  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4045 
4046  const bool verbose = Behavior::verbose("CrsMatrix");
4047  std::unique_ptr<std::string> prefix;
4048  if (verbose) {
4049  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4050  std::ostringstream os;
4051  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4052  << endl;
4053  std::cerr << os.str();
4054  }
4055  RCP<const Comm<int> > comm = getComm ();
4056 
4057  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4058  (! isFillActive (), std::runtime_error, "Fill must be active before "
4059  "you may call this method.");
4060 
4061  const size_t myNumNonlocalRows = nonlocals_.size ();
4062 
4063  // If no processes have nonlocal rows, then we don't have to do
4064  // anything. Checking this is probably cheaper than constructing
4065  // the Map of nonlocal rows (see below) and noticing that it has
4066  // zero global entries.
4067  {
4068  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4069  int someoneHasNonlocalRows = 0;
4070  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4071  outArg (someoneHasNonlocalRows));
4072  if (someoneHasNonlocalRows == 0) {
4073  return; // no process has nonlocal rows, so nothing to do
4074  }
4075  }
4076 
4077  // 1. Create a list of the "nonlocal" rows on each process. this
4078  // requires iterating over nonlocals_, so while we do this,
4079  // deduplicate the entries and get a count for each nonlocal
4080  // row on this process.
4081  // 2. Construct a new row Map corresponding to those rows. This
4082  // Map is likely overlapping. We know that the Map is not
4083  // empty on all processes, because the above all-reduce and
4084  // return exclude that case.
4085 
4086  RCP<const map_type> nonlocalRowMap;
4087  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4088  {
4089  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4090  size_type curPos = 0;
4091  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4092  ++mapIter, ++curPos) {
4093  myNonlocalGblRows[curPos] = mapIter->first;
4094  // Get the values and column indices by reference, since we
4095  // intend to change them in place (that's what "erase" does).
4096  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4097  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4098 
4099  // Sort both arrays jointly, using the column indices as keys,
4100  // then merge them jointly. "Merge" here adds values
4101  // corresponding to the same column indices. The first 2 args
4102  // of merge2 are output arguments that work just like the
4103  // return value of std::unique.
4104  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4105  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4106  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4107  merge2 (gblCols_newEnd, vals_newEnd,
4108  gblCols.begin (), gblCols.end (),
4109  vals.begin (), vals.end ());
4110  gblCols.erase (gblCols_newEnd, gblCols.end ());
4111  vals.erase (vals_newEnd, vals.end ());
4112  numEntPerNonlocalRow[curPos] = gblCols.size ();
4113  }
4114 
4115  // Currently, Map requires that its indexBase be the global min
4116  // of all its global indices. Map won't compute this for us, so
4117  // we must do it. If our process has no nonlocal rows, set the
4118  // "min" to the max possible GO value. This ensures that if
4119  // some process has at least one nonlocal row, then it will pick
4120  // that up as the min. We know that at least one process has a
4121  // nonlocal row, since the all-reduce and return at the top of
4122  // this method excluded that case.
4123  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4124  {
4125  auto iter = std::min_element (myNonlocalGblRows.begin (),
4126  myNonlocalGblRows.end ());
4127  if (iter != myNonlocalGblRows.end ()) {
4128  myMinNonlocalGblRow = *iter;
4129  }
4130  }
4131  GO gblMinNonlocalGblRow = 0;
4132  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4133  outArg (gblMinNonlocalGblRow));
4134  const GO indexBase = gblMinNonlocalGblRow;
4135  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4136  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4137  }
4138 
4139  // 3. Use the values and column indices for each nonlocal row, as
4140  // stored in nonlocals_, to construct a CrsMatrix corresponding
4141  // to nonlocal rows. We have
4142  // exact counts of the number of entries in each nonlocal row.
4143 
4144  if (verbose) {
4145  std::ostringstream os;
4146  os << *prefix << "Create nonlocal matrix" << endl;
4147  std::cerr << os.str();
4148  }
4149  RCP<crs_matrix_type> nonlocalMatrix =
4150  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4151  {
4152  size_type curPos = 0;
4153  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4154  ++mapIter, ++curPos) {
4155  const GO gblRow = mapIter->first;
4156  // Get values & column indices by ref, just to avoid copy.
4157  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4158  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4159  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4160  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4161  }
4162  }
4163  // There's no need to fill-complete the nonlocals matrix.
4164  // We just use it as a temporary container for the Export.
4165 
4166  // 4. If the original row Map is one to one, then we can Export
4167  // directly from nonlocalMatrix into this. Otherwise, we have
4168  // to create a temporary matrix with a one-to-one row Map,
4169  // Export into that, then Import from the temporary matrix into
4170  // *this.
4171 
4172  auto origRowMap = this->getRowMap ();
4173  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4174 
4175  int isLocallyComplete = 1; // true by default
4176 
4177  if (origRowMapIsOneToOne) {
4178  if (verbose) {
4179  std::ostringstream os;
4180  os << *prefix << "Original row Map is 1-to-1" << endl;
4181  std::cerr << os.str();
4182  }
4183  export_type exportToOrig (nonlocalRowMap, origRowMap);
4184  if (! exportToOrig.isLocallyComplete ()) {
4185  isLocallyComplete = 0;
4186  }
4187  if (verbose) {
4188  std::ostringstream os;
4189  os << *prefix << "doExport from nonlocalMatrix" << endl;
4190  std::cerr << os.str();
4191  }
4192  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4193  // We're done at this point!
4194  }
4195  else {
4196  if (verbose) {
4197  std::ostringstream os;
4198  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4199  std::cerr << os.str();
4200  }
4201  // If you ask a Map whether it is one to one, it does some
4202  // communication and stashes intermediate results for later use
4203  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4204  // much more then the original cost of calling isOneToOne.
4205  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4206  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4207  if (! exportToOneToOne.isLocallyComplete ()) {
4208  isLocallyComplete = 0;
4209  }
4210 
4211  // Create a temporary matrix with the one-to-one row Map.
4212  //
4213  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4214  // each row, to avoid reallocation during the Export operation.
4215  if (verbose) {
4216  std::ostringstream os;
4217  os << *prefix << "Create & doExport into 1-to-1 matrix"
4218  << endl;
4219  std::cerr << os.str();
4220  }
4221  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4222  // Export from matrix of nonlocals into the temp one-to-one matrix.
4223  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4224  Tpetra::ADD);
4225 
4226  // We don't need the matrix of nonlocals anymore, so get rid of
4227  // it, to keep the memory high-water mark down.
4228  if (verbose) {
4229  std::ostringstream os;
4230  os << *prefix << "Free nonlocalMatrix" << endl;
4231  std::cerr << os.str();
4232  }
4233  nonlocalMatrix = Teuchos::null;
4234 
4235  // Import from the one-to-one matrix to the original matrix.
4236  if (verbose) {
4237  std::ostringstream os;
4238  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4239  std::cerr << os.str();
4240  }
4241  import_type importToOrig (oneToOneRowMap, origRowMap);
4242  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4243  }
4244 
4245  // It's safe now to clear out nonlocals_, since we've already
4246  // committed side effects to *this. The standard idiom for
4247  // clearing a Container like std::map, is to swap it with an empty
4248  // Container and let the swapped Container fall out of scope.
4249  if (verbose) {
4250  std::ostringstream os;
4251  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4252  std::cerr << os.str();
4253  }
4254  decltype (nonlocals_) newNonlocals;
4255  std::swap (nonlocals_, newNonlocals);
4256 
4257  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4258  // don't like throwing an exception here. A local return value
4259  // would likely be more useful to users. However, if users find
4260  // themselves exercising nonlocal inserts often, then they are
4261  // probably novice users who need the help. See Gibhub Issues
4262  // #603 and #601 (esp. the latter) for discussion.
4263 
4264  int isGloballyComplete = 0; // output argument of reduceAll
4265  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4266  outArg (isGloballyComplete));
4267  TEUCHOS_TEST_FOR_EXCEPTION
4268  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4269  "you called insertGlobalValues with a global row index which is not in "
4270  "the matrix's row Map on any process in its communicator.");
4271  }
4272 
4273  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4274  void
4276  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4277  {
4278  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4279  myGraph_->resumeFill (params);
4280  }
4281 #if KOKKOSKERNELS_VERSION >= 40299
4282  // Delete the apply helper (if it exists)
4283  applyHelper.reset();
4284 #endif
4285  fillComplete_ = false;
4286  }
4287 
4288  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4289  bool
4292  return getCrsGraphRef ().haveGlobalConstants ();
4293  }
4294 
4295  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4296  void
4298  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4299  {
4300  const char tfecfFuncName[] = "fillComplete(params): ";
4301 
4302  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4303  (this->getCrsGraph ().is_null (), std::logic_error,
4304  "getCrsGraph() returns null. This should not happen at this point. "
4305  "Please report this bug to the Tpetra developers.");
4306 
4307  const crs_graph_type& graph = this->getCrsGraphRef ();
4308  if (this->isStaticGraph () && graph.isFillComplete ()) {
4309  // If this matrix's graph is fill complete and the user did not
4310  // supply a domain or range Map, use the graph's domain and
4311  // range Maps.
4312  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4313  }
4314  else { // assume that user's row Map is the domain and range Map
4315  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4316  Teuchos::RCP<const map_type> domainMap = rangeMap;
4317  this->fillComplete (domainMap, rangeMap, params);
4318  }
4319  }
4320 
4321  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4322  void
4324  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4325  const Teuchos::RCP<const map_type>& rangeMap,
4326  const Teuchos::RCP<Teuchos::ParameterList>& params)
4327  {
4328  using Details::Behavior;
4330  using Teuchos::ArrayRCP;
4331  using Teuchos::RCP;
4332  using Teuchos::rcp;
4333  using std::endl;
4334  const char tfecfFuncName[] = "fillComplete: ";
4335  ProfilingRegion regionFillComplete
4336  ("Tpetra::CrsMatrix::fillComplete");
4337  const bool verbose = Behavior::verbose("CrsMatrix");
4338  std::unique_ptr<std::string> prefix;
4339  if (verbose) {
4340  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4341  std::ostringstream os;
4342  os << *prefix << endl;
4343  std::cerr << os.str ();
4344  }
4345  Details::ProfilingRegion region(
4346  "Tpetra::CrsMatrix::fillCompete",
4347  "fillCompete");
4348 
4349  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4350  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4351  "Matrix fill state must be active (isFillActive() "
4352  "must be true) before you may call fillComplete().");
4353  const int numProcs = this->getComm ()->getSize ();
4354 
4355  //
4356  // Read parameters from the input ParameterList.
4357  //
4358  {
4359  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4360 
4361  // If true, the caller promises that no process did nonlocal
4362  // changes since the last call to fillComplete.
4363  bool assertNoNonlocalInserts = false;
4364  // If true, makeColMap sorts remote GIDs (within each remote
4365  // process' group).
4366  bool sortGhosts = true;
4367 
4368  if (! params.is_null ()) {
4369  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4370  assertNoNonlocalInserts);
4371  if (params->isParameter ("sort column map ghost gids")) {
4372  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4373  }
4374  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4375  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4376  }
4377  }
4378  // We also don't need to do global assembly if there is only one
4379  // process in the communicator.
4380  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4381  // This parameter only matters if this matrix owns its graph.
4382  if (! this->myGraph_.is_null ()) {
4383  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4384  }
4385 
4386  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4387  if (this->hasColMap ()) { // use local indices
4388  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4389  }
4390  else { // no column Map, so use global indices
4391  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4392  }
4393  }
4394  // Global assemble, if we need to. This call only costs a single
4395  // all-reduce if we didn't need global assembly after all.
4396  if (needGlobalAssemble) {
4397  this->globalAssemble ();
4398  }
4399  else {
4400  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4401  (numProcs == 1 && nonlocals_.size() > 0,
4402  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4403  "An invalid entry (i.e., with row index not in the row Map) must have "
4404  "been submitted to the CrsMatrix.");
4405  }
4406  }
4407  if (this->isStaticGraph ()) {
4408  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4409  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4410  // checks below only in debug mode. It would be nicer to do a
4411  // local check, then propagate the error state in a deferred
4412  // way, whenever communication happens. That would reduce the
4413  // cost of checking, to the point where it may make sense to
4414  // enable it even in release mode.
4415 #ifdef HAVE_TPETRA_DEBUG
4416  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4417  // input Maps incurs a penalty of two all-reduces for the
4418  // otherwise optimal const graph case.
4419  //
4420  // We could turn these (max) 2 all-reduces into (max) 1, by
4421  // fusing them. We could do this by adding a "locallySameAs"
4422  // method to Map, which would return one of four states:
4423  //
4424  // a. Certainly globally the same
4425  // b. Certainly globally not the same
4426  // c. Locally the same
4427  // d. Locally not the same
4428  //
4429  // The first two states don't require further communication.
4430  // The latter two states require an all-reduce to communicate
4431  // globally, but we only need one all-reduce, since we only need
4432  // to check whether at least one of the Maps is wrong.
4433  const bool domainMapsMatch =
4434  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4435  const bool rangeMapsMatch =
4436  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4437 
4438  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4439  (! domainMapsMatch, std::runtime_error,
4440  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4441  "The graph cannot be changed because it was given to the CrsMatrix "
4442  "constructor as const. You can fix this by passing in the graph's "
4443  "domain Map and range Map to the matrix's fillComplete call.");
4444 
4445  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4446  (! rangeMapsMatch, std::runtime_error,
4447  "The CrsMatrix's range Map does not match the graph's range Map. "
4448  "The graph cannot be changed because it was given to the CrsMatrix "
4449  "constructor as const. You can fix this by passing in the graph's "
4450  "domain Map and range Map to the matrix's fillComplete call.");
4451 #endif // HAVE_TPETRA_DEBUG
4452 
4453  // The matrix does _not_ own the graph, and the graph's
4454  // structure is already fixed, so just fill the local matrix.
4455  this->fillLocalMatrix (params);
4456  }
4457  else {
4458  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4459  // Set the graph's domain and range Maps. This will clear the
4460  // Import if the domain Map has changed (is a different
4461  // pointer), and the Export if the range Map has changed (is a
4462  // different pointer).
4463  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4464 
4465  // Make the graph's column Map, if necessary.
4466  Teuchos::Array<int> remotePIDs (0);
4467  const bool mustBuildColMap = ! this->hasColMap ();
4468  if (mustBuildColMap) {
4469  this->myGraph_->makeColMap (remotePIDs);
4470  }
4471 
4472  // Make indices local, if necessary. The method won't do
4473  // anything if the graph is already locally indexed.
4474  const std::pair<size_t, std::string> makeIndicesLocalResult =
4475  this->myGraph_->makeIndicesLocal(verbose);
4476  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4477  // the error state to makeImportExport
4478  // which may do all-reduces and thus may
4479  // have the opportunity to communicate that error state.
4480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4481  (makeIndicesLocalResult.first != 0, std::runtime_error,
4482  makeIndicesLocalResult.second);
4483 
4484  const bool sorted = this->myGraph_->isSorted ();
4485  const bool merged = this->myGraph_->isMerged ();
4486  this->sortAndMergeIndicesAndValues (sorted, merged);
4487 
4488  // Make Import and Export objects, if they haven't been made
4489  // already. If we made a column Map above, reuse information
4490  // from that process to avoid communiation in the Import setup.
4491  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4492 
4493  // The matrix _does_ own the graph, so fill the local graph at
4494  // the same time as the local matrix.
4495  this->fillLocalGraphAndMatrix (params);
4496 
4497  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4498  params->get ("compute global constants", true);
4499  if (callGraphComputeGlobalConstants) {
4500  this->myGraph_->computeGlobalConstants ();
4501  }
4502  else {
4503  this->myGraph_->computeLocalConstants ();
4504  }
4505  this->myGraph_->fillComplete_ = true;
4506  this->myGraph_->checkInternalState ();
4507  }
4508 
4509  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4510 
4511  this->fillComplete_ = true; // Now we're fill complete!
4512  {
4513  Details::ProfilingRegion region_cis(
4514  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4515  );
4516  this->checkInternalState ();
4517  }
4518  } //fillComplete(domainMap, rangeMap, params)
4519 
4520  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4521  void
4523  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4524  const Teuchos::RCP<const map_type> & rangeMap,
4525  const Teuchos::RCP<const import_type>& importer,
4526  const Teuchos::RCP<const export_type>& exporter,
4527  const Teuchos::RCP<Teuchos::ParameterList> &params)
4528  {
4529 #ifdef HAVE_TPETRA_MMM_TIMINGS
4530  std::string label;
4531  if(!params.is_null())
4532  label = params->get("Timer Label",label);
4533  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4534  using Teuchos::TimeMonitor;
4535 
4536  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4537 #endif
4538 
4539  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4540  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4541  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4542  "must be true) before calling fillComplete().");
4543  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4544  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4545 
4546  {
4547 #ifdef HAVE_TPETRA_MMM_TIMINGS
4548  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4549 #endif
4550  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4551  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4552  }
4553 
4554  {
4555 #ifdef HAVE_TPETRA_MMM_TIMINGS
4556  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4557 #endif
4558  // Fill the local graph and matrix
4559  fillLocalGraphAndMatrix (params);
4560  }
4561  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4562 
4563  // Now we're fill complete!
4564  fillComplete_ = true;
4565 
4566  // Sanity checks at the end.
4567 #ifdef HAVE_TPETRA_DEBUG
4568  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4569  ": We're at the end of fillComplete(), but isFillActive() is true. "
4570  "Please report this bug to the Tpetra developers.");
4571  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4572  ": We're at the end of fillComplete(), but isFillActive() is true. "
4573  "Please report this bug to the Tpetra developers.");
4574 #endif // HAVE_TPETRA_DEBUG
4575  {
4576 #ifdef HAVE_TPETRA_MMM_TIMINGS
4577  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4578 #endif
4579 
4580  checkInternalState();
4581  }
4582  }
4583 
4584  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4586  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4587  {
4588  impl_scalar_type* rowValueIter = vals;
4589  // beg,end define a half-exclusive interval over which to iterate.
4590  LocalOrdinal* beg = cols;
4591  LocalOrdinal* end = cols + rowLen;
4592  LocalOrdinal* newend = beg;
4593  if (beg != end) {
4594  LocalOrdinal* cur = beg + 1;
4595  impl_scalar_type* vcur = rowValueIter + 1;
4596  impl_scalar_type* vend = rowValueIter;
4597  cur = beg+1;
4598  while (cur != end) {
4599  if (*cur != *newend) {
4600  // new entry; save it
4601  ++newend;
4602  ++vend;
4603  (*newend) = (*cur);
4604  (*vend) = (*vcur);
4605  }
4606  else {
4607  // old entry; merge it
4608  //(*vend) = f (*vend, *vcur);
4609  (*vend) += *vcur;
4610  }
4611  ++cur;
4612  ++vcur;
4613  }
4614  ++newend; // one past the last entry, per typical [beg,end) semantics
4615  }
4616  return newend - beg;
4617  }
4618 
4619  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4620  void
4622  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4623  {
4624  using ::Tpetra::Details::ProfilingRegion;
4625  typedef LocalOrdinal LO;
4626  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4627  host_execution_space;
4628  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4629  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4630  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4631 
4632  if (! sorted || ! merged) {
4633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4634  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4635  "\"static\" (const) graph, since the matrix does not own the graph.");
4636  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4637  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4638  "this matrix claims ! isStaticGraph(). "
4639  "Please report this bug to the Tpetra developers.");
4640  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4641  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4642  "this method if the graph's storage has already been optimized. "
4643  "Please report this bug to the Tpetra developers.");
4644 
4645  crs_graph_type& graph = * (this->myGraph_);
4646  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4647  size_t totalNumDups = 0;
4648  {
4649  //Accessing host unpacked (4-array CRS) local matrix.
4650  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4651  auto rowLengths_ = graph.k_numRowEntries_;
4652  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4653  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4654  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4655  [=] (const LO lclRow, size_t& numDups) {
4656  size_t rowBegin = rowBegins_(lclRow);
4657  size_t rowLen = rowLengths_(lclRow);
4658  LO* cols = cols_.data() + rowBegin;
4659  impl_scalar_type* vals = vals_.data() + rowBegin;
4660  if (! sorted) {
4661  sort2 (cols, cols + rowLen, vals);
4662  }
4663  if (! merged) {
4664  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4665  rowLengths_(lclRow) = newRowLength;
4666  numDups += rowLen - newRowLength;
4667  }
4668  }, totalNumDups);
4669  }
4670  if (! sorted) {
4671  graph.indicesAreSorted_ = true; // we just sorted every row
4672  }
4673  if (! merged) {
4674  graph.noRedundancies_ = true; // we just merged every row
4675  }
4676  }
4677  }
4678 
4679  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4680  void
4684  Scalar alpha,
4685  Scalar beta) const
4686  {
4688  using Teuchos::RCP;
4689  using Teuchos::rcp;
4690  using Teuchos::rcp_const_cast;
4691  using Teuchos::rcpFromRef;
4692  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4693  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4694 
4695  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4696  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4697  // failing only for the Kokkos refactor version of Tpetra. It's a
4698  // good idea regardless to have the bypass.
4699  if (alpha == ZERO) {
4700  if (beta == ZERO) {
4701  Y_in.putScalar (ZERO);
4702  } else if (beta != ONE) {
4703  Y_in.scale (beta);
4704  }
4705  return;
4706  }
4707 
4708  // It's possible that X is a view of Y or vice versa. We don't
4709  // allow this (apply() requires that X and Y not alias one
4710  // another), but it's helpful to detect and work around this case.
4711  // We don't try to to detect the more subtle cases (e.g., one is a
4712  // subview of the other, but their initial pointers differ). We
4713  // only need to do this if this matrix's Import is trivial;
4714  // otherwise, we don't actually apply the operator from X into Y.
4715 
4716  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4717  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4718 
4719  // If beta == 0, then the output MV will be overwritten; none of
4720  // its entries should be read. (Sparse BLAS semantics say that we
4721  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4722  // This matters if we need to do an Export operation; see below.
4723  const bool Y_is_overwritten = (beta == ZERO);
4724 
4725  // We treat the case of a replicated MV output specially.
4726  const bool Y_is_replicated =
4727  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4728 
4729  // This is part of the special case for replicated MV output.
4730  // We'll let each process do its thing, but do an all-reduce at
4731  // the end to sum up the results. Setting beta=0 on all processes
4732  // but Proc 0 makes the math work out for the all-reduce. (This
4733  // assumes that the replicated data is correctly replicated, so
4734  // that the data are the same on all processes.)
4735  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4736  beta = ZERO;
4737  }
4738 
4739  // Temporary MV for Import operation. After the block of code
4740  // below, this will be an (Imported if necessary) column Map MV
4741  // ready to give to localApply(...).
4742  RCP<const MV> X_colMap;
4743  if (importer.is_null ()) {
4744  if (! X_in.isConstantStride ()) {
4745  // Not all sparse mat-vec kernels can handle an input MV with
4746  // nonconstant stride correctly, so we have to copy it in that
4747  // case into a constant stride MV. To make a constant stride
4748  // copy of X_in, we force creation of the column (== domain)
4749  // Map MV (if it hasn't already been created, else fetch the
4750  // cached copy). This avoids creating a new MV each time.
4751  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4752  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4753  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4754  }
4755  else {
4756  // The domain and column Maps are the same, so do the local
4757  // multiply using the domain Map input MV X_in.
4758  X_colMap = rcpFromRef (X_in);
4759  }
4760  }
4761  else { // need to Import source (multi)vector
4762  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4763 
4764  // We're doing an Import anyway, which will copy the relevant
4765  // elements of the domain Map MV X_in into a separate column Map
4766  // MV. Thus, we don't have to worry whether X_in is constant
4767  // stride.
4768  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4769 
4770  // Import from the domain Map MV to the column Map MV.
4771  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4772  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4773  }
4774 
4775  // Temporary MV for doExport (if needed), or for copying a
4776  // nonconstant stride output MV into a constant stride MV. This
4777  // is null if we don't need the temporary MV, that is, if the
4778  // Export is trivial (null).
4779  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4780 
4781  // If we have a nontrivial Export object, we must perform an
4782  // Export. In that case, the local multiply result will go into
4783  // the row Map multivector. We don't have to make a
4784  // constant-stride version of Y_in in this case, because we had to
4785  // make a constant stride Y_rowMap MV and do an Export anyway.
4786  if (! exporter.is_null ()) {
4787  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4788  {
4789  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4790 
4791  // If we're overwriting the output MV Y_in completely (beta ==
4792  // 0), then make sure that it is filled with zeros before we
4793  // do the Export. Otherwise, the ADD combine mode will use
4794  // data in Y_in, which is supposed to be zero.
4795  if (Y_is_overwritten) {
4796  Y_in.putScalar (ZERO);
4797  }
4798  else {
4799  // Scale output MV by beta, so that doExport sums in the
4800  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4801  Y_in.scale (beta);
4802  }
4803  // Do the Export operation.
4804  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4805  }
4806  }
4807  else { // Don't do an Export: row Map and range Map are the same.
4808  //
4809  // If Y_in does not have constant stride, or if the column Map
4810  // MV aliases Y_in, then we can't let the kernel write directly
4811  // to Y_in. Instead, we have to use the cached row (== range)
4812  // Map MV as temporary storage.
4813  //
4814  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4815  // the user passed in the same MultiVector for both X and Y. It
4816  // won't detect whether one MultiVector views the other. We
4817  // should also check the MultiVectors' raw data pointers.
4818  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4819  // Force creating the MV if it hasn't been created already.
4820  // This will reuse a previously created cached MV.
4821  Y_rowMap = getRowMapMultiVector (Y_in, true);
4822 
4823  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4824  // since we're overwriting it anyway.
4825  if (beta != ZERO) {
4826  Tpetra::deep_copy (*Y_rowMap, Y_in);
4827  }
4828  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4829  Tpetra::deep_copy (Y_in, *Y_rowMap);
4830  }
4831  else {
4832  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4833  }
4834  }
4835 
4836  // If the range Map is a locally replicated Map, sum up
4837  // contributions from each process. We set beta = 0 on all
4838  // processes but Proc 0 initially, so this will handle the scaling
4839  // factor beta correctly.
4840  if (Y_is_replicated) {
4841  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4842  Y_in.reduce ();
4843  }
4844  }
4845 
4846  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4847  void
4851  const Teuchos::ETransp mode,
4852  Scalar alpha,
4853  Scalar beta) const
4854  {
4856  using Teuchos::null;
4857  using Teuchos::RCP;
4858  using Teuchos::rcp;
4859  using Teuchos::rcp_const_cast;
4860  using Teuchos::rcpFromRef;
4861  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4862 
4863  // Take shortcuts for alpha == 0.
4864  if (alpha == ZERO) {
4865  // Follow the Sparse BLAS convention by ignoring both the matrix
4866  // and X_in, in this case.
4867  if (beta == ZERO) {
4868  // Follow the Sparse BLAS convention by overwriting any Inf or
4869  // NaN values in Y_in, in this case.
4870  Y_in.putScalar (ZERO);
4871  }
4872  else {
4873  Y_in.scale (beta);
4874  }
4875  return;
4876  }
4877  else if (beta == ZERO) {
4878  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4879  //when bets==0. This was not the case with transpose in a multithreaded
4880  //environment where a multiplication with subsequent atomic_adds is used
4881  //since 0 is effectively not special cased. Doing the explicit set to zero here
4882  //This catches cases where Y is nan or inf.
4883  Y_in.putScalar (ZERO);
4884  }
4885 
4886  const size_t numVectors = X_in.getNumVectors ();
4887 
4888  // We don't allow X_in and Y_in to alias one another. It's hard
4889  // to check this, because advanced users could create views from
4890  // raw pointers. However, if X_in and Y_in reference the same
4891  // object, we will do the user a favor by copying X into new
4892  // storage (with a warning). We only need to do this if we have
4893  // trivial importers; otherwise, we don't actually apply the
4894  // operator from X into Y.
4895  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4896  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4897  // access X indirectly, in case we need to create temporary storage
4898  RCP<const MV> X;
4899 
4900  // some parameters for below
4901  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4902  const bool Y_is_overwritten = (beta == ZERO);
4903  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4904  beta = ZERO;
4905  }
4906 
4907  // The kernels do not allow input or output with nonconstant stride.
4908  if (! X_in.isConstantStride () && importer.is_null ()) {
4909  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4910  } else {
4911  X = rcpFromRef (X_in); // Reference to X_in
4912  }
4913 
4914  // Set up temporary multivectors for Import and/or Export.
4915  if (importer != Teuchos::null) {
4916  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4917  importMV_ = null;
4918  }
4919  if (importMV_ == null) {
4920  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4921  }
4922  }
4923  if (exporter != Teuchos::null) {
4924  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4925  exportMV_ = null;
4926  }
4927  if (exportMV_ == null) {
4928  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4929  }
4930  }
4931 
4932  // If we have a non-trivial exporter, we must import elements that
4933  // are permuted or are on other processors.
4934  if (! exporter.is_null ()) {
4935  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4936  exportMV_->doImport (X_in, *exporter, INSERT);
4937  X = exportMV_; // multiply out of exportMV_
4938  }
4939 
4940  // If we have a non-trivial importer, we must export elements that
4941  // are permuted or belong to other processors. We will compute
4942  // solution into the to-be-exported MV; get a view.
4943  if (importer != Teuchos::null) {
4944  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4945 
4946  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4947  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4948  // back and figure out why this helps. importMV_ SHOULD be
4949  // completely overwritten in the localApply(...) call
4950  // below, because beta == ZERO there.
4951  importMV_->putScalar (ZERO);
4952  // Do the local computation.
4953  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4954 
4955  if (Y_is_overwritten) {
4956  Y_in.putScalar (ZERO);
4957  } else {
4958  Y_in.scale (beta);
4959  }
4960  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4961  }
4962  // otherwise, multiply into Y
4963  else {
4964  // can't multiply in-situ; can't multiply into non-strided multivector
4965  //
4966  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4967  // the user passed in the same MultiVector for both X and Y. It
4968  // won't detect whether one MultiVector views the other. We
4969  // should also check the MultiVectors' raw data pointers.
4970  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4971  // Make a deep copy of Y_in, into which to write the multiply result.
4972  MV Y (Y_in, Teuchos::Copy);
4973  this->localApply (*X, Y, mode, alpha, beta);
4974  Tpetra::deep_copy (Y_in, Y);
4975  } else {
4976  this->localApply (*X, Y_in, mode, alpha, beta);
4977  }
4978  }
4979 
4980  // If the range Map is a locally replicated map, sum the
4981  // contributions from each process. (That's why we set beta=0
4982  // above for all processes but Proc 0.)
4983  if (Y_is_replicated) {
4984  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4985  Y_in.reduce ();
4986  }
4987  }
4988 
4989  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4990  void
4994  const Teuchos::ETransp mode,
4995  const Scalar& alpha,
4996  const Scalar& beta) const
4997  {
4999  using Teuchos::NO_TRANS;
5000  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5001 
5002  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5003  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5004 
5005  const bool debug = ::Tpetra::Details::Behavior::debug ();
5006  if (debug) {
5007  const char tfecfFuncName[] = "localApply: ";
5008  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5009  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5010  "X.getNumVectors() = " << X.getNumVectors () << " != "
5011  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5012  const bool transpose = (mode != Teuchos::NO_TRANS);
5013  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5014  (! transpose && X.getLocalLength () !=
5015  getColMap ()->getLocalNumElements (), std::runtime_error,
5016  "NO_TRANS case: X has the wrong number of local rows. "
5017  "X.getLocalLength() = " << X.getLocalLength () << " != "
5018  "getColMap()->getLocalNumElements() = " <<
5019  getColMap ()->getLocalNumElements () << ".");
5020  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5021  (! transpose && Y.getLocalLength () !=
5022  getRowMap ()->getLocalNumElements (), std::runtime_error,
5023  "NO_TRANS case: Y has the wrong number of local rows. "
5024  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5025  "getRowMap()->getLocalNumElements() = " <<
5026  getRowMap ()->getLocalNumElements () << ".");
5027  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5028  (transpose && X.getLocalLength () !=
5029  getRowMap ()->getLocalNumElements (), std::runtime_error,
5030  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5031  "rows. X.getLocalLength() = " << X.getLocalLength ()
5032  << " != getRowMap()->getLocalNumElements() = "
5033  << getRowMap ()->getLocalNumElements () << ".");
5034  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5035  (transpose && Y.getLocalLength () !=
5036  getColMap ()->getLocalNumElements (), std::runtime_error,
5037  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5038  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5039  << " != getColMap()->getLocalNumElements() = "
5040  << getColMap ()->getLocalNumElements () << ".");
5041  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5042  (! isFillComplete (), std::runtime_error, "The matrix is not "
5043  "fill complete. You must call fillComplete() (possibly with "
5044  "domain and range Map arguments) without an intervening "
5045  "resumeFill() call before you may call this method.");
5046  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5047  (! X.isConstantStride () || ! Y.isConstantStride (),
5048  std::runtime_error, "X and Y must be constant stride.");
5049  // If the two pointers are null, then they don't alias one
5050  // another, even though they are equal.
5051  // Kokkos does not guarantee that zero row-extent vectors
5052  // point to different places, so we have to check that too.
5053  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5054  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5055  && X_lcl.extent(0) != 0,
5056  std::runtime_error, "X and Y may not alias one another.");
5057  }
5058 
5059 #if KOKKOSKERNELS_VERSION >= 40299
5060  auto A_lcl = getLocalMatrixDevice();
5061 
5062  if(!applyHelper.get()) {
5063  // The apply helper does not exist, so create it.
5064  // Decide now whether to use the imbalanced row path, or the default.
5065  bool useMergePath = false;
5066 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5067  //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5068  //we can use SPMV_MERGE_PATH for the native spmv as well.
5069  //Take out this ifdef to enable that.
5070  //
5071  //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5072  if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5073  LocalOrdinal nrows = getLocalNumRows();
5074  LocalOrdinal maxRowImbalance = 0;
5075  if(nrows != 0)
5076  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5077 
5078  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5079  useMergePath = true;
5080  }
5081 #endif
5082  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5083  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5084  }
5085 
5086  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5087  const char* modeKK = nullptr;
5088  switch(mode)
5089  {
5090  case Teuchos::NO_TRANS:
5091  modeKK = KokkosSparse::NoTranspose; break;
5092  case Teuchos::TRANS:
5093  modeKK = KokkosSparse::Transpose; break;
5094  case Teuchos::CONJ_TRANS:
5095  modeKK = KokkosSparse::ConjugateTranspose; break;
5096  default:
5097  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5098  }
5099 
5100  if(applyHelper->shouldUseIntRowptrs())
5101  {
5102  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5103  KokkosSparse::spmv(
5104  &applyHelper->handle_int, modeKK,
5105  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5106  }
5107  else
5108  {
5109  KokkosSparse::spmv(
5110  &applyHelper->handle, modeKK,
5111  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5112  }
5113 #else
5114  LocalOrdinal nrows = getLocalNumRows();
5115  LocalOrdinal maxRowImbalance = 0;
5116  if(nrows != 0)
5117  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5118 
5119  auto matrix_lcl = getLocalMultiplyOperator();
5120  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5121  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5122  else
5123  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5124 #endif
5125  }
5126 
5127  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5128  void
5132  Teuchos::ETransp mode,
5133  Scalar alpha,
5134  Scalar beta) const
5135  {
5137  const char fnName[] = "Tpetra::CrsMatrix::apply";
5138 
5139  TEUCHOS_TEST_FOR_EXCEPTION
5140  (! isFillComplete (), std::runtime_error,
5141  fnName << ": Cannot call apply() until fillComplete() "
5142  "has been called.");
5143 
5144  if (mode == Teuchos::NO_TRANS) {
5145  ProfilingRegion regionNonTranspose (fnName);
5146  this->applyNonTranspose (X, Y, alpha, beta);
5147  }
5148  else {
5149  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5150  this->applyTranspose (X, Y, mode, alpha, beta);
5151  }
5152  }
5153 
5154 
5155  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5156  template<class T>
5157  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5159  convert () const
5160  {
5161  using Teuchos::RCP;
5162  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5163  const char tfecfFuncName[] = "convert: ";
5164 
5165  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5166  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5167  "of the conversion) is not fill complete. You must first call "
5168  "fillComplete() (possibly with the domain and range Map) without an "
5169  "intervening call to resumeFill(), before you may call this method.");
5170 
5171  RCP<output_matrix_type> newMatrix
5172  (new output_matrix_type (this->getCrsGraph ()));
5173  // Copy old values into new values. impl_scalar_type and T may
5174  // differ, so we can't use Kokkos::deep_copy.
5176  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5177  this->getLocalMatrixDevice ().values);
5178  // Since newmat has a static (const) graph, the graph already has
5179  // a column Map, and Import and Export objects already exist (if
5180  // applicable). Thus, calling fillComplete is cheap.
5181  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5182 
5183  return newMatrix;
5184  }
5185 
5186 
5187  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5188  void
5191  {
5192  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5193  if (debug) {
5194  const char tfecfFuncName[] = "checkInternalState: ";
5195  const char err[] = "Internal state is not consistent. "
5196  "Please report this bug to the Tpetra developers.";
5197 
5198  // This version of the graph (RCP<const crs_graph_type>) must
5199  // always be nonnull.
5200  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5201  (staticGraph_.is_null (), std::logic_error, err);
5202  // myGraph == null means that the matrix has a const ("static")
5203  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5204  // graph).
5205  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5206  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5207  std::logic_error, err);
5208  // if matrix is fill complete, then graph must be fill complete
5209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5210  (isFillComplete () && ! staticGraph_->isFillComplete (),
5211  std::logic_error, err << " Specifically, the matrix is fill complete, "
5212  "but its graph is NOT fill complete.");
5213  // if values are allocated and they are non-zero in number, then
5214  // one of the allocations should be present
5215  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5216  (staticGraph_->indicesAreAllocated () &&
5217  staticGraph_->getLocalAllocationSize() > 0 &&
5218  staticGraph_->getLocalNumRows() > 0 &&
5219  valuesUnpacked_wdv.extent (0) == 0,
5220  std::logic_error, err);
5221  }
5222  }
5223 
5224  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5225  std::string
5228  {
5229  std::ostringstream os;
5230 
5231  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5232  if (this->getObjectLabel () != "") {
5233  os << "Label: \"" << this->getObjectLabel () << "\", ";
5234  }
5235  if (isFillComplete ()) {
5236  os << "isFillComplete: true"
5237  << ", global dimensions: [" << getGlobalNumRows () << ", "
5238  << getGlobalNumCols () << "]"
5239  << ", global number of entries: " << getGlobalNumEntries ()
5240  << "}";
5241  }
5242  else {
5243  os << "isFillComplete: false"
5244  << ", global dimensions: [" << getGlobalNumRows () << ", "
5245  << getGlobalNumCols () << "]}";
5246  }
5247  return os.str ();
5248  }
5249 
5250  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5251  void
5253  describe (Teuchos::FancyOStream &out,
5254  const Teuchos::EVerbosityLevel verbLevel) const
5255  {
5256  using std::endl;
5257  using std::setw;
5258  using Teuchos::ArrayView;
5259  using Teuchos::Comm;
5260  using Teuchos::RCP;
5261  using Teuchos::TypeNameTraits;
5262  using Teuchos::VERB_DEFAULT;
5263  using Teuchos::VERB_NONE;
5264  using Teuchos::VERB_LOW;
5265  using Teuchos::VERB_MEDIUM;
5266  using Teuchos::VERB_HIGH;
5267  using Teuchos::VERB_EXTREME;
5268 
5269  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5270 
5271  if (vl == VERB_NONE) {
5272  return; // Don't print anything at all
5273  }
5274 
5275  // By convention, describe() always begins with a tab.
5276  Teuchos::OSTab tab0 (out);
5277 
5278  RCP<const Comm<int> > comm = this->getComm();
5279  const int myRank = comm->getRank();
5280  const int numProcs = comm->getSize();
5281  size_t width = 1;
5282  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5283  ++width;
5284  }
5285  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5286 
5287  // none: print nothing
5288  // low: print O(1) info from node 0
5289  // medium: print O(P) info, num entries per process
5290  // high: print O(N) info, num entries per row
5291  // extreme: print O(NNZ) info: print indices and values
5292  //
5293  // for medium and higher, print constituent objects at specified verbLevel
5294  if (myRank == 0) {
5295  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5296  }
5297  Teuchos::OSTab tab1 (out);
5298 
5299  if (myRank == 0) {
5300  if (this->getObjectLabel () != "") {
5301  out << "Label: \"" << this->getObjectLabel () << "\", ";
5302  }
5303  {
5304  out << "Template parameters:" << endl;
5305  Teuchos::OSTab tab2 (out);
5306  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5307  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5308  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5309  << "Node: " << TypeNameTraits<Node>::name () << endl;
5310  }
5311  if (isFillComplete()) {
5312  out << "isFillComplete: true" << endl
5313  << "Global dimensions: [" << getGlobalNumRows () << ", "
5314  << getGlobalNumCols () << "]" << endl
5315  << "Global number of entries: " << getGlobalNumEntries () << endl
5316  << endl << "Global max number of entries in a row: "
5317  << getGlobalMaxNumRowEntries () << endl;
5318  }
5319  else {
5320  out << "isFillComplete: false" << endl
5321  << "Global dimensions: [" << getGlobalNumRows () << ", "
5322  << getGlobalNumCols () << "]" << endl;
5323  }
5324  }
5325 
5326  if (vl < VERB_MEDIUM) {
5327  return; // all done!
5328  }
5329 
5330  // Describe the row Map.
5331  if (myRank == 0) {
5332  out << endl << "Row Map:" << endl;
5333  }
5334  if (getRowMap ().is_null ()) {
5335  if (myRank == 0) {
5336  out << "null" << endl;
5337  }
5338  }
5339  else {
5340  if (myRank == 0) {
5341  out << endl;
5342  }
5343  getRowMap ()->describe (out, vl);
5344  }
5345 
5346  // Describe the column Map.
5347  if (myRank == 0) {
5348  out << "Column Map: ";
5349  }
5350  if (getColMap ().is_null ()) {
5351  if (myRank == 0) {
5352  out << "null" << endl;
5353  }
5354  } else if (getColMap () == getRowMap ()) {
5355  if (myRank == 0) {
5356  out << "same as row Map" << endl;
5357  }
5358  } else {
5359  if (myRank == 0) {
5360  out << endl;
5361  }
5362  getColMap ()->describe (out, vl);
5363  }
5364 
5365  // Describe the domain Map.
5366  if (myRank == 0) {
5367  out << "Domain Map: ";
5368  }
5369  if (getDomainMap ().is_null ()) {
5370  if (myRank == 0) {
5371  out << "null" << endl;
5372  }
5373  } else if (getDomainMap () == getRowMap ()) {
5374  if (myRank == 0) {
5375  out << "same as row Map" << endl;
5376  }
5377  } else if (getDomainMap () == getColMap ()) {
5378  if (myRank == 0) {
5379  out << "same as column Map" << endl;
5380  }
5381  } else {
5382  if (myRank == 0) {
5383  out << endl;
5384  }
5385  getDomainMap ()->describe (out, vl);
5386  }
5387 
5388  // Describe the range Map.
5389  if (myRank == 0) {
5390  out << "Range Map: ";
5391  }
5392  if (getRangeMap ().is_null ()) {
5393  if (myRank == 0) {
5394  out << "null" << endl;
5395  }
5396  } else if (getRangeMap () == getDomainMap ()) {
5397  if (myRank == 0) {
5398  out << "same as domain Map" << endl;
5399  }
5400  } else if (getRangeMap () == getRowMap ()) {
5401  if (myRank == 0) {
5402  out << "same as row Map" << endl;
5403  }
5404  } else {
5405  if (myRank == 0) {
5406  out << endl;
5407  }
5408  getRangeMap ()->describe (out, vl);
5409  }
5410 
5411  // O(P) data
5412  for (int curRank = 0; curRank < numProcs; ++curRank) {
5413  if (myRank == curRank) {
5414  out << "Process rank: " << curRank << endl;
5415  Teuchos::OSTab tab2 (out);
5416  if (! staticGraph_->indicesAreAllocated ()) {
5417  out << "Graph indices not allocated" << endl;
5418  }
5419  else {
5420  out << "Number of allocated entries: "
5421  << staticGraph_->getLocalAllocationSize () << endl;
5422  }
5423  out << "Number of entries: " << getLocalNumEntries () << endl
5424  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5425  << endl;
5426  }
5427  // Give output time to complete by executing some barriers.
5428  comm->barrier ();
5429  comm->barrier ();
5430  comm->barrier ();
5431  }
5432 
5433  if (vl < VERB_HIGH) {
5434  return; // all done!
5435  }
5436 
5437  // O(N) and O(NNZ) data
5438  for (int curRank = 0; curRank < numProcs; ++curRank) {
5439  if (myRank == curRank) {
5440  out << std::setw(width) << "Proc Rank"
5441  << std::setw(width) << "Global Row"
5442  << std::setw(width) << "Num Entries";
5443  if (vl == VERB_EXTREME) {
5444  out << std::setw(width) << "(Index,Value)";
5445  }
5446  out << endl;
5447  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5448  const size_t nE = getNumEntriesInLocalRow(r);
5449  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5450  out << std::setw(width) << myRank
5451  << std::setw(width) << gid
5452  << std::setw(width) << nE;
5453  if (vl == VERB_EXTREME) {
5454  if (isGloballyIndexed()) {
5455  global_inds_host_view_type rowinds;
5456  values_host_view_type rowvals;
5457  getGlobalRowView (gid, rowinds, rowvals);
5458  for (size_t j = 0; j < nE; ++j) {
5459  out << " (" << rowinds[j]
5460  << ", " << rowvals[j]
5461  << ") ";
5462  }
5463  }
5464  else if (isLocallyIndexed()) {
5465  local_inds_host_view_type rowinds;
5466  values_host_view_type rowvals;
5467  getLocalRowView (r, rowinds, rowvals);
5468  for (size_t j=0; j < nE; ++j) {
5469  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5470  << ", " << rowvals[j]
5471  << ") ";
5472  }
5473  } // globally or locally indexed
5474  } // vl == VERB_EXTREME
5475  out << endl;
5476  } // for each row r on this process
5477  } // if (myRank == curRank)
5478 
5479  // Give output time to complete
5480  comm->barrier ();
5481  comm->barrier ();
5482  comm->barrier ();
5483  } // for each process p
5484  }
5485 
5486  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5487  bool
5490  {
5491  // It's not clear what kind of compatibility checks on sizes can
5492  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5493  // compatibility.
5494 
5495  // Currently, the source object must be a RowMatrix with the same
5496  // four template parameters as the target CrsMatrix. We might
5497  // relax this requirement later.
5498  const row_matrix_type* srcRowMat =
5499  dynamic_cast<const row_matrix_type*> (&source);
5500  return (srcRowMat != nullptr);
5501  }
5502 
5503  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5504  void
5507  const typename crs_graph_type::padding_type& padding,
5508  const bool verbose)
5509  {
5511  using Details::padCrsArrays;
5512  using std::endl;
5513  using LO = local_ordinal_type;
5514  using row_ptrs_type =
5515  typename local_graph_device_type::row_map_type::non_const_type;
5516  using range_policy =
5517  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5518  const char tfecfFuncName[] = "applyCrsPadding";
5519  const char suffix[] =
5520  ". Please report this bug to the Tpetra developers.";
5521  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5522 
5523  std::unique_ptr<std::string> prefix;
5524  if (verbose) {
5525  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5526  std::ostringstream os;
5527  os << *prefix << "padding: ";
5528  padding.print(os);
5529  os << endl;
5530  std::cerr << os.str();
5531  }
5532  const int myRank = ! verbose ? -1 : [&] () {
5533  auto map = this->getMap();
5534  if (map.is_null()) {
5535  return -1;
5536  }
5537  auto comm = map->getComm();
5538  if (comm.is_null()) {
5539  return -1;
5540  }
5541  return comm->getRank();
5542  } ();
5543 
5544  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5545  if (! myGraph_->indicesAreAllocated()) {
5546  if (verbose) {
5547  std::ostringstream os;
5548  os << *prefix << "Call allocateIndices" << endl;
5549  std::cerr << os.str();
5550  }
5551  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5552  }
5553 
5554  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5555  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5556  // size needs to increase. That should be the job of
5557  // padCrsArrays.
5558 
5559  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5560  // would use it directly.
5561 
5562  if (verbose) {
5563  std::ostringstream os;
5564  os << *prefix << "Allocate row_ptrs_beg: "
5565  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5566  std::cerr << os.str();
5567  }
5568  using Kokkos::view_alloc;
5569  using Kokkos::WithoutInitializing;
5570  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5571  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5572  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5573  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5574 
5575  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5576  size_t(row_ptr_beg.extent(0) - 1);
5577  if (verbose) {
5578  std::ostringstream os;
5579  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5580  std::cerr << os.str();
5581  }
5582  row_ptrs_type row_ptr_end(
5583  view_alloc("row_ptr_end", WithoutInitializing), N);
5584 
5585  row_ptrs_type num_row_entries_d;
5586 
5587  const bool refill_num_row_entries =
5588  myGraph_->k_numRowEntries_.extent(0) != 0;
5589 
5590  if (refill_num_row_entries) { // unpacked storage
5591  // We can't assume correct *this capture until C++17, and it's
5592  // likely more efficient just to capture what we need anyway.
5593  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5594  myGraph_->k_numRowEntries_);
5595  Kokkos::parallel_for
5596  ("Fill end row pointers", range_policy(0, N),
5597  KOKKOS_LAMBDA (const size_t i) {
5598  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5599  });
5600  }
5601  else {
5602  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5603  // storage, we don't need row_ptr_end to be separate allocation;
5604  // could just have it alias row_ptr_beg+1.
5605  Kokkos::parallel_for
5606  ("Fill end row pointers", range_policy(0, N),
5607  KOKKOS_LAMBDA (const size_t i) {
5608  row_ptr_end(i) = row_ptr_beg(i+1);
5609  });
5610  }
5611 
5612  if (myGraph_->isGloballyIndexed()) {
5613  padCrsArrays(row_ptr_beg, row_ptr_end,
5614  myGraph_->gblInds_wdv,
5615  valuesUnpacked_wdv, padding, myRank, verbose);
5616  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5617  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5618  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5619  (newValuesLen != newColIndsLen, std::logic_error,
5620  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5621  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5622  << suffix);
5623  }
5624  else {
5625  padCrsArrays(row_ptr_beg, row_ptr_end,
5626  myGraph_->lclIndsUnpacked_wdv,
5627  valuesUnpacked_wdv, padding, myRank, verbose);
5628  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5629  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5630  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5631  (newValuesLen != newColIndsLen, std::logic_error,
5632  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5633  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5634  << suffix);
5635  }
5636 
5637  if (refill_num_row_entries) {
5638  Kokkos::parallel_for
5639  ("Fill num entries", range_policy(0, N),
5640  KOKKOS_LAMBDA (const size_t i) {
5641  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5642  });
5643  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5644  }
5645 
5646  if (verbose) {
5647  std::ostringstream os;
5648  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5649  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5650  << ", new size: " << row_ptr_beg.extent(0) << endl;
5651  std::cerr << os.str();
5652  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5653  row_ptr_beg.extent(0) );
5654  }
5655  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5656  }
5657 
5658  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5659  void
5660  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5661  copyAndPermuteStaticGraph(
5662  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5663  const size_t numSameIDs,
5664  const LocalOrdinal permuteToLIDs[],
5665  const LocalOrdinal permuteFromLIDs[],
5666  const size_t numPermutes)
5667  {
5668  using Details::ProfilingRegion;
5669  using Teuchos::Array;
5670  using Teuchos::ArrayView;
5671  using std::endl;
5672  using LO = LocalOrdinal;
5673  using GO = GlobalOrdinal;
5674  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5675  const char suffix[] =
5676  " Please report this bug to the Tpetra developers.";
5677  ProfilingRegion regionCAP
5678  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5679 
5680  const bool debug = Details::Behavior::debug("CrsGraph");
5681  const bool verbose = Details::Behavior::verbose("CrsGraph");
5682  std::unique_ptr<std::string> prefix;
5683  if (verbose) {
5684  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5685  std::ostringstream os;
5686  os << *prefix << "Start" << endl;
5687  }
5688  const char* const prefix_raw =
5689  verbose ? prefix.get()->c_str() : nullptr;
5690 
5691  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5692  //
5693  // Copy the first numSame row from source to target (this matrix).
5694  // This involves copying rows corresponding to LIDs [0, numSame-1].
5695  //
5696  const map_type& srcRowMap = * (srcMat.getRowMap ());
5697  nonconst_global_inds_host_view_type rowInds;
5698  nonconst_values_host_view_type rowVals;
5699  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5700  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5701  // Global ID for the current row index in the source matrix.
5702  // The first numSameIDs GIDs in the two input lists are the
5703  // same, so sourceGID == targetGID in this case.
5704  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5705  const GO targetGID = sourceGID;
5706 
5707  ArrayView<const GO>rowIndsConstView;
5708  ArrayView<const Scalar> rowValsConstView;
5709 
5710  if (sourceIsLocallyIndexed) {
5711  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5712  if (rowLength > static_cast<size_t> (rowInds.size())) {
5713  Kokkos::resize(rowInds,rowLength);
5714  Kokkos::resize(rowVals,rowLength);
5715  }
5716  // Resizing invalidates an Array's views, so we must make new
5717  // ones, even if rowLength hasn't changed.
5718  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5719  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5720 
5721  // The source matrix is locally indexed, so we have to get a
5722  // copy. Really it's the GIDs that have to be copied (because
5723  // they have to be converted from LIDs).
5724  size_t checkRowLength = 0;
5725  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5726  rowValsView, checkRowLength);
5727  if (debug) {
5728  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5729  (rowLength != checkRowLength, std::logic_error, "For "
5730  "global row index " << sourceGID << ", the source "
5731  "matrix's getNumEntriesInGlobalRow returns a row length "
5732  "of " << rowLength << ", but getGlobalRowCopy reports "
5733  "a row length of " << checkRowLength << "." << suffix);
5734  }
5735 
5736  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5737  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5738  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5739  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5740  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5741  rowIndsView.data(), rowIndsView.extent(0),
5742  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5743  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5744  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5745  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5746  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5747  // KDDKDD UVM TEMPORARY: KokkosView interface
5748  }
5749  else { // source matrix is globally indexed.
5750  global_inds_host_view_type rowIndsView;
5751  values_host_view_type rowValsView;
5752  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5753  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5754  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5755  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5756  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5757  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5758  rowIndsView.data(), rowIndsView.extent(0),
5759  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5760  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5761  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5762  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5763  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5764  // KDDKDD UVM TEMPORARY: KokkosView interface
5765 
5766  }
5767 
5768  // Applying a permutation to a matrix with a static graph
5769  // means REPLACE-ing entries.
5770  combineGlobalValues(targetGID, rowIndsConstView,
5771  rowValsConstView, REPLACE,
5772  prefix_raw, debug, verbose);
5773  }
5774 
5775  if (verbose) {
5776  std::ostringstream os;
5777  os << *prefix << "Do permutes" << endl;
5778  }
5779 
5780  const map_type& tgtRowMap = * (this->getRowMap ());
5781  for (size_t p = 0; p < numPermutes; ++p) {
5782  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5783  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5784 
5785  ArrayView<const GO> rowIndsConstView;
5786  ArrayView<const Scalar> rowValsConstView;
5787 
5788  if (sourceIsLocallyIndexed) {
5789  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5790  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5791  Kokkos::resize(rowInds,rowLength);
5792  Kokkos::resize(rowVals,rowLength);
5793  }
5794  // Resizing invalidates an Array's views, so we must make new
5795  // ones, even if rowLength hasn't changed.
5796  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5797  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5798 
5799  // The source matrix is locally indexed, so we have to get a
5800  // copy. Really it's the GIDs that have to be copied (because
5801  // they have to be converted from LIDs).
5802  size_t checkRowLength = 0;
5803  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5804  rowValsView, checkRowLength);
5805  if (debug) {
5806  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5807  (rowLength != checkRowLength, std::logic_error, "For "
5808  "source matrix global row index " << sourceGID << ", "
5809  "getNumEntriesInGlobalRow returns a row length of " <<
5810  rowLength << ", but getGlobalRowCopy a row length of "
5811  << checkRowLength << "." << suffix);
5812  }
5813 
5814  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5815  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5816  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5817  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5818  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5819  rowIndsView.data(), rowIndsView.extent(0),
5820  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5821  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5822  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5823  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5824  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5825  // KDDKDD UVM TEMPORARY: KokkosView interface
5826  }
5827  else {
5828  global_inds_host_view_type rowIndsView;
5829  values_host_view_type rowValsView;
5830  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5831  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5832  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5833  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5834  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5835  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5836  rowIndsView.data(), rowIndsView.extent(0),
5837  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5838  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5839  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5840  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5841  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5842  // KDDKDD UVM TEMPORARY: KokkosView interface
5843  }
5844 
5845  combineGlobalValues(targetGID, rowIndsConstView,
5846  rowValsConstView, REPLACE,
5847  prefix_raw, debug, verbose);
5848  }
5849 
5850  if (verbose) {
5851  std::ostringstream os;
5852  os << *prefix << "Done" << endl;
5853  }
5854  }
5855 
5856  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5857  void
5858  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5859  copyAndPermuteNonStaticGraph(
5860  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5861  const size_t numSameIDs,
5862  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5863  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5864  const size_t numPermutes)
5865  {
5866  using Details::ProfilingRegion;
5867  using Teuchos::Array;
5868  using Teuchos::ArrayView;
5869  using std::endl;
5870  using LO = LocalOrdinal;
5871  using GO = GlobalOrdinal;
5872  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5873  const char suffix[] =
5874  " Please report this bug to the Tpetra developers.";
5875  ProfilingRegion regionCAP
5876  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5877 
5878  const bool debug = Details::Behavior::debug("CrsGraph");
5879  const bool verbose = Details::Behavior::verbose("CrsGraph");
5880  std::unique_ptr<std::string> prefix;
5881  if (verbose) {
5882  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5883  std::ostringstream os;
5884  os << *prefix << "Start" << endl;
5885  }
5886  const char* const prefix_raw =
5887  verbose ? prefix.get()->c_str() : nullptr;
5888 
5889  {
5890  using row_graph_type = RowGraph<LO, GO, Node>;
5891  const row_graph_type& srcGraph = *(srcMat.getGraph());
5892  auto padding =
5893  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5894  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5895  applyCrsPadding(*padding, verbose);
5896  }
5897  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5898  //
5899  // Copy the first numSame row from source to target (this matrix).
5900  // This involves copying rows corresponding to LIDs [0, numSame-1].
5901  //
5902  const map_type& srcRowMap = * (srcMat.getRowMap ());
5903  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5904  using gids_type = nonconst_global_inds_host_view_type;
5905  using vals_type = nonconst_values_host_view_type;
5906  gids_type rowInds;
5907  vals_type rowVals;
5908  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5909  // Global ID for the current row index in the source matrix.
5910  // The first numSameIDs GIDs in the two input lists are the
5911  // same, so sourceGID == targetGID in this case.
5912  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5913  const GO targetGID = sourceGID;
5914 
5915  ArrayView<const GO> rowIndsConstView;
5916  ArrayView<const Scalar> rowValsConstView;
5917 
5918  if (sourceIsLocallyIndexed) {
5919 
5920  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5921  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5922  Kokkos::resize(rowInds,rowLength);
5923  Kokkos::resize(rowVals,rowLength);
5924  }
5925  // Resizing invalidates an Array's views, so we must make new
5926  // ones, even if rowLength hasn't changed.
5927  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5928  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5929 
5930  // The source matrix is locally indexed, so we have to get a
5931  // copy. Really it's the GIDs that have to be copied (because
5932  // they have to be converted from LIDs).
5933  size_t checkRowLength = 0;
5934  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5935  checkRowLength);
5936  if (debug) {
5937  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5938  (rowLength != checkRowLength, std::logic_error, ": For "
5939  "global row index " << sourceGID << ", the source "
5940  "matrix's getNumEntriesInGlobalRow returns a row length "
5941  "of " << rowLength << ", but getGlobalRowCopy reports "
5942  "a row length of " << checkRowLength << "." << suffix);
5943  }
5944  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5945  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5946  }
5947  else { // source matrix is globally indexed.
5948  global_inds_host_view_type rowIndsView;
5949  values_host_view_type rowValsView;
5950  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5951 
5952  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5953  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5954  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5955  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5956  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5957  rowIndsView.data(), rowIndsView.extent(0),
5958  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5959  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5960  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5961  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5962  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5963  // KDDKDD UVM TEMPORARY: KokkosView interface
5964  }
5965 
5966  // Combine the data into the target matrix.
5967  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5968  rowValsConstView, prefix_raw, debug, verbose);
5969  }
5970 
5971  if (verbose) {
5972  std::ostringstream os;
5973  os << *prefix << "Do permutes" << endl;
5974  }
5975  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5976  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5977 
5978  const map_type& tgtRowMap = * (this->getRowMap ());
5979  for (size_t p = 0; p < numPermutes; ++p) {
5980  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5981  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5982 
5983  ArrayView<const GO> rowIndsConstView;
5984  ArrayView<const Scalar> rowValsConstView;
5985 
5986  if (sourceIsLocallyIndexed) {
5987  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5988  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5989  Kokkos::resize(rowInds,rowLength);
5990  Kokkos::resize(rowVals,rowLength);
5991  }
5992  // Resizing invalidates an Array's views, so we must make new
5993  // ones, even if rowLength hasn't changed.
5994  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5995  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5996 
5997  // The source matrix is locally indexed, so we have to get a
5998  // copy. Really it's the GIDs that have to be copied (because
5999  // they have to be converted from LIDs).
6000  size_t checkRowLength = 0;
6001  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6002  rowValsView, checkRowLength);
6003  if (debug) {
6004  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6005  (rowLength != checkRowLength, std::logic_error, "For "
6006  "source matrix global row index " << sourceGID << ", "
6007  "getNumEntriesInGlobalRow returns a row length of " <<
6008  rowLength << ", but getGlobalRowCopy a row length of "
6009  << checkRowLength << "." << suffix);
6010  }
6011  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6012  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6013  }
6014  else {
6015  global_inds_host_view_type rowIndsView;
6016  values_host_view_type rowValsView;
6017  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6018 
6019  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6020  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6021  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6022  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6023  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6024  rowIndsView.data(), rowIndsView.extent(0),
6025  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6026  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6027  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6028  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6029  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6030  // KDDKDD UVM TEMPORARY: KokkosView interface
6031  }
6032 
6033  // Combine the data into the target matrix.
6034  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6035  rowValsConstView, prefix_raw, debug, verbose);
6036  }
6037 
6038  if (verbose) {
6039  std::ostringstream os;
6040  os << *prefix << "Done" << endl;
6041  }
6042  }
6043 
6044  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6045  void
6048  const SrcDistObject& srcObj,
6049  const size_t numSameIDs,
6050  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6051  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6052  const CombineMode /*CM*/)
6053  {
6054  using Details::Behavior;
6057  using std::endl;
6058 
6059  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6060  const char tfecfFuncName[] = "copyAndPermute: ";
6061  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6062 
6063  const bool verbose = Behavior::verbose("CrsMatrix");
6064  std::unique_ptr<std::string> prefix;
6065  if (verbose) {
6066  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6067  std::ostringstream os;
6068  os << *prefix << endl
6069  << *prefix << " numSameIDs: " << numSameIDs << endl
6070  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6071  << endl
6072  << *prefix << " "
6073  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6074  << endl
6075  << *prefix << " "
6076  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6077  << endl
6078  << *prefix << " "
6079  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6080  << endl;
6081  std::cerr << os.str ();
6082  }
6083 
6084  const auto numPermute = permuteToLIDs.extent (0);
6085  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6086  (numPermute != permuteFromLIDs.extent (0),
6087  std::invalid_argument, "permuteToLIDs.extent(0) = "
6088  << numPermute << "!= permuteFromLIDs.extent(0) = "
6089  << permuteFromLIDs.extent (0) << ".");
6090 
6091  // This dynamic cast should succeed, because we've already tested
6092  // it in checkSizes().
6094  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6095  if (isStaticGraph ()) {
6096  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6097  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6098  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6099  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6100 
6101  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6102  permuteToLIDs_h.data(),
6103  permuteFromLIDs_h.data(),
6104  numPermute);
6105  }
6106  else {
6107  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6108  permuteFromLIDs, numPermute);
6109  }
6110 
6111  if (verbose) {
6112  std::ostringstream os;
6113  os << *prefix << "Done" << endl;
6114  std::cerr << os.str();
6115  }
6116  }
6117 
6118  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6119  void
6122  (const SrcDistObject& source,
6123  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6124  Kokkos::DualView<char*, buffer_device_type>& exports,
6125  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6126  size_t& constantNumPackets)
6127  {
6128  using Details::Behavior;
6131  using Teuchos::outArg;
6132  using Teuchos::REDUCE_MAX;
6133  using Teuchos::reduceAll;
6134  using std::endl;
6135  typedef LocalOrdinal LO;
6136  typedef GlobalOrdinal GO;
6137  const char tfecfFuncName[] = "packAndPrepare: ";
6138  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6139 
6140  const bool debug = Behavior::debug("CrsMatrix");
6141  const bool verbose = Behavior::verbose("CrsMatrix");
6142 
6143  // Processes on which the communicator is null should not participate.
6144  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6145  if (pComm.is_null ()) {
6146  return;
6147  }
6148  const Teuchos::Comm<int>& comm = *pComm;
6149  const int myRank = comm.getSize ();
6150 
6151  std::unique_ptr<std::string> prefix;
6152  if (verbose) {
6153  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6154  std::ostringstream os;
6155  os << *prefix << "Start" << endl
6156  << *prefix << " "
6157  << dualViewStatusToString (exportLIDs, "exportLIDs")
6158  << endl
6159  << *prefix << " "
6160  << dualViewStatusToString (exports, "exports")
6161  << endl
6162  << *prefix << " "
6163  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6164  << endl;
6165  std::cerr << os.str ();
6166  }
6167 
6168  // Attempt to cast the source object to CrsMatrix. If successful,
6169  // use the source object's packNew() method to pack its data for
6170  // communication. Otherwise, attempt to cast to RowMatrix; if
6171  // successful, use the source object's pack() method. Otherwise,
6172  // the source object doesn't have the right type.
6173  //
6174  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6175  // RowMatrix to have the same Node type. Unfortunately, we don't
6176  // have a way to ask if the RowMatrix is "a RowMatrix with any
6177  // Node type," since RowMatrix doesn't have a base class. A
6178  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6179  // not currently exist, would satisfy this requirement.
6180  //
6181  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6182  // type doesn't technically need to match the target object's
6183  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6184  // and GO need not be the same, as long as there is no overflow of
6185  // the indices. However, checking for index overflow is global
6186  // and therefore undesirable.
6187 
6188  std::ostringstream msg; // for collecting error messages
6189  int lclBad = 0; // to be set below
6190 
6191  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6192  const crs_matrix_type* srcCrsMat =
6193  dynamic_cast<const crs_matrix_type*> (&source);
6194  if (srcCrsMat != nullptr) {
6195  if (verbose) {
6196  std::ostringstream os;
6197  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6198  "calling packNew" << endl;
6199  std::cerr << os.str ();
6200  }
6201  try {
6202  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6203  constantNumPackets);
6204  }
6205  catch (std::exception& e) {
6206  lclBad = 1;
6207  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6208  }
6209  }
6210  else {
6211  using Kokkos::HostSpace;
6212  using Kokkos::subview;
6213  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6214  using range_type = Kokkos::pair<size_t, size_t>;
6215 
6216  if (verbose) {
6217  std::ostringstream os;
6218  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6219  << endl;
6220  std::cerr << os.str ();
6221  }
6222 
6223  const row_matrix_type* srcRowMat =
6224  dynamic_cast<const row_matrix_type*> (&source);
6225  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6226  (srcRowMat == nullptr, std::invalid_argument,
6227  "The source object of the Import or Export operation is neither a "
6228  "CrsMatrix (with the same template parameters as the target object), "
6229  "nor a RowMatrix (with the same first four template parameters as the "
6230  "target object).");
6231 
6232  // For the RowMatrix case, we need to convert from
6233  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6234  // so terribly efficient, since packing a non-CrsMatrix
6235  // RowMatrix for Import/Export into a CrsMatrix is not a
6236  // critical case. Thus, we may allocate Teuchos::Array objects
6237  // here and copy to and from Kokkos::*View.
6238 
6239  // View exportLIDs's host data as a Teuchos::ArrayView.
6240  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6241  auto exportLIDs_h = exportLIDs.view_host ();
6242  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6243  exportLIDs_h.size ());
6244 
6245  // pack() will allocate exports_a as needed. We'll copy back
6246  // into exports (after (re)allocating exports if needed) below.
6247  Teuchos::Array<char> exports_a;
6248 
6249  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6250  // need to sync, since we're doing write-only access, but we do
6251  // need to mark the DualView as modified on host.
6252 
6253  numPacketsPerLID.clear_sync_state (); // write-only access
6254  numPacketsPerLID.modify_host ();
6255  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6256  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6257  numPacketsPerLID_h.size ());
6258 
6259  // Invoke RowMatrix's legacy pack() interface, using above
6260  // Teuchos::Array* objects.
6261  try {
6262  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6263  constantNumPackets);
6264  }
6265  catch (std::exception& e) {
6266  lclBad = 1;
6267  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6268  }
6269 
6270  // Allocate 'exports', and copy exports_a back into it.
6271  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6272  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6273  const std::string oldLabel = exports.d_view.label ();
6274  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6275  exports = exports_type (newLabel, newAllocSize);
6276  }
6277  // It's safe to assume that we're working on host anyway, so
6278  // just keep exports sync'd to host.
6279  // ignore current device contents
6280  exports.modify_host();
6281 
6282  auto exports_h = exports.view_host ();
6283  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6284 
6285  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6286  // exports_a into a nonowning Kokkos::View first before copying.
6287  typedef typename exports_type::t_host::execution_space HES;
6288  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6289  Kokkos::View<const char*, host_device_type>
6290  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6291  // DEEP_COPY REVIEW - NOT TESTED
6292  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6293  }
6294 
6295  if (debug) {
6296  int gblBad = 0; // output argument; to be set below
6297  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6298  if (gblBad != 0) {
6299  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6300  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6301  (true, std::logic_error, "packNew() or pack() threw an exception on "
6302  "one or more participating processes.");
6303  }
6304  }
6305  else {
6306  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6307  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6308  "or more participating processes. Here is this process' error "
6309  "message: " << msg.str ());
6310  }
6311 
6312  if (verbose) {
6313  std::ostringstream os;
6314  os << *prefix << "packAndPrepare: Done!" << endl
6315  << *prefix << " "
6316  << dualViewStatusToString (exportLIDs, "exportLIDs")
6317  << endl
6318  << *prefix << " "
6319  << dualViewStatusToString (exports, "exports")
6320  << endl
6321  << *prefix << " "
6322  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6323  << endl;
6324  std::cerr << os.str ();
6325  }
6326  }
6327 
6328  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6329  size_t
6330  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6331  packRow (char exports[],
6332  const size_t offset,
6333  const size_t numEnt,
6334  const GlobalOrdinal gidsIn[],
6335  const impl_scalar_type valsIn[],
6336  const size_t numBytesPerValue) const
6337  {
6338  using Kokkos::View;
6339  using Kokkos::subview;
6341  typedef LocalOrdinal LO;
6342  typedef GlobalOrdinal GO;
6343  typedef impl_scalar_type ST;
6344 
6345  if (numEnt == 0) {
6346  // Empty rows always take zero bytes, to ensure sparsity.
6347  return 0;
6348  }
6349 
6350  const GO gid = 0; // packValueCount wants this
6351  const LO numEntLO = static_cast<size_t> (numEnt);
6352 
6353  const size_t numEntBeg = offset;
6354  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6355  const size_t gidsBeg = numEntBeg + numEntLen;
6356  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6357  const size_t valsBeg = gidsBeg + gidsLen;
6358  const size_t valsLen = numEnt * numBytesPerValue;
6359 
6360  char* const numEntOut = exports + numEntBeg;
6361  char* const gidsOut = exports + gidsBeg;
6362  char* const valsOut = exports + valsBeg;
6363 
6364  size_t numBytesOut = 0;
6365  int errorCode = 0;
6366  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
6367 
6368  {
6369  Kokkos::pair<int, size_t> p;
6370  p = PackTraits<GO>::packArray (gidsOut, gidsIn, numEnt);
6371  errorCode += p.first;
6372  numBytesOut += p.second;
6373 
6374  p = PackTraits<ST>::packArray (valsOut, valsIn, numEnt);
6375  errorCode += p.first;
6376  numBytesOut += p.second;
6377  }
6378 
6379  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6380  TEUCHOS_TEST_FOR_EXCEPTION
6381  (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
6382  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6383  << expectedNumBytes << ".");
6384  TEUCHOS_TEST_FOR_EXCEPTION
6385  (errorCode != 0, std::runtime_error, "packRow: "
6386  "PackTraits::packArray returned a nonzero error code");
6387 
6388  return numBytesOut;
6389  }
6390 
6391  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6392  size_t
6393  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6394  unpackRow (GlobalOrdinal gidsOut[],
6395  impl_scalar_type valsOut[],
6396  const char imports[],
6397  const size_t offset,
6398  const size_t numBytes,
6399  const size_t numEnt,
6400  const size_t numBytesPerValue)
6401  {
6402  using Kokkos::View;
6403  using Kokkos::subview;
6405  typedef LocalOrdinal LO;
6406  typedef GlobalOrdinal GO;
6407  typedef impl_scalar_type ST;
6408 
6409  Details::ProfilingRegion region_upack_row(
6410  "Tpetra::CrsMatrix::unpackRow",
6411  "Import/Export"
6412  );
6413 
6414  if (numBytes == 0) {
6415  // Rows with zero bytes should always have zero entries.
6416  if (numEnt != 0) {
6417  const int myRank = this->getMap ()->getComm ()->getRank ();
6418  TEUCHOS_TEST_FOR_EXCEPTION
6419  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6420  "unpackRow: The number of bytes to unpack numBytes=0, but the "
6421  "number of entries to unpack (as reported by numPacketsPerLID) "
6422  "for this row numEnt=" << numEnt << " != 0.");
6423  }
6424  return 0;
6425  }
6426 
6427  if (numEnt == 0 && numBytes != 0) {
6428  const int myRank = this->getMap ()->getComm ()->getRank ();
6429  TEUCHOS_TEST_FOR_EXCEPTION
6430  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6431  "unpackRow: The number of entries to unpack (as reported by "
6432  "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6433  "numBytes=" << numBytes << " != 0.");
6434  }
6435 
6436  const GO gid = 0; // packValueCount wants this
6437  const LO lid = 0; // packValueCount wants this
6438 
6439  const size_t numEntBeg = offset;
6440  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
6441  const size_t gidsBeg = numEntBeg + numEntLen;
6442  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6443  const size_t valsBeg = gidsBeg + gidsLen;
6444  const size_t valsLen = numEnt * numBytesPerValue;
6445 
6446  const char* const numEntIn = imports + numEntBeg;
6447  const char* const gidsIn = imports + gidsBeg;
6448  const char* const valsIn = imports + valsBeg;
6449 
6450  size_t numBytesOut = 0;
6451  int errorCode = 0;
6452  LO numEntOut;
6453  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
6454  if (static_cast<size_t> (numEntOut) != numEnt ||
6455  numEntOut == static_cast<LO> (0)) {
6456  const int myRank = this->getMap ()->getComm ()->getRank ();
6457  std::ostringstream os;
6458  os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6459  bool firstErrorCondition = false;
6460  if (static_cast<size_t> (numEntOut) != numEnt) {
6461  os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6462  << " does not equal number of entries unpacked from imports "
6463  "buffer numEntOut=" << numEntOut << ".";
6464  firstErrorCondition = true;
6465  }
6466  if (numEntOut == static_cast<LO> (0)) {
6467  if (firstErrorCondition) {
6468  os << " Also, ";
6469  }
6470  os << "Number of entries unpacked from imports buffer numEntOut=0, "
6471  "but number of bytes to unpack for this row numBytes=" << numBytes
6472  << " != 0. This should never happen, since packRow should only "
6473  "ever pack rows with a nonzero number of entries. In this case, "
6474  "the number of entries from numPacketsPerLID is numEnt=" << numEnt
6475  << ".";
6476  }
6477  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
6478  }
6479 
6480  {
6481  Kokkos::pair<int, size_t> p;
6482  p = PackTraits<GO>::unpackArray (gidsOut, gidsIn, numEnt);
6483  errorCode += p.first;
6484  numBytesOut += p.second;
6485 
6486  p = PackTraits<ST>::unpackArray (valsOut, valsIn, numEnt);
6487  errorCode += p.first;
6488  numBytesOut += p.second;
6489  }
6490 
6491  TEUCHOS_TEST_FOR_EXCEPTION
6492  (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
6493  << numBytesOut << " != numBytes = " << numBytes << ".");
6494 
6495  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6496  TEUCHOS_TEST_FOR_EXCEPTION
6497  (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
6498  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6499  << expectedNumBytes << ".");
6500 
6501  TEUCHOS_TEST_FOR_EXCEPTION
6502  (errorCode != 0, std::runtime_error, "unpackRow: "
6503  "PackTraits::unpackArray returned a nonzero error code");
6504 
6505  return numBytesOut;
6506  }
6507 
6508  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6509  void
6510  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6511  allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
6512  size_t& totalNumEntries,
6513  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
6514  {
6515  using Details::Behavior;
6517  using std::endl;
6518  typedef impl_scalar_type IST;
6519  typedef LocalOrdinal LO;
6520  typedef GlobalOrdinal GO;
6521  //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6522 
6523  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6524  // output to std::cerr on every MPI process. This is unwise for
6525  // runs with large numbers of MPI processes.
6526  const bool verbose = Behavior::verbose("CrsMatrix");
6527  std::unique_ptr<std::string> prefix;
6528  if (verbose) {
6529  prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6530  std::ostringstream os;
6531  os << *prefix << "Before:"
6532  << endl
6533  << *prefix << " "
6534  << dualViewStatusToString (exports, "exports")
6535  << endl
6536  << *prefix << " "
6537  << dualViewStatusToString (exportLIDs, "exportLIDs")
6538  << endl;
6539  std::cerr << os.str ();
6540  }
6541 
6542  // The number of export LIDs must fit in LocalOrdinal, assuming
6543  // that the LIDs are distinct and valid on the calling process.
6544  const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
6545 
6546  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6547  auto exportLIDs_h = exportLIDs.view_host ();
6548 
6549  // Count the total number of matrix entries to send.
6550  totalNumEntries = 0;
6551  for (LO i = 0; i < numExportLIDs; ++i) {
6552  const LO lclRow = exportLIDs_h[i];
6553  size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
6554  // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6555  // indices as an error. Just consider them nonowned for now.
6556  if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
6557  curNumEntries = 0;
6558  }
6559  totalNumEntries += curNumEntries;
6560  }
6561 
6562  // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6563  // if sizeof(IST) is a meaningful representation of the amount of
6564  // data in a Scalar instance. (LO and GO are always built-in
6565  // integer types.)
6566  //
6567  // Allocate the exports array. It does NOT need padding for
6568  // alignment, since we use memcpy to write to / read from send /
6569  // receive buffers.
6570  const size_t allocSize =
6571  static_cast<size_t> (numExportLIDs) * sizeof (LO) +
6572  totalNumEntries * (sizeof (IST) + sizeof (GO));
6573  if (static_cast<size_t> (exports.extent (0)) < allocSize) {
6574  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6575 
6576  const std::string oldLabel = exports.d_view.label ();
6577  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6578  exports = exports_type (newLabel, allocSize);
6579  }
6580 
6581  if (verbose) {
6582  std::ostringstream os;
6583  os << *prefix << "After:"
6584  << endl
6585  << *prefix << " "
6586  << dualViewStatusToString (exports, "exports")
6587  << endl
6588  << *prefix << " "
6589  << dualViewStatusToString (exportLIDs, "exportLIDs")
6590  << endl;
6591  std::cerr << os.str ();
6592  }
6593  }
6594 
6595  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6596  void
6598  packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6599  Kokkos::DualView<char*, buffer_device_type>& exports,
6600  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6601  size_t& constantNumPackets) const
6602  {
6603  // The call to packNew in packAndPrepare catches and handles any exceptions.
6604  Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6605  if (this->isStaticGraph ()) {
6607  packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
6608  constantNumPackets);
6609  }
6610  else {
6611  this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
6612  constantNumPackets);
6613  }
6614  }
6615 
6616  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6617  void
6619  packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6620  Kokkos::DualView<char*, buffer_device_type>& exports,
6621  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6622  size_t& constantNumPackets) const
6623  {
6624  using Details::Behavior;
6626  using Details::PackTraits;
6628  using Kokkos::View;
6629  using std::endl;
6630  using LO = LocalOrdinal;
6631  using GO = GlobalOrdinal;
6632  using ST = impl_scalar_type;
6633  const char tfecfFuncName[] = "packNonStaticNew: ";
6634 
6635  const bool verbose = Behavior::verbose("CrsMatrix");
6636  std::unique_ptr<std::string> prefix;
6637  if (verbose) {
6638  prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6639  std::ostringstream os;
6640  os << *prefix << "Start" << endl;
6641  std::cerr << os.str ();
6642  }
6643 
6644  const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
6645  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6646  (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
6647  std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
6648  << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
6649  << ".");
6650 
6651  // Setting this to zero tells the caller to expect a possibly
6652  // different ("nonconstant") number of packets per local index
6653  // (i.e., a possibly different number of entries per row).
6654  constantNumPackets = 0;
6655 
6656  // The pack buffer 'exports' enters this method possibly
6657  // unallocated. Do the first two parts of "Count, allocate, fill,
6658  // compute."
6659  size_t totalNumEntries = 0;
6660  this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
6661  const size_t bufSize = static_cast<size_t> (exports.extent (0));
6662 
6663  // Write-only host access
6664  exports.clear_sync_state();
6665  exports.modify_host();
6666  auto exports_h = exports.view_host ();
6667  if (verbose) {
6668  std::ostringstream os;
6669  os << *prefix << "After marking exports as modified on host, "
6670  << dualViewStatusToString (exports, "exports") << endl;
6671  std::cerr << os.str ();
6672  }
6673 
6674  // Read-only host access
6675  auto exportLIDs_h = exportLIDs.view_host ();
6676 
6677  // Write-only host access
6678  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6679  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6680  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6681 
6682  // Compute the number of "packets" (in this case, bytes) per
6683  // export LID (in this case, local index of the row to send), and
6684  // actually pack the data.
6685  auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6686 
6687 
6688  // Temporary buffer for global column indices.
6689  typename global_inds_host_view_type::non_const_type gidsIn_k;
6690  if (this->isLocallyIndexed()) { // Need storage for Global IDs
6691  gidsIn_k =
6692  typename global_inds_host_view_type::non_const_type("packGids",
6693  maxRowNumEnt);
6694  }
6695 
6696  size_t offset = 0; // current index into 'exports' array.
6697  for (size_t i = 0; i < numExportLIDs; ++i) {
6698  const LO lclRow = exportLIDs_h[i];
6699 
6700  size_t numBytes = 0;
6701  size_t numEnt = this->getNumEntriesInLocalRow (lclRow);
6702 
6703  // Only pack this row's data if it has a nonzero number of
6704  // entries. We can do this because receiving processes get the
6705  // number of packets, and will know that zero packets means zero
6706  // entries.
6707  if (numEnt == 0) {
6708  numPacketsPerLID_h[i] = 0;
6709  continue;
6710  }
6711 
6712  if (this->isLocallyIndexed ()) {
6713  typename global_inds_host_view_type::non_const_type gidsIn;
6714  values_host_view_type valsIn;
6715  // If the matrix is locally indexed on the calling process, we
6716  // have to use its column Map (which it _must_ have in this
6717  // case) to convert to global indices.
6718  local_inds_host_view_type lidsIn;
6719  this->getLocalRowView (lclRow, lidsIn, valsIn);
6720  const map_type& colMap = * (this->getColMap ());
6721  for (size_t k = 0; k < numEnt; ++k) {
6722  gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
6723  }
6724  gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
6725 
6726  const size_t numBytesPerValue =
6727  PackTraits<ST>::packValueCount (valsIn[0]);
6728  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6729  gidsIn.data (), valsIn.data (),
6730  numBytesPerValue);
6731  }
6732  else if (this->isGloballyIndexed ()) {
6733  global_inds_host_view_type gidsIn;
6734  values_host_view_type valsIn;
6735  // If the matrix is globally indexed on the calling process,
6736  // then we can use the column indices directly. However, we
6737  // have to get the global row index. The calling process must
6738  // have a row Map, since otherwise it shouldn't be participating
6739  // in packing operations.
6740  const map_type& rowMap = * (this->getRowMap ());
6741  const GO gblRow = rowMap.getGlobalElement (lclRow);
6742  this->getGlobalRowView (gblRow, gidsIn, valsIn);
6743 
6744  const size_t numBytesPerValue =
6745  PackTraits<ST>::packValueCount (valsIn[0]);
6746  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6747  gidsIn.data (), valsIn.data (),
6748  numBytesPerValue);
6749  }
6750  // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6751  // nor locally indexed, then it has no entries. Therefore,
6752  // there is nothing to pack. No worries!
6753 
6754  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6755  (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6756  "First invalid offset into 'exports' pack buffer at index i = " << i
6757  << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
6758  bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
6759  ".");
6760  // numPacketsPerLID_h[i] is the number of "packets" in the
6761  // current local row i. Packet=char (really "byte") so use the
6762  // number of bytes of the packed data for that row.
6763  numPacketsPerLID_h[i] = numBytes;
6764  offset += numBytes;
6765  }
6766 
6767  if (verbose) {
6768  std::ostringstream os;
6769  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6770  << *prefix << " "
6771  << dualViewStatusToString (exports, "exports")
6772  << endl
6773  << *prefix << " "
6774  << dualViewStatusToString (exportLIDs, "exportLIDs")
6775  << endl;
6776  std::cerr << os.str ();
6777  }
6778  }
6779 
6780  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6781  LocalOrdinal
6782  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6783  combineGlobalValuesRaw(const LocalOrdinal lclRow,
6784  const LocalOrdinal numEnt,
6785  const impl_scalar_type vals[],
6786  const GlobalOrdinal cols[],
6787  const Tpetra::CombineMode combMode,
6788  const char* const prefix,
6789  const bool debug,
6790  const bool verbose)
6791  {
6792  using GO = GlobalOrdinal;
6793 
6794  // mfh 23 Mar 2017: This branch is not thread safe in a debug
6795  // build, due to use of Teuchos::ArrayView; see #229.
6796  const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6797  Teuchos::ArrayView<const GO> cols_av
6798  (numEnt == 0 ? nullptr : cols, numEnt);
6799  Teuchos::ArrayView<const Scalar> vals_av
6800  (numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*> (vals), numEnt);
6801 
6802  // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6803  // combine modes. combineGlobalValues throws on error; it does
6804  // not return an error code. Thus, if it returns, it succeeded.
6805  combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6806  prefix, debug, verbose);
6807  return numEnt;
6808  }
6809 
6810  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6811  void
6812  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6813  combineGlobalValues(
6814  const GlobalOrdinal globalRowIndex,
6815  const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6816  const Teuchos::ArrayView<const Scalar>& values,
6817  const Tpetra::CombineMode combineMode,
6818  const char* const prefix,
6819  const bool debug,
6820  const bool verbose)
6821  {
6822  const char tfecfFuncName[] = "combineGlobalValues: ";
6823 
6824  if (isStaticGraph ()) {
6825  // INSERT doesn't make sense for a static graph, since you
6826  // aren't allowed to change the structure of the graph.
6827  // However, all the other combine modes work.
6828  if (combineMode == ADD) {
6829  sumIntoGlobalValues (globalRowIndex, columnIndices, values);
6830  }
6831  else if (combineMode == REPLACE) {
6832  replaceGlobalValues (globalRowIndex, columnIndices, values);
6833  }
6834  else if (combineMode == ABSMAX) {
6835  using ::Tpetra::Details::AbsMax;
6836  AbsMax<Scalar> f;
6837  this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
6838  columnIndices,
6839  values, f);
6840  }
6841  else if (combineMode == INSERT) {
6842  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6843  (isStaticGraph() && combineMode == INSERT,
6844  std::invalid_argument, "INSERT combine mode is forbidden "
6845  "if the matrix has a static (const) graph (i.e., was "
6846  "constructed with the CrsMatrix constructor that takes a "
6847  "const CrsGraph pointer).");
6848  }
6849  else {
6850  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6851  (true, std::logic_error, "Invalid combine mode; should "
6852  "never get here! "
6853  "Please report this bug to the Tpetra developers.");
6854  }
6855  }
6856  else { // The matrix has a dynamic graph.
6857  if (combineMode == ADD || combineMode == INSERT) {
6858  // For a dynamic graph, all incoming column indices are
6859  // inserted into the target graph. Duplicate indices will
6860  // have their values summed. In this context, ADD and INSERT
6861  // are equivalent. We need to call insertGlobalValues()
6862  // anyway if the column indices don't yet exist in this row,
6863  // so we just call insertGlobalValues() for both cases.
6864  insertGlobalValuesFilteredChecked(globalRowIndex,
6865  columnIndices, values, prefix, debug, verbose);
6866  }
6867  // FIXME (mfh 14 Mar 2012):
6868  //
6869  // Implementing ABSMAX or REPLACE for a dynamic graph would
6870  // require modifying assembly to attach a possibly different
6871  // combine mode to each inserted (i, j, A_ij) entry. For
6872  // example, consider two different Export operations to the same
6873  // target CrsMatrix, the first with ABSMAX combine mode and the
6874  // second with REPLACE. This isn't a common use case, so we
6875  // won't mess with it for now.
6876  else if (combineMode == ABSMAX) {
6877  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6878  ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
6879  "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6880  "implemented.");
6881  }
6882  else if (combineMode == REPLACE) {
6883  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6884  ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
6885  "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6886  "implemented.");
6887  }
6888  else {
6889  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6890  true, std::logic_error, "Should never get here! Please report this "
6891  "bug to the Tpetra developers.");
6892  }
6893  }
6894  }
6895 
6896  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6897  void
6900  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6901  Kokkos::DualView<char*, buffer_device_type> imports,
6902  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6903  const size_t constantNumPackets,
6904  const CombineMode combineMode)
6905  {
6906  using Details::Behavior;
6909  using std::endl;
6910  const char tfecfFuncName[] = "unpackAndCombine: ";
6911  ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
6912 
6913  const bool debug = Behavior::debug("CrsMatrix");
6914  const bool verbose = Behavior::verbose("CrsMatrix");
6915  constexpr int numValidModes = 5;
6916  const CombineMode validModes[numValidModes] =
6917  {ADD, REPLACE, ABSMAX, INSERT, ZERO};
6918  const char* validModeNames[numValidModes] =
6919  {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6920 
6921  std::unique_ptr<std::string> prefix;
6922  if (verbose) {
6923  prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6924  std::ostringstream os;
6925  os << *prefix << "Start:" << endl
6926  << *prefix << " "
6927  << dualViewStatusToString (importLIDs, "importLIDs")
6928  << endl
6929  << *prefix << " "
6930  << dualViewStatusToString (imports, "imports")
6931  << endl
6932  << *prefix << " "
6933  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6934  << endl
6935  << *prefix << " constantNumPackets: " << constantNumPackets
6936  << endl
6937  << *prefix << " combineMode: " << combineModeToString (combineMode)
6938  << endl;
6939  std::cerr << os.str ();
6940  }
6941 
6942  if (debug) {
6943  if (std::find (validModes, validModes+numValidModes, combineMode) ==
6944  validModes+numValidModes) {
6945  std::ostringstream os;
6946  os << "Invalid combine mode. Valid modes are {";
6947  for (int k = 0; k < numValidModes; ++k) {
6948  os << validModeNames[k];
6949  if (k < numValidModes - 1) {
6950  os << ", ";
6951  }
6952  }
6953  os << "}.";
6954  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6955  (true, std::invalid_argument, os.str ());
6956  }
6957  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6958  (importLIDs.extent(0) != numPacketsPerLID.extent(0),
6959  std::invalid_argument, "importLIDs.extent(0)="
6960  << importLIDs.extent(0)
6961  << " != numPacketsPerLID.extent(0)="
6962  << numPacketsPerLID.extent(0) << ".");
6963  }
6964 
6965  if (combineMode == ZERO) {
6966  return; // nothing to do
6967  }
6968 
6969  if (debug) {
6970  using Teuchos::reduceAll;
6971  std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
6972  int lclBad = 0;
6973  try {
6974  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6975  constantNumPackets, combineMode,
6976  verbose);
6977  } catch (std::exception& e) {
6978  lclBad = 1;
6979  *msg << e.what ();
6980  }
6981  int gblBad = 0;
6982  const Teuchos::Comm<int>& comm = * (this->getComm ());
6983  reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
6984  lclBad, Teuchos::outArg (gblBad));
6985  if (gblBad != 0) {
6986  // mfh 22 Oct 2017: 'prefix' might be null, since it is only
6987  // initialized in a debug build. Thus, we get the process
6988  // rank again here. This is an error message, so the small
6989  // run-time cost doesn't matter. See #1887.
6990  std::ostringstream os;
6991  os << "Proc " << comm.getRank () << ": " << msg->str () << endl;
6992  msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
6993  ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
6994  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6995  (true, std::logic_error, std::endl << "unpackAndCombineImpl "
6996  "threw an exception on one or more participating processes: "
6997  << endl << msg->str ());
6998  }
6999  }
7000  else {
7001  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
7002  constantNumPackets, combineMode,
7003  verbose);
7004  }
7005 
7006  if (verbose) {
7007  std::ostringstream os;
7008  os << *prefix << "Done!" << endl
7009  << *prefix << " "
7010  << dualViewStatusToString (importLIDs, "importLIDs")
7011  << endl
7012  << *prefix << " "
7013  << dualViewStatusToString (imports, "imports")
7014  << endl
7015  << *prefix << " "
7016  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7017  << endl;
7018  std::cerr << os.str ();
7019  }
7020  }
7021 
7022  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7023  void
7026  const Kokkos::DualView<const local_ordinal_type*,
7027  buffer_device_type>& importLIDs,
7028  Kokkos::DualView<char*, buffer_device_type> imports,
7029  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7030  const size_t constantNumPackets,
7031  const CombineMode combineMode,
7032  const bool verbose)
7033  {
7034  Details::ProfilingRegion region_unpack_and_combine_impl(
7035  "Tpetra::CrsMatrix::unpackAndCombineImpl",
7036  "Import/Export"
7037  );
7038  using std::endl;
7039  const char tfecfFuncName[] = "unpackAndCombineImpl";
7040  std::unique_ptr<std::string> prefix;
7041  if (verbose) {
7042  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7043  std::ostringstream os;
7044  os << *prefix << "isStaticGraph(): "
7045  << (isStaticGraph() ? "true" : "false")
7046  << ", importLIDs.extent(0): "
7047  << importLIDs.extent(0)
7048  << ", imports.extent(0): "
7049  << imports.extent(0)
7050  << ", numPacketsPerLID.extent(0): "
7051  << numPacketsPerLID.extent(0)
7052  << endl;
7053  std::cerr << os.str();
7054  }
7055 
7056  if (isStaticGraph ()) {
7057  using Details::unpackCrsMatrixAndCombineNew;
7058  unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
7059  importLIDs, constantNumPackets,
7060  combineMode);
7061  }
7062  else {
7063  {
7064  using padding_type = typename crs_graph_type::padding_type;
7065  std::unique_ptr<padding_type> padding;
7066  try {
7067  padding = myGraph_->computePaddingForCrsMatrixUnpack(
7068  importLIDs, imports, numPacketsPerLID, verbose);
7069  }
7070  catch (std::exception& e) {
7071  const auto rowMap = getRowMap();
7072  const auto comm = rowMap.is_null() ? Teuchos::null :
7073  rowMap->getComm();
7074  const int myRank = comm.is_null() ? -1 : comm->getRank();
7075  TEUCHOS_TEST_FOR_EXCEPTION
7076  (true, std::runtime_error, "Proc " << myRank << ": "
7077  "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
7078  "threw an exception: " << e.what());
7079  }
7080  if (verbose) {
7081  std::ostringstream os;
7082  os << *prefix << "Call applyCrsPadding" << endl;
7083  std::cerr << os.str();
7084  }
7085  applyCrsPadding(*padding, verbose);
7086  }
7087  if (verbose) {
7088  std::ostringstream os;
7089  os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
7090  std::cerr << os.str();
7091  }
7092  unpackAndCombineImplNonStatic(importLIDs, imports,
7093  numPacketsPerLID,
7094  constantNumPackets,
7095  combineMode);
7096  }
7097 
7098  if (verbose) {
7099  std::ostringstream os;
7100  os << *prefix << "Done" << endl;
7101  std::cerr << os.str();
7102  }
7103  }
7104 
7105  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7106  void
7107  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7108  unpackAndCombineImplNonStatic(
7109  const Kokkos::DualView<const local_ordinal_type*,
7110  buffer_device_type>& importLIDs,
7111  Kokkos::DualView<char*, buffer_device_type> imports,
7112  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7113  const size_t constantNumPackets,
7114  const CombineMode combineMode)
7115  {
7116  using Kokkos::View;
7117  using Kokkos::subview;
7118  using Kokkos::MemoryUnmanaged;
7119  using Details::Behavior;
7122  using Details::PackTraits;
7123  using Details::ScalarViewTraits;
7124  using std::endl;
7125  using LO = LocalOrdinal;
7126  using GO = GlobalOrdinal;
7127  using ST = impl_scalar_type;
7128  using size_type = typename Teuchos::ArrayView<LO>::size_type;
7129  using HES =
7130  typename View<int*, device_type>::HostMirror::execution_space;
7131  using pair_type = std::pair<typename View<int*, HES>::size_type,
7132  typename View<int*, HES>::size_type>;
7133  using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
7134  using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
7135  const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
7136 
7137  const bool debug = Behavior::debug("CrsMatrix");
7138  const bool verbose = Behavior::verbose("CrsMatrix");
7139  std::unique_ptr<std::string> prefix;
7140  if (verbose) {
7141  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7142  std::ostringstream os;
7143  os << *prefix << endl; // we've already printed DualViews' statuses
7144  std::cerr << os.str ();
7145  }
7146  const char* const prefix_raw =
7147  verbose ? prefix.get()->c_str() : nullptr;
7148 
7149  const size_type numImportLIDs = importLIDs.extent (0);
7150  if (combineMode == ZERO || numImportLIDs == 0) {
7151  return; // nothing to do; no need to combine entries
7152  }
7153 
7154  Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
7155  "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
7156  "Import/Export"
7157  );
7158 
7159  // We're unpacking on host. This is read-only host access.
7160  if (imports.need_sync_host()) {
7161  imports.sync_host ();
7162  }
7163  auto imports_h = imports.view_host();
7164 
7165  // Read-only host access.
7166  if (numPacketsPerLID.need_sync_host()) {
7167  numPacketsPerLID.sync_host ();
7168  }
7169  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
7170 
7171  TEUCHOS_ASSERT( ! importLIDs.need_sync_host() );
7172  auto importLIDs_h = importLIDs.view_host();
7173 
7174  size_t numBytesPerValue;
7175  {
7176  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7177  // with run-time size? We already assume that all entries in both the
7178  // source and target matrices have the same size. If the calling process
7179  // owns at least one entry in either matrix, we can use that entry to set
7180  // the size. However, it is possible that the calling process owns no
7181  // entries. In that case, we're in trouble. One way to fix this would be
7182  // for each row's data to contain the run-time size. This is only
7183  // necessary if the size is not a compile-time constant.
7184  Scalar val;
7185  numBytesPerValue = PackTraits<ST>::packValueCount (val);
7186  }
7187 
7188  // Determine the maximum number of entries in any one row
7189  size_t offset = 0;
7190  size_t maxRowNumEnt = 0;
7191  for (size_type i = 0; i < numImportLIDs; ++i) {
7192  const size_t numBytes = numPacketsPerLID_h[i];
7193  if (numBytes == 0) {
7194  continue; // empty buffer for that row means that the row is empty
7195  }
7196  // We need to unpack a nonzero number of entries for this row.
7197  if (debug) {
7198  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7199  (offset + numBytes > size_t(imports_h.extent (0)),
7200  std::logic_error, ": At local row index importLIDs_h[i="
7201  << i << "]=" << importLIDs_h[i] << ", offset (=" << offset
7202  << ") + numBytes (=" << numBytes << ") > "
7203  "imports_h.extent(0)=" << imports_h.extent (0) << ".");
7204  }
7205  LO numEntLO = 0;
7206 
7207  if (debug) {
7208  const size_t theNumBytes =
7209  PackTraits<LO>::packValueCount (numEntLO);
7210  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7211  (theNumBytes > numBytes, std::logic_error, ": theNumBytes="
7212  << theNumBytes << " > numBytes = " << numBytes << ".");
7213  }
7214  const char* const inBuf = imports_h.data () + offset;
7215  const size_t actualNumBytes =
7216  PackTraits<LO>::unpackValue (numEntLO, inBuf);
7217 
7218  if (debug) {
7219  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7220  (actualNumBytes > numBytes, std::logic_error, ": At i=" << i
7221  << ", actualNumBytes=" << actualNumBytes
7222  << " > numBytes=" << numBytes << ".");
7223  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7224  (numEntLO == 0, std::logic_error, ": At local row index "
7225  "importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", "
7226  "the number of entries read from the packed data is "
7227  "numEntLO=" << numEntLO << ", but numBytes=" << numBytes
7228  << " != 0.");
7229  }
7230 
7231  maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
7232  offset += numBytes;
7233  }
7234 
7235  // Temporary space to cache incoming global column indices and
7236  // values. Column indices come in as global indices, in case the
7237  // source object's column Map differs from the target object's
7238  // (this's) column Map.
7239  View<GO*, HES> gblColInds;
7240  View<LO*, HES> lclColInds;
7241  View<ST*, HES> vals;
7242  {
7243  GO gid = 0;
7244  LO lid = 0;
7245  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7246  // with run-time size? We already assume that all entries in both the
7247  // source and target matrices have the same size. If the calling process
7248  // owns at least one entry in either matrix, we can use that entry to set
7249  // the size. However, it is possible that the calling process owns no
7250  // entries. In that case, we're in trouble. One way to fix this would be
7251  // for each row's data to contain the run-time size. This is only
7252  // necessary if the size is not a compile-time constant.
7253  Scalar val;
7254  gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
7255  gid, maxRowNumEnt, "gids");
7256  lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
7257  lid, maxRowNumEnt, "lids");
7258  vals = ScalarViewTraits<ST, HES>::allocateArray(
7259  val, maxRowNumEnt, "vals");
7260  }
7261 
7262  offset = 0;
7263  for (size_type i = 0; i < numImportLIDs; ++i) {
7264  const size_t numBytes = numPacketsPerLID_h[i];
7265  if (numBytes == 0) {
7266  continue; // empty buffer for that row means that the row is empty
7267  }
7268  LO numEntLO = 0;
7269  const char* const inBuf = imports_h.data () + offset;
7270  (void) PackTraits<LO>::unpackValue (numEntLO, inBuf);
7271 
7272  const size_t numEnt = static_cast<size_t>(numEntLO);;
7273  const LO lclRow = importLIDs_h[i];
7274 
7275  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
7276  vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
7277 
7278  const size_t numBytesOut =
7279  unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
7280  offset, numBytes, numEnt, numBytesPerValue);
7281  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7282  (numBytes != numBytesOut, std::logic_error, ": At i=" << i
7283  << ", numBytes=" << numBytes << " != numBytesOut="
7284  << numBytesOut << ".");
7285 
7286  const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
7287  const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
7288  combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
7289  combineMode, prefix_raw, debug, verbose);
7290  // Don't update offset until current LID has succeeded.
7291  offset += numBytes;
7292  } // for each import LID i
7293 
7294  if (verbose) {
7295  std::ostringstream os;
7296  os << *prefix << "Done" << endl;
7297  std::cerr << os.str();
7298  }
7299  }
7300 
7301  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7302  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7304  getColumnMapMultiVector (const MV& X_domainMap,
7305  const bool force) const
7306  {
7307  using Teuchos::null;
7308  using Teuchos::RCP;
7309  using Teuchos::rcp;
7310 
7311  TEUCHOS_TEST_FOR_EXCEPTION(
7312  ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
7313  "MapMultiVector: You may only call this method if the matrix has a "
7314  "column Map. If the matrix does not yet have a column Map, you should "
7315  "first call fillComplete (with domain and range Map if necessary).");
7316 
7317  // If the graph is not fill complete, then the Import object (if
7318  // one should exist) hasn't been constructed yet.
7319  TEUCHOS_TEST_FOR_EXCEPTION(
7320  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7321  "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
7322  "this matrix's graph is fill complete.");
7323 
7324  const size_t numVecs = X_domainMap.getNumVectors ();
7325  RCP<const import_type> importer = this->getGraph ()->getImporter ();
7326  RCP<const map_type> colMap = this->getColMap ();
7327 
7328  RCP<MV> X_colMap; // null by default
7329 
7330  // If the Import object is trivial (null), then we don't need a
7331  // separate column Map multivector. Just return null in that
7332  // case. The caller is responsible for knowing not to use the
7333  // returned null pointer.
7334  //
7335  // If the Import is nontrivial, then we do need a separate
7336  // column Map multivector for the Import operation. Check in
7337  // that case if we have to (re)create the column Map
7338  // multivector.
7339  if (! importer.is_null () || force) {
7340  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
7341  X_colMap = rcp (new MV (colMap, numVecs));
7342 
7343  // Cache the newly created multivector for later reuse.
7344  importMV_ = X_colMap;
7345  }
7346  else { // Yay, we can reuse the cached multivector!
7347  X_colMap = importMV_;
7348  // mfh 09 Jan 2013: We don't have to fill with zeros first,
7349  // because the Import uses INSERT combine mode, which overwrites
7350  // existing entries.
7351  //
7352  //X_colMap->putScalar (ZERO);
7353  }
7354  }
7355  return X_colMap;
7356  }
7357 
7358  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7359  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7362  const bool force) const
7363  {
7364  using Teuchos::null;
7365  using Teuchos::RCP;
7366  using Teuchos::rcp;
7367 
7368  // If the graph is not fill complete, then the Export object (if
7369  // one should exist) hasn't been constructed yet.
7370  TEUCHOS_TEST_FOR_EXCEPTION(
7371  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7372  "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7373  "matrix's graph is fill complete.");
7374 
7375  const size_t numVecs = Y_rangeMap.getNumVectors ();
7376  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
7377  // Every version of the constructor takes either a row Map, or a
7378  // graph (all of whose constructors take a row Map). Thus, the
7379  // matrix always has a row Map.
7380  RCP<const map_type> rowMap = this->getRowMap ();
7381 
7382  RCP<MV> Y_rowMap; // null by default
7383 
7384  // If the Export object is trivial (null), then we don't need a
7385  // separate row Map multivector. Just return null in that case.
7386  // The caller is responsible for knowing not to use the returned
7387  // null pointer.
7388  //
7389  // If the Export is nontrivial, then we do need a separate row
7390  // Map multivector for the Export operation. Check in that case
7391  // if we have to (re)create the row Map multivector.
7392  if (! exporter.is_null () || force) {
7393  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
7394  Y_rowMap = rcp (new MV (rowMap, numVecs));
7395  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7396  }
7397  else { // Yay, we can reuse the cached multivector!
7398  Y_rowMap = exportMV_;
7399  }
7400  }
7401  return Y_rowMap;
7402  }
7403 
7404  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7405  void
7407  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
7408  {
7409  TEUCHOS_TEST_FOR_EXCEPTION(
7410  myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
7411  "removeEmptyProcessesInPlace: This method does not work when the matrix "
7412  "was created with a constant graph (that is, when it was created using "
7413  "the version of its constructor that takes an RCP<const CrsGraph>). "
7414  "This is because the matrix is not allowed to modify the graph in that "
7415  "case, but removing empty processes requires modifying the graph.");
7416  myGraph_->removeEmptyProcessesInPlace (newMap);
7417  // Even though CrsMatrix's row Map (as returned by getRowMap())
7418  // comes from its CrsGraph, CrsMatrix still implements DistObject,
7419  // so we also have to change the DistObject's Map.
7420  this->map_ = this->getRowMap ();
7421  // In the nonconst graph case, staticGraph_ is just a const
7422  // pointer to myGraph_. This assignment is probably redundant,
7423  // but it doesn't hurt.
7424  staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
7425  }
7426 
7427  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7428  Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7430  add (const Scalar& alpha,
7432  const Scalar& beta,
7433  const Teuchos::RCP<const map_type>& domainMap,
7434  const Teuchos::RCP<const map_type>& rangeMap,
7435  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7436  {
7437  using Teuchos::Array;
7438  using Teuchos::ArrayView;
7439  using Teuchos::ParameterList;
7440  using Teuchos::RCP;
7441  using Teuchos::rcp;
7442  using Teuchos::rcp_implicit_cast;
7443  using Teuchos::sublist;
7444  using std::endl;
7445  using LO = local_ordinal_type;
7446  using GO = global_ordinal_type;
7447  using crs_matrix_type =
7449  const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7450 
7451  const bool debug = Details::Behavior::debug("CrsMatrix");
7452  const bool verbose = Details::Behavior::verbose("CrsMatrix");
7453  std::unique_ptr<std::string> prefix;
7454  if (verbose) {
7455  prefix = this->createPrefix("CrsMatrix", "add");
7456  std::ostringstream os;
7457  os << *prefix << "Start" << endl;
7458  std::cerr << os.str ();
7459  }
7460 
7461  const crs_matrix_type& B = *this; // a convenient abbreviation
7462  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7463  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7464 
7465  // If the user didn't supply a domain or range Map, then try to
7466  // get one from B first (if it has them), then from A (if it has
7467  // them). If we don't have any domain or range Maps, scold the
7468  // user.
7469  RCP<const map_type> A_domainMap = A.getDomainMap ();
7470  RCP<const map_type> A_rangeMap = A.getRangeMap ();
7471  RCP<const map_type> B_domainMap = B.getDomainMap ();
7472  RCP<const map_type> B_rangeMap = B.getRangeMap ();
7473 
7474  RCP<const map_type> theDomainMap = domainMap;
7475  RCP<const map_type> theRangeMap = rangeMap;
7476 
7477  if (domainMap.is_null ()) {
7478  if (B_domainMap.is_null ()) {
7479  TEUCHOS_TEST_FOR_EXCEPTION(
7480  A_domainMap.is_null (), std::invalid_argument,
7481  "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7482  "then you must supply a nonnull domain Map to this method.");
7483  theDomainMap = A_domainMap;
7484  } else {
7485  theDomainMap = B_domainMap;
7486  }
7487  }
7488  if (rangeMap.is_null ()) {
7489  if (B_rangeMap.is_null ()) {
7490  TEUCHOS_TEST_FOR_EXCEPTION(
7491  A_rangeMap.is_null (), std::invalid_argument,
7492  "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7493  "then you must supply a nonnull range Map to this method.");
7494  theRangeMap = A_rangeMap;
7495  } else {
7496  theRangeMap = B_rangeMap;
7497  }
7498  }
7499 
7500  if (debug) {
7501  // In debug mode, check that A and B have matching domain and
7502  // range Maps, if they have domain and range Maps at all. (If
7503  // they aren't fill complete, then they may not yet have them.)
7504  if (! A_domainMap.is_null() && ! A_rangeMap.is_null()) {
7505  if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7506  TEUCHOS_TEST_FOR_EXCEPTION
7507  (! B_domainMap->isSameAs(*A_domainMap),
7508  std::invalid_argument,
7509  errPfx << "The input RowMatrix A must have a domain Map "
7510  "which is the same as (isSameAs) this RowMatrix's "
7511  "domain Map.");
7512  TEUCHOS_TEST_FOR_EXCEPTION
7513  (! B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7514  errPfx << "The input RowMatrix A must have a range Map "
7515  "which is the same as (isSameAs) this RowMatrix's range "
7516  "Map.");
7517  TEUCHOS_TEST_FOR_EXCEPTION
7518  (! domainMap.is_null() &&
7519  ! domainMap->isSameAs(*B_domainMap),
7520  std::invalid_argument,
7521  errPfx << "The input domain Map must be the same as "
7522  "(isSameAs) this RowMatrix's domain Map.");
7523  TEUCHOS_TEST_FOR_EXCEPTION
7524  (! rangeMap.is_null() &&
7525  ! rangeMap->isSameAs(*B_rangeMap),
7526  std::invalid_argument,
7527  errPfx << "The input range Map must be the same as "
7528  "(isSameAs) this RowMatrix's range Map.");
7529  }
7530  }
7531  else if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7532  TEUCHOS_TEST_FOR_EXCEPTION
7533  (! domainMap.is_null() &&
7534  ! domainMap->isSameAs(*B_domainMap),
7535  std::invalid_argument,
7536  errPfx << "The input domain Map must be the same as "
7537  "(isSameAs) this RowMatrix's domain Map.");
7538  TEUCHOS_TEST_FOR_EXCEPTION
7539  (! rangeMap.is_null() && ! rangeMap->isSameAs(*B_rangeMap),
7540  std::invalid_argument,
7541  errPfx << "The input range Map must be the same as "
7542  "(isSameAs) this RowMatrix's range Map.");
7543  }
7544  else {
7545  TEUCHOS_TEST_FOR_EXCEPTION
7546  (domainMap.is_null() || rangeMap.is_null(),
7547  std::invalid_argument, errPfx << "If neither A nor B "
7548  "have a domain and range Map, then you must supply a "
7549  "nonnull domain and range Map to this method.");
7550  }
7551  }
7552 
7553  // What parameters do we pass to C's constructor? Do we call
7554  // fillComplete on C after filling it? And if so, what parameters
7555  // do we pass to C's fillComplete call?
7556  bool callFillComplete = true;
7557  RCP<ParameterList> constructorSublist;
7558  RCP<ParameterList> fillCompleteSublist;
7559  if (! params.is_null()) {
7560  callFillComplete =
7561  params->get("Call fillComplete", callFillComplete);
7562  constructorSublist = sublist(params, "Constructor parameters");
7563  fillCompleteSublist = sublist(params, "fillComplete parameters");
7564  }
7565 
7566  RCP<const map_type> A_rowMap = A.getRowMap ();
7567  RCP<const map_type> B_rowMap = B.getRowMap ();
7568  RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7569  RCP<crs_matrix_type> C; // The result matrix.
7570 
7571  // If A and B's row Maps are the same, we can compute an upper
7572  // bound on the number of entries in each row of C, before
7573  // actually computing the sum. A reasonable upper bound is the
7574  // sum of the two entry counts in each row.
7575  if (A_rowMap->isSameAs (*B_rowMap)) {
7576  const LO localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7577  Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
7578 
7579  // Get the number of entries in each row of A.
7580  if (alpha != ZERO) {
7581  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7582  const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7583  C_maxNumEntriesPerRow[localRow] += A_numEntries;
7584  }
7585  }
7586  // Get the number of entries in each row of B.
7587  if (beta != ZERO) {
7588  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7589  const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7590  C_maxNumEntriesPerRow[localRow] += B_numEntries;
7591  }
7592  }
7593  // Construct the result matrix C.
7594  if (constructorSublist.is_null ()) {
7595  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow ()));
7596  } else {
7597  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
7598  constructorSublist));
7599  }
7600  // Since A and B have the same row Maps, we could add them
7601  // together all at once and merge values before we call
7602  // insertGlobalValues. However, we don't really need to, since
7603  // we've already allocated enough space in each row of C for C
7604  // to do the merge itself.
7605  }
7606  else { // the row Maps of A and B are not the same
7607  // Construct the result matrix C.
7608  // true: !A_rowMap->isSameAs (*B_rowMap)
7609  TEUCHOS_TEST_FOR_EXCEPTION
7610  (true, std::invalid_argument, errPfx << "The row maps must "
7611  "be the same for statically allocated matrices, to ensure "
7612  "that there is sufficient space to do the addition.");
7613  }
7614 
7615  TEUCHOS_TEST_FOR_EXCEPTION
7616  (C.is_null (), std::logic_error,
7617  errPfx << "C should not be null at this point. "
7618  "Please report this bug to the Tpetra developers.");
7619 
7620  if (verbose) {
7621  std::ostringstream os;
7622  os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7623  std::cerr << os.str ();
7624  }
7625  using gids_type = nonconst_global_inds_host_view_type;
7626  using vals_type = nonconst_values_host_view_type;
7627  gids_type ind;
7628  vals_type val;
7629 
7630  if (alpha != ZERO) {
7631  const LO A_localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7632  for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7633  size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7634  const GO globalRow = A_rowMap->getGlobalElement (localRow);
7635  if (A_numEntries > static_cast<size_t> (ind.size ())) {
7636  Kokkos::resize(ind,A_numEntries);
7637  Kokkos::resize(val,A_numEntries);
7638  }
7639  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
7640  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
7641  A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
7642 
7643  if (alpha != ONE) {
7644  for (size_t k = 0; k < A_numEntries; ++k) {
7645  valView[k] *= alpha;
7646  }
7647  }
7648  C->insertGlobalValues (globalRow, A_numEntries,
7649  reinterpret_cast<Scalar *>(valView.data()),
7650  indView.data());
7651  }
7652  }
7653 
7654  if (beta != ZERO) {
7655  const LO B_localNumRows = static_cast<LO> (B_rowMap->getLocalNumElements ());
7656  for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7657  size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7658  const GO globalRow = B_rowMap->getGlobalElement (localRow);
7659  if (B_numEntries > static_cast<size_t> (ind.size ())) {
7660  Kokkos::resize(ind,B_numEntries);
7661  Kokkos::resize(val,B_numEntries);
7662  }
7663  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
7664  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
7665  B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
7666 
7667  if (beta != ONE) {
7668  for (size_t k = 0; k < B_numEntries; ++k) {
7669  valView[k] *= beta;
7670  }
7671  }
7672  C->insertGlobalValues (globalRow, B_numEntries,
7673  reinterpret_cast<Scalar *>(valView.data()),
7674  indView.data());
7675  }
7676  }
7677 
7678  if (callFillComplete) {
7679  if (verbose) {
7680  std::ostringstream os;
7681  os << *prefix << "Call fillComplete on C" << endl;
7682  std::cerr << os.str ();
7683  }
7684  if (fillCompleteSublist.is_null ()) {
7685  C->fillComplete (theDomainMap, theRangeMap);
7686  } else {
7687  C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
7688  }
7689  }
7690  else if (verbose) {
7691  std::ostringstream os;
7692  os << *prefix << "Do NOT call fillComplete on C" << endl;
7693  std::cerr << os.str ();
7694  }
7695 
7696  if (verbose) {
7697  std::ostringstream os;
7698  os << *prefix << "Done" << endl;
7699  std::cerr << os.str ();
7700  }
7701  return rcp_implicit_cast<row_matrix_type> (C);
7702  }
7703 
7704 
7705 
7706  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7707  void
7710  const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7711  const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
7712  const Teuchos::RCP<const map_type>& domainMap,
7713  const Teuchos::RCP<const map_type>& rangeMap,
7714  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7715  {
7716  using Details::Behavior;
7721  using Teuchos::ArrayRCP;
7722  using Teuchos::ArrayView;
7723  using Teuchos::Comm;
7724  using Teuchos::ParameterList;
7725  using Teuchos::RCP;
7726  using std::endl;
7727  typedef LocalOrdinal LO;
7728  typedef GlobalOrdinal GO;
7729  typedef node_type NT;
7730  typedef CrsMatrix<Scalar, LO, GO, NT> this_CRS_type;
7731  typedef Vector<int, LO, GO, NT> IntVectorType;
7732  using Teuchos::as;
7733 
7734  const bool debug = Behavior::debug("CrsMatrix");
7735  const bool verbose = Behavior::verbose("CrsMatrix");
7736  int MyPID = getComm ()->getRank ();
7737 
7738  std::unique_ptr<std::string> verbosePrefix;
7739  if (verbose) {
7740  verbosePrefix =
7741  this->createPrefix("CrsMatrix", "transferAndFillComplete");
7742  std::ostringstream os;
7743  os << "Start" << endl;
7744  std::cerr << os.str();
7745  }
7746 
7747  //
7748  // Get the caller's parameters
7749  //
7750  bool isMM = false; // optimize for matrix-matrix ops.
7751  bool reverseMode = false; // Are we in reverse mode?
7752  bool restrictComm = false; // Do we need to restrict the communicator?
7753 
7754  int mm_optimization_core_count =
7755  Behavior::TAFC_OptimizationCoreCount();
7756  RCP<ParameterList> matrixparams; // parameters for the destination matrix
7757  bool overrideAllreduce = false;
7758  bool useKokkosPath = false;
7759  if (! params.is_null ()) {
7760  matrixparams = sublist (params, "CrsMatrix");
7761  reverseMode = params->get ("Reverse Mode", reverseMode);
7762  useKokkosPath = params->get ("TAFC: use kokkos path", useKokkosPath);
7763  restrictComm = params->get ("Restrict Communicator", restrictComm);
7764  auto & slist = params->sublist("matrixmatrix: kernel params",false);
7765  isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
7766  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
7767 
7768  overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
7769  if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7770  if(reverseMode) isMM = false;
7771  }
7772 
7773  // Only used in the sparse matrix-matrix multiply (isMM) case.
7774  std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
7775  int mismatch = 0;
7776  int reduced_mismatch = 0;
7777  if (isMM && !overrideAllreduce) {
7778 
7779  // Test for pathological matrix transfer
7780  const bool source_vals = ! getGraph ()->getImporter ().is_null();
7781  const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
7782  rowTransfer.getRemoteLIDs ().size() == 0);
7783  mismatch = (source_vals != target_vals) ? 1 : 0;
7784  iallreduceRequest =
7785  ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
7786  Teuchos::REDUCE_MAX, * (getComm ()));
7787  }
7788 
7789 #ifdef HAVE_TPETRA_MMM_TIMINGS
7790  using Teuchos::TimeMonitor;
7791  std::string label;
7792  if(!params.is_null())
7793  label = params->get("Timer Label",label);
7794  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
7795  std::string tlstr;
7796  {
7797  std::ostringstream os;
7798  if(isMM) os<<":MMOpt";
7799  else os<<":MMLegacy";
7800  tlstr = os.str();
7801  }
7802 
7803  Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
7804 #endif
7805 
7806  // Make sure that the input argument rowTransfer is either an
7807  // Import or an Export. Import and Export are the only two
7808  // subclasses of Transfer that we defined, but users might
7809  // (unwisely, for now at least) decide to implement their own
7810  // subclasses. Exclude this possibility.
7811  const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
7812  const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
7813  TEUCHOS_TEST_FOR_EXCEPTION(
7814  xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7815  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7816  "argument must be either an Import or an Export, and its template "
7817  "parameters must match the corresponding template parameters of the "
7818  "CrsMatrix.");
7819 
7820  // Make sure that the input argument domainTransfer is either an
7821  // Import or an Export. Import and Export are the only two
7822  // subclasses of Transfer that we defined, but users might
7823  // (unwisely, for now at least) decide to implement their own
7824  // subclasses. Exclude this possibility.
7825  Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
7826  Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
7827 
7828  if(! domainTransfer.is_null()) {
7829  TEUCHOS_TEST_FOR_EXCEPTION(
7830  (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7831  "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7832  "argument must be either an Import or an Export, and its template "
7833  "parameters must match the corresponding template parameters of the "
7834  "CrsMatrix.");
7835 
7836  TEUCHOS_TEST_FOR_EXCEPTION(
7837  ( xferAsImport != nullptr || ! xferDomainAsImport.is_null() ) &&
7838  (( xferAsImport != nullptr && xferDomainAsImport.is_null() ) ||
7839  ( xferAsImport == nullptr && ! xferDomainAsImport.is_null() )), std::invalid_argument,
7840  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7841  "arguments must be of the same type (either Import or Export).");
7842 
7843  TEUCHOS_TEST_FOR_EXCEPTION(
7844  ( xferAsExport != nullptr || ! xferDomainAsExport.is_null() ) &&
7845  (( xferAsExport != nullptr && xferDomainAsExport.is_null() ) ||
7846  ( xferAsExport == nullptr && ! xferDomainAsExport.is_null() )), std::invalid_argument,
7847  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7848  "arguments must be of the same type (either Import or Export).");
7849  } // domainTransfer != null
7850 
7851 
7852  // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7853  // if the source Map is not distributed but the target Map is?
7854  const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
7855 
7856  // Get the new domain and range Maps. We need some of them for
7857  // error checking, now that we have the reverseMode parameter.
7858  RCP<const map_type> MyRowMap = reverseMode ?
7859  rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
7860  RCP<const map_type> MyColMap; // create this below
7861  RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
7862  domainMap : getDomainMap ();
7863  RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
7864  rangeMap : getRangeMap ();
7865  RCP<const map_type> BaseRowMap = MyRowMap;
7866  RCP<const map_type> BaseDomainMap = MyDomainMap;
7867 
7868  // If the user gave us a nonnull destMat, then check whether it's
7869  // "pristine." That means that it has no entries.
7870  //
7871  // FIXME (mfh 15 May 2014) If this is not true on all processes,
7872  // then this exception test may hang. It would be better to
7873  // forward an error flag to the next communication phase.
7874  if (! destMat.is_null ()) {
7875  // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7876  // whether a graph or matrix has no entries on the calling
7877  // process, is that it is neither locally nor globally indexed.
7878  // This may change eventually with the Kokkos refactor version
7879  // of Tpetra, so it would be better just to check the quantity
7880  // of interest directly. Note that with the Kokkos refactor
7881  // version of Tpetra, asking for the total number of entries in
7882  // a graph or matrix that is not fill complete might require
7883  // computation (kernel launch), since it is not thread scalable
7884  // to update a count every time an entry is inserted.
7885  const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
7886  ! destMat->getGraph ()->isGloballyIndexed ();
7887  TEUCHOS_TEST_FOR_EXCEPTION(
7888  ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
7889  "transferAndFillComplete: The input argument 'destMat' is only allowed "
7890  "to be nonnull, if its graph is empty (neither locally nor globally "
7891  "indexed).");
7892  // FIXME (mfh 15 May 2014) At some point, we want to change
7893  // graphs and matrices so that their DistObject Map
7894  // (this->getMap()) may differ from their row Map. This will
7895  // make redistribution for 2-D distributions more efficient. I
7896  // hesitate to change this check, because I'm not sure how much
7897  // the code here depends on getMap() and getRowMap() being the
7898  // same.
7899  TEUCHOS_TEST_FOR_EXCEPTION(
7900  ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
7901  "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7902  "input argument 'destMat' is not the same as the (row) Map specified "
7903  "by the input argument 'rowTransfer'.");
7904  TEUCHOS_TEST_FOR_EXCEPTION(
7905  ! destMat->checkSizes (*this), std::invalid_argument,
7906  "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7907  "destination matrix, but checkSizes() indicates that it is not a legal "
7908  "legal target for redistribution from the source matrix (*this). This "
7909  "may mean that they do not have the same dimensions.");
7910  }
7911 
7912  // If forward mode (the default), then *this's (row) Map must be
7913  // the same as the source Map of the Transfer. If reverse mode,
7914  // then *this's (row) Map must be the same as the target Map of
7915  // the Transfer.
7916  //
7917  // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7918  // and matrices so that their DistObject Map (this->getMap()) may
7919  // differ from their row Map. This will make redistribution for
7920  // 2-D distributions more efficient. I hesitate to change this
7921  // check, because I'm not sure how much the code here depends on
7922  // getMap() and getRowMap() being the same.
7923  TEUCHOS_TEST_FOR_EXCEPTION(
7924  ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
7925  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7926  "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7927  TEUCHOS_TEST_FOR_EXCEPTION(
7928  ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
7929  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7930  "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7931 
7932  // checks for domainTransfer
7933  TEUCHOS_TEST_FOR_EXCEPTION(
7934  ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7935  std::invalid_argument,
7936  "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7937  "argument must be the same as the rebalanced domain map 'domainMap'");
7938 
7939  TEUCHOS_TEST_FOR_EXCEPTION(
7940  ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7941  std::invalid_argument,
7942  "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7943  "argument must be the same as the rebalanced domain map 'domainMap'");
7944 
7945  // The basic algorithm here is:
7946  //
7947  // 1. Call the moral equivalent of "Distor.do" to handle the import.
7948  // 2. Copy all the Imported and Copy/Permuted data into the raw
7949  // CrsMatrix / CrsGraphData pointers, still using GIDs.
7950  // 3. Call an optimized version of MakeColMap that avoids the
7951  // Directory lookups (since the importer knows who owns all the
7952  // GIDs) AND reindexes to LIDs.
7953  // 4. Call expertStaticFillComplete()
7954 
7955  // Get information from the Importer
7956  const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7957  ArrayView<const LO> ExportLIDs = reverseMode ?
7958  rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
7959  auto RemoteLIDs = reverseMode ?
7960  rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
7961  auto PermuteToLIDs = reverseMode ?
7962  rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
7963  auto PermuteFromLIDs = reverseMode ?
7964  rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
7965  Distributor& Distor = rowTransfer.getDistributor ();
7966 
7967  // Owning PIDs
7968  Teuchos::Array<int> SourcePids;
7969 
7970  // Temp variables for sub-communicators
7971  RCP<const map_type> ReducedRowMap, ReducedColMap,
7972  ReducedDomainMap, ReducedRangeMap;
7973  RCP<const Comm<int> > ReducedComm;
7974 
7975  // If the user gave us a null destMat, then construct the new
7976  // destination matrix. We will replace its column Map later.
7977  if (destMat.is_null ()) {
7978  destMat = rcp (new this_CRS_type (MyRowMap, 0, matrixparams));
7979  }
7980 
7981  /***************************************************/
7982  /***** 1) First communicator restriction phase ****/
7983  /***************************************************/
7984  if (restrictComm) {
7985 #ifdef HAVE_TPETRA_MMM_TIMINGS
7986  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrictComm")));
7987 #endif
7988  ReducedRowMap = MyRowMap->removeEmptyProcesses ();
7989  ReducedComm = ReducedRowMap.is_null () ?
7990  Teuchos::null :
7991  ReducedRowMap->getComm ();
7992  destMat->removeEmptyProcessesInPlace (ReducedRowMap);
7993 
7994  ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
7995  ReducedRowMap :
7996  MyDomainMap->replaceCommWithSubset (ReducedComm);
7997  ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
7998  ReducedRowMap :
7999  MyRangeMap->replaceCommWithSubset (ReducedComm);
8000 
8001  // Reset the "my" maps
8002  MyRowMap = ReducedRowMap;
8003  MyDomainMap = ReducedDomainMap;
8004  MyRangeMap = ReducedRangeMap;
8005 
8006  // Update my PID, if we've restricted the communicator
8007  if (! ReducedComm.is_null ()) {
8008  MyPID = ReducedComm->getRank ();
8009  }
8010  else {
8011  MyPID = -2; // For debugging
8012  }
8013  }
8014  else {
8015  ReducedComm = MyRowMap->getComm ();
8016  }
8017 
8018 
8019 
8020  /***************************************************/
8021  /***** 2) From Tpetra::DistObject::doTransfer() ****/
8022  /***************************************************/
8023  // Get the owning PIDs
8024  RCP<const import_type> MyImporter = getGraph ()->getImporter ();
8025 
8026  // check whether domain maps of source matrix and base domain map is the same
8027  bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
8028 
8029  if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
8030 #ifdef HAVE_TPETRA_MMM_TIMINGS
8031  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs same map")));
8032 #endif
8033  // Same domain map as source matrix
8034  //
8035  // NOTE: This won't work for restrictComm (because the Import
8036  // doesn't know the restricted PIDs), though writing an
8037  // optimized version for that case would be easy (Import an
8038  // IntVector of the new PIDs). Might want to add this later.
8039  Import_Util::getPids (*MyImporter, SourcePids, false);
8040  }
8041  else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
8042  // Same domain map as source matrix (restricted communicator)
8043  // We need one import from the domain to the column map
8044 #ifdef HAVE_TPETRA_MMM_TIMINGS
8045  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs restricted comm")));
8046 #endif
8047  IntVectorType SourceDomain_pids(getDomainMap (),true);
8048  IntVectorType SourceCol_pids(getColMap());
8049  // SourceDomain_pids contains the restricted pids
8050  SourceDomain_pids.putScalar(MyPID);
8051 
8052  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8053  SourcePids.resize (getColMap ()->getLocalNumElements ());
8054  SourceCol_pids.get1dCopy (SourcePids ());
8055  }
8056  else if (MyImporter.is_null ()) {
8057  // Matrix has no off-process entries
8058 #ifdef HAVE_TPETRA_MMM_TIMINGS
8059  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs all local entries")));
8060 #endif
8061  SourcePids.resize (getColMap ()->getLocalNumElements ());
8062  SourcePids.assign (getColMap ()->getLocalNumElements (), MyPID);
8063  }
8064  else if ( ! MyImporter.is_null () &&
8065  ! domainTransfer.is_null () ) {
8066  // general implementation for rectangular matrices with
8067  // domain map different than SourceMatrix domain map.
8068  // User has to provide a DomainTransfer object. We need
8069  // to communications (import/export)
8070 #ifdef HAVE_TPETRA_MMM_TIMINGS
8071  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs rectangular case")));
8072 #endif
8073 
8074  // TargetDomain_pids lives on the rebalanced new domain map
8075  IntVectorType TargetDomain_pids (domainMap);
8076  TargetDomain_pids.putScalar (MyPID);
8077 
8078  // SourceDomain_pids lives on the non-rebalanced old domain map
8079  IntVectorType SourceDomain_pids (getDomainMap ());
8080 
8081  // SourceCol_pids lives on the non-rebalanced old column map
8082  IntVectorType SourceCol_pids (getColMap ());
8083 
8084  if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8085  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8086  }
8087  else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8088  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8089  }
8090  else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8091  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8092  }
8093  else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8094  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8095  }
8096  else {
8097  TEUCHOS_TEST_FOR_EXCEPTION(
8098  true, std::logic_error, "Tpetra::CrsMatrix::"
8099  "transferAndFillComplete: Should never get here! "
8100  "Please report this bug to a Tpetra developer.");
8101  }
8102  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8103  SourcePids.resize (getColMap ()->getLocalNumElements ());
8104  SourceCol_pids.get1dCopy (SourcePids ());
8105  }
8106  else if ( ! MyImporter.is_null () &&
8107  BaseDomainMap->isSameAs (*BaseRowMap) &&
8108  getDomainMap ()->isSameAs (*getRowMap ())) {
8109  // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
8110 #ifdef HAVE_TPETRA_MMM_TIMINGS
8111  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs query import")));
8112 #endif
8113 
8114  IntVectorType TargetRow_pids (domainMap);
8115  IntVectorType SourceRow_pids (getRowMap ());
8116  IntVectorType SourceCol_pids (getColMap ());
8117 
8118  TargetRow_pids.putScalar (MyPID);
8119  if (! reverseMode && xferAsImport != nullptr) {
8120  SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
8121  }
8122  else if (reverseMode && xferAsExport != nullptr) {
8123  SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
8124  }
8125  else if (! reverseMode && xferAsExport != nullptr) {
8126  SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
8127  }
8128  else if (reverseMode && xferAsImport != nullptr) {
8129  SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
8130  }
8131  else {
8132  TEUCHOS_TEST_FOR_EXCEPTION(
8133  true, std::logic_error, "Tpetra::CrsMatrix::"
8134  "transferAndFillComplete: Should never get here! "
8135  "Please report this bug to a Tpetra developer.");
8136  }
8137 
8138  SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
8139  SourcePids.resize (getColMap ()->getLocalNumElements ());
8140  SourceCol_pids.get1dCopy (SourcePids ());
8141  }
8142  else {
8143  TEUCHOS_TEST_FOR_EXCEPTION(
8144  true, std::invalid_argument, "Tpetra::CrsMatrix::"
8145  "transferAndFillComplete: This method only allows either domainMap == "
8146  "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
8147  "getDomainMap () == getRowMap ()).");
8148  }
8149 
8150  // Tpetra-specific stuff
8151  size_t constantNumPackets = destMat->constantNumberOfPackets ();
8152  {
8153 #ifdef HAVE_TPETRA_MMM_TIMINGS
8154  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC reallocate buffers")));
8155 #endif
8156  if (constantNumPackets == 0) {
8157  destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
8158  RemoteLIDs.view_host().size ());
8159  }
8160  else {
8161  // There are a constant number of packets per element. We
8162  // already know (from the number of "remote" (incoming)
8163  // elements) how many incoming elements we expect, so we can
8164  // resize the buffer accordingly.
8165  const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
8166  destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
8167  }
8168  }
8169 
8170  // Pack & Prepare w/ owning PIDs
8171  {
8172 #ifdef HAVE_TPETRA_MMM_TIMINGS
8173  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC pack and prepare")));
8174 #endif
8175  if (debug) {
8176  using Teuchos::outArg;
8177  using Teuchos::REDUCE_MAX;
8178  using Teuchos::reduceAll;
8179  using std::cerr;
8180  using std::endl;
8181  RCP<const Teuchos::Comm<int> > comm = this->getComm ();
8182  const int myRank = comm->getRank ();
8183 
8184  std::ostringstream errStrm;
8185  int lclErr = 0;
8186  int gblErr = 0;
8187 
8188  Teuchos::ArrayView<size_t> numExportPacketsPerLID;
8189  try {
8190  // packAndPrepare* methods modify numExportPacketsPerLID_.
8191  destMat->numExportPacketsPerLID_.modify_host ();
8192  numExportPacketsPerLID =
8193  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8194  }
8195  catch (std::exception& e) {
8196  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
8197  << e.what () << std::endl;
8198  lclErr = 1;
8199  }
8200  catch (...) {
8201  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
8202  "an exception not a subclass of std::exception" << std::endl;
8203  lclErr = 1;
8204  }
8205 
8206  if (! comm.is_null ()) {
8207  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8208  }
8209  if (gblErr != 0) {
8210  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8211  TEUCHOS_TEST_FOR_EXCEPTION(
8212  true, std::runtime_error, "getArrayViewFromDualView threw an "
8213  "exception on at least one process.");
8214  }
8215 
8216  if (verbose) {
8217  std::ostringstream os;
8218  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8219  << std::endl;
8220  std::cerr << os.str ();
8221  }
8222  try {
8224  destMat->exports_,
8225  numExportPacketsPerLID,
8226  ExportLIDs,
8227  SourcePids,
8228  constantNumPackets);
8229  }
8230  catch (std::exception& e) {
8231  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
8232  << e.what () << std::endl;
8233  lclErr = 1;
8234  }
8235  catch (...) {
8236  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
8237  "an exception not a subclass of std::exception" << std::endl;
8238  lclErr = 1;
8239  }
8240 
8241  if (verbose) {
8242  std::ostringstream os;
8243  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8244  << std::endl;
8245  std::cerr << os.str ();
8246  }
8247 
8248  if (! comm.is_null ()) {
8249  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8250  }
8251  if (gblErr != 0) {
8252  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8253  TEUCHOS_TEST_FOR_EXCEPTION(
8254  true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
8255  "exception on at least one process.");
8256  }
8257  }
8258  else {
8259  // packAndPrepare* methods modify numExportPacketsPerLID_.
8260  destMat->numExportPacketsPerLID_.modify_host ();
8261  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
8262  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8263  if (verbose) {
8264  std::ostringstream os;
8265  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8266  << std::endl;
8267  std::cerr << os.str ();
8268  }
8270  destMat->exports_,
8271  numExportPacketsPerLID,
8272  ExportLIDs,
8273  SourcePids,
8274  constantNumPackets);
8275  if (verbose) {
8276  std::ostringstream os;
8277  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8278  << std::endl;
8279  std::cerr << os.str ();
8280  }
8281  }
8282  }
8283 
8284  // Do the exchange of remote data.
8285  {
8286 #ifdef HAVE_TPETRA_MMM_TIMINGS
8287  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs exchange remote data")));
8288 #endif
8289  if (! communication_needed) {
8290  if (verbose) {
8291  std::ostringstream os;
8292  os << *verbosePrefix << "Communication not needed" << std::endl;
8293  std::cerr << os.str ();
8294  }
8295  }
8296  else {
8297  if (reverseMode) {
8298  if (constantNumPackets == 0) { // variable number of packets per LID
8299  if (verbose) {
8300  std::ostringstream os;
8301  os << *verbosePrefix << "Reverse mode, variable # packets / LID"
8302  << std::endl;
8303  std::cerr << os.str ();
8304  }
8305  destMat->numExportPacketsPerLID_.sync_device();
8306  auto numExportPacketsPerLID = destMat->numExportPacketsPerLID_.view_device();
8307  auto numImportPacketsPerLID = destMat->numImportPacketsPerLID_.view_device();
8308  if (verbose) {
8309  std::ostringstream os;
8310  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8311  << std::endl;
8312  std::cerr << os.str ();
8313  }
8314  Distor.doReversePostsAndWaits(numExportPacketsPerLID, 1, numImportPacketsPerLID);
8315  if (verbose) {
8316  std::ostringstream os;
8317  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8318  << std::endl;
8319  std::cerr << os.str ();
8320  }
8321 
8322  size_t totalImportPackets = Kokkos::Experimental::reduce(typename Node::execution_space(), numImportPacketsPerLID);
8323 
8324  // Reallocation MUST go before setting the modified flag,
8325  // because it may clear out the flags.
8326  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8327  verbosePrefix.get ());
8328  destMat->imports_.modify_host ();
8329  auto deviceImports = destMat->imports_.view_device();
8330  auto deviceExports = destMat->exports_.view_device();
8331  if (verbose) {
8332  std::ostringstream os;
8333  os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaitsKokkos"
8334  << std::endl;
8335  std::cerr << os.str ();
8336  }
8337  destMat->imports_.sync_device();
8338  Distor.doReversePostsAndWaitsKokkos (deviceExports, numExportPacketsPerLID, deviceImports, numImportPacketsPerLID);
8339  if (verbose) {
8340  std::ostringstream os;
8341  os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaitsKokkos"
8342  << std::endl;
8343  std::cerr << os.str ();
8344  }
8345  }
8346  else { // constant number of packets per LID
8347  if (verbose) {
8348  std::ostringstream os;
8349  os << *verbosePrefix << "Reverse mode, constant # packets / LID"
8350  << std::endl;
8351  std::cerr << os.str ();
8352  }
8353  destMat->imports_.modify_host ();
8354  auto hostImports = destMat->imports_.view_host();
8355  // This is a legacy host pack/unpack path, so use the host
8356  // version of exports_.
8357  destMat->exports_.sync_host ();
8358  auto hostExports = destMat->exports_.view_host();
8359  if (verbose) {
8360  std::ostringstream os;
8361  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8362  << std::endl;
8363  std::cerr << os.str ();
8364  }
8365  Distor.doReversePostsAndWaits (hostExports,
8366  constantNumPackets,
8367  hostImports);
8368  if (verbose) {
8369  std::ostringstream os;
8370  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8371  << std::endl;
8372  std::cerr << os.str ();
8373  }
8374  }
8375  }
8376  else { // forward mode (the default)
8377  if (constantNumPackets == 0) { // variable number of packets per LID
8378  if (verbose) {
8379  std::ostringstream os;
8380  os << *verbosePrefix << "Forward mode, variable # packets / LID"
8381  << std::endl;
8382  std::cerr << os.str ();
8383  }
8384  destMat->numExportPacketsPerLID_.sync_device ();
8385  auto numExportPacketsPerLID = destMat->numExportPacketsPerLID_.view_device();
8386  auto numImportPacketsPerLID = destMat->numImportPacketsPerLID_.view_device();
8387  if (verbose) {
8388  std::ostringstream os;
8389  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8390  << std::endl;
8391  std::cerr << os.str ();
8392  }
8393  Distor.doPostsAndWaits(numExportPacketsPerLID, 1, numImportPacketsPerLID);
8394  if (verbose) {
8395  std::ostringstream os;
8396  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8397  << std::endl;
8398  std::cerr << os.str ();
8399  }
8400 
8401  size_t totalImportPackets = Kokkos::Experimental::reduce(typename Node::execution_space(), numImportPacketsPerLID);
8402 
8403  // Reallocation MUST go before setting the modified flag,
8404  // because it may clear out the flags.
8405  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8406  verbosePrefix.get ());
8407  destMat->imports_.modify_host ();
8408  auto deviceImports = destMat->imports_.view_device();
8409  auto deviceExports = destMat->exports_.view_device();
8410  if (verbose) {
8411  std::ostringstream os;
8412  os << *verbosePrefix << "Calling 4-arg doPostsAndWaitsKokkos"
8413  << std::endl;
8414  std::cerr << os.str ();
8415  }
8416  destMat->imports_.sync_device ();
8417  Distor.doPostsAndWaitsKokkos (deviceExports, numExportPacketsPerLID, deviceImports, numImportPacketsPerLID);
8418  if (verbose) {
8419  std::ostringstream os;
8420  os << *verbosePrefix << "Finished 4-arg doPostsAndWaitsKokkos"
8421  << std::endl;
8422  std::cerr << os.str ();
8423  }
8424  }
8425  else { // constant number of packets per LID
8426  if (verbose) {
8427  std::ostringstream os;
8428  os << *verbosePrefix << "Forward mode, constant # packets / LID"
8429  << std::endl;
8430  std::cerr << os.str ();
8431  }
8432  destMat->imports_.modify_host ();
8433  auto hostImports = destMat->imports_.view_host();
8434  // This is a legacy host pack/unpack path, so use the host
8435  // version of exports_.
8436  destMat->exports_.sync_host ();
8437  auto hostExports = destMat->exports_.view_host();
8438  if (verbose) {
8439  std::ostringstream os;
8440  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8441  << std::endl;
8442  std::cerr << os.str ();
8443  }
8444  Distor.doPostsAndWaits (hostExports,
8445  constantNumPackets,
8446  hostImports);
8447  if (verbose) {
8448  std::ostringstream os;
8449  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8450  << std::endl;
8451  std::cerr << os.str ();
8452  }
8453  }
8454  }
8455  }
8456  }
8457 
8458  /*********************************************************************/
8459  /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8460  /*********************************************************************/
8461 
8462  bool runOnHost = std::is_same_v<typename device_type::memory_space, Kokkos::HostSpace> && !useKokkosPath;
8463 
8464  Teuchos::Array<int> RemotePids;
8465  if (runOnHost) {
8466  Teuchos::Array<int> TargetPids;
8467 
8468 # ifdef HAVE_TPETRA_MMM_TIMINGS
8469  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8470 # endif
8471  ArrayRCP<size_t> CSR_rowptr;
8472  ArrayRCP<GO> CSR_colind_GID;
8473  ArrayRCP<LO> CSR_colind_LID;
8474  ArrayRCP<Scalar> CSR_vals;
8475 
8476  destMat->imports_.sync_device ();
8477  destMat->numImportPacketsPerLID_.sync_device ();
8478 
8479  size_t N = BaseRowMap->getLocalNumElements ();
8480 
8481  auto RemoteLIDs_d = RemoteLIDs.view_device();
8482  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8483  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8484 
8486  *this,
8487  RemoteLIDs_d,
8488  destMat->imports_.view_device(), //hostImports
8489  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8490  NumSameIDs,
8491  PermuteToLIDs_d,
8492  PermuteFromLIDs_d,
8493  N,
8494  MyPID,
8495  CSR_rowptr,
8496  CSR_colind_GID,
8497  CSR_vals,
8498  SourcePids(),
8499  TargetPids);
8500 
8501  // If LO and GO are the same, we can reuse memory when
8502  // converting the column indices from global to local indices.
8503  if (typeid (LO) == typeid (GO)) {
8504  CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
8505  }
8506  else {
8507  CSR_colind_LID.resize (CSR_colind_GID.size());
8508  }
8509  CSR_colind_LID.resize (CSR_colind_GID.size());
8510 
8511  // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8512  // owned entries. Convert them to the actual PID.
8513  // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
8514  for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
8515  {
8516  if(TargetPids[i] == -1) TargetPids[i] = MyPID;
8517  }
8518 #ifdef HAVE_TPETRA_MMM_TIMINGS
8519  tmCopySPRdata = Teuchos::null;
8520 #endif
8521  /**************************************************************/
8522  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8523  /**************************************************************/
8524  // Call an optimized version of makeColMap that avoids the
8525  // Directory lookups (since the Import object knows who owns all
8526  // the GIDs).
8527  if (verbose) {
8528  std::ostringstream os;
8529  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8530  << std::endl;
8531  std::cerr << os.str ();
8532  }
8533  {
8534 #ifdef HAVE_TPETRA_MMM_TIMINGS
8535  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8536 #endif
8537  Import_Util::lowCommunicationMakeColMapAndReindexSerial(CSR_rowptr (),
8538  CSR_colind_LID (),
8539  CSR_colind_GID (),
8540  BaseDomainMap,
8541  TargetPids,
8542  RemotePids,
8543  MyColMap);
8544  }
8545 
8546  if (verbose) {
8547  std::ostringstream os;
8548  os << *verbosePrefix << "restrictComm="
8549  << (restrictComm ? "true" : "false") << std::endl;
8550  std::cerr << os.str ();
8551  }
8552 
8553  /*******************************************************/
8554  /**** 4) Second communicator restriction phase ****/
8555  /*******************************************************/
8556  {
8557 #ifdef HAVE_TPETRA_MMM_TIMINGS
8558  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8559 #endif
8560  if (restrictComm) {
8561  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8562  ReducedRowMap :
8563  MyColMap->replaceCommWithSubset (ReducedComm);
8564  MyColMap = ReducedColMap; // Reset the "my" maps
8565  }
8566 
8567  // Replace the col map
8568  if (verbose) {
8569  std::ostringstream os;
8570  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8571  std::cerr << os.str ();
8572  }
8573  destMat->replaceColMap (MyColMap);
8574 
8575  // Short circuit if the processor is no longer in the communicator
8576  //
8577  // NOTE: Epetra replaces modifies all "removed" processes so they
8578  // have a dummy (serial) Map that doesn't touch the original
8579  // communicator. Duplicating that here might be a good idea.
8580  if (ReducedComm.is_null ()) {
8581  if (verbose) {
8582  std::ostringstream os;
8583  os << *verbosePrefix << "I am no longer in the communicator; "
8584  "returning" << std::endl;
8585  std::cerr << os.str ();
8586  }
8587  return;
8588  }
8589  }
8590 
8591  /***************************************************/
8592  /**** 5) Sort ****/
8593  /***************************************************/
8594  if ((! reverseMode && xferAsImport != nullptr) ||
8595  (reverseMode && xferAsExport != nullptr)) {
8596  if (verbose) {
8597  std::ostringstream os;
8598  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8599  std::cerr << os.str ();
8600  }
8601 #ifdef HAVE_TPETRA_MMM_TIMINGS
8602  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8603 #endif
8604  Import_Util::sortCrsEntries (CSR_rowptr(),
8605  CSR_colind_LID(),
8606  CSR_vals());
8607  }
8608  else if ((! reverseMode && xferAsExport != nullptr) ||
8609  (reverseMode && xferAsImport != nullptr)) {
8610  if (verbose) {
8611  std::ostringstream os;
8612  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8613  << endl;
8614  std::cerr << os.str();
8615  }
8616 #ifdef HAVE_TPETRA_MMM_TIMINGS
8617  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8618 #endif
8619  Import_Util::sortAndMergeCrsEntries (CSR_rowptr(),
8620  CSR_colind_LID(),
8621  CSR_vals());
8622  if (CSR_rowptr[N] != static_cast<size_t>(CSR_vals.size())) {
8623  CSR_colind_LID.resize (CSR_rowptr[N]);
8624  CSR_vals.resize (CSR_rowptr[N]);
8625  }
8626  }
8627  else {
8628  TEUCHOS_TEST_FOR_EXCEPTION(
8629  true, std::logic_error, "Tpetra::CrsMatrix::"
8630  "transferAndFillComplete: Should never get here! "
8631  "Please report this bug to a Tpetra developer.");
8632  }
8633  /***************************************************/
8634  /**** 6) Reset the colmap and the arrays ****/
8635  /***************************************************/
8636 
8637  if (verbose) {
8638  std::ostringstream os;
8639  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8640  std::cerr << os.str ();
8641  }
8642 
8643  // Call constructor for the new matrix (restricted as needed)
8644  //
8645  // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8646  // refactor version of CrsMatrix, though it reserves the right to
8647  // make a deep copy of the arrays.
8648  {
8649 #ifdef HAVE_TPETRA_MMM_TIMINGS
8650  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8651 #endif
8652  destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
8653  }
8654 
8655  } else {
8656  // run on device
8657 
8658 # ifdef HAVE_TPETRA_MMM_TIMINGS
8659  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8660 # endif
8661  ArrayRCP<size_t> CSR_rowptr;
8662  ArrayRCP<GO> CSR_colind_GID;
8663  ArrayRCP<LO> CSR_colind_LID;
8664  ArrayRCP<Scalar> CSR_vals;
8665 
8666  destMat->imports_.sync_device ();
8667  destMat->numImportPacketsPerLID_.sync_device ();
8668 
8669  size_t N = BaseRowMap->getLocalNumElements ();
8670 
8671  auto RemoteLIDs_d = RemoteLIDs.view_device();
8672  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8673  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8674 
8675  Kokkos::View<size_t*,device_type> CSR_rowptr_d;
8676  Kokkos::View<GO*,device_type> CSR_colind_GID_d;
8677  Kokkos::View<LO*,device_type> CSR_colind_LID_d;
8678  Kokkos::View<impl_scalar_type*,device_type> CSR_vals_d;
8679  Kokkos::View<int*,device_type> TargetPids_d;
8680 
8682  *this,
8683  RemoteLIDs_d,
8684  destMat->imports_.view_device(), //hostImports
8685  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8686  NumSameIDs,
8687  PermuteToLIDs_d,
8688  PermuteFromLIDs_d,
8689  N,
8690  MyPID,
8691  CSR_rowptr_d,
8692  CSR_colind_GID_d,
8693  CSR_vals_d,
8694  SourcePids(),
8695  TargetPids_d);
8696 
8697  Kokkos::resize (CSR_colind_LID_d, CSR_colind_GID_d.size());
8698 
8699 #ifdef HAVE_TPETRA_MMM_TIMINGS
8700  tmCopySPRdata = Teuchos::null;
8701 #endif
8702  /**************************************************************/
8703  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8704  /**************************************************************/
8705  // Call an optimized version of makeColMap that avoids the
8706  // Directory lookups (since the Import object knows who owns all
8707  // the GIDs).
8708  if (verbose) {
8709  std::ostringstream os;
8710  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8711  << std::endl;
8712  std::cerr << os.str ();
8713  }
8714  {
8715 #ifdef HAVE_TPETRA_MMM_TIMINGS
8716  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8717 #endif
8718  Import_Util::lowCommunicationMakeColMapAndReindex(CSR_rowptr_d,
8719  CSR_colind_LID_d,
8720  CSR_colind_GID_d,
8721  BaseDomainMap,
8722  TargetPids_d,
8723  RemotePids,
8724  MyColMap);
8725  }
8726 
8727  if (verbose) {
8728  std::ostringstream os;
8729  os << *verbosePrefix << "restrictComm="
8730  << (restrictComm ? "true" : "false") << std::endl;
8731  std::cerr << os.str ();
8732  }
8733 
8734  /*******************************************************/
8735  /**** 4) Second communicator restriction phase ****/
8736  /*******************************************************/
8737  {
8738 #ifdef HAVE_TPETRA_MMM_TIMINGS
8739  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8740 #endif
8741  if (restrictComm) {
8742  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8743  ReducedRowMap :
8744  MyColMap->replaceCommWithSubset (ReducedComm);
8745  MyColMap = ReducedColMap; // Reset the "my" maps
8746  }
8747 
8748  // Replace the col map
8749  if (verbose) {
8750  std::ostringstream os;
8751  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8752  std::cerr << os.str ();
8753  }
8754  destMat->replaceColMap (MyColMap);
8755 
8756  // Short circuit if the processor is no longer in the communicator
8757  //
8758  // NOTE: Epetra replaces modifies all "removed" processes so they
8759  // have a dummy (serial) Map that doesn't touch the original
8760  // communicator. Duplicating that here might be a good idea.
8761  if (ReducedComm.is_null ()) {
8762  if (verbose) {
8763  std::ostringstream os;
8764  os << *verbosePrefix << "I am no longer in the communicator; "
8765  "returning" << std::endl;
8766  std::cerr << os.str ();
8767  }
8768  return;
8769  }
8770  }
8771 
8772  /***************************************************/
8773  /**** 5) Sort ****/
8774  /***************************************************/
8775 
8776  if ((! reverseMode && xferAsImport != nullptr) ||
8777  (reverseMode && xferAsExport != nullptr)) {
8778  if (verbose) {
8779  std::ostringstream os;
8780  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8781  std::cerr << os.str ();
8782  }
8783 #ifdef HAVE_TPETRA_MMM_TIMINGS
8784  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8785 #endif
8786  Import_Util::sortCrsEntries (CSR_rowptr_d,
8787  CSR_colind_LID_d,
8788  CSR_vals_d);
8789  }
8790  else if ((! reverseMode && xferAsExport != nullptr) ||
8791  (reverseMode && xferAsImport != nullptr)) {
8792  if (verbose) {
8793  std::ostringstream os;
8794  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8795  << endl;
8796  std::cerr << os.str();
8797  }
8798 #ifdef HAVE_TPETRA_MMM_TIMINGS
8799  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8800 #endif
8801  Import_Util::sortAndMergeCrsEntries (CSR_rowptr_d,
8802  CSR_colind_LID_d,
8803  CSR_vals_d);
8804  }
8805  else {
8806  TEUCHOS_TEST_FOR_EXCEPTION(
8807  true, std::logic_error, "Tpetra::CrsMatrix::"
8808  "transferAndFillComplete: Should never get here! "
8809  "Please report this bug to a Tpetra developer.");
8810  }
8811 
8812  /***************************************************/
8813  /**** 6) Reset the colmap and the arrays ****/
8814  /***************************************************/
8815 
8816  if (verbose) {
8817  std::ostringstream os;
8818  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8819  std::cerr << os.str ();
8820  }
8821 
8822  {
8823 #ifdef HAVE_TPETRA_MMM_TIMINGS
8824  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8825 #endif
8826  destMat->setAllValues (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d);
8827  }
8828 
8829  } //if (runOnHost) .. else ..
8830 
8831  /***************************************************/
8832  /**** 7) Build Importer & Call ESFC ****/
8833  /***************************************************/
8834 #ifdef HAVE_TPETRA_MMM_TIMINGS
8835  RCP<TimeMonitor> tmIESFC = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC build importer and esfc"))));
8836 #endif
8837  // Pre-build the importer using the existing PIDs
8838  Teuchos::ParameterList esfc_params;
8839 
8840  RCP<import_type> MyImport;
8841 
8842  // Fulfull the non-blocking allreduce on reduced_mismatch.
8843  if (iallreduceRequest.get () != nullptr) {
8844  if (verbose) {
8845  std::ostringstream os;
8846  os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8847  << endl;
8848  std::cerr << os.str ();
8849  }
8850  iallreduceRequest->wait ();
8851  if (reduced_mismatch != 0) {
8852  isMM = false;
8853  }
8854  }
8855 
8856  if( isMM ) {
8857 #ifdef HAVE_TPETRA_MMM_TIMINGS
8858  Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8859 #endif
8860  // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8861 
8862  if (verbose) {
8863  std::ostringstream os;
8864  os << *verbosePrefix << "Getting CRS pointers" << endl;
8865  std::cerr << os.str ();
8866  }
8867 
8868  Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8869  Teuchos::ArrayRCP<int> type3PIDs;
8870  auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8871  auto colind = getCrsGraph()->getLocalIndicesHost();
8872 
8873  if (verbose) {
8874  std::ostringstream os;
8875  os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8876  std::cerr << os.str ();
8877  }
8878 
8879  {
8880 #ifdef HAVE_TPETRA_MMM_TIMINGS
8881  TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8882 #endif
8883  Import_Util::reverseNeighborDiscovery(*this,
8884  rowptr,
8885  colind,
8886  rowTransfer,
8887  MyImporter,
8888  MyDomainMap,
8889  type3PIDs,
8890  type3LIDs,
8891  ReducedComm);
8892  }
8893 
8894  if (verbose) {
8895  std::ostringstream os;
8896  os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8897  std::cerr << os.str ();
8898  }
8899 
8900  Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8901  Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8902 
8903  Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8904  Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8905 
8906  const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8907  // from EpetraExt_MMHelpers.cpp: build_type2_exports
8908  std::vector<bool> IsOwned(numCols,true);
8909  std::vector<int> SentTo(numCols,-1);
8910  if (! MyImporter.is_null ()) {
8911  for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8912  IsOwned[rlid]=false;
8913  }
8914  }
8915 
8916  std::vector<std::pair<int,GO> > usrtg;
8917  usrtg.reserve(TEPID2.size());
8918 
8919  {
8920  const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
8921  for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
8922  const LO row = TELID2[i];
8923  const int pid = TEPID2[i];
8924  for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
8925  const int col = colind[j];
8926  if (IsOwned[col] && SentTo[col] != pid) {
8927  SentTo[col] = pid;
8928  GO gid = colMap.getGlobalElement (col);
8929  usrtg.push_back (std::pair<int,GO> (pid, gid));
8930  }
8931  }
8932  }
8933  }
8934 
8935 // This sort can _not_ be omitted.[
8936  std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
8937  auto eopg = std ::unique(usrtg.begin(),usrtg.end());
8938  // 25 Jul 2018: Could just ignore the entries at and after eopg.
8939  usrtg.erase(eopg,usrtg.end());
8940 
8941  const Array_size_type type2_us_size = usrtg.size();
8942  Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
8943  Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
8944 
8945  int pos=0;
8946  for(auto && p : usrtg) {
8947  EPID2[pos]= p.first;
8948  ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
8949  pos++;
8950  }
8951 
8952  Teuchos::ArrayView<int> EPID3 = type3PIDs();
8953  Teuchos::ArrayView< LO> ELID3 = type3LIDs();
8954  GO InfGID = std::numeric_limits<GO>::max();
8955  int InfPID = INT_MAX;
8956 #ifdef TPETRA_MIN3
8957 # undef TPETRA_MIN3
8958 #endif // TPETRA_MIN3
8959 #define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
8960  int i1=0, i2=0, i3=0;
8961  int Len1 = EPID1.size();
8962  int Len2 = EPID2.size();
8963  int Len3 = EPID3.size();
8964 
8965  int MyLen=Len1+Len2+Len3;
8966  Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
8967  Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
8968  int iloc = 0; // will be the size of the userExportLID/PIDs
8969 
8970  while(i1 < Len1 || i2 < Len2 || i3 < Len3){
8971  int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
8972  int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
8973  int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
8974 
8975  GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
8976  GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
8977  GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
8978 
8979  int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
8980  GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
8981 #ifdef TPETRA_MIN3
8982 # undef TPETRA_MIN3
8983 #endif // TPETRA_MIN3
8984  bool added_entry=false;
8985 
8986  if(PID1 == MIN_PID && GID1 == MIN_GID){
8987  userExportLIDs[iloc]=ELID1[i1];
8988  userExportPIDs[iloc]=EPID1[i1];
8989  i1++;
8990  added_entry=true;
8991  iloc++;
8992  }
8993  if(PID2 == MIN_PID && GID2 == MIN_GID){
8994  if(!added_entry) {
8995  userExportLIDs[iloc]=ELID2[i2];
8996  userExportPIDs[iloc]=EPID2[i2];
8997  added_entry=true;
8998  iloc++;
8999  }
9000  i2++;
9001  }
9002  if(PID3 == MIN_PID && GID3 == MIN_GID){
9003  if(!added_entry) {
9004  userExportLIDs[iloc]=ELID3[i3];
9005  userExportPIDs[iloc]=EPID3[i3];
9006  iloc++;
9007  }
9008  i3++;
9009  }
9010  }
9011 
9012  if (verbose) {
9013  std::ostringstream os;
9014  os << *verbosePrefix << "Create Import" << std::endl;
9015  std::cerr << os.str ();
9016  }
9017 
9018 #ifdef HAVE_TPETRA_MMM_TIMINGS
9019  auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
9020 #endif
9021  Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
9022  // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
9023  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9024  MyImport = rcp ( new import_type (MyDomainMap,
9025  MyColMap,
9026  RemotePids,
9027  userExportLIDs.view(0,iloc).getConst(),
9028  userExportPIDs.view(0,iloc).getConst(),
9029  plist)
9030  );
9031 
9032  if (verbose) {
9033  std::ostringstream os;
9034  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9035  std::cerr << os.str ();
9036  }
9037 
9038  {
9039 #ifdef HAVE_TPETRA_MMM_TIMINGS
9040  TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
9041  esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
9042 #endif
9043  if(!params.is_null())
9044  esfc_params.set("compute global constants",params->get("compute global constants",true));
9045  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
9046 
9047  }
9048 
9049  } // if(isMM)
9050  else {
9051 #ifdef HAVE_TPETRA_MMM_TIMINGS
9052  TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
9053 #endif
9054  if (verbose) {
9055  std::ostringstream os;
9056  os << *verbosePrefix << "Create Import" << std::endl;
9057  std::cerr << os.str ();
9058  }
9059 
9060 #ifdef HAVE_TPETRA_MMM_TIMINGS
9061  TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
9062 #endif
9063  Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
9064  mypars->set("Timer Label","notMMFrom_tAFC");
9065  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9066  MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
9067 
9068  if (verbose) {
9069  std::ostringstream os;
9070  os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
9071  std::cerr << os.str ();
9072  }
9073 
9074 #ifdef HAVE_TPETRA_MMM_TIMINGS
9075  TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
9076  esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
9077 #else
9078  esfc_params.set("Timer Label",std::string("notMM eSFC"));
9079 #endif
9080 
9081  if (!params.is_null ()) {
9082  esfc_params.set ("compute global constants",
9083  params->get ("compute global constants", true));
9084  }
9085  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
9086  MyImport, Teuchos::null,
9087  rcp (new Teuchos::ParameterList (esfc_params)));
9088  }
9089 
9090 #ifdef HAVE_TPETRA_MMM_TIMINGS
9091  tmIESFC = Teuchos::null;
9092 #endif
9093 
9094  if (verbose) {
9095  std::ostringstream os;
9096  os << *verbosePrefix << "Done" << endl;
9097  std::cerr << os.str ();
9098  }
9099  } //transferAndFillComplete
9100 
9101 
9102  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9103  void
9106  const import_type& importer,
9107  const Teuchos::RCP<const map_type>& domainMap,
9108  const Teuchos::RCP<const map_type>& rangeMap,
9109  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9110  {
9111  transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
9112  }
9113 
9114  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9115  void
9118  const import_type& rowImporter,
9119  const import_type& domainImporter,
9120  const Teuchos::RCP<const map_type>& domainMap,
9121  const Teuchos::RCP<const map_type>& rangeMap,
9122  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9123  {
9124  transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
9125  }
9126 
9127  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9128  void
9131  const export_type& exporter,
9132  const Teuchos::RCP<const map_type>& domainMap,
9133  const Teuchos::RCP<const map_type>& rangeMap,
9134  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9135  {
9136  transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
9137  }
9138 
9139  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9140  void
9143  const export_type& rowExporter,
9144  const export_type& domainExporter,
9145  const Teuchos::RCP<const map_type>& domainMap,
9146  const Teuchos::RCP<const map_type>& rangeMap,
9147  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9148  {
9149  transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
9150  }
9151 
9152 } // namespace Tpetra
9153 
9154 //
9155 // Explicit instantiation macro
9156 //
9157 // Must be expanded from within the Tpetra namespace!
9158 //
9159 
9160 #define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
9161  \
9162  template class CrsMatrix< SCALAR , LO , GO , NODE >;
9163 
9164 #define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
9165  \
9166  template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
9167  CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
9168 
9169 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9170  template<> \
9171  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9172  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9173  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9174  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9175  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
9176  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9177  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9178  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9179  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9180  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9181  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9182  const Teuchos::RCP<Teuchos::ParameterList>& params);
9183 
9184 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9185  template<> \
9186  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9187  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9188  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9189  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9190  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
9191  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9192  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9193  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
9194  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9195  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9196  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9197  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9198  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9199  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9200  const Teuchos::RCP<Teuchos::ParameterList>& params);
9201 
9202 
9203 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9204  template<> \
9205  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9206  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9207  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9208  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9209  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
9210  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9211  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9212  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9213  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9214  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9215  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9216  const Teuchos::RCP<Teuchos::ParameterList>& params);
9217 
9218 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9219  template<> \
9220  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9221  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9222  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9223  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9224  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9225  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9226  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9227  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9228  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9229  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9230  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9231  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9232  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9233  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9234  const Teuchos::RCP<Teuchos::ParameterList>& params);
9235 
9236 
9237 #define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9238  TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9239  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9240  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9241  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9242  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9243 
9244 #endif // TPETRA_CRSMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
bool indicesAreSorted_
Whether the graph&#39;s indices are sorted in each row, on this process.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Functor for the the ABSMAX CombineMode of Import and Export operations.
void checkInternalState() const
Check that this object&#39;s state is sane; throw if it&#39;s not.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types...
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
typename device_type::execution_space execution_space
The Kokkos execution space.
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object&#39;s data for an Import or Export.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
void getGlobalRowView(GlobalOrdinal GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices...
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Traits class for packing / unpacking data of type T.
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular...
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
bool noRedundancies_
Whether the graph&#39;s indices are non-redundant (merged) in each row, on this process.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
bool isDistributed() const
Whether this is a globally distributed object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
static bool debug()
Whether Tpetra is in debug mode.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix&#39;s communicator...
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y...
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T...
Scalar scalar_type
The type of each entry in the matrix.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of column indices for all rows Valid when isLocallyIndexed is true If OptimizedStorage...
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void sort(View &view, const size_t &size)
Convenience wrapper for std::sort for host-accessible views.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular...
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow)...
bool isFillComplete() const override
Whether the matrix is fill complete.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified, possibly nondefault Kokkos Node type.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row&#39;s entries.
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Functions for manipulating CRS arrays.
Kokkos::View< size_t *, Kokkos::LayoutLeft, device_type >::HostMirror num_row_entries_type
Row offsets for &quot;1-D&quot; storage.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
bool isNodeGlobalElement(global_ordinal_type globalIndex) const
Whether the given global index is owned by this Map on the calling process.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for &quot;new&quot; DistObject inter...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void copyConvert(const OutputViewType &dst, const InputViewType &src)
Copy values from the 1-D Kokkos::View src, to the 1-D Kokkos::View dst, of the same length...
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
bool fillComplete_
Whether the matrix is fill complete.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using global indices.
Replace old value with maximum of magnitudes of old and new values.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
Abstract base class for objects that can be the source of an Import or Export operation.
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
typename Node::device_type device_type
The Kokkos device type.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process...
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps...
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
std::string description() const override
A one-line description of this object.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
LO getLocalDiagCopyWithoutOffsetsNotFillComplete(::Tpetra::Vector< SC, LO, GO, NT > &diag, const ::Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool debug=false)
Given a locally indexed, global sparse matrix, extract the matrix&#39;s diagonal entries into a Tpetra::V...
Replace existing values with new values.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
Replace old values with zero.
const row_ptrs_host_view_type & getRowPtrsUnpackedHost() const
Get the unpacked row pointers on host. Lazily make a copy from device.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is &quot;imbalanced&quot; in the number of entries per row...
bool isLocallyComplete() const
Is this Export or Import locally complete?
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
local_map_type getLocalMap() const
Get the LocalMap for Kokkos-Kernels.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
A parallel distribution of indices over processes.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const CombineMode CM=ADD)
Insert one or more entries into the matrix, using local column indices.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Details::EStorageStatus storageStatus_
Status of the matrix&#39;s storage, when not in a fill-complete state.
A read-only, row-oriented interface to a sparse matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
A distributed dense vector.
Declaration of Tpetra::Details::iallreduce.
void reduce()
Sum values of a locally replicated multivector across all processes.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process...
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using local row and column indices.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
Definition: Tpetra_Util.cpp:71
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
Accumulate new values into existing values (may not be supported in all classes)
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
void rightScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Right-scale a KokkosSparse::CrsMatrix.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Description of Tpetra&#39;s behavior.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.