Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_CRSMATRIX_DEF_HPP
11 #define TPETRA_CRSMATRIX_DEF_HPP
12 
20 
21 #include "Tpetra_Import_Util.hpp"
22 #include "Tpetra_Import_Util2.hpp"
23 #include "Tpetra_RowMatrix.hpp"
24 #include "Tpetra_LocalCrsMatrixOperator.hpp"
25 
32 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
40 #include "Tpetra_Details_packCrsMatrix.hpp"
41 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
43 #include "Teuchos_FancyOStream.hpp"
44 #include "Teuchos_RCP.hpp"
45 #include "Teuchos_DataAccess.hpp"
46 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
47 #include "KokkosBlas1_scal.hpp"
48 #include "KokkosSparse_getDiagCopy.hpp"
49 #include "KokkosSparse_spmv.hpp"
50 #include "Kokkos_StdAlgorithms.hpp"
51 
52 #include <memory>
53 #include <sstream>
54 #include <typeinfo>
55 #include <utility>
56 #include <vector>
57 
58 namespace Tpetra {
59 
60 namespace { // (anonymous)
61 
62  template<class T, class BinaryFunction>
63  T atomic_binary_function_update (volatile T* const dest,
64  const T& inputVal,
65  BinaryFunction f)
66  {
67  T oldVal = *dest;
68  T assume;
69 
70  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
71  // POWER architectures, because 'newval' depends on 'assume',
72  // which depends on 'oldVal', which depends on '*dest'. This
73  // sets up a chain of read dependencies that should ensure
74  // correct behavior given a sane memory model.
75  do {
76  assume = oldVal;
77  T newVal = f (assume, inputVal);
78  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
79  } while (assume != oldVal);
80 
81  return oldVal;
82  }
83 } // namespace (anonymous)
84 
85 //
86 // Users must never rely on anything in the Details namespace.
87 //
88 namespace Details {
89 
99 template<class Scalar>
100 struct AbsMax {
102  Scalar operator() (const Scalar& x, const Scalar& y) {
103  typedef Teuchos::ScalarTraits<Scalar> STS;
104  return std::max (STS::magnitude (x), STS::magnitude (y));
105  }
106 };
107 
108 } // namespace Details
109 } // namespace Tpetra
110 
111 namespace Tpetra {
112 
113  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
114  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
115  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
116  size_t maxNumEntriesPerRow,
117  const Teuchos::RCP<Teuchos::ParameterList>& params) :
118  dist_object_type (rowMap)
119  {
120  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
121  "[, RCP<ParameterList>]): ";
122  Teuchos::RCP<crs_graph_type> graph;
123  try {
124  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
125  params));
126  }
127  catch (std::exception& e) {
128  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
129  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
130  "size_t [, RCP<ParameterList>]) threw an exception: "
131  << e.what ());
132  }
133  // myGraph_ not null means that the matrix owns the graph. That's
134  // different than the const CrsGraph constructor, where the matrix
135  // does _not_ own the graph.
136  myGraph_ = graph;
137  staticGraph_ = myGraph_;
138  resumeFill (params);
140  }
141 
142  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
144  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
145  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
146  const Teuchos::RCP<Teuchos::ParameterList>& params) :
147  dist_object_type (rowMap)
148  {
149  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
150  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
151  Teuchos::RCP<crs_graph_type> graph;
152  try {
153  using Teuchos::rcp;
154  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
155  params));
156  }
157  catch (std::exception& e) {
158  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
159  (true, std::runtime_error, "CrsGraph constructor "
160  "(RCP<const Map>, ArrayView<const size_t>"
161  "[, RCP<ParameterList>]) threw an exception: "
162  << e.what ());
163  }
164  // myGraph_ not null means that the matrix owns the graph. That's
165  // different than the const CrsGraph constructor, where the matrix
166  // does _not_ own the graph.
167  myGraph_ = graph;
168  staticGraph_ = graph;
169  resumeFill (params);
171  }
172 
173  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
175  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
176  const Teuchos::RCP<const map_type>& colMap,
177  const size_t maxNumEntPerRow,
178  const Teuchos::RCP<Teuchos::ParameterList>& params) :
179  dist_object_type (rowMap)
180  {
181  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
182  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
183  const char suffix[] =
184  " Please report this bug to the Tpetra developers.";
185 
186  // An artifact of debugging something a while back.
187  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
188  (! staticGraph_.is_null (), std::logic_error,
189  "staticGraph_ is not null at the beginning of the constructor."
190  << suffix);
191  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
192  (! myGraph_.is_null (), std::logic_error,
193  "myGraph_ is not null at the beginning of the constructor."
194  << suffix);
195  Teuchos::RCP<crs_graph_type> graph;
196  try {
197  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
198  maxNumEntPerRow,
199  params));
200  }
201  catch (std::exception& e) {
202  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
203  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
204  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
205  "exception: " << e.what ());
206  }
207  // myGraph_ not null means that the matrix owns the graph. That's
208  // different than the const CrsGraph constructor, where the matrix
209  // does _not_ own the graph.
210  myGraph_ = graph;
211  staticGraph_ = myGraph_;
212  resumeFill (params);
214  }
215 
216  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
218  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
219  const Teuchos::RCP<const map_type>& colMap,
220  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
221  const Teuchos::RCP<Teuchos::ParameterList>& params) :
222  dist_object_type (rowMap)
223  {
224  const char tfecfFuncName[] =
225  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
226  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
227  Teuchos::RCP<crs_graph_type> graph;
228  try {
229  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
230  numEntPerRowToAlloc,
231  params));
232  }
233  catch (std::exception& e) {
234  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
235  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
236  "RCP<const Map>, ArrayView<const size_t>[, "
237  "RCP<ParameterList>]) threw an exception: " << e.what ());
238  }
239  // myGraph_ not null means that the matrix owns the graph. That's
240  // different than the const CrsGraph constructor, where the matrix
241  // does _not_ own the graph.
242  myGraph_ = graph;
243  staticGraph_ = graph;
244  resumeFill (params);
246  }
247 
248 
249  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
251  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
252  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
253  dist_object_type (graph->getRowMap ()),
254  staticGraph_ (graph),
255  storageStatus_ (Details::STORAGE_1D_PACKED)
256  {
257  using std::endl;
258  typedef typename local_matrix_device_type::values_type values_type;
259  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
260  "RCP<ParameterList>]): ";
261  const bool verbose = Details::Behavior::verbose("CrsMatrix");
262 
263  std::unique_ptr<std::string> prefix;
264  if (verbose) {
265  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
266  std::ostringstream os;
267  os << *prefix << "Start" << endl;
268  std::cerr << os.str ();
269  }
270 
271  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
272  (graph.is_null (), std::runtime_error, "Input graph is null.");
273  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
274  (! graph->isFillComplete (), std::runtime_error, "Input graph "
275  "is not fill complete. You must call fillComplete on the "
276  "graph before using it to construct a CrsMatrix. Note that "
277  "calling resumeFill on the graph makes it not fill complete, "
278  "even if you had previously called fillComplete. In that "
279  "case, you must call fillComplete on the graph again.");
280 
281  // The graph is fill complete, so it is locally indexed and has a
282  // fixed structure. This means we can allocate the (1-D) array of
283  // values and build the local matrix right now. Note that the
284  // local matrix's number of columns comes from the column Map, not
285  // the domain Map.
286 
287  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
288  if (verbose) {
289  std::ostringstream os;
290  os << *prefix << "Allocate values: " << numEnt << endl;
291  std::cerr << os.str ();
292  }
293 
294  values_type val ("Tpetra::CrsMatrix::values", numEnt);
295  valuesPacked_wdv = values_wdv_type(val);
296  valuesUnpacked_wdv = valuesPacked_wdv;
297 
299 
300  if (verbose) {
301  std::ostringstream os;
302  os << *prefix << "Done" << endl;
303  std::cerr << os.str ();
304  }
305  }
306 
307  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
310  const Teuchos::RCP<const crs_graph_type>& graph,
311  const Teuchos::RCP<Teuchos::ParameterList>& params) :
312  dist_object_type (graph->getRowMap ()),
313  staticGraph_ (graph),
314  storageStatus_ (matrix.storageStatus_)
315  {
316  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
317  "local_matrix_device_type::values_type, "
318  "[,RCP<ParameterList>]): ";
319  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
320  (graph.is_null (), std::runtime_error, "Input graph is null.");
321  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
322  (! graph->isFillComplete (), std::runtime_error, "Input graph "
323  "is not fill complete. You must call fillComplete on the "
324  "graph before using it to construct a CrsMatrix. Note that "
325  "calling resumeFill on the graph makes it not fill complete, "
326  "even if you had previously called fillComplete. In that "
327  "case, you must call fillComplete on the graph again.");
328 
329  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
330  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
331 
332  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
333  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
334 
336  }
337 
338 
339  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
341  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
342  const typename local_matrix_device_type::values_type& values,
343  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
344  dist_object_type (graph->getRowMap ()),
345  staticGraph_ (graph),
346  storageStatus_ (Details::STORAGE_1D_PACKED)
347  {
348  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
349  "local_matrix_device_type::values_type, "
350  "[,RCP<ParameterList>]): ";
351  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
352  (graph.is_null (), std::runtime_error, "Input graph is null.");
353  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
354  (! graph->isFillComplete (), std::runtime_error, "Input graph "
355  "is not fill complete. You must call fillComplete on the "
356  "graph before using it to construct a CrsMatrix. Note that "
357  "calling resumeFill on the graph makes it not fill complete, "
358  "even if you had previously called fillComplete. In that "
359  "case, you must call fillComplete on the graph again.");
360 
361  // The graph is fill complete, so it is locally indexed and has a
362  // fixed structure. This means we can allocate the (1-D) array of
363  // values and build the local matrix right now. Note that the
364  // local matrix's number of columns comes from the column Map, not
365  // the domain Map.
366 
367  valuesPacked_wdv = values_wdv_type(values);
368  valuesUnpacked_wdv = valuesPacked_wdv;
369 
370  // FIXME (22 Jun 2016) I would very much like to get rid of
371  // k_values1D_ at some point. I find it confusing to have all
372  // these extra references lying around.
373  // KDDKDD ALMOST THERE, MARK!
374 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
375 
377  }
378 
379  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
381  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
382  const Teuchos::RCP<const map_type>& colMap,
383  const typename local_graph_device_type::row_map_type& rowPointers,
384  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
385  const typename local_matrix_device_type::values_type& values,
386  const Teuchos::RCP<Teuchos::ParameterList>& params) :
387  dist_object_type (rowMap),
388  storageStatus_ (Details::STORAGE_1D_PACKED)
389  {
390  using Details::getEntryOnHost;
391  using Teuchos::RCP;
392  using std::endl;
393  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
394  "RCP<const Map>, ptr, ind, val[, params]): ";
395  const char suffix[] =
396  ". Please report this bug to the Tpetra developers.";
397  const bool debug = Details::Behavior::debug("CrsMatrix");
398  const bool verbose = Details::Behavior::verbose("CrsMatrix");
399 
400  std::unique_ptr<std::string> prefix;
401  if (verbose) {
402  prefix = this->createPrefix(
403  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
404  std::ostringstream os;
405  os << *prefix << "Start" << endl;
406  std::cerr << os.str ();
407  }
408 
409  // Check the user's input. Note that this might throw only on
410  // some processes but not others, causing deadlock. We prefer
411  // deadlock due to exceptions to segfaults, because users can
412  // catch exceptions.
413  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
414  (values.extent(0) != columnIndices.extent(0),
415  std::invalid_argument, "values.extent(0)=" << values.extent(0)
416  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
417  << ".");
418  if (debug && rowPointers.extent(0) != 0) {
419  const size_t numEnt =
420  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
421  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
422  (numEnt != size_t(columnIndices.extent(0)) ||
423  numEnt != size_t(values.extent(0)),
424  std::invalid_argument, "Last entry of rowPointers says that "
425  "the matrix has " << numEnt << " entr"
426  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
427  "columnIndices and values don't match this. "
428  "columnIndices.extent(0)=" << columnIndices.extent (0)
429  << " and values.extent(0)=" << values.extent (0) << ".");
430  }
431 
432  RCP<crs_graph_type> graph;
433  try {
434  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
435  columnIndices, params));
436  }
437  catch (std::exception& e) {
438  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
439  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
440  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
441  << e.what ());
442  }
443  // The newly created CrsGraph _must_ have a local graph at this
444  // point. We don't really care whether CrsGraph's constructor
445  // deep-copies or shallow-copies the input, but the dimensions
446  // have to be right. That's how we tell whether the CrsGraph has
447  // a local graph.
448  auto lclGraph = graph->getLocalGraphDevice ();
449  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
450  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
451  lclGraph.entries.extent (0) != columnIndices.extent (0),
452  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
453  "ind[, params]) did not set the local graph correctly." << suffix);
454  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
455  (lclGraph.entries.extent (0) != values.extent (0),
456  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
457  "params]) did not set the local graph correctly. "
458  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
459  << " != values.extent(0) = " << values.extent (0) << suffix);
460 
461  // myGraph_ not null means that the matrix owns the graph. This
462  // is true because the column indices come in as nonconst,
463  // implying shared ownership.
464  myGraph_ = graph;
465  staticGraph_ = graph;
466 
467  // The graph may not be fill complete yet. However, it is locally
468  // indexed (since we have a column Map) and has a fixed structure
469  // (due to the input arrays). This means we can allocate the
470  // (1-D) array of values and build the local matrix right now.
471  // Note that the local matrix's number of columns comes from the
472  // column Map, not the domain Map.
473 
474  valuesPacked_wdv = values_wdv_type(values);
475  valuesUnpacked_wdv = valuesPacked_wdv;
476 
477  // FIXME (22 Jun 2016) I would very much like to get rid of
478  // k_values1D_ at some point. I find it confusing to have all
479  // these extra references lying around.
480 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
481 
483  if (verbose) {
484  std::ostringstream os;
485  os << *prefix << "Done" << endl;
486  std::cerr << os.str();
487  }
488  }
489 
490  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
492  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
493  const Teuchos::RCP<const map_type>& colMap,
494  const Teuchos::ArrayRCP<size_t>& ptr,
495  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
496  const Teuchos::ArrayRCP<Scalar>& val,
497  const Teuchos::RCP<Teuchos::ParameterList>& params) :
498  dist_object_type (rowMap),
499  storageStatus_ (Details::STORAGE_1D_PACKED)
500  {
501  using Kokkos::Compat::getKokkosViewDeepCopy;
502  using Teuchos::av_reinterpret_cast;
503  using Teuchos::RCP;
504  using values_type = typename local_matrix_device_type::values_type;
505  using IST = impl_scalar_type;
506  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
507  "RCP<const Map>, ptr, ind, val[, params]): ";
508 
509  RCP<crs_graph_type> graph;
510  try {
511  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
512  ind, params));
513  }
514  catch (std::exception& e) {
515  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
516  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
517  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
518  "RCP<ParameterList>]) threw an exception: " << e.what ());
519  }
520  // myGraph_ not null means that the matrix owns the graph. This
521  // is true because the column indices come in as nonconst,
522  // implying shared ownership.
523  myGraph_ = graph;
524  staticGraph_ = graph;
525 
526  // The graph may not be fill complete yet. However, it is locally
527  // indexed (since we have a column Map) and has a fixed structure
528  // (due to the input arrays). This means we can allocate the
529  // (1-D) array of values and build the local matrix right now.
530  // Note that the local matrix's number of columns comes from the
531  // column Map, not the domain Map.
532 
533  // The graph _must_ have a local graph at this point. We don't
534  // really care whether CrsGraph's constructor deep-copies or
535  // shallow-copies the input, but the dimensions have to be right.
536  // That's how we tell whether the CrsGraph has a local graph.
537  auto lclGraph = staticGraph_->getLocalGraphDevice ();
538  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
539  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
540  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
541  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
542  "ptr, ind[, params]) did not set the local graph correctly. "
543  "Please report this bug to the Tpetra developers.");
544 
545  values_type valIn =
546  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
547  valuesPacked_wdv = values_wdv_type(valIn);
548  valuesUnpacked_wdv = valuesPacked_wdv;
549 
550  // FIXME (22 Jun 2016) I would very much like to get rid of
551  // k_values1D_ at some point. I find it confusing to have all
552  // these extra references lying around.
553 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
554 
556  }
557 
558  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
560  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
561  const Teuchos::RCP<const map_type>& colMap,
562  const local_matrix_device_type& lclMatrix,
563  const Teuchos::RCP<Teuchos::ParameterList>& params) :
564  dist_object_type (rowMap),
565  storageStatus_ (Details::STORAGE_1D_PACKED),
566  fillComplete_ (true)
567  {
568  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
569  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
570  const char suffix[] =
571  " Please report this bug to the Tpetra developers.";
572 
573  Teuchos::RCP<crs_graph_type> graph;
574  try {
575  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
576  lclMatrix.graph, params));
577  }
578  catch (std::exception& e) {
579  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
580  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
581  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
582  "exception: " << e.what ());
583  }
584  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
585  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
586  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
587  "did not produce a fill-complete graph. Please report this bug to the "
588  "Tpetra developers.");
589  // myGraph_ not null means that the matrix owns the graph. This
590  // is true because the column indices come in as nonconst through
591  // the matrix, implying shared ownership.
592  myGraph_ = graph;
593  staticGraph_ = graph;
594 
595  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
596  valuesUnpacked_wdv = valuesPacked_wdv;
597 
598  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
599  (isFillActive (), std::logic_error,
600  "At the end of a CrsMatrix constructor that should produce "
601  "a fillComplete matrix, isFillActive() is true." << suffix);
602  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
603  (! isFillComplete (), std::logic_error, "At the end of a "
604  "CrsMatrix constructor that should produce a fillComplete "
605  "matrix, isFillComplete() is false." << suffix);
607  }
608 
609  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
612  const Teuchos::RCP<const map_type>& rowMap,
613  const Teuchos::RCP<const map_type>& colMap,
614  const Teuchos::RCP<const map_type>& domainMap,
615  const Teuchos::RCP<const map_type>& rangeMap,
616  const Teuchos::RCP<Teuchos::ParameterList>& params) :
617  dist_object_type (rowMap),
618  storageStatus_ (Details::STORAGE_1D_PACKED),
619  fillComplete_ (true)
620  {
621  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
622  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
623  "local_matrix_device_type[, RCP<ParameterList>]): ";
624  const char suffix[] =
625  " Please report this bug to the Tpetra developers.";
626 
627  Teuchos::RCP<crs_graph_type> graph;
628  try {
629  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
630  domainMap, rangeMap, params));
631  }
632  catch (std::exception& e) {
633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
634  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
635  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
636  "RCP<ParameterList>]) threw an exception: " << e.what ());
637  }
638  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
639  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
640  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
641  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
642  "not produce a fillComplete graph." << suffix);
643  // myGraph_ not null means that the matrix owns the graph. This
644  // is true because the column indices come in as nonconst through
645  // the matrix, implying shared ownership.
646  myGraph_ = graph;
647  staticGraph_ = graph;
648 
649  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
650  valuesUnpacked_wdv = valuesPacked_wdv;
651 
652  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
653  (isFillActive (), std::logic_error,
654  "At the end of a CrsMatrix constructor that should produce "
655  "a fillComplete matrix, isFillActive() is true." << suffix);
656  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
657  (! isFillComplete (), std::logic_error, "At the end of a "
658  "CrsMatrix constructor that should produce a fillComplete "
659  "matrix, isFillComplete() is false." << suffix);
661  }
662 
663  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
666  const Teuchos::RCP<const map_type>& rowMap,
667  const Teuchos::RCP<const map_type>& colMap,
668  const Teuchos::RCP<const map_type>& domainMap,
669  const Teuchos::RCP<const map_type>& rangeMap,
670  const Teuchos::RCP<const import_type>& importer,
671  const Teuchos::RCP<const export_type>& exporter,
672  const Teuchos::RCP<Teuchos::ParameterList>& params) :
673  dist_object_type (rowMap),
674  storageStatus_ (Details::STORAGE_1D_PACKED),
675  fillComplete_ (true)
676  {
677  using Teuchos::rcp;
678  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
679  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
680  const char suffix[] =
681  " Please report this bug to the Tpetra developers.";
682 
683  Teuchos::RCP<crs_graph_type> graph;
684  try {
685  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
686  domainMap, rangeMap, importer,
687  exporter, params));
688  }
689  catch (std::exception& e) {
690  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
691  (true, std::runtime_error, "CrsGraph constructor "
692  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
693  "params) threw: " << e.what ());
694  }
695  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
696  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
697  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
698  "Export, params) did not produce a fill-complete graph. "
699  "Please report this bug to the Tpetra developers.");
700  // myGraph_ not null means that the matrix owns the graph. This
701  // is true because the column indices come in as nonconst through
702  // the matrix, implying shared ownership.
703  myGraph_ = graph;
704  staticGraph_ = graph;
705 
706  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
707  valuesUnpacked_wdv = valuesPacked_wdv;
708 
709  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
710  (isFillActive (), std::logic_error,
711  "At the end of a CrsMatrix constructor that should produce "
712  "a fillComplete matrix, isFillActive() is true." << suffix);
713  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
714  (! isFillComplete (), std::logic_error, "At the end of a "
715  "CrsMatrix constructor that should produce a fillComplete "
716  "matrix, isFillComplete() is false." << suffix);
718  }
719 
720  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
723  const Teuchos::DataAccess copyOrView):
724  dist_object_type (source.getCrsGraph()->getRowMap ()),
725  staticGraph_ (source.getCrsGraph()),
726  storageStatus_ (source.storageStatus_)
727  {
728  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
729  "const CrsMatrix&, const Teuchos::DataAccess): ";
730  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
731  (! source.isFillComplete (), std::invalid_argument,
732  "Source graph must be fillComplete().");
733 
734  if (copyOrView == Teuchos::Copy) {
735  using values_type = typename local_matrix_device_type::values_type;
736  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
737  using Kokkos::view_alloc;
738  using Kokkos::WithoutInitializing;
739  values_type newvals (view_alloc ("val", WithoutInitializing),
740  vals.extent (0));
741  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
742  Kokkos::deep_copy (newvals, vals);
743  valuesPacked_wdv = values_wdv_type(newvals);
744  valuesUnpacked_wdv = valuesPacked_wdv;
745  fillComplete (source.getDomainMap (), source.getRangeMap ());
746  }
747  else if (copyOrView == Teuchos::View) {
748  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
749  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
750  fillComplete (source.getDomainMap (), source.getRangeMap ());
751  }
752  else {
753  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
754  (true, std::invalid_argument, "Second argument 'copyOrView' "
755  "has an invalid value " << copyOrView << ". Valid values "
756  "include Teuchos::Copy = " << Teuchos::Copy << " and "
757  "Teuchos::View = " << Teuchos::View << ".");
758  }
760  }
761 
762  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
763  void
766  {
767  std::swap(crs_matrix.importMV_, this->importMV_);
768  std::swap(crs_matrix.exportMV_, this->exportMV_);
769  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
770  std::swap(crs_matrix.myGraph_, this->myGraph_);
771  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
772  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
773  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
774  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
775  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
776  }
777 
778  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
779  Teuchos::RCP<const Teuchos::Comm<int> >
781  getComm () const {
782  return getCrsGraphRef ().getComm ();
783  }
784 
785  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
786  bool
788  isFillComplete () const {
789  return fillComplete_;
790  }
791 
792  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
793  bool
795  isFillActive () const {
796  return ! fillComplete_;
797  }
798 
799  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
800  bool
803  return this->getCrsGraphRef ().isStorageOptimized ();
804  }
805 
806  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
807  bool
810  return getCrsGraphRef ().isLocallyIndexed ();
811  }
812 
813  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
814  bool
817  return getCrsGraphRef ().isGloballyIndexed ();
818  }
819 
820  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
821  bool
823  hasColMap () const {
824  return getCrsGraphRef ().hasColMap ();
825  }
826 
827  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
831  return getCrsGraphRef ().getGlobalNumEntries ();
832  }
833 
834  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
835  size_t
838  return getCrsGraphRef ().getLocalNumEntries ();
839  }
840 
841  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
845  return getCrsGraphRef ().getGlobalNumRows ();
846  }
847 
848  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
852  return getCrsGraphRef ().getGlobalNumCols ();
853  }
854 
855  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
856  size_t
859  return getCrsGraphRef ().getLocalNumRows ();
860  }
861 
862 
863  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
864  size_t
867  return getCrsGraphRef ().getLocalNumCols ();
868  }
869 
870 
871  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
872  size_t
874  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
875  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
876  }
877 
878  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
879  size_t
881  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
882  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
883  }
884 
885  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
886  size_t
889  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
890  }
891 
892  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
893  size_t
896  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
897  }
898 
899  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
900  GlobalOrdinal
902  getIndexBase () const {
903  return getRowMap ()->getIndexBase ();
904  }
905 
906  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
907  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
909  getRowMap () const {
910  return getCrsGraphRef ().getRowMap ();
911  }
912 
913  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
914  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
916  getColMap () const {
917  return getCrsGraphRef ().getColMap ();
918  }
919 
920  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
921  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
923  getDomainMap () const {
924  return getCrsGraphRef ().getDomainMap ();
925  }
926 
927  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
928  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
930  getRangeMap () const {
931  return getCrsGraphRef ().getRangeMap ();
932  }
933 
934  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
935  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
937  getGraph () const {
938  if (staticGraph_ != Teuchos::null) {
939  return staticGraph_;
940  }
941  return myGraph_;
942  }
943 
944  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
945  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
947  getCrsGraph () const {
948  if (staticGraph_ != Teuchos::null) {
949  return staticGraph_;
950  }
951  return myGraph_;
952  }
953 
954  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
957  getCrsGraphRef () const
958  {
959 #ifdef HAVE_TPETRA_DEBUG
960  constexpr bool debug = true;
961 #else
962  constexpr bool debug = false;
963 #endif // HAVE_TPETRA_DEBUG
964 
965  if (! this->staticGraph_.is_null ()) {
966  return * (this->staticGraph_);
967  }
968  else {
969  if (debug) {
970  const char tfecfFuncName[] = "getCrsGraphRef: ";
971  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
972  (this->myGraph_.is_null (), std::logic_error,
973  "Both staticGraph_ and myGraph_ are null. "
974  "Please report this bug to the Tpetra developers.");
975  }
976  return * (this->myGraph_);
977  }
978  }
979 
980  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
981  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
984  {
985  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
986  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
987  numCols,
988  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
989  staticGraph_->getLocalGraphDevice());
990  }
991 
992  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
993  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
995  getLocalMatrixHost () const
996  {
997  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
998  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
999  valuesPacked_wdv.getHostView(Access::ReadWrite),
1000  staticGraph_->getLocalGraphHost());
1001  }
1002 
1003 #if KOKKOSKERNELS_VERSION < 40299
1004 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1005  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1006  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1009  {
1010  auto localMatrix = getLocalMatrixDevice();
1011 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
1012  if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()))
1013  {
1014  if(this->ordinalRowptrs.data() == nullptr)
1015  {
1016  auto originalRowptrs = localMatrix.graph.row_map;
1017  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1018  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1019  this->ordinalRowptrs = ordinal_rowptrs_type(
1020  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1021  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1022  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1023  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1024  KOKKOS_LAMBDA(LocalOrdinal i)
1025  {
1026  ordinalRowptrs_(i) = originalRowptrs(i);
1027  });
1028  }
1029  //return local operator using ordinalRowptrs
1030  return std::make_shared<local_multiply_op_type>(
1031  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1032  }
1033 #endif
1034 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1035  return std::make_shared<local_multiply_op_type>(
1036  std::make_shared<local_matrix_device_type>(localMatrix));
1037  }
1038 #endif
1039 
1040  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1041  bool
1043  isStaticGraph () const {
1044  return myGraph_.is_null ();
1045  }
1046 
1047  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1048  bool
1051  return true;
1052  }
1053 
1054  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1055  bool
1058  return true;
1059  }
1060 
1061  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1062  void
1064  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1065  const bool verbose)
1066  {
1067  using Details::Behavior;
1069  using std::endl;
1070  const char tfecfFuncName[] = "allocateValues: ";
1071  const char suffix[] =
1072  " Please report this bug to the Tpetra developers.";
1073  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1074 
1075  std::unique_ptr<std::string> prefix;
1076  if (verbose) {
1077  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1078  std::ostringstream os;
1079  os << *prefix << "lg: "
1080  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1081  << ", gas: Graph"
1082  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1083  << "Allocated" << endl;
1084  std::cerr << os.str();
1085  }
1086 
1087  const bool debug = Behavior::debug("CrsMatrix");
1088  if (debug) {
1089  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1090  (this->staticGraph_.is_null (), std::logic_error,
1091  "staticGraph_ is null." << suffix);
1092 
1093  // If the graph indices are already allocated, then gas should be
1094  // GraphAlreadyAllocated. Otherwise, gas should be
1095  // GraphNotYetAllocated.
1096  if ((gas == GraphAlreadyAllocated) !=
1097  staticGraph_->indicesAreAllocated ()) {
1098  const char err1[] = "The caller has asserted that the graph "
1099  "is ";
1100  const char err2[] = "already allocated, but the static graph "
1101  "says that its indices are ";
1102  const char err3[] = "already allocated. ";
1103  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1104  (gas == GraphAlreadyAllocated &&
1105  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1106  err1 << err2 << "not " << err3 << suffix);
1107  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1108  (gas != GraphAlreadyAllocated &&
1109  staticGraph_->indicesAreAllocated (), std::logic_error,
1110  err1 << "not " << err2 << err3 << suffix);
1111  }
1112 
1113  // If the graph is unallocated, then it had better be a
1114  // matrix-owned graph. ("Matrix-owned graph" means that the
1115  // matrix gets to define the graph structure. If the CrsMatrix
1116  // constructor that takes an RCP<const CrsGraph> was used, then
1117  // the matrix does _not_ own the graph.)
1118  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1119  (! this->staticGraph_->indicesAreAllocated () &&
1120  this->myGraph_.is_null (), std::logic_error,
1121  "The static graph says that its indices are not allocated, "
1122  "but the graph is not owned by the matrix." << suffix);
1123  }
1124 
1125  if (gas == GraphNotYetAllocated) {
1126  if (debug) {
1127  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1128  (this->myGraph_.is_null (), std::logic_error,
1129  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1130  }
1131  try {
1132  this->myGraph_->allocateIndices (lg, verbose);
1133  }
1134  catch (std::exception& e) {
1135  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1136  (true, std::runtime_error, "CrsGraph::allocateIndices "
1137  "threw an exception: " << e.what ());
1138  }
1139  catch (...) {
1140  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1141  (true, std::runtime_error, "CrsGraph::allocateIndices "
1142  "threw an exception not a subclass of std::exception.");
1143  }
1144  }
1145 
1146  // Allocate matrix values.
1147  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1148  if (debug) {
1149  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1150  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1151  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1152  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1153  }
1154 
1155  // Allocate array of (packed???) matrix values.
1156  using values_type = typename local_matrix_device_type::values_type;
1157  if (verbose) {
1158  std::ostringstream os;
1159  os << *prefix << "Allocate values_wdv: Pre "
1160  << valuesUnpacked_wdv.extent(0) << ", post "
1161  << lclTotalNumEntries << endl;
1162  std::cerr << os.str();
1163  }
1164 // this->k_values1D_ =
1165  valuesUnpacked_wdv = values_wdv_type(
1166  values_type("Tpetra::CrsMatrix::values",
1167  lclTotalNumEntries));
1168  }
1169 
1170 
1171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1172  void
1174  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1175  {
1177  using ::Tpetra::Details::getEntryOnHost;
1178  using Teuchos::arcp_const_cast;
1179  using Teuchos::Array;
1180  using Teuchos::ArrayRCP;
1181  using Teuchos::null;
1182  using Teuchos::RCP;
1183  using Teuchos::rcp;
1184  using std::endl;
1185  using row_map_type = typename local_graph_device_type::row_map_type;
1186  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1187  using values_type = typename local_matrix_device_type::values_type;
1188  Details::ProfilingRegion regionFLGAM
1189  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1190 
1191  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1192  "fillComplete or expertStaticFillComplete): ";
1193  const char suffix[] =
1194  " Please report this bug to the Tpetra developers.";
1195  const bool debug = Details::Behavior::debug("CrsMatrix");
1196  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1197 
1198  std::unique_ptr<std::string> prefix;
1199  if (verbose) {
1200  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1201  std::ostringstream os;
1202  os << *prefix << endl;
1203  std::cerr << os.str ();
1204  }
1205 
1206  if (debug) {
1207  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1208  // matrix owns the graph, which means myGraph_ is not null.
1209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1210  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1211  "(myGraph_) is null. This means that the matrix has a "
1212  "const (a.k.a. \"static\") graph. fillComplete or "
1213  "expertStaticFillComplete should never call "
1214  "fillLocalGraphAndMatrix in that case." << suffix);
1215  }
1216 
1217  const size_t lclNumRows = this->getLocalNumRows ();
1218 
1219  // This method's goal is to fill in the three arrays (compressed
1220  // sparse row format) that define the sparse graph's and matrix's
1221  // structure, and the sparse matrix's values.
1222  //
1223  // Get references to the data in myGraph_, so we can modify them
1224  // as well. Note that we only call fillLocalGraphAndMatrix() if
1225  // the matrix owns the graph, which means myGraph_ is not null.
1226 
1227  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1228  // See: https://github.com/trilinos/Trilinos/issues/12237
1229  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1230  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1231 
1232  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1233  myGraph_->rowPtrsUnpacked_dev_;
1234 
1235  if (debug) {
1236  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1237  (curRowOffsets.extent (0) == 0, std::logic_error,
1238  "curRowOffsets.extent(0) == 0.");
1239  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1240  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1241  "curRowOffsets.extent(0) = "
1242  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1243  << (lclNumRows + 1) << ".");
1244  const size_t numOffsets = curRowOffsets.extent (0);
1245  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1246  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1247  (numOffsets != 0 &&
1248  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1249  std::logic_error, "numOffsets = " <<
1250  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1251  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1252  << numOffsets << ") = " << valToCheck << ".");
1253  }
1254 
1255  if (myGraph_->getLocalNumEntries() !=
1256  myGraph_->getLocalAllocationSize()) {
1257 
1258  // Use the nonconst version of row_map_type for k_ptrs,
1259  // because row_map_type is const and we need to modify k_ptrs here.
1260  typename row_map_type::non_const_type k_ptrs;
1261  row_map_type k_ptrs_const;
1262  lclinds_1d_type k_inds;
1263  values_type k_vals;
1264 
1265  if (verbose) {
1266  std::ostringstream os;
1267  const auto numEnt = myGraph_->getLocalNumEntries();
1268  const auto allocSize = myGraph_->getLocalAllocationSize();
1269  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1270  << ", allocSize=" << allocSize << endl;
1271  std::cerr << os.str ();
1272  }
1273  // The matrix's current 1-D storage is "unpacked." This means
1274  // the row offsets may differ from what the final row offsets
1275  // should be. This could happen, for example, if the user
1276  // set an upper
1277  // bound on the number of entries per row, but didn't fill all
1278  // those entries.
1279  if (debug && curRowOffsets.extent (0) != 0) {
1280  const size_t numOffsets =
1281  static_cast<size_t> (curRowOffsets.extent (0));
1282  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1283  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1284  (static_cast<size_t> (valToCheck) !=
1285  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1286  std::logic_error, "(unpacked branch) Before "
1287  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1288  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1289  " = " << valuesUnpacked_wdv.extent (0) << ".");
1290  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1291  (static_cast<size_t> (valToCheck) !=
1292  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1293  std::logic_error, "(unpacked branch) Before "
1294  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1295  << ") = " << valToCheck
1296  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1297  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1298  }
1299  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1300  // the array of valid entry counts per row.
1301 
1302  // Total number of entries in the matrix on the calling
1303  // process. We will compute this in the loop below. It's
1304  // cheap to compute and useful as a sanity check.
1305  size_t lclTotalNumEntries = 0;
1306  {
1307  // Allocate the packed row offsets array. We use a nonconst
1308  // temporary (packedRowOffsets) here, because k_ptrs is
1309  // const. We will assign packedRowOffsets to k_ptrs below.
1310  if (verbose) {
1311  std::ostringstream os;
1312  os << *prefix << "Allocate packed row offsets: "
1313  << (lclNumRows+1) << endl;
1314  std::cerr << os.str ();
1315  }
1316  typename row_map_type::non_const_type
1317  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1318  typename row_entries_type::const_type numRowEnt_h =
1319  myGraph_->k_numRowEntries_;
1320  // We're computing offsets on device. This function can
1321  // handle numRowEnt_h being a host View.
1322  lclTotalNumEntries =
1323  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1324  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1325  // to use packedRowOffsets in the loop above and assign here.
1326  k_ptrs = packedRowOffsets;
1327  k_ptrs_const = k_ptrs;
1328  }
1329 
1330  if (debug) {
1331  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1332  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1333  std::logic_error,
1334  "(unpacked branch) After packing k_ptrs, "
1335  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1336  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1337  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1338  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1339  (valToCheck != lclTotalNumEntries, std::logic_error,
1340  "(unpacked branch) After filling k_ptrs, "
1341  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1342  << " != total number of entries on the calling process = "
1343  << lclTotalNumEntries << ".");
1344  }
1345 
1346  // Allocate the arrays of packed column indices and values.
1347  if (verbose) {
1348  std::ostringstream os;
1349  os << *prefix << "Allocate packed local column indices: "
1350  << lclTotalNumEntries << endl;
1351  std::cerr << os.str ();
1352  }
1353  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1354  if (verbose) {
1355  std::ostringstream os;
1356  os << *prefix << "Allocate packed values: "
1357  << lclTotalNumEntries << endl;
1358  std::cerr << os.str ();
1359  }
1360  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1361 
1362  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1363  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1364  // the packed row offsets array k_ptrs that we created above.
1365  //
1366  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1367  // need to keep around the unpacked row offsets, column
1368  // indices, and values arrays.
1369 
1370  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1371  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1372  using inds_packer_type = pack_functor<
1373  typename Graph::local_graph_device_type::entries_type::non_const_type,
1374  typename Graph::local_inds_dualv_type::t_dev::const_type,
1375  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1376  typename Graph::local_graph_device_type::row_map_type>;
1377  inds_packer_type indsPacker (
1378  k_inds,
1379  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1380  k_ptrs, curRowOffsets);
1381  using exec_space = typename decltype (k_inds)::execution_space;
1382  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1383  Kokkos::parallel_for
1384  ("Tpetra::CrsMatrix pack column indices",
1385  range_type (0, lclNumRows), indsPacker);
1386 
1387  // Pack the values from unpacked valuesUnpacked_wdv into packed
1388  // k_vals. We will replace valuesPacked_wdv below.
1389  using vals_packer_type = pack_functor<
1390  typename values_type::non_const_type,
1391  typename values_type::const_type,
1392  typename row_map_type::non_const_type,
1393  typename row_map_type::const_type>;
1394  vals_packer_type valsPacker (
1395  k_vals,
1396  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1397  k_ptrs, curRowOffsets);
1398  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1399  range_type (0, lclNumRows), valsPacker);
1400 
1401  if (debug) {
1402  const char myPrefix[] = "(\"Optimize Storage\""
1403  "=true branch) After packing, ";
1404  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1405  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1406  << "k_ptrs.extent(0) = 0. This probably means that "
1407  "rowPtrsUnpacked_ was never allocated.");
1408  if (k_ptrs.extent (0) != 0) {
1409  const size_t numOffsets (k_ptrs.extent (0));
1410  const auto valToCheck =
1411  getEntryOnHost (k_ptrs, numOffsets - 1);
1412  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1413  (size_t (valToCheck) != k_vals.extent (0),
1414  std::logic_error, myPrefix <<
1415  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1416  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1417  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1418  (size_t (valToCheck) != k_inds.extent (0),
1419  std::logic_error, myPrefix <<
1420  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1421  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1422  }
1423  }
1424  // Build the local graph.
1425  myGraph_->setRowPtrsPacked(k_ptrs_const);
1426  myGraph_->lclIndsPacked_wdv =
1427  typename crs_graph_type::local_inds_wdv_type(k_inds);
1428  valuesPacked_wdv = values_wdv_type(k_vals);
1429  }
1430  else { // We don't have to pack, so just set the pointers.
1431  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1432  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1433  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1434  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1435  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1436  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1437  valuesPacked_wdv = valuesUnpacked_wdv;
1438 
1439  if (verbose) {
1440  std::ostringstream os;
1441  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1442  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1443  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1444  << valuesUnpacked_wdv.extent(0) << endl;
1445  std::cerr << os.str();
1446  }
1447 
1448  if (debug) {
1449  const char myPrefix[] =
1450  "(\"Optimize Storage\"=false branch) ";
1451  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1452  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1453  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1454  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1455  "that rowPtrsUnpacked_ was never allocated.");
1456  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1457  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1458  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1459  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1460  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1461  std::logic_error, myPrefix <<
1462  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1463  << " != valuesPacked_wdv.extent(0) = "
1464  << valuesPacked_wdv.extent (0) << ".");
1465  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1466  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1467  std::logic_error, myPrefix <<
1468  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1469  << " != myGraph_->lclIndsPacked.extent(0) = "
1470  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1471  }
1472  }
1473  }
1474 
1475  if (debug) {
1476  const char myPrefix[] = "After packing, ";
1477  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1478  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1479  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1480  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1481  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1482  (lclNumRows+1) << ".");
1483  if (rowPtrsPackedHost.extent (0) != 0) {
1484  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1485  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1486  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1487  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1488  std::logic_error, myPrefix << "k_ptrs_const(" <<
1489  (numOffsets-1) << ") = " << valToCheck
1490  << " != valuesPacked_wdv.extent(0) = "
1491  << valuesPacked_wdv.extent (0) << ".");
1492  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1493  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1494  std::logic_error, myPrefix << "k_ptrs_const(" <<
1495  (numOffsets-1) << ") = " << valToCheck
1496  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1497  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1498  }
1499  }
1500 
1501  // May we ditch the old allocations for the packed (and otherwise
1502  // "optimized") allocations, later in this routine? Optimize
1503  // storage if the graph is not static, or if the graph already has
1504  // optimized storage.
1505  const bool defaultOptStorage =
1506  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1507  const bool requestOptimizedStorage =
1508  (! params.is_null () &&
1509  params->get ("Optimize Storage", defaultOptStorage)) ||
1510  (params.is_null () && defaultOptStorage);
1511 
1512  // The graph has optimized storage when indices are allocated,
1513  // myGraph_->k_numRowEntries_ is empty, and there are more than
1514  // zero rows on this process.
1515  if (requestOptimizedStorage) {
1516  // Free the old, unpacked, unoptimized allocations.
1517  // Free graph data structures that are only needed for
1518  // unpacked 1-D storage.
1519  if (verbose) {
1520  std::ostringstream os;
1521  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1522  << myGraph_->k_numRowEntries_.extent(0) << endl;
1523  std::cerr << os.str();
1524  }
1525 
1526  myGraph_->k_numRowEntries_ = row_entries_type ();
1527 
1528  // Keep the new 1-D packed allocations.
1529  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1530  // We directly set the memory spaces to avoid a memcpy from device to host
1531  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1532  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1533  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1534  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1535  valuesUnpacked_wdv = valuesPacked_wdv;
1536 
1537  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1538  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1539  }
1540  else {
1541  if (verbose) {
1542  std::ostringstream os;
1543  os << *prefix << "User requested NOT to optimize storage"
1544  << endl;
1545  std::cerr << os.str();
1546  }
1547  }
1548  }
1549 
1550  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1551  void
1553  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1554  {
1555  using ::Tpetra::Details::ProfilingRegion;
1556  using Teuchos::ArrayRCP;
1557  using Teuchos::Array;
1558  using Teuchos::null;
1559  using Teuchos::RCP;
1560  using Teuchos::rcp;
1561  using std::endl;
1562  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1563  using non_const_row_map_type = typename row_map_type::non_const_type;
1564  using values_type = typename local_matrix_device_type::values_type;
1565  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1566  const size_t lclNumRows = getLocalNumRows();
1567 
1568  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1569  std::unique_ptr<std::string> prefix;
1570  if (verbose) {
1571  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1572  std::ostringstream os;
1573  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1574  std::cerr << os.str ();
1575  }
1576 
1577  // The goals of this routine are first, to allocate and fill
1578  // packed 1-D storage (see below for an explanation) in the vals
1579  // array, and second, to give vals to the local matrix and
1580  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1581  // row offsets, within the scope of this routine, since we're only
1582  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1583  // fill both the graph and the matrix at the same time).
1584 
1585  // get data from staticGraph_
1586  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1587  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1588  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1589 
1590  row_map_type k_ptrs; // "packed" row offsets array
1591  values_type k_vals; // "packed" values array
1592 
1593  // May we ditch the old allocations for the packed (and otherwise
1594  // "optimized") allocations, later in this routine? Request
1595  // optimized storage by default.
1596  bool requestOptimizedStorage = true;
1597  const bool default_OptimizeStorage =
1598  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1599  if (! params.is_null() &&
1600  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1601  requestOptimizedStorage = false;
1602  }
1603  // If we're not allowed to change a static graph, then we can't
1604  // change the storage of the matrix, either. This means that if
1605  // the graph's storage isn't already optimized, we can't optimize
1606  // the matrix's storage either. Check and give warning, as
1607  // appropriate.
1608  if (! staticGraph_->isStorageOptimized () &&
1609  requestOptimizedStorage) {
1611  (true, std::runtime_error, "You requested optimized storage "
1612  "by setting the \"Optimize Storage\" flag to \"true\" in "
1613  "the ParameterList, or by virtue of default behavior. "
1614  "However, the associated CrsGraph was filled separately and "
1615  "requested not to optimize storage. Therefore, the "
1616  "CrsMatrix cannot optimize storage.");
1617  requestOptimizedStorage = false;
1618  }
1619 
1620  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1621  // See: https://github.com/trilinos/Trilinos/issues/12237
1622  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1623  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1624 
1625  // The matrix's values are currently
1626  // stored in a 1-D format. However, this format is "unpacked";
1627  // it doesn't necessarily have the same row offsets as indicated
1628  // by the ptrs array returned by allocRowPtrs. This could
1629  // happen, for example, if the user
1630  // fixed the number of matrix entries in
1631  // each row, but didn't fill all those entries.
1632  //
1633  // As above, we don't need to keep the "packed" row offsets
1634  // array ptrs here, but we do need it here temporarily, so we
1635  // have to allocate it. We'll free ptrs later in this method.
1636  //
1637  // Note that this routine checks whether storage has already
1638  // been packed. This is a common case for solution of nonlinear
1639  // PDEs using the finite element method, as long as the
1640  // structure of the sparse matrix does not change between linear
1641  // solves.
1642  if (nodeNumEntries != nodeNumAllocated) {
1643  if (verbose) {
1644  std::ostringstream os;
1645  os << *prefix << "Unpacked 1-D storage: numEnt="
1646  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1647  << endl;
1648  std::cerr << os.str();
1649  }
1650  // We have to pack the 1-D storage, since the user didn't fill
1651  // up all requested storage.
1652  if (verbose) {
1653  std::ostringstream os;
1654  os << *prefix << "Allocate packed row offsets: "
1655  << (lclNumRows+1) << endl;
1656  std::cerr << os.str();
1657  }
1658  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1659  lclNumRows+1);
1660  // Total number of entries in the matrix on the calling
1661  // process. We will compute this in the loop below. It's
1662  // cheap to compute and useful as a sanity check.
1663  size_t lclTotalNumEntries = 0;
1664  k_ptrs = tmpk_ptrs;
1665  {
1666  typename row_entries_type::const_type numRowEnt_h =
1667  staticGraph_->k_numRowEntries_;
1668  // This function can handle the counts being a host View.
1669  lclTotalNumEntries =
1670  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1671  }
1672 
1673  // Allocate the "packed" values array.
1674  // It has exactly the right number of entries.
1675  if (verbose) {
1676  std::ostringstream os;
1677  os << *prefix << "Allocate packed values: "
1678  << lclTotalNumEntries << endl;
1679  std::cerr << os.str ();
1680  }
1681  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1682 
1683  // Pack values_wdv into k_vals. We will replace values_wdv below.
1684  pack_functor<
1685  typename values_type::non_const_type,
1686  typename values_type::const_type,
1687  typename row_map_type::non_const_type,
1688  typename row_map_type::const_type> valsPacker
1689  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1690  tmpk_ptrs, k_rowPtrs);
1691 
1692  using exec_space = typename decltype (k_vals)::execution_space;
1693  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1694  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1695  range_type (0, lclNumRows), valsPacker);
1696  valuesPacked_wdv = values_wdv_type(k_vals);
1697  }
1698  else { // We don't have to pack, so just set the pointer.
1699  valuesPacked_wdv = valuesUnpacked_wdv;
1700  if (verbose) {
1701  std::ostringstream os;
1702  os << *prefix << "Storage already packed: "
1703  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1704  std::cerr << os.str();
1705  }
1706  }
1707 
1708  // May we ditch the old allocations for the packed one?
1709  if (requestOptimizedStorage) {
1710  // The user requested optimized storage, so we can dump the
1711  // unpacked 1-D storage, and keep the packed storage.
1712  valuesUnpacked_wdv = valuesPacked_wdv;
1713 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1714  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1715  }
1716  }
1717 
1718  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1719  void
1721  insertIndicesAndValues (crs_graph_type& graph,
1722  RowInfo& rowInfo,
1723  const typename crs_graph_type::SLocalGlobalViews& newInds,
1724  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1725  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1726  const ELocalGlobal lg,
1727  const ELocalGlobal I)
1728  {
1729  const size_t oldNumEnt = rowInfo.numEntries;
1730  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1731 
1732  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1733  // that probably relates to scalar_type vs. impl_scalar_type
1734  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1735  // details; look for GCC_WORKAROUND macro definition.
1736  if (numInserted > 0) {
1737  const size_t startOffset = oldNumEnt;
1738  memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1739  numInserted * sizeof (impl_scalar_type));
1740  }
1741  }
1742 
1743  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1744  void
1746  insertLocalValues (const LocalOrdinal lclRow,
1747  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1748  const Teuchos::ArrayView<const Scalar>& values,
1749  const CombineMode CM)
1750  {
1751  using std::endl;
1752  const char tfecfFuncName[] = "insertLocalValues: ";
1753 
1754  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1755  (! this->isFillActive (), std::runtime_error,
1756  "Fill is not active. After calling fillComplete, you must call "
1757  "resumeFill before you may insert entries into the matrix again.");
1758  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1759  (this->isStaticGraph (), std::runtime_error,
1760  "Cannot insert indices with static graph; use replaceLocalValues() "
1761  "instead.");
1762  // At this point, we know that myGraph_ is nonnull.
1763  crs_graph_type& graph = * (this->myGraph_);
1764  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1765  (graph.colMap_.is_null (), std::runtime_error,
1766  "Cannot insert local indices without a column map.");
1767  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1768  (graph.isGloballyIndexed (),
1769  std::runtime_error, "Graph indices are global; use "
1770  "insertGlobalValues().");
1771  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1772  (values.size () != indices.size (), std::runtime_error,
1773  "values.size() = " << values.size ()
1774  << " != indices.size() = " << indices.size () << ".");
1775  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1776  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1777  "Local row index " << lclRow << " does not belong to this process.");
1778 
1779  if (! graph.indicesAreAllocated ()) {
1780  // We only allocate values at most once per process, so it's OK
1781  // to check TPETRA_VERBOSE here.
1782  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1783  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1784  }
1785 
1786 #ifdef HAVE_TPETRA_DEBUG
1787  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1788  // In a debug build, test whether any of the given column indices
1789  // are not in the column Map. Keep track of the invalid column
1790  // indices so we can tell the user about them.
1791  {
1792  using Teuchos::toString;
1793 
1794  const map_type& colMap = * (graph.colMap_);
1795  Teuchos::Array<LocalOrdinal> badColInds;
1796  bool allInColMap = true;
1797  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1798  if (! colMap.isNodeLocalElement (indices[k])) {
1799  allInColMap = false;
1800  badColInds.push_back (indices[k]);
1801  }
1802  }
1803  if (! allInColMap) {
1804  std::ostringstream os;
1805  os << "You attempted to insert entries in owned row " << lclRow
1806  << ", at the following column indices: " << toString (indices)
1807  << "." << endl;
1808  os << "Of those, the following indices are not in the column Map on "
1809  "this process: " << toString (badColInds) << "." << endl << "Since "
1810  "the matrix has a column Map already, it is invalid to insert "
1811  "entries at those locations.";
1812  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1813  (true, std::invalid_argument, os.str ());
1814  }
1815  }
1816 #endif // HAVE_TPETRA_DEBUG
1817 
1818  RowInfo rowInfo = graph.getRowInfo (lclRow);
1819 
1820  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1821  if (CM == ADD) {
1822  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1823  valsView[offset] += values[k]; };
1824  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1825  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1826  } else if (CM == INSERT) {
1827  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1828  valsView[offset] = values[k]; };
1829  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1830  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1831  } else {
1832  std::ostringstream os;
1833  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1834  << "but this has not been implemented." << endl;
1835  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1836  (true, std::invalid_argument, os.str ());
1837  }
1838  }
1839 
1840  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1841  void
1843  insertLocalValues (const LocalOrdinal localRow,
1844  const LocalOrdinal numEnt,
1845  const Scalar vals[],
1846  const LocalOrdinal cols[],
1847  const CombineMode CM)
1848  {
1849  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1850  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1851  this->insertLocalValues (localRow, colsT, valsT, CM);
1852  }
1853 
1854  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1855  void
1858  RowInfo& rowInfo,
1859  const GlobalOrdinal gblColInds[],
1860  const impl_scalar_type vals[],
1861  const size_t numInputEnt)
1862  {
1863 #ifdef HAVE_TPETRA_DEBUG
1864  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1865  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1866  const size_t curNumEnt = rowInfo.numEntries;
1867 #endif // HAVE_TPETRA_DEBUG
1868 
1869  if (! graph.indicesAreAllocated ()) {
1870  // We only allocate values at most once per process, so it's OK
1871  // to check TPETRA_VERBOSE here.
1872  using ::Tpetra::Details::Behavior;
1873  const bool verbose = Behavior::verbose("CrsMatrix");
1874  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1875  // mfh 23 Jul 2017: allocateValues invalidates existing
1876  // getRowInfo results. Once we get rid of lazy graph
1877  // allocation, we'll be able to move the getRowInfo call outside
1878  // of this method.
1879  rowInfo = graph.getRowInfo (rowInfo.localRow);
1880  }
1881 
1882  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1883  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1884  valsView[offset] += vals[k];
1885  };
1886  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1887 #ifdef HAVE_TPETRA_DEBUG
1888  //numInserted is only used inside the debug code below.
1889  auto numInserted =
1890 #endif
1891  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1892 
1893 #ifdef HAVE_TPETRA_DEBUG
1894  size_t newNumEnt = curNumEnt + numInserted;
1895  const size_t chkNewNumEnt =
1896  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1897  if (chkNewNumEnt != newNumEnt) {
1898  std::ostringstream os;
1899  os << std::endl << "newNumEnt = " << newNumEnt
1900  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1901  << ") = " << chkNewNumEnt << "." << std::endl
1902  << "\torigNumEnt: " << origNumEnt << std::endl
1903  << "\tnumInputEnt: " << numInputEnt << std::endl
1904  << "\tgblColInds: [";
1905  for (size_t k = 0; k < numInputEnt; ++k) {
1906  os << gblColInds[k];
1907  if (k + size_t (1) < numInputEnt) {
1908  os << ",";
1909  }
1910  }
1911  os << "]" << std::endl
1912  << "\tvals: [";
1913  for (size_t k = 0; k < numInputEnt; ++k) {
1914  os << vals[k];
1915  if (k + size_t (1) < numInputEnt) {
1916  os << ",";
1917  }
1918  }
1919  os << "]" << std::endl;
1920 
1921  if (this->supportsRowViews ()) {
1922  values_host_view_type vals2;
1923  if (this->isGloballyIndexed ()) {
1924  global_inds_host_view_type gblColInds2;
1925  const GlobalOrdinal gblRow =
1926  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1927  if (gblRow ==
1928  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1929  os << "Local row index " << rowInfo.localRow << " is invalid!"
1930  << std::endl;
1931  }
1932  else {
1933  bool getViewThrew = false;
1934  try {
1935  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1936  }
1937  catch (std::exception& e) {
1938  getViewThrew = true;
1939  os << "getGlobalRowView threw exception:" << std::endl
1940  << e.what () << std::endl;
1941  }
1942  if (! getViewThrew) {
1943  os << "\tNew global column indices: ";
1944  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1945  os << gblColInds2[jjj] << " ";
1946  os << std::endl;
1947  os << "\tNew values: ";
1948  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1949  os << vals2[jjj] << " ";
1950  os << std::endl;
1951  }
1952  }
1953  }
1954  else if (this->isLocallyIndexed ()) {
1955  local_inds_host_view_type lclColInds2;
1956  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1957  os << "\tNew local column indices: ";
1958  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1959  os << lclColInds2[jjj] << " ";
1960  os << std::endl;
1961  os << "\tNew values: ";
1962  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1963  os << vals2[jjj] << " ";
1964  os << std::endl;
1965  }
1966  }
1967 
1968  os << "Please report this bug to the Tpetra developers.";
1969  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1970  (true, std::logic_error, os.str ());
1971  }
1972 #endif // HAVE_TPETRA_DEBUG
1973  }
1974 
1975  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1976  void
1978  insertGlobalValues (const GlobalOrdinal gblRow,
1979  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1980  const Teuchos::ArrayView<const Scalar>& values)
1981  {
1982  using Teuchos::toString;
1983  using std::endl;
1984  typedef impl_scalar_type IST;
1985  typedef LocalOrdinal LO;
1986  typedef GlobalOrdinal GO;
1987  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1988  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1989  const char tfecfFuncName[] = "insertGlobalValues: ";
1990 
1991 #ifdef HAVE_TPETRA_DEBUG
1992  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1993  (values.size () != indices.size (), std::runtime_error,
1994  "values.size() = " << values.size () << " != indices.size() = "
1995  << indices.size () << ".");
1996 #endif // HAVE_TPETRA_DEBUG
1997 
1998  // getRowMap() is not thread safe, because it increments RCP's
1999  // reference count. getCrsGraphRef() is thread safe.
2000  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2001  const LO lclRow = rowMap.getLocalElement (gblRow);
2002 
2003  if (lclRow == OTLO::invalid ()) {
2004  // Input row is _not_ owned by the calling process.
2005  //
2006  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2007  // is not in the row Map, it doesn't matter whether or not the
2008  // graph is static; the data just get stashed for later use by
2009  // globalAssemble().
2010  this->insertNonownedGlobalValues (gblRow, indices, values);
2011  }
2012  else { // Input row _is_ owned by the calling process
2013  if (this->isStaticGraph ()) {
2014  // Uh oh! Not allowed to insert into owned rows in that case.
2015  const int myRank = rowMap.getComm ()->getRank ();
2016  const int numProcs = rowMap.getComm ()->getSize ();
2017  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2018  (true, std::runtime_error,
2019  "The matrix was constructed with a constant (\"static\") graph, "
2020  "yet the given global row index " << gblRow << " is in the row "
2021  "Map on the calling process (with rank " << myRank << ", of " <<
2022  numProcs << " process(es)). In this case, you may not insert "
2023  "new entries into rows owned by the calling process.");
2024  }
2025 
2026  crs_graph_type& graph = * (this->myGraph_);
2027  const IST* const inputVals =
2028  reinterpret_cast<const IST*> (values.getRawPtr ());
2029  const GO* const inputGblColInds = indices.getRawPtr ();
2030  const size_t numInputEnt = indices.size ();
2031  RowInfo rowInfo = graph.getRowInfo (lclRow);
2032 
2033  // If the matrix has a column Map, check at this point whether
2034  // the column indices belong to the column Map.
2035  //
2036  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2037  // test to the CrsGraph method, since it may have to do this
2038  // anyway.
2039  if (! graph.colMap_.is_null ()) {
2040  const map_type& colMap = * (graph.colMap_);
2041  // In a debug build, keep track of the nonowned ("bad") column
2042  // indices, so that we can display them in the exception
2043  // message. In a release build, just ditch the loop early if
2044  // we encounter a nonowned column index.
2045 #ifdef HAVE_TPETRA_DEBUG
2046  Teuchos::Array<GO> badColInds;
2047 #endif // HAVE_TPETRA_DEBUG
2048  const size_type numEntriesToInsert = indices.size ();
2049  bool allInColMap = true;
2050  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2051  if (! colMap.isNodeGlobalElement (indices[k])) {
2052  allInColMap = false;
2053 #ifdef HAVE_TPETRA_DEBUG
2054  badColInds.push_back (indices[k]);
2055 #else
2056  break;
2057 #endif // HAVE_TPETRA_DEBUG
2058  }
2059  }
2060  if (! allInColMap) {
2061  std::ostringstream os;
2062  os << "You attempted to insert entries in owned row " << gblRow
2063  << ", at the following column indices: " << toString (indices)
2064  << "." << endl;
2065 #ifdef HAVE_TPETRA_DEBUG
2066  os << "Of those, the following indices are not in the column Map "
2067  "on this process: " << toString (badColInds) << "." << endl
2068  << "Since the matrix has a column Map already, it is invalid "
2069  "to insert entries at those locations.";
2070 #else
2071  os << "At least one of those indices is not in the column Map "
2072  "on this process." << endl << "It is invalid to insert into "
2073  "columns not in the column Map on the process that owns the "
2074  "row.";
2075 #endif // HAVE_TPETRA_DEBUG
2076  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2077  (true, std::invalid_argument, os.str ());
2078  }
2079  }
2080 
2081  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2082  inputVals, numInputEnt);
2083  }
2084  }
2085 
2086 
2087  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2088  void
2090  insertGlobalValues (const GlobalOrdinal globalRow,
2091  const LocalOrdinal numEnt,
2092  const Scalar vals[],
2093  const GlobalOrdinal inds[])
2094  {
2095  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2096  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2097  this->insertGlobalValues (globalRow, indsT, valsT);
2098  }
2099 
2100 
2101  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2102  void
2105  const GlobalOrdinal gblRow,
2106  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2107  const Teuchos::ArrayView<const Scalar>& values,
2108  const bool debug)
2109  {
2110  typedef impl_scalar_type IST;
2111  typedef LocalOrdinal LO;
2112  typedef GlobalOrdinal GO;
2113  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2114  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2115 
2116  if (debug) {
2117  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2118  (values.size () != indices.size (), std::runtime_error,
2119  "values.size() = " << values.size () << " != indices.size() = "
2120  << indices.size () << ".");
2121  }
2122 
2123  // getRowMap() is not thread safe, because it increments RCP's
2124  // reference count. getCrsGraphRef() is thread safe.
2125  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2126  const LO lclRow = rowMap.getLocalElement (gblRow);
2127  if (lclRow == OTLO::invalid ()) {
2128  // Input row is _not_ owned by the calling process.
2129  //
2130  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2131  // is not in the row Map, it doesn't matter whether or not the
2132  // graph is static; the data just get stashed for later use by
2133  // globalAssemble().
2134  this->insertNonownedGlobalValues (gblRow, indices, values);
2135  }
2136  else { // Input row _is_ owned by the calling process
2137  if (this->isStaticGraph ()) {
2138  // Uh oh! Not allowed to insert into owned rows in that case.
2139  const int myRank = rowMap.getComm ()->getRank ();
2140  const int numProcs = rowMap.getComm ()->getSize ();
2141  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2142  (true, std::runtime_error,
2143  "The matrix was constructed with a constant (\"static\") graph, "
2144  "yet the given global row index " << gblRow << " is in the row "
2145  "Map on the calling process (with rank " << myRank << ", of " <<
2146  numProcs << " process(es)). In this case, you may not insert "
2147  "new entries into rows owned by the calling process.");
2148  }
2149 
2150  crs_graph_type& graph = * (this->myGraph_);
2151  const IST* const inputVals =
2152  reinterpret_cast<const IST*> (values.getRawPtr ());
2153  const GO* const inputGblColInds = indices.getRawPtr ();
2154  const size_t numInputEnt = indices.size ();
2155  RowInfo rowInfo = graph.getRowInfo (lclRow);
2156 
2157  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2158  // This branch is similar in function to the following branch, but for
2159  // the special case that the target graph is locally indexed.
2160  // In this case, we cannot simply filter
2161  // out global indices that don't exist on the receiving process and
2162  // insert the remaining (global) indices, but we must convert them (the
2163  // remaining global indices) to local and call `insertLocalValues`.
2164  const map_type& colMap = * (graph.colMap_);
2165  size_t curOffset = 0;
2166  while (curOffset < numInputEnt) {
2167  // Find a sequence of input indices that are in the column Map on the
2168  // calling process. Doing a sequence at a time, instead of one at a
2169  // time, amortizes some overhead.
2170  Teuchos::Array<LO> lclIndices;
2171  size_t endOffset = curOffset;
2172  for ( ; endOffset < numInputEnt; ++endOffset) {
2173  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2174  if (lclIndex != OTLO::invalid())
2175  lclIndices.push_back(lclIndex);
2176  else
2177  break;
2178  }
2179  // curOffset, endOffset: half-exclusive range of indices in the column
2180  // Map on the calling process. If endOffset == curOffset, the range is
2181  // empty.
2182  const LO numIndInSeq = (endOffset - curOffset);
2183  if (numIndInSeq != 0) {
2184  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2185  }
2186  // Invariant before the increment line: Either endOffset ==
2187  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2188  // on the calling process.
2189  if (debug) {
2190  const bool invariant = endOffset == numInputEnt ||
2191  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2192  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2193  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2194  }
2195  curOffset = endOffset + 1;
2196  }
2197  }
2198  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2199  const map_type& colMap = * (graph.colMap_);
2200  size_t curOffset = 0;
2201  while (curOffset < numInputEnt) {
2202  // Find a sequence of input indices that are in the column
2203  // Map on the calling process. Doing a sequence at a time,
2204  // instead of one at a time, amortizes some overhead.
2205  size_t endOffset = curOffset;
2206  for ( ; endOffset < numInputEnt &&
2207  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2208  ++endOffset)
2209  {}
2210  // curOffset, endOffset: half-exclusive range of indices in
2211  // the column Map on the calling process. If endOffset ==
2212  // curOffset, the range is empty.
2213  const LO numIndInSeq = (endOffset - curOffset);
2214  if (numIndInSeq != 0) {
2215  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2216  this->insertGlobalValuesImpl (graph, rowInfo,
2217  inputGblColInds + curOffset,
2218  inputVals + curOffset,
2219  numIndInSeq);
2220  }
2221  // Invariant before the increment line: Either endOffset ==
2222  // numInputEnt, or inputGblColInds[endOffset] is not in the
2223  // column Map on the calling process.
2224  if (debug) {
2225  const bool invariant = endOffset == numInputEnt ||
2226  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2227  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2228  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2229  }
2230  curOffset = endOffset + 1;
2231  }
2232  }
2233  else { // we don't have a column Map.
2234  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2235  inputVals, numInputEnt);
2236  }
2237  }
2238  }
2239 
2240  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2241  void
2242  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2243  insertGlobalValuesFilteredChecked(
2244  const GlobalOrdinal gblRow,
2245  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2246  const Teuchos::ArrayView<const Scalar>& values,
2247  const char* const prefix,
2248  const bool debug,
2249  const bool verbose)
2250  {
2252  using std::endl;
2253 
2254  try {
2255  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2256  }
2257  catch(std::exception& e) {
2258  std::ostringstream os;
2259  if (verbose) {
2260  const size_t maxNumToPrint =
2262  os << *prefix << ": insertGlobalValuesFiltered threw an "
2263  "exception: " << e.what() << endl
2264  << "Global row index: " << gblRow << endl;
2265  verbosePrintArray(os, indices, "Global column indices",
2266  maxNumToPrint);
2267  os << endl;
2268  verbosePrintArray(os, values, "Values", maxNumToPrint);
2269  os << endl;
2270  }
2271  else {
2272  os << ": insertGlobalValuesFiltered threw an exception: "
2273  << e.what();
2274  }
2275  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2276  }
2277  }
2278 
2279  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2280  LocalOrdinal
2283  const crs_graph_type& graph,
2284  const RowInfo& rowInfo,
2285  const LocalOrdinal inds[],
2286  const impl_scalar_type newVals[],
2287  const LocalOrdinal numElts)
2288  {
2289  typedef LocalOrdinal LO;
2290  typedef GlobalOrdinal GO;
2291  const bool sorted = graph.isSorted ();
2292 
2293  size_t hint = 0; // Guess for the current index k into rowVals
2294  LO numValid = 0; // number of valid local column indices
2295 
2296  if (graph.isLocallyIndexed ()) {
2297  // Get a view of the column indices in the row. This amortizes
2298  // the cost of getting the view over all the entries of inds.
2299  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2300 
2301  for (LO j = 0; j < numElts; ++j) {
2302  const LO lclColInd = inds[j];
2303  const size_t offset =
2304  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2305  lclColInd, hint, sorted);
2306  if (offset != rowInfo.numEntries) {
2307  rowVals[offset] = newVals[j];
2308  hint = offset + 1;
2309  ++numValid;
2310  }
2311  }
2312  }
2313  else if (graph.isGloballyIndexed ()) {
2314  if (graph.colMap_.is_null ()) {
2315  return Teuchos::OrdinalTraits<LO>::invalid ();
2316  }
2317  const map_type colMap = * (graph.colMap_);
2318 
2319  // Get a view of the column indices in the row. This amortizes
2320  // the cost of getting the view over all the entries of inds.
2321  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2322 
2323  for (LO j = 0; j < numElts; ++j) {
2324  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2325  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2326  const size_t offset =
2327  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2328  gblColInd, hint, sorted);
2329  if (offset != rowInfo.numEntries) {
2330  rowVals[offset] = newVals[j];
2331  hint = offset + 1;
2332  ++numValid;
2333  }
2334  }
2335  }
2336  }
2337  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2338  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2339  // to be neither locally nor globally indexed on a process.
2340  // This means that the graph or matrix has no entries on that
2341  // process. Epetra also works like this. It's related to lazy
2342  // allocation (on first insertion, not at graph / matrix
2343  // construction). Lazy allocation will go away because it is
2344  // not thread scalable.
2345 
2346  return numValid;
2347  }
2348 
2349  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2350  LocalOrdinal
2352  replaceLocalValues (const LocalOrdinal localRow,
2353  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2354  const Teuchos::ArrayView<const Scalar>& vals)
2355  {
2356  typedef LocalOrdinal LO;
2357 
2358  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2359  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2360  return Teuchos::OrdinalTraits<LO>::invalid ();
2361  }
2362  const LO* const inputInds = lclCols.getRawPtr ();
2363  const Scalar* const inputVals = vals.getRawPtr ();
2364  return this->replaceLocalValues (localRow, numInputEnt,
2365  inputVals, inputInds);
2366  }
2367 
2368  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2370  local_ordinal_type
2373  const local_ordinal_type localRow,
2374  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2375  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2376  {
2377  using LO = local_ordinal_type;
2378  const LO numInputEnt = inputInds.extent(0);
2379  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2380  return Teuchos::OrdinalTraits<LO>::invalid();
2381  }
2382  const Scalar* const inVals =
2383  reinterpret_cast<const Scalar*>(inputVals.data());
2384  return this->replaceLocalValues(localRow, numInputEnt,
2385  inVals, inputInds.data());
2386  }
2387 
2388  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2389  LocalOrdinal
2391  replaceLocalValues (const LocalOrdinal localRow,
2392  const LocalOrdinal numEnt,
2393  const Scalar inputVals[],
2394  const LocalOrdinal inputCols[])
2395  {
2396  typedef impl_scalar_type IST;
2397  typedef LocalOrdinal LO;
2398 
2399  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2400  // Fill must be active and the "nonconst" graph must exist.
2401  return Teuchos::OrdinalTraits<LO>::invalid ();
2402  }
2403  const crs_graph_type& graph = * (this->staticGraph_);
2404  const RowInfo rowInfo = graph.getRowInfo (localRow);
2405 
2406  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2407  // The calling process does not own this row, so it is not
2408  // allowed to modify its values.
2409  return static_cast<LO> (0);
2410  }
2411  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2412  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2413  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2414  inputCols, inVals, numEnt);
2415  }
2416 
2417  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2418  LocalOrdinal
2421  const crs_graph_type& graph,
2422  const RowInfo& rowInfo,
2423  const GlobalOrdinal inds[],
2424  const impl_scalar_type newVals[],
2425  const LocalOrdinal numElts)
2426  {
2427  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2428  auto fun =
2429  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2430  rowVals[offset] = newVals[k];
2431  };
2432  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2433  return graph.findGlobalIndices(rowInfo, indsT, cb);
2434  }
2435 
2436  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2437  LocalOrdinal
2439  replaceGlobalValues (const GlobalOrdinal globalRow,
2440  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2441  const Teuchos::ArrayView<const Scalar>& inputVals)
2442  {
2443  typedef LocalOrdinal LO;
2444 
2445  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2446  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2447  return Teuchos::OrdinalTraits<LO>::invalid ();
2448  }
2449  return this->replaceGlobalValues (globalRow, numInputEnt,
2450  inputVals.getRawPtr (),
2451  inputGblColInds.getRawPtr ());
2452  }
2453 
2454  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2455  LocalOrdinal
2457  replaceGlobalValues (const GlobalOrdinal globalRow,
2458  const LocalOrdinal numEnt,
2459  const Scalar inputVals[],
2460  const GlobalOrdinal inputGblColInds[])
2461  {
2462  typedef impl_scalar_type IST;
2463  typedef LocalOrdinal LO;
2464 
2465  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2466  // Fill must be active and the "nonconst" graph must exist.
2467  return Teuchos::OrdinalTraits<LO>::invalid ();
2468  }
2469  const crs_graph_type& graph = * (this->staticGraph_);
2470 
2471  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2472  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2473  // The input local row is invalid on the calling process,
2474  // which means that the calling process summed 0 entries.
2475  return static_cast<LO> (0);
2476  }
2477 
2478  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2479  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2480  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2481  inputGblColInds, inVals, numEnt);
2482  }
2483 
2484  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2486  local_ordinal_type
2489  const global_ordinal_type globalRow,
2490  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2491  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2492  {
2493  // We use static_assert here to check the template parameters,
2494  // rather than std::enable_if (e.g., on the return value, to
2495  // enable compilation only if the template parameters match the
2496  // desired attributes). This turns obscure link errors into
2497  // clear compilation errors. It also makes the return value a
2498  // lot easier to see.
2499  using LO = local_ordinal_type;
2500  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2501  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2502  return Teuchos::OrdinalTraits<LO>::invalid();
2503  }
2504  const Scalar* const inVals =
2505  reinterpret_cast<const Scalar*>(inputVals.data());
2506  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2507  inputInds.data());
2508  }
2509 
2510  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2511  LocalOrdinal
2514  const crs_graph_type& graph,
2515  const RowInfo& rowInfo,
2516  const GlobalOrdinal inds[],
2517  const impl_scalar_type newVals[],
2518  const LocalOrdinal numElts,
2519  const bool atomic)
2520  {
2521  typedef LocalOrdinal LO;
2522  typedef GlobalOrdinal GO;
2523 
2524  const bool sorted = graph.isSorted ();
2525 
2526  size_t hint = 0; // guess at the index's relative offset in the row
2527  LO numValid = 0; // number of valid input column indices
2528 
2529  if (graph.isLocallyIndexed ()) {
2530  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2531  // pointer does NOT change its reference count. Thus, this
2532  // code is still thread safe.
2533  if (graph.colMap_.is_null ()) {
2534  // NO input column indices are valid in this case, since if
2535  // the column Map is null on the calling process, then the
2536  // calling process owns no graph entries.
2537  return numValid;
2538  }
2539  const map_type& colMap = * (graph.colMap_);
2540 
2541  // Get a view of the column indices in the row. This amortizes
2542  // the cost of getting the view over all the entries of inds.
2543  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2544  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2545 
2546  for (LO j = 0; j < numElts; ++j) {
2547  const LO lclColInd = colMap.getLocalElement (inds[j]);
2548  if (lclColInd != LINV) {
2549  const size_t offset =
2550  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2551  lclColInd, hint, sorted);
2552  if (offset != rowInfo.numEntries) {
2553  if (atomic) {
2554  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2555  }
2556  else {
2557  rowVals[offset] += newVals[j];
2558  }
2559  hint = offset + 1;
2560  numValid++;
2561  }
2562  }
2563  }
2564  }
2565  else if (graph.isGloballyIndexed ()) {
2566  // Get a view of the column indices in the row. This amortizes
2567  // the cost of getting the view over all the entries of inds.
2568  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2569 
2570  for (LO j = 0; j < numElts; ++j) {
2571  const GO gblColInd = inds[j];
2572  const size_t offset =
2573  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2574  gblColInd, hint, sorted);
2575  if (offset != rowInfo.numEntries) {
2576  if (atomic) {
2577  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2578  }
2579  else {
2580  rowVals[offset] += newVals[j];
2581  }
2582  hint = offset + 1;
2583  numValid++;
2584  }
2585  }
2586  }
2587  // If the graph is neither locally nor globally indexed on the
2588  // calling process, that means the calling process has no graph
2589  // entries. Thus, none of the input column indices are valid.
2590 
2591  return numValid;
2592  }
2593 
2594  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2595  LocalOrdinal
2597  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2598  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2599  const Teuchos::ArrayView<const Scalar>& inputVals,
2600  const bool atomic)
2601  {
2602  typedef LocalOrdinal LO;
2603 
2604  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2605  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2606  return Teuchos::OrdinalTraits<LO>::invalid ();
2607  }
2608  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2609  inputVals.getRawPtr (),
2610  inputGblColInds.getRawPtr (),
2611  atomic);
2612  }
2613 
2614  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2615  LocalOrdinal
2617  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2618  const LocalOrdinal numInputEnt,
2619  const Scalar inputVals[],
2620  const GlobalOrdinal inputGblColInds[],
2621  const bool atomic)
2622  {
2623  typedef impl_scalar_type IST;
2624  typedef LocalOrdinal LO;
2625  typedef GlobalOrdinal GO;
2626 
2627  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2628  // Fill must be active and the "nonconst" graph must exist.
2629  return Teuchos::OrdinalTraits<LO>::invalid ();
2630  }
2631  const crs_graph_type& graph = * (this->staticGraph_);
2632 
2633  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2634  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2635  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2636  // thread safe in a debug build, in part because it uses
2637  // Teuchos::ArrayView, and in part because of the data structure
2638  // used to stash outgoing entries.
2639  using Teuchos::ArrayView;
2640  ArrayView<const GO> inputGblColInds_av(
2641  numInputEnt == 0 ? nullptr : inputGblColInds,
2642  numInputEnt);
2643  ArrayView<const Scalar> inputVals_av(
2644  numInputEnt == 0 ? nullptr :
2645  inputVals, numInputEnt);
2646  // gblRow is not in the row Map on the calling process, so stash
2647  // the given entries away in a separate data structure.
2648  // globalAssemble() (called during fillComplete()) will exchange
2649  // that data and sum it in using sumIntoGlobalValues().
2650  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2651  inputVals_av);
2652  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2653  // since we won't know whether the given indices were valid
2654  // until globalAssemble (called in fillComplete) is called.
2655  // That's why insertNonownedGlobalValues doesn't return
2656  // anything. Just for consistency, I'll return the number of
2657  // entries that the user gave us.
2658  return numInputEnt;
2659  }
2660  else { // input row is in the row Map on the calling process
2661  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2662  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2663  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2664  inputGblColInds, inVals,
2665  numInputEnt, atomic);
2666  }
2667  }
2668 
2669  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2670  LocalOrdinal
2672  transformLocalValues (const LocalOrdinal lclRow,
2673  const LocalOrdinal numInputEnt,
2674  const impl_scalar_type inputVals[],
2675  const LocalOrdinal inputCols[],
2676  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2677  const bool atomic)
2678  {
2679  using Tpetra::Details::OrdinalTraits;
2680  typedef LocalOrdinal LO;
2681 
2682  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2683  // Fill must be active and the "nonconst" graph must exist.
2684  return Teuchos::OrdinalTraits<LO>::invalid ();
2685  }
2686  const crs_graph_type& graph = * (this->staticGraph_);
2687  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2688 
2689  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2690  // The calling process does not own this row, so it is not
2691  // allowed to modify its values.
2692  return static_cast<LO> (0);
2693  }
2694  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2695  return this->transformLocalValues (curRowVals.data (), graph,
2696  rowInfo, inputCols, inputVals,
2697  numInputEnt, f, atomic);
2698  }
2699 
2700  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2701  LocalOrdinal
2702  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2703  transformGlobalValues (const GlobalOrdinal gblRow,
2704  const LocalOrdinal numInputEnt,
2705  const impl_scalar_type inputVals[],
2706  const GlobalOrdinal inputCols[],
2707  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2708  const bool atomic)
2709  {
2710  using Tpetra::Details::OrdinalTraits;
2711  typedef LocalOrdinal LO;
2712 
2713  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2714  // Fill must be active and the "nonconst" graph must exist.
2715  return OrdinalTraits<LO>::invalid ();
2716  }
2717  const crs_graph_type& graph = * (this->staticGraph_);
2718  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2719 
2720  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2721  // The calling process does not own this row, so it is not
2722  // allowed to modify its values.
2723  return static_cast<LO> (0);
2724  }
2725  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2726  return this->transformGlobalValues (curRowVals.data (), graph,
2727  rowInfo, inputCols, inputVals,
2728  numInputEnt, f, atomic);
2729  }
2730 
2731  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2732  LocalOrdinal
2733  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2734  transformLocalValues (impl_scalar_type rowVals[],
2735  const crs_graph_type& graph,
2736  const RowInfo& rowInfo,
2737  const LocalOrdinal inds[],
2738  const impl_scalar_type newVals[],
2739  const LocalOrdinal numElts,
2740  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2741  const bool atomic)
2742  {
2743  typedef impl_scalar_type ST;
2744  typedef LocalOrdinal LO;
2745  typedef GlobalOrdinal GO;
2746 
2747  //if (newVals.extent (0) != inds.extent (0)) {
2748  // The sizes of the input arrays must match.
2749  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2750  //}
2751  //const LO numElts = static_cast<LO> (inds.extent (0));
2752  const bool sorted = graph.isSorted ();
2753 
2754  LO numValid = 0; // number of valid input column indices
2755  size_t hint = 0; // Guess for the current index k into rowVals
2756 
2757  if (graph.isLocallyIndexed ()) {
2758  // Get a view of the column indices in the row. This amortizes
2759  // the cost of getting the view over all the entries of inds.
2760  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2761 
2762  for (LO j = 0; j < numElts; ++j) {
2763  const LO lclColInd = inds[j];
2764  const size_t offset =
2765  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2766  lclColInd, hint, sorted);
2767  if (offset != rowInfo.numEntries) {
2768  if (atomic) {
2769  // NOTE (mfh 30 Nov 2015) The commented-out code is
2770  // wrong because another thread may have changed
2771  // rowVals[offset] between those two lines of code.
2772  //
2773  //const ST newVal = f (rowVals[offset], newVals[j]);
2774  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2775 
2776  volatile ST* const dest = &rowVals[offset];
2777  (void) atomic_binary_function_update (dest, newVals[j], f);
2778  }
2779  else {
2780  // use binary function f
2781  rowVals[offset] = f (rowVals[offset], newVals[j]);
2782  }
2783  hint = offset + 1;
2784  ++numValid;
2785  }
2786  }
2787  }
2788  else if (graph.isGloballyIndexed ()) {
2789  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2790  // pointer does NOT change its reference count. Thus, this
2791  // code is still thread safe.
2792  if (graph.colMap_.is_null ()) {
2793  // NO input column indices are valid in this case. Either
2794  // the column Map hasn't been set yet (so local indices
2795  // don't exist yet), or the calling process owns no graph
2796  // entries.
2797  return numValid;
2798  }
2799  const map_type& colMap = * (graph.colMap_);
2800  // Get a view of the column indices in the row. This amortizes
2801  // the cost of getting the view over all the entries of inds.
2802  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2803 
2804  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2805  for (LO j = 0; j < numElts; ++j) {
2806  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2807  if (gblColInd != GINV) {
2808  const size_t offset =
2809  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2810  gblColInd, hint, sorted);
2811  if (offset != rowInfo.numEntries) {
2812  if (atomic) {
2813  // NOTE (mfh 30 Nov 2015) The commented-out code is
2814  // wrong because another thread may have changed
2815  // rowVals[offset] between those two lines of code.
2816  //
2817  //const ST newVal = f (rowVals[offset], newVals[j]);
2818  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2819 
2820  volatile ST* const dest = &rowVals[offset];
2821  (void) atomic_binary_function_update (dest, newVals[j], f);
2822  }
2823  else {
2824  // use binary function f
2825  rowVals[offset] = f (rowVals[offset], newVals[j]);
2826  }
2827  hint = offset + 1;
2828  numValid++;
2829  }
2830  }
2831  }
2832  }
2833  // If the graph is neither locally nor globally indexed on the
2834  // calling process, that means the calling process has no graph
2835  // entries. Thus, none of the input column indices are valid.
2836 
2837  return numValid;
2838  }
2839 
2840  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2841  LocalOrdinal
2842  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2843  transformGlobalValues (impl_scalar_type rowVals[],
2844  const crs_graph_type& graph,
2845  const RowInfo& rowInfo,
2846  const GlobalOrdinal inds[],
2847  const impl_scalar_type newVals[],
2848  const LocalOrdinal numElts,
2849  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2850  const bool atomic)
2851  {
2852  typedef impl_scalar_type ST;
2853  typedef LocalOrdinal LO;
2854  typedef GlobalOrdinal GO;
2855 
2856  //if (newVals.extent (0) != inds.extent (0)) {
2857  // The sizes of the input arrays must match.
2858  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2859  //}
2860  //const LO numElts = static_cast<LO> (inds.extent (0));
2861  const bool sorted = graph.isSorted ();
2862 
2863  LO numValid = 0; // number of valid input column indices
2864  size_t hint = 0; // Guess for the current index k into rowVals
2865 
2866  if (graph.isGloballyIndexed ()) {
2867  // Get a view of the column indices in the row. This amortizes
2868  // the cost of getting the view over all the entries of inds.
2869  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2870 
2871  for (LO j = 0; j < numElts; ++j) {
2872  const GO gblColInd = inds[j];
2873  const size_t offset =
2874  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2875  gblColInd, hint, sorted);
2876  if (offset != rowInfo.numEntries) {
2877  if (atomic) {
2878  // NOTE (mfh 30 Nov 2015) The commented-out code is
2879  // wrong because another thread may have changed
2880  // rowVals[offset] between those two lines of code.
2881  //
2882  //const ST newVal = f (rowVals[offset], newVals[j]);
2883  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2884 
2885  volatile ST* const dest = &rowVals[offset];
2886  (void) atomic_binary_function_update (dest, newVals[j], f);
2887  }
2888  else {
2889  // use binary function f
2890  rowVals[offset] = f (rowVals[offset], newVals[j]);
2891  }
2892  hint = offset + 1;
2893  ++numValid;
2894  }
2895  }
2896  }
2897  else if (graph.isLocallyIndexed ()) {
2898  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2899  // pointer does NOT change its reference count. Thus, this
2900  // code is still thread safe.
2901  if (graph.colMap_.is_null ()) {
2902  // NO input column indices are valid in this case. Either the
2903  // column Map hasn't been set yet (so local indices don't
2904  // exist yet), or the calling process owns no graph entries.
2905  return numValid;
2906  }
2907  const map_type& colMap = * (graph.colMap_);
2908  // Get a view of the column indices in the row. This amortizes
2909  // the cost of getting the view over all the entries of inds.
2910  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2911 
2912  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2913  for (LO j = 0; j < numElts; ++j) {
2914  const LO lclColInd = colMap.getLocalElement (inds[j]);
2915  if (lclColInd != LINV) {
2916  const size_t offset =
2917  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2918  lclColInd, hint, sorted);
2919  if (offset != rowInfo.numEntries) {
2920  if (atomic) {
2921  // NOTE (mfh 30 Nov 2015) The commented-out code is
2922  // wrong because another thread may have changed
2923  // rowVals[offset] between those two lines of code.
2924  //
2925  //const ST newVal = f (rowVals[offset], newVals[j]);
2926  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2927 
2928  volatile ST* const dest = &rowVals[offset];
2929  (void) atomic_binary_function_update (dest, newVals[j], f);
2930  }
2931  else {
2932  // use binary function f
2933  rowVals[offset] = f (rowVals[offset], newVals[j]);
2934  }
2935  hint = offset + 1;
2936  numValid++;
2937  }
2938  }
2939  }
2940  }
2941  // If the graph is neither locally nor globally indexed on the
2942  // calling process, that means the calling process has no graph
2943  // entries. Thus, none of the input column indices are valid.
2944 
2945  return numValid;
2946  }
2947 
2948  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2949  LocalOrdinal
2952  const crs_graph_type& graph,
2953  const RowInfo& rowInfo,
2954  const LocalOrdinal inds[],
2955  const impl_scalar_type newVals[],
2956  const LocalOrdinal numElts,
2957  const bool atomic)
2958  {
2959  typedef LocalOrdinal LO;
2960  typedef GlobalOrdinal GO;
2961 
2962  const bool sorted = graph.isSorted ();
2963 
2964  size_t hint = 0; // Guess for the current index k into rowVals
2965  LO numValid = 0; // number of valid local column indices
2966 
2967  if (graph.isLocallyIndexed ()) {
2968  // Get a view of the column indices in the row. This amortizes
2969  // the cost of getting the view over all the entries of inds.
2970  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2971 
2972  for (LO j = 0; j < numElts; ++j) {
2973  const LO lclColInd = inds[j];
2974  const size_t offset =
2975  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2976  lclColInd, hint, sorted);
2977  if (offset != rowInfo.numEntries) {
2978  if (atomic) {
2979  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2980  }
2981  else {
2982  rowVals[offset] += newVals[j];
2983  }
2984  hint = offset + 1;
2985  ++numValid;
2986  }
2987  }
2988  }
2989  else if (graph.isGloballyIndexed ()) {
2990  if (graph.colMap_.is_null ()) {
2991  return Teuchos::OrdinalTraits<LO>::invalid ();
2992  }
2993  const map_type colMap = * (graph.colMap_);
2994 
2995  // Get a view of the column indices in the row. This amortizes
2996  // the cost of getting the view over all the entries of inds.
2997  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2998 
2999  for (LO j = 0; j < numElts; ++j) {
3000  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3001  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3002  const size_t offset =
3003  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3004  gblColInd, hint, sorted);
3005  if (offset != rowInfo.numEntries) {
3006  if (atomic) {
3007  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3008  }
3009  else {
3010  rowVals[offset] += newVals[j];
3011  }
3012  hint = offset + 1;
3013  ++numValid;
3014  }
3015  }
3016  }
3017  }
3018  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3019  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3020  // to be neither locally nor globally indexed on a process.
3021  // This means that the graph or matrix has no entries on that
3022  // process. Epetra also works like this. It's related to lazy
3023  // allocation (on first insertion, not at graph / matrix
3024  // construction). Lazy allocation will go away because it is
3025  // not thread scalable.
3026 
3027  return numValid;
3028  }
3029 
3030  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3031  LocalOrdinal
3033  sumIntoLocalValues (const LocalOrdinal localRow,
3034  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3035  const Teuchos::ArrayView<const Scalar>& values,
3036  const bool atomic)
3037  {
3038  using LO = local_ordinal_type;
3039  const LO numInputEnt = static_cast<LO>(indices.size());
3040  if (static_cast<LO>(values.size()) != numInputEnt) {
3041  return Teuchos::OrdinalTraits<LO>::invalid();
3042  }
3043  const LO* const inputInds = indices.getRawPtr();
3044  const scalar_type* const inputVals = values.getRawPtr();
3045  return this->sumIntoLocalValues(localRow, numInputEnt,
3046  inputVals, inputInds, atomic);
3047  }
3048 
3049  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3051  local_ordinal_type
3054  const local_ordinal_type localRow,
3055  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3056  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3057  const bool atomic)
3058  {
3059  using LO = local_ordinal_type;
3060  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3061  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3062  return Teuchos::OrdinalTraits<LO>::invalid();
3063  }
3064  const scalar_type* inVals =
3065  reinterpret_cast<const scalar_type*>(inputVals.data());
3066  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3067  inputInds.data(), atomic);
3068  }
3069 
3070  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3071  LocalOrdinal
3073  sumIntoLocalValues (const LocalOrdinal localRow,
3074  const LocalOrdinal numEnt,
3075  const Scalar vals[],
3076  const LocalOrdinal cols[],
3077  const bool atomic)
3078  {
3079  typedef impl_scalar_type IST;
3080  typedef LocalOrdinal LO;
3081 
3082  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3083  // Fill must be active and the "nonconst" graph must exist.
3084  return Teuchos::OrdinalTraits<LO>::invalid ();
3085  }
3086  const crs_graph_type& graph = * (this->staticGraph_);
3087  const RowInfo rowInfo = graph.getRowInfo (localRow);
3088 
3089  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3090  // The calling process does not own this row, so it is not
3091  // allowed to modify its values.
3092  return static_cast<LO> (0);
3093  }
3094  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3095  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3096  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3097  cols, inputVals, numEnt, atomic);
3098  }
3099 
3100  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3102  values_dualv_type::t_host::const_type
3104  getValuesViewHost (const RowInfo& rowinfo) const
3105  {
3106  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3107  return typename values_dualv_type::t_host::const_type ();
3108  else
3109  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3110  rowinfo.allocSize,
3111  Access::ReadOnly);
3112  }
3113 
3114  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3116  values_dualv_type::t_host
3119  {
3120  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3121  return typename values_dualv_type::t_host ();
3122  else
3123  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3124  rowinfo.allocSize,
3125  Access::ReadWrite);
3126  }
3127 
3128  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3130  values_dualv_type::t_dev::const_type
3132  getValuesViewDevice (const RowInfo& rowinfo) const
3133  {
3134  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3135  return typename values_dualv_type::t_dev::const_type ();
3136  else
3137  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3138  rowinfo.allocSize,
3139  Access::ReadOnly);
3140  }
3141 
3142  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3144  values_dualv_type::t_dev
3147  {
3148  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3149  return typename values_dualv_type::t_dev ();
3150  else
3151  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3152  rowinfo.allocSize,
3153  Access::ReadWrite);
3154  }
3155 
3156 
3157  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3158  void
3161  nonconst_local_inds_host_view_type &indices,
3162  nonconst_values_host_view_type &values,
3163  size_t& numEntries) const
3164  {
3165  using Teuchos::ArrayView;
3166  using Teuchos::av_reinterpret_cast;
3167  const char tfecfFuncName[] = "getLocalRowCopy: ";
3168 
3169  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3170  (! this->hasColMap (), std::runtime_error,
3171  "The matrix does not have a column Map yet. This means we don't have "
3172  "local indices for columns yet, so it doesn't make sense to call this "
3173  "method. If the matrix doesn't have a column Map yet, you should call "
3174  "fillComplete on it first.");
3175 
3176  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3177  const size_t theNumEntries = rowinfo.numEntries;
3178  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3179  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3180  static_cast<size_t> (values.size ()) < theNumEntries,
3181  std::runtime_error, "Row with local index " << localRow << " has " <<
3182  theNumEntries << " entry/ies, but indices.size() = " <<
3183  indices.size () << " and values.size() = " << values.size () << ".");
3184  numEntries = theNumEntries; // first side effect
3185 
3186  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3187  if (staticGraph_->isLocallyIndexed ()) {
3188  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3189  auto curVals = getValuesViewHost(rowinfo);
3190 
3191  for (size_t j = 0; j < theNumEntries; ++j) {
3192  values[j] = curVals[j];
3193  indices[j] = curLclInds(j);
3194  }
3195  }
3196  else if (staticGraph_->isGloballyIndexed ()) {
3197  // Don't call getColMap(), because it touches RCP's reference count.
3198  const map_type& colMap = * (staticGraph_->colMap_);
3199  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3200  auto curVals = getValuesViewHost(rowinfo);
3201 
3202  for (size_t j = 0; j < theNumEntries; ++j) {
3203  values[j] = curVals[j];
3204  indices[j] = colMap.getLocalElement (curGblInds(j));
3205  }
3206  }
3207  }
3208  }
3209 
3210 
3211 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3212 void
3215  nonconst_global_inds_host_view_type &indices,
3216  nonconst_values_host_view_type &values,
3217  size_t& numEntries) const
3218  {
3219  using Teuchos::ArrayView;
3220  using Teuchos::av_reinterpret_cast;
3221  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3222 
3223  const RowInfo rowinfo =
3224  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3225  const size_t theNumEntries = rowinfo.numEntries;
3226  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3227  static_cast<size_t> (indices.size ()) < theNumEntries ||
3228  static_cast<size_t> (values.size ()) < theNumEntries,
3229  std::runtime_error, "Row with global index " << globalRow << " has "
3230  << theNumEntries << " entry/ies, but indices.size() = " <<
3231  indices.size () << " and values.size() = " << values.size () << ".");
3232  numEntries = theNumEntries; // first side effect
3233 
3234  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3235  if (staticGraph_->isLocallyIndexed ()) {
3236  const map_type& colMap = * (staticGraph_->colMap_);
3237  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3238  auto curVals = getValuesViewHost(rowinfo);
3239 
3240  for (size_t j = 0; j < theNumEntries; ++j) {
3241  values[j] = curVals[j];
3242  indices[j] = colMap.getGlobalElement (curLclInds(j));
3243  }
3244  }
3245  else if (staticGraph_->isGloballyIndexed ()) {
3246  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3247  auto curVals = getValuesViewHost(rowinfo);
3248 
3249  for (size_t j = 0; j < theNumEntries; ++j) {
3250  values[j] = curVals[j];
3251  indices[j] = curGblInds(j);
3252  }
3253  }
3254  }
3255  }
3256 
3257 
3258  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3259  void
3261  getLocalRowView(LocalOrdinal localRow,
3262  local_inds_host_view_type &indices,
3263  values_host_view_type &values) const
3264  {
3265  const char tfecfFuncName[] = "getLocalRowView: ";
3266 
3267  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3268  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3269  "its indices as global indices, so you cannot get a view with local "
3270  "column indices. If the matrix has a column Map, you may call "
3271  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3272  "a view with global column indices by calling getGlobalRowCopy().");
3273 
3274  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3275  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3276  rowInfo.numEntries > 0) {
3277  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3278  rowInfo.offset1D,
3279  rowInfo.numEntries,
3280  Access::ReadOnly);
3281  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3282  rowInfo.numEntries,
3283  Access::ReadOnly);
3284  }
3285  else {
3286  // This does the right thing (reports an empty row) if the input
3287  // row is invalid.
3288  indices = local_inds_host_view_type();
3289  values = values_host_view_type();
3290  }
3291 
3292 #ifdef HAVE_TPETRA_DEBUG
3293  const char suffix[] = ". This should never happen. Please report this "
3294  "bug to the Tpetra developers.";
3295  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3296  (static_cast<size_t> (indices.size ()) !=
3297  static_cast<size_t> (values.size ()), std::logic_error,
3298  "At the end of this method, for local row " << localRow << ", "
3299  "indices.size() = " << indices.size () << " != values.size () = "
3300  << values.size () << suffix);
3301  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3302  (static_cast<size_t> (indices.size ()) !=
3303  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3304  "At the end of this method, for local row " << localRow << ", "
3305  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3306  << rowInfo.numEntries << suffix);
3307  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3308  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3309  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3310  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3311  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3312  expectedNumEntries << suffix);
3313 #endif // HAVE_TPETRA_DEBUG
3314  }
3315 
3316 
3317  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3318  void
3320  getGlobalRowView (GlobalOrdinal globalRow,
3321  global_inds_host_view_type &indices,
3322  values_host_view_type &values) const
3323  {
3324  const char tfecfFuncName[] = "getGlobalRowView: ";
3325 
3326  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3327  isLocallyIndexed (), std::runtime_error,
3328  "The matrix is locally indexed, so we cannot return a view of the row "
3329  "with global column indices. Use getGlobalRowCopy() instead.");
3330 
3331  // This does the right thing (reports an empty row) if the input
3332  // row is invalid.
3333  const RowInfo rowInfo =
3334  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3335  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3336  rowInfo.numEntries > 0) {
3337  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3338  rowInfo.numEntries,
3339  Access::ReadOnly);
3340  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3341  rowInfo.numEntries,
3342  Access::ReadOnly);
3343  }
3344  else {
3345  indices = global_inds_host_view_type();
3346  values = values_host_view_type();
3347  }
3348 
3349 #ifdef HAVE_TPETRA_DEBUG
3350  const char suffix[] = ". This should never happen. Please report this "
3351  "bug to the Tpetra developers.";
3352  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3353  (static_cast<size_t> (indices.size ()) !=
3354  static_cast<size_t> (values.size ()), std::logic_error,
3355  "At the end of this method, for global row " << globalRow << ", "
3356  "indices.size() = " << indices.size () << " != values.size () = "
3357  << values.size () << suffix);
3358  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3359  (static_cast<size_t> (indices.size ()) !=
3360  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3361  "At the end of this method, for global row " << globalRow << ", "
3362  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3363  << rowInfo.numEntries << suffix);
3364  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3365  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3366  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3367  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3368  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3369  " " << expectedNumEntries << suffix);
3370 #endif // HAVE_TPETRA_DEBUG
3371  }
3372 
3373 
3374  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3375  void
3377  scale (const Scalar& alpha)
3378  {
3379  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3380 
3381  const size_t nlrs = staticGraph_->getLocalNumRows ();
3382  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3383  if (! staticGraph_->indicesAreAllocated () ||
3384  nlrs == 0 || numEntries == 0) {
3385  // do nothing
3386  }
3387  else {
3388 
3389  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3390  KokkosBlas::scal(vals, theAlpha, vals);
3391 
3392  }
3393  }
3394 
3395  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3396  void
3398  setAllToScalar (const Scalar& alpha)
3399  {
3400  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3401 
3402  // replace all values in the matrix
3403  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3404  // however, if there are no valid entries, we can short-circuit
3405  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3406  const size_t numEntries = staticGraph_->getLocalNumEntries();
3407  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3408  // do nothing
3409  }
3410  else {
3411  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3412  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3413  theAlpha);
3414  // CAG: This fence was found to be required on Cuda with UVM=on.
3415  Kokkos::fence("CrsMatrix::setAllToScalar");
3416  }
3417  }
3418 
3419  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3420  void
3422  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3423  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3424  const typename local_matrix_device_type::values_type& values)
3425  {
3426  using ProfilingRegion=Details::ProfilingRegion;
3427  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3428  const char tfecfFuncName[] = "setAllValues: ";
3429  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3430  (columnIndices.size () != values.size (), std::invalid_argument,
3431  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3432  " = " << values.size () << ".");
3433  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3434  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3435 
3436  try {
3437  myGraph_->setAllIndices (rowPointers, columnIndices);
3438  }
3439  catch (std::exception &e) {
3440  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3441  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3442  "exception: " << e.what ());
3443  }
3444 
3445  // Make sure that myGraph_ now has a local graph. It may not be
3446  // fillComplete yet, so it's important to check. We don't care
3447  // whether setAllIndices() did a shallow copy or a deep copy, so a
3448  // good way to check is to compare dimensions.
3449  auto lclGraph = myGraph_->getLocalGraphDevice ();
3450  const size_t numEnt = lclGraph.entries.extent (0);
3451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3452  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3453  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3454  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3455  "local graph. Please report this bug to the Tpetra developers.");
3456 
3457  valuesPacked_wdv = values_wdv_type(values);
3458  valuesUnpacked_wdv = valuesPacked_wdv;
3459 
3460  // Storage MUST be packed, since the interface doesn't give any
3461  // way to indicate any extra space at the end of each row.
3462  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3463 
3464  checkInternalState ();
3465  }
3466 
3467  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3468  void
3470  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3471  {
3472  using ProfilingRegion=Details::ProfilingRegion;
3473  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3474 
3475  auto graph = localDeviceMatrix.graph;
3476  //FIXME how to check whether graph is allocated
3477 
3478  auto rows = graph.row_map;
3479  auto columns = graph.entries;
3480  auto values = localDeviceMatrix.values;
3481 
3482  setAllValues(rows,columns,values);
3483  }
3484 
3485  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3486  void
3488  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3489  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3490  const Teuchos::ArrayRCP<Scalar>& val)
3491  {
3492  using Kokkos::Compat::getKokkosViewDeepCopy;
3493  using Teuchos::ArrayRCP;
3494  using Teuchos::av_reinterpret_cast;
3495  typedef device_type DT;
3496  typedef impl_scalar_type IST;
3497  typedef typename local_graph_device_type::row_map_type row_map_type;
3498  //typedef typename row_map_type::non_const_value_type row_offset_type;
3499  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3500 
3501  // The row offset type may depend on the execution space. It may
3502  // not necessarily be size_t. If it's not, we need to make a deep
3503  // copy. We need to make a deep copy anyway so that Kokkos can
3504  // own the memory. Regardless, ptrIn gets the copy.
3505  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3506  Kokkos::View<const size_t*,
3507  typename row_map_type::array_layout,
3508  Kokkos::HostSpace,
3509  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3510  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3511 
3512  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3513  (ptrNative.extent (0) != ptrSizeT.extent (0),
3514  std::logic_error, "ptrNative.extent(0) = " <<
3515  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3516  << ptrSizeT.extent (0) << ". Please report this bug to the "
3517  "Tpetra developers.");
3518 
3519  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3520  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3521  this->setAllValues (ptrNative, indIn, valIn);
3522  }
3523 
3524  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3525  void
3527  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3528  {
3529  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3530  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3531  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3532 
3533  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3534  // this method in CrsGraph too, so don't call it (otherwise build
3535  // warnings will show up and annoy users). Instead, copy results
3536  // in and out, if the memory space requires it.
3537 
3538  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3539  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3540  offsets.resize (lclNumRows);
3541  }
3542 
3543  // The input ArrayRCP must always be a host pointer. Thus, if
3544  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3545  // to write to that allocation directly as a Kokkos::View.
3546  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3547  // It is always syntactically correct to assign a raw host
3548  // pointer to a device View, so this code will compile correctly
3549  // even if this branch never runs.
3550  typedef Kokkos::View<size_t*, device_type,
3551  Kokkos::MemoryUnmanaged> output_type;
3552  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3553  staticGraph_->getLocalDiagOffsets (offsetsOut);
3554  }
3555  else {
3556  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3557  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3558  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3559  Kokkos::MemoryUnmanaged> output_type;
3560  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3561  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3562  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3563  }
3564  }
3565 
3566  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3567  void
3570  {
3571  using Teuchos::ArrayRCP;
3572  using Teuchos::ArrayView;
3573  using Teuchos::av_reinterpret_cast;
3574  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3575  typedef local_ordinal_type LO;
3576 
3577 
3578  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3579  staticGraph_.is_null (), std::runtime_error,
3580  "This method requires that the matrix have a graph.");
3581  auto rowMapPtr = this->getRowMap ();
3582  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3583  // Processes on which the row Map or its communicator is null
3584  // don't participate. Users shouldn't even call this method on
3585  // those processes.
3586  return;
3587  }
3588  auto colMapPtr = this->getColMap ();
3589  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3590  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3591  "This method requires that the matrix have a column Map.");
3592  const map_type& rowMap = * rowMapPtr;
3593  const map_type& colMap = * colMapPtr;
3594  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3595 
3596 #ifdef HAVE_TPETRA_DEBUG
3597  // isCompatible() requires an all-reduce, and thus this check
3598  // should only be done in debug mode.
3599  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3600  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3601  "The input Vector's Map must be compatible with the CrsMatrix's row "
3602  "Map. You may check this by using Map's isCompatible method: "
3603  "diag.getMap ()->isCompatible (A.getRowMap ());");
3604 #endif // HAVE_TPETRA_DEBUG
3605 
3606  if (this->isFillComplete ()) {
3607  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3608  // 1-D subview of the first (and only) column of D_lcl.
3609  const auto D_lcl_1d =
3610  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3611 
3612  const auto lclRowMap = rowMap.getLocalMap ();
3613  const auto lclColMap = colMap.getLocalMap ();
3615  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3616  lclColMap,
3617  getLocalMatrixDevice ());
3618  }
3619  else {
3621  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3622  }
3623  }
3624 
3625  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3626  void
3629  const Kokkos::View<const size_t*, device_type,
3630  Kokkos::MemoryUnmanaged>& offsets) const
3631  {
3632  typedef LocalOrdinal LO;
3633 
3634 #ifdef HAVE_TPETRA_DEBUG
3635  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3636  const map_type& rowMap = * (this->getRowMap ());
3637  // isCompatible() requires an all-reduce, and thus this check
3638  // should only be done in debug mode.
3639  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3640  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3641  "The input Vector's Map must be compatible with (in the sense of Map::"
3642  "isCompatible) the CrsMatrix's row Map.");
3643 #endif // HAVE_TPETRA_DEBUG
3644 
3645  // For now, we fill the Vector on the host and sync to device.
3646  // Later, we may write a parallel kernel that works entirely on
3647  // device.
3648  //
3649  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3650  // we write a device kernel, it will not need to assume UVM.
3651 
3652  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3653  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3654  // Get 1-D subview of the first (and only) column of D_lcl.
3655  auto D_lcl_1d =
3656  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3657 
3658  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3659  getLocalMatrixDevice ());
3660  }
3661 
3662  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3663  void
3666  const Teuchos::ArrayView<const size_t>& offsets) const
3667  {
3668  using LO = LocalOrdinal;
3669  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3670  using IST = impl_scalar_type;
3671 
3672 #ifdef HAVE_TPETRA_DEBUG
3673  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3674  const map_type& rowMap = * (this->getRowMap ());
3675  // isCompatible() requires an all-reduce, and thus this check
3676  // should only be done in debug mode.
3677  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3678  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3679  "The input Vector's Map must be compatible with (in the sense of Map::"
3680  "isCompatible) the CrsMatrix's row Map.");
3681 #endif // HAVE_TPETRA_DEBUG
3682 
3683  // See #1510. In case diag has already been marked modified on
3684  // device, we need to clear that flag, since the code below works
3685  // on host.
3686  //diag.clear_sync_state ();
3687 
3688  // For now, we fill the Vector on the host and sync to device.
3689  // Later, we may write a parallel kernel that works entirely on
3690  // device.
3691  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3692  // 1-D subview of the first (and only) column of lclVecHost.
3693  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3694 
3695  using host_offsets_view_type =
3696  Kokkos::View<const size_t*, Kokkos::HostSpace,
3697  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3698  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3699  // Find the diagonal entries and put them in lclVecHost1d.
3700  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3701  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3702  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3703 
3704  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3705  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3706  Kokkos::parallel_for
3707  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3708  range_type (0, myNumRows),
3709  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3710  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3711  if (h_offsets[lclRow] != INV) {
3712  auto curRowOffset = rowPtrsPackedHost (lclRow);
3713  lclVecHost1d(lclRow) =
3714  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3715  }
3716  });
3717  //diag.sync_device ();
3718  }
3719 
3720 
3721  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3722  void
3725  {
3726  using ::Tpetra::Details::ProfilingRegion;
3727  using Teuchos::ArrayRCP;
3728  using Teuchos::ArrayView;
3729  using Teuchos::null;
3730  using Teuchos::RCP;
3731  using Teuchos::rcp;
3732  using Teuchos::rcpFromRef;
3734  const char tfecfFuncName[] = "leftScale: ";
3735 
3736  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3737 
3738  RCP<const vec_type> xp;
3739  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3740  // Take from Epetra: If we have a non-trivial exporter, we must
3741  // import elements that are permuted or are on other processors.
3742  auto exporter = this->getCrsGraphRef ().getExporter ();
3743  if (exporter.get () != nullptr) {
3744  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3745  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3746  xp = tempVec;
3747  }
3748  else {
3749  xp = rcpFromRef (x);
3750  }
3751  }
3752  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3753  xp = rcpFromRef (x);
3754  }
3755  else {
3756  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3757  (true, std::invalid_argument, "x's Map must be the same as "
3758  "either the row Map or the range Map of the CrsMatrix.");
3759  }
3760 
3761  if (this->isFillComplete()) {
3762  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3763  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3765  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3766  x_lcl_1d, false, false);
3767  }
3768  else {
3769  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3770  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3771  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3772  " fillComplete");
3773  }
3774  }
3775 
3776  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3777  void
3780  {
3781  using ::Tpetra::Details::ProfilingRegion;
3782  using Teuchos::ArrayRCP;
3783  using Teuchos::ArrayView;
3784  using Teuchos::null;
3785  using Teuchos::RCP;
3786  using Teuchos::rcp;
3787  using Teuchos::rcpFromRef;
3789  const char tfecfFuncName[] = "rightScale: ";
3790 
3791  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3792 
3793  RCP<const vec_type> xp;
3794  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3795  // Take from Epetra: If we have a non-trivial exporter, we must
3796  // import elements that are permuted or are on other processors.
3797  auto importer = this->getCrsGraphRef ().getImporter ();
3798  if (importer.get () != nullptr) {
3799  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3800  tempVec->doImport (x, *importer, REPLACE);
3801  xp = tempVec;
3802  }
3803  else {
3804  xp = rcpFromRef (x);
3805  }
3806  }
3807  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3808  xp = rcpFromRef (x);
3809  } else {
3810  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3811  (true, std::runtime_error, "x's Map must be the same as "
3812  "either the domain Map or the column Map of the CrsMatrix.");
3813  }
3814 
3815  if (this->isFillComplete()) {
3816  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3817  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3819  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3820  x_lcl_1d, false, false);
3821  }
3822  else {
3823  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3824  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3825  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3826  " fillComplete");
3827  }
3828  }
3829 
3830  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3834  {
3835  using Teuchos::ArrayView;
3836  using Teuchos::outArg;
3837  using Teuchos::REDUCE_SUM;
3838  using Teuchos::reduceAll;
3839 
3840  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3841  // local part of this computation. It could make sense to put
3842  // this operation in the Kokkos::CrsMatrix.
3843 
3844  // check the cache first
3845  mag_type mySum = STM::zero ();
3846  if (getLocalNumEntries() > 0) {
3847  if (isStorageOptimized ()) {
3848  // "Optimized" storage is packed storage. That means we can
3849  // iterate in one pass through the 1-D values array.
3850  const size_t numEntries = getLocalNumEntries ();
3851  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3852  for (size_t k = 0; k < numEntries; ++k) {
3853  auto val = values[k];
3854  // Note (etp 06 Jan 2015) We need abs() here for composite types
3855  // (in general, if mag_type is on the left-hand-side, we need
3856  // abs() on the right-hand-side)
3857  const mag_type val_abs = STS::abs (val);
3858  mySum += val_abs * val_abs;
3859  }
3860  }
3861  else {
3862  const LocalOrdinal numRows =
3863  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3864  for (LocalOrdinal r = 0; r < numRows; ++r) {
3865  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3866  const size_t numEntries = rowInfo.numEntries;
3867  auto A_r = this->getValuesViewHost(rowInfo);
3868  for (size_t k = 0; k < numEntries; ++k) {
3869  const impl_scalar_type val = A_r[k];
3870  const mag_type val_abs = STS::abs (val);
3871  mySum += val_abs * val_abs;
3872  }
3873  }
3874  }
3875  }
3876  mag_type totalSum = STM::zero ();
3877  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3878  mySum, outArg (totalSum));
3879  return STM::sqrt (totalSum);
3880  }
3881 
3882  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3883  void
3885  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3886  {
3887  const char tfecfFuncName[] = "replaceColMap: ";
3888  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3889  // Then replacing the column Map might mean that we need to
3890  // reindex the column indices.
3891  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3892  myGraph_.is_null (), std::runtime_error,
3893  "This method does not work if the matrix has a const graph. The whole "
3894  "idea of a const graph is that you are not allowed to change it, but "
3895  "this method necessarily must modify the graph, since the graph owns "
3896  "the matrix's column Map.");
3897  myGraph_->replaceColMap (newColMap);
3898  }
3899 
3900  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3901  void
3904  const Teuchos::RCP<const map_type>& newColMap,
3905  const Teuchos::RCP<const import_type>& newImport,
3906  const bool sortEachRow)
3907  {
3908  const char tfecfFuncName[] = "reindexColumns: ";
3909  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3910  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3911  "The input graph is null, but the matrix does not own its graph.");
3912 
3913  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3914  const bool sortGraph = false; // we'll sort graph & matrix together below
3915 
3916  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3917 
3918  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3919  const LocalOrdinal lclNumRows =
3920  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3921 
3922  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3923 
3924  const RowInfo rowInfo = theGraph.getRowInfo (row);
3925  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3926  auto vals = this->getValuesViewHostNonConst (rowInfo);
3927 
3928  sort2 (lclColInds.data (),
3929  lclColInds.data () + rowInfo.numEntries,
3930  vals.data ());
3931  }
3932  theGraph.indicesAreSorted_ = true;
3933  }
3934  }
3935 
3936  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3937  void
3939  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3940  {
3941  const char tfecfFuncName[] = "replaceDomainMap: ";
3942  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3943  myGraph_.is_null (), std::runtime_error,
3944  "This method does not work if the matrix has a const graph. The whole "
3945  "idea of a const graph is that you are not allowed to change it, but this"
3946  " method necessarily must modify the graph, since the graph owns the "
3947  "matrix's domain Map and Import objects.");
3948  myGraph_->replaceDomainMap (newDomainMap);
3949  }
3950 
3951  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3952  void
3954  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3955  Teuchos::RCP<const import_type>& newImporter)
3956  {
3957  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3958  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3959  myGraph_.is_null (), std::runtime_error,
3960  "This method does not work if the matrix has a const graph. The whole "
3961  "idea of a const graph is that you are not allowed to change it, but this"
3962  " method necessarily must modify the graph, since the graph owns the "
3963  "matrix's domain Map and Import objects.");
3964  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3965  }
3966 
3967  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3968  void
3970  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
3971  {
3972  const char tfecfFuncName[] = "replaceRangeMap: ";
3973  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3974  myGraph_.is_null (), std::runtime_error,
3975  "This method does not work if the matrix has a const graph. The whole "
3976  "idea of a const graph is that you are not allowed to change it, but this"
3977  " method necessarily must modify the graph, since the graph owns the "
3978  "matrix's domain Map and Import objects.");
3979  myGraph_->replaceRangeMap (newRangeMap);
3980  }
3981 
3982  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3983  void
3985  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
3986  Teuchos::RCP<const export_type>& newExporter)
3987  {
3988  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3989  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3990  myGraph_.is_null (), std::runtime_error,
3991  "This method does not work if the matrix has a const graph. The whole "
3992  "idea of a const graph is that you are not allowed to change it, but this"
3993  " method necessarily must modify the graph, since the graph owns the "
3994  "matrix's domain Map and Import objects.");
3995  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
3996  }
3997 
3998  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3999  void
4001  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4002  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4003  const Teuchos::ArrayView<const Scalar>& values)
4004  {
4005  using Teuchos::Array;
4006  typedef GlobalOrdinal GO;
4007  typedef typename Array<GO>::size_type size_type;
4008 
4009  const size_type numToInsert = indices.size ();
4010  // Add the new data to the list of nonlocals.
4011  // This creates the arrays if they don't exist yet.
4012  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4013  Array<GO>& curRowInds = curRow.first;
4014  Array<Scalar>& curRowVals = curRow.second;
4015  const size_type newCapacity = curRowInds.size () + numToInsert;
4016  curRowInds.reserve (newCapacity);
4017  curRowVals.reserve (newCapacity);
4018  for (size_type k = 0; k < numToInsert; ++k) {
4019  curRowInds.push_back (indices[k]);
4020  curRowVals.push_back (values[k]);
4021  }
4022  }
4023 
4024  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4025  void
4028  {
4029  using Details::Behavior;
4031  using Teuchos::Comm;
4032  using Teuchos::outArg;
4033  using Teuchos::RCP;
4034  using Teuchos::rcp;
4035  using Teuchos::REDUCE_MAX;
4036  using Teuchos::REDUCE_MIN;
4037  using Teuchos::reduceAll;
4038  using std::endl;
4040  //typedef LocalOrdinal LO;
4041  typedef GlobalOrdinal GO;
4042  typedef typename Teuchos::Array<GO>::size_type size_type;
4043  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4044  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4045 
4046  const bool verbose = Behavior::verbose("CrsMatrix");
4047  std::unique_ptr<std::string> prefix;
4048  if (verbose) {
4049  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4050  std::ostringstream os;
4051  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4052  << endl;
4053  std::cerr << os.str();
4054  }
4055  RCP<const Comm<int> > comm = getComm ();
4056 
4057  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4058  (! isFillActive (), std::runtime_error, "Fill must be active before "
4059  "you may call this method.");
4060 
4061  const size_t myNumNonlocalRows = nonlocals_.size ();
4062 
4063  // If no processes have nonlocal rows, then we don't have to do
4064  // anything. Checking this is probably cheaper than constructing
4065  // the Map of nonlocal rows (see below) and noticing that it has
4066  // zero global entries.
4067  {
4068  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4069  int someoneHasNonlocalRows = 0;
4070  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4071  outArg (someoneHasNonlocalRows));
4072  if (someoneHasNonlocalRows == 0) {
4073  return; // no process has nonlocal rows, so nothing to do
4074  }
4075  }
4076 
4077  // 1. Create a list of the "nonlocal" rows on each process. this
4078  // requires iterating over nonlocals_, so while we do this,
4079  // deduplicate the entries and get a count for each nonlocal
4080  // row on this process.
4081  // 2. Construct a new row Map corresponding to those rows. This
4082  // Map is likely overlapping. We know that the Map is not
4083  // empty on all processes, because the above all-reduce and
4084  // return exclude that case.
4085 
4086  RCP<const map_type> nonlocalRowMap;
4087  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4088  {
4089  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4090  size_type curPos = 0;
4091  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4092  ++mapIter, ++curPos) {
4093  myNonlocalGblRows[curPos] = mapIter->first;
4094  // Get the values and column indices by reference, since we
4095  // intend to change them in place (that's what "erase" does).
4096  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4097  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4098 
4099  // Sort both arrays jointly, using the column indices as keys,
4100  // then merge them jointly. "Merge" here adds values
4101  // corresponding to the same column indices. The first 2 args
4102  // of merge2 are output arguments that work just like the
4103  // return value of std::unique.
4104  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4105  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4106  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4107  merge2 (gblCols_newEnd, vals_newEnd,
4108  gblCols.begin (), gblCols.end (),
4109  vals.begin (), vals.end ());
4110  gblCols.erase (gblCols_newEnd, gblCols.end ());
4111  vals.erase (vals_newEnd, vals.end ());
4112  numEntPerNonlocalRow[curPos] = gblCols.size ();
4113  }
4114 
4115  // Currently, Map requires that its indexBase be the global min
4116  // of all its global indices. Map won't compute this for us, so
4117  // we must do it. If our process has no nonlocal rows, set the
4118  // "min" to the max possible GO value. This ensures that if
4119  // some process has at least one nonlocal row, then it will pick
4120  // that up as the min. We know that at least one process has a
4121  // nonlocal row, since the all-reduce and return at the top of
4122  // this method excluded that case.
4123  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4124  {
4125  auto iter = std::min_element (myNonlocalGblRows.begin (),
4126  myNonlocalGblRows.end ());
4127  if (iter != myNonlocalGblRows.end ()) {
4128  myMinNonlocalGblRow = *iter;
4129  }
4130  }
4131  GO gblMinNonlocalGblRow = 0;
4132  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4133  outArg (gblMinNonlocalGblRow));
4134  const GO indexBase = gblMinNonlocalGblRow;
4135  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4136  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4137  }
4138 
4139  // 3. Use the values and column indices for each nonlocal row, as
4140  // stored in nonlocals_, to construct a CrsMatrix corresponding
4141  // to nonlocal rows. We have
4142  // exact counts of the number of entries in each nonlocal row.
4143 
4144  if (verbose) {
4145  std::ostringstream os;
4146  os << *prefix << "Create nonlocal matrix" << endl;
4147  std::cerr << os.str();
4148  }
4149  RCP<crs_matrix_type> nonlocalMatrix =
4150  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4151  {
4152  size_type curPos = 0;
4153  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4154  ++mapIter, ++curPos) {
4155  const GO gblRow = mapIter->first;
4156  // Get values & column indices by ref, just to avoid copy.
4157  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4158  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4159  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4160  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4161  }
4162  }
4163  // There's no need to fill-complete the nonlocals matrix.
4164  // We just use it as a temporary container for the Export.
4165 
4166  // 4. If the original row Map is one to one, then we can Export
4167  // directly from nonlocalMatrix into this. Otherwise, we have
4168  // to create a temporary matrix with a one-to-one row Map,
4169  // Export into that, then Import from the temporary matrix into
4170  // *this.
4171 
4172  auto origRowMap = this->getRowMap ();
4173  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4174 
4175  int isLocallyComplete = 1; // true by default
4176 
4177  if (origRowMapIsOneToOne) {
4178  if (verbose) {
4179  std::ostringstream os;
4180  os << *prefix << "Original row Map is 1-to-1" << endl;
4181  std::cerr << os.str();
4182  }
4183  export_type exportToOrig (nonlocalRowMap, origRowMap);
4184  if (! exportToOrig.isLocallyComplete ()) {
4185  isLocallyComplete = 0;
4186  }
4187  if (verbose) {
4188  std::ostringstream os;
4189  os << *prefix << "doExport from nonlocalMatrix" << endl;
4190  std::cerr << os.str();
4191  }
4192  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4193  // We're done at this point!
4194  }
4195  else {
4196  if (verbose) {
4197  std::ostringstream os;
4198  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4199  std::cerr << os.str();
4200  }
4201  // If you ask a Map whether it is one to one, it does some
4202  // communication and stashes intermediate results for later use
4203  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4204  // much more then the original cost of calling isOneToOne.
4205  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4206  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4207  if (! exportToOneToOne.isLocallyComplete ()) {
4208  isLocallyComplete = 0;
4209  }
4210 
4211  // Create a temporary matrix with the one-to-one row Map.
4212  //
4213  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4214  // each row, to avoid reallocation during the Export operation.
4215  if (verbose) {
4216  std::ostringstream os;
4217  os << *prefix << "Create & doExport into 1-to-1 matrix"
4218  << endl;
4219  std::cerr << os.str();
4220  }
4221  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4222  // Export from matrix of nonlocals into the temp one-to-one matrix.
4223  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4224  Tpetra::ADD);
4225 
4226  // We don't need the matrix of nonlocals anymore, so get rid of
4227  // it, to keep the memory high-water mark down.
4228  if (verbose) {
4229  std::ostringstream os;
4230  os << *prefix << "Free nonlocalMatrix" << endl;
4231  std::cerr << os.str();
4232  }
4233  nonlocalMatrix = Teuchos::null;
4234 
4235  // Import from the one-to-one matrix to the original matrix.
4236  if (verbose) {
4237  std::ostringstream os;
4238  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4239  std::cerr << os.str();
4240  }
4241  import_type importToOrig (oneToOneRowMap, origRowMap);
4242  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4243  }
4244 
4245  // It's safe now to clear out nonlocals_, since we've already
4246  // committed side effects to *this. The standard idiom for
4247  // clearing a Container like std::map, is to swap it with an empty
4248  // Container and let the swapped Container fall out of scope.
4249  if (verbose) {
4250  std::ostringstream os;
4251  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4252  std::cerr << os.str();
4253  }
4254  decltype (nonlocals_) newNonlocals;
4255  std::swap (nonlocals_, newNonlocals);
4256 
4257  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4258  // don't like throwing an exception here. A local return value
4259  // would likely be more useful to users. However, if users find
4260  // themselves exercising nonlocal inserts often, then they are
4261  // probably novice users who need the help. See Gibhub Issues
4262  // #603 and #601 (esp. the latter) for discussion.
4263 
4264  int isGloballyComplete = 0; // output argument of reduceAll
4265  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4266  outArg (isGloballyComplete));
4267  TEUCHOS_TEST_FOR_EXCEPTION
4268  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4269  "you called insertGlobalValues with a global row index which is not in "
4270  "the matrix's row Map on any process in its communicator.");
4271  }
4272 
4273  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4274  void
4276  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4277  {
4278  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4279  myGraph_->resumeFill (params);
4280  }
4281 #if KOKKOSKERNELS_VERSION >= 40299
4282  // Delete the apply helper (if it exists)
4283  applyHelper.reset();
4284 #endif
4285  fillComplete_ = false;
4286  }
4287 
4288  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4289  bool
4292  return getCrsGraphRef ().haveGlobalConstants ();
4293  }
4294 
4295  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4296  void
4298  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4299  {
4300  const char tfecfFuncName[] = "fillComplete(params): ";
4301 
4302  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4303  (this->getCrsGraph ().is_null (), std::logic_error,
4304  "getCrsGraph() returns null. This should not happen at this point. "
4305  "Please report this bug to the Tpetra developers.");
4306 
4307  const crs_graph_type& graph = this->getCrsGraphRef ();
4308  if (this->isStaticGraph () && graph.isFillComplete ()) {
4309  // If this matrix's graph is fill complete and the user did not
4310  // supply a domain or range Map, use the graph's domain and
4311  // range Maps.
4312  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4313  }
4314  else { // assume that user's row Map is the domain and range Map
4315  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4316  Teuchos::RCP<const map_type> domainMap = rangeMap;
4317  this->fillComplete (domainMap, rangeMap, params);
4318  }
4319  }
4320 
4321  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4322  void
4324  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4325  const Teuchos::RCP<const map_type>& rangeMap,
4326  const Teuchos::RCP<Teuchos::ParameterList>& params)
4327  {
4328  using Details::Behavior;
4330  using Teuchos::ArrayRCP;
4331  using Teuchos::RCP;
4332  using Teuchos::rcp;
4333  using std::endl;
4334  const char tfecfFuncName[] = "fillComplete: ";
4335  ProfilingRegion regionFillComplete
4336  ("Tpetra::CrsMatrix::fillComplete");
4337  const bool verbose = Behavior::verbose("CrsMatrix");
4338  std::unique_ptr<std::string> prefix;
4339  if (verbose) {
4340  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4341  std::ostringstream os;
4342  os << *prefix << endl;
4343  std::cerr << os.str ();
4344  }
4345  Details::ProfilingRegion region(
4346  "Tpetra::CrsMatrix::fillCompete",
4347  "fillCompete");
4348 
4349  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4350  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4351  "Matrix fill state must be active (isFillActive() "
4352  "must be true) before you may call fillComplete().");
4353  const int numProcs = this->getComm ()->getSize ();
4354 
4355  //
4356  // Read parameters from the input ParameterList.
4357  //
4358  {
4359  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4360 
4361  // If true, the caller promises that no process did nonlocal
4362  // changes since the last call to fillComplete.
4363  bool assertNoNonlocalInserts = false;
4364  // If true, makeColMap sorts remote GIDs (within each remote
4365  // process' group).
4366  bool sortGhosts = true;
4367 
4368  if (! params.is_null ()) {
4369  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4370  assertNoNonlocalInserts);
4371  if (params->isParameter ("sort column map ghost gids")) {
4372  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4373  }
4374  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4375  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4376  }
4377  }
4378  // We also don't need to do global assembly if there is only one
4379  // process in the communicator.
4380  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4381  // This parameter only matters if this matrix owns its graph.
4382  if (! this->myGraph_.is_null ()) {
4383  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4384  }
4385 
4386  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4387  if (this->hasColMap ()) { // use local indices
4388  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4389  }
4390  else { // no column Map, so use global indices
4391  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4392  }
4393  }
4394  // Global assemble, if we need to. This call only costs a single
4395  // all-reduce if we didn't need global assembly after all.
4396  if (needGlobalAssemble) {
4397  this->globalAssemble ();
4398  }
4399  else {
4400  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4401  (numProcs == 1 && nonlocals_.size() > 0,
4402  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4403  "An invalid entry (i.e., with row index not in the row Map) must have "
4404  "been submitted to the CrsMatrix.");
4405  }
4406  }
4407  if (this->isStaticGraph ()) {
4408  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4409  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4410  // checks below only in debug mode. It would be nicer to do a
4411  // local check, then propagate the error state in a deferred
4412  // way, whenever communication happens. That would reduce the
4413  // cost of checking, to the point where it may make sense to
4414  // enable it even in release mode.
4415 #ifdef HAVE_TPETRA_DEBUG
4416  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4417  // input Maps incurs a penalty of two all-reduces for the
4418  // otherwise optimal const graph case.
4419  //
4420  // We could turn these (max) 2 all-reduces into (max) 1, by
4421  // fusing them. We could do this by adding a "locallySameAs"
4422  // method to Map, which would return one of four states:
4423  //
4424  // a. Certainly globally the same
4425  // b. Certainly globally not the same
4426  // c. Locally the same
4427  // d. Locally not the same
4428  //
4429  // The first two states don't require further communication.
4430  // The latter two states require an all-reduce to communicate
4431  // globally, but we only need one all-reduce, since we only need
4432  // to check whether at least one of the Maps is wrong.
4433  const bool domainMapsMatch =
4434  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4435  const bool rangeMapsMatch =
4436  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4437 
4438  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4439  (! domainMapsMatch, std::runtime_error,
4440  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4441  "The graph cannot be changed because it was given to the CrsMatrix "
4442  "constructor as const. You can fix this by passing in the graph's "
4443  "domain Map and range Map to the matrix's fillComplete call.");
4444 
4445  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4446  (! rangeMapsMatch, std::runtime_error,
4447  "The CrsMatrix's range Map does not match the graph's range Map. "
4448  "The graph cannot be changed because it was given to the CrsMatrix "
4449  "constructor as const. You can fix this by passing in the graph's "
4450  "domain Map and range Map to the matrix's fillComplete call.");
4451 #endif // HAVE_TPETRA_DEBUG
4452 
4453  // The matrix does _not_ own the graph, and the graph's
4454  // structure is already fixed, so just fill the local matrix.
4455  this->fillLocalMatrix (params);
4456  }
4457  else {
4458  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4459  // Set the graph's domain and range Maps. This will clear the
4460  // Import if the domain Map has changed (is a different
4461  // pointer), and the Export if the range Map has changed (is a
4462  // different pointer).
4463  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4464 
4465  // Make the graph's column Map, if necessary.
4466  Teuchos::Array<int> remotePIDs (0);
4467  const bool mustBuildColMap = ! this->hasColMap ();
4468  if (mustBuildColMap) {
4469  this->myGraph_->makeColMap (remotePIDs);
4470  }
4471 
4472  // Make indices local, if necessary. The method won't do
4473  // anything if the graph is already locally indexed.
4474  const std::pair<size_t, std::string> makeIndicesLocalResult =
4475  this->myGraph_->makeIndicesLocal(verbose);
4476  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4477  // the error state to makeImportExport
4478  // which may do all-reduces and thus may
4479  // have the opportunity to communicate that error state.
4480  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4481  (makeIndicesLocalResult.first != 0, std::runtime_error,
4482  makeIndicesLocalResult.second);
4483 
4484  const bool sorted = this->myGraph_->isSorted ();
4485  const bool merged = this->myGraph_->isMerged ();
4486  this->sortAndMergeIndicesAndValues (sorted, merged);
4487 
4488  // Make Import and Export objects, if they haven't been made
4489  // already. If we made a column Map above, reuse information
4490  // from that process to avoid communiation in the Import setup.
4491  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4492 
4493  // The matrix _does_ own the graph, so fill the local graph at
4494  // the same time as the local matrix.
4495  this->fillLocalGraphAndMatrix (params);
4496 
4497  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4498  params->get ("compute global constants", true);
4499  if (callGraphComputeGlobalConstants) {
4500  this->myGraph_->computeGlobalConstants ();
4501  }
4502  else {
4503  this->myGraph_->computeLocalConstants ();
4504  }
4505  this->myGraph_->fillComplete_ = true;
4506  this->myGraph_->checkInternalState ();
4507  }
4508 
4509  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4510 
4511  this->fillComplete_ = true; // Now we're fill complete!
4512  {
4513  Details::ProfilingRegion region_cis(
4514  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4515  );
4516  this->checkInternalState ();
4517  }
4518  } //fillComplete(domainMap, rangeMap, params)
4519 
4520  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4521  void
4523  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4524  const Teuchos::RCP<const map_type> & rangeMap,
4525  const Teuchos::RCP<const import_type>& importer,
4526  const Teuchos::RCP<const export_type>& exporter,
4527  const Teuchos::RCP<Teuchos::ParameterList> &params)
4528  {
4529 #ifdef HAVE_TPETRA_MMM_TIMINGS
4530  std::string label;
4531  if(!params.is_null())
4532  label = params->get("Timer Label",label);
4533  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4534  using Teuchos::TimeMonitor;
4535 
4536  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4537 #endif
4538 
4539  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4540  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4541  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4542  "must be true) before calling fillComplete().");
4543  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4544  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4545 
4546  {
4547 #ifdef HAVE_TPETRA_MMM_TIMINGS
4548  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4549 #endif
4550  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4551  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4552  }
4553 
4554  {
4555 #ifdef HAVE_TPETRA_MMM_TIMINGS
4556  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4557 #endif
4558  // Fill the local graph and matrix
4559  fillLocalGraphAndMatrix (params);
4560  }
4561  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4562 
4563  // Now we're fill complete!
4564  fillComplete_ = true;
4565 
4566  // Sanity checks at the end.
4567 #ifdef HAVE_TPETRA_DEBUG
4568  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4569  ": We're at the end of fillComplete(), but isFillActive() is true. "
4570  "Please report this bug to the Tpetra developers.");
4571  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4572  ": We're at the end of fillComplete(), but isFillActive() is true. "
4573  "Please report this bug to the Tpetra developers.");
4574 #endif // HAVE_TPETRA_DEBUG
4575  {
4576 #ifdef HAVE_TPETRA_MMM_TIMINGS
4577  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4578 #endif
4579 
4580  checkInternalState();
4581  }
4582  }
4583 
4584  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4586  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4587  {
4588  impl_scalar_type* rowValueIter = vals;
4589  // beg,end define a half-exclusive interval over which to iterate.
4590  LocalOrdinal* beg = cols;
4591  LocalOrdinal* end = cols + rowLen;
4592  LocalOrdinal* newend = beg;
4593  if (beg != end) {
4594  LocalOrdinal* cur = beg + 1;
4595  impl_scalar_type* vcur = rowValueIter + 1;
4596  impl_scalar_type* vend = rowValueIter;
4597  cur = beg+1;
4598  while (cur != end) {
4599  if (*cur != *newend) {
4600  // new entry; save it
4601  ++newend;
4602  ++vend;
4603  (*newend) = (*cur);
4604  (*vend) = (*vcur);
4605  }
4606  else {
4607  // old entry; merge it
4608  //(*vend) = f (*vend, *vcur);
4609  (*vend) += *vcur;
4610  }
4611  ++cur;
4612  ++vcur;
4613  }
4614  ++newend; // one past the last entry, per typical [beg,end) semantics
4615  }
4616  return newend - beg;
4617  }
4618 
4619  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4620  void
4622  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4623  {
4624  using ::Tpetra::Details::ProfilingRegion;
4625  typedef LocalOrdinal LO;
4626  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4627  host_execution_space;
4628  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4629  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4630  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4631 
4632  if (! sorted || ! merged) {
4633  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4634  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4635  "\"static\" (const) graph, since the matrix does not own the graph.");
4636  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4637  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4638  "this matrix claims ! isStaticGraph(). "
4639  "Please report this bug to the Tpetra developers.");
4640  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4641  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4642  "this method if the graph's storage has already been optimized. "
4643  "Please report this bug to the Tpetra developers.");
4644 
4645  crs_graph_type& graph = * (this->myGraph_);
4646  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4647  size_t totalNumDups = 0;
4648  {
4649  //Accessing host unpacked (4-array CRS) local matrix.
4650  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4651  auto rowLengths_ = graph.k_numRowEntries_;
4652  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4653  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4654  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4655  [=] (const LO lclRow, size_t& numDups) {
4656  size_t rowBegin = rowBegins_(lclRow);
4657  size_t rowLen = rowLengths_(lclRow);
4658  LO* cols = cols_.data() + rowBegin;
4659  impl_scalar_type* vals = vals_.data() + rowBegin;
4660  if (! sorted) {
4661  sort2 (cols, cols + rowLen, vals);
4662  }
4663  if (! merged) {
4664  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4665  rowLengths_(lclRow) = newRowLength;
4666  numDups += rowLen - newRowLength;
4667  }
4668  }, totalNumDups);
4669  }
4670  if (! sorted) {
4671  graph.indicesAreSorted_ = true; // we just sorted every row
4672  }
4673  if (! merged) {
4674  graph.noRedundancies_ = true; // we just merged every row
4675  }
4676  }
4677  }
4678 
4679  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4680  void
4684  Scalar alpha,
4685  Scalar beta) const
4686  {
4688  using Teuchos::RCP;
4689  using Teuchos::rcp;
4690  using Teuchos::rcp_const_cast;
4691  using Teuchos::rcpFromRef;
4692  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4693  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4694 
4695  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4696  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4697  // failing only for the Kokkos refactor version of Tpetra. It's a
4698  // good idea regardless to have the bypass.
4699  if (alpha == ZERO) {
4700  if (beta == ZERO) {
4701  Y_in.putScalar (ZERO);
4702  } else if (beta != ONE) {
4703  Y_in.scale (beta);
4704  }
4705  return;
4706  }
4707 
4708  // It's possible that X is a view of Y or vice versa. We don't
4709  // allow this (apply() requires that X and Y not alias one
4710  // another), but it's helpful to detect and work around this case.
4711  // We don't try to to detect the more subtle cases (e.g., one is a
4712  // subview of the other, but their initial pointers differ). We
4713  // only need to do this if this matrix's Import is trivial;
4714  // otherwise, we don't actually apply the operator from X into Y.
4715 
4716  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4717  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4718 
4719  // If beta == 0, then the output MV will be overwritten; none of
4720  // its entries should be read. (Sparse BLAS semantics say that we
4721  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4722  // This matters if we need to do an Export operation; see below.
4723  const bool Y_is_overwritten = (beta == ZERO);
4724 
4725  // We treat the case of a replicated MV output specially.
4726  const bool Y_is_replicated =
4727  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4728 
4729  // This is part of the special case for replicated MV output.
4730  // We'll let each process do its thing, but do an all-reduce at
4731  // the end to sum up the results. Setting beta=0 on all processes
4732  // but Proc 0 makes the math work out for the all-reduce. (This
4733  // assumes that the replicated data is correctly replicated, so
4734  // that the data are the same on all processes.)
4735  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4736  beta = ZERO;
4737  }
4738 
4739  // Temporary MV for Import operation. After the block of code
4740  // below, this will be an (Imported if necessary) column Map MV
4741  // ready to give to localApply(...).
4742  RCP<const MV> X_colMap;
4743  if (importer.is_null ()) {
4744  if (! X_in.isConstantStride ()) {
4745  // Not all sparse mat-vec kernels can handle an input MV with
4746  // nonconstant stride correctly, so we have to copy it in that
4747  // case into a constant stride MV. To make a constant stride
4748  // copy of X_in, we force creation of the column (== domain)
4749  // Map MV (if it hasn't already been created, else fetch the
4750  // cached copy). This avoids creating a new MV each time.
4751  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4752  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4753  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4754  }
4755  else {
4756  // The domain and column Maps are the same, so do the local
4757  // multiply using the domain Map input MV X_in.
4758  X_colMap = rcpFromRef (X_in);
4759  }
4760  }
4761  else { // need to Import source (multi)vector
4762  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4763 
4764  // We're doing an Import anyway, which will copy the relevant
4765  // elements of the domain Map MV X_in into a separate column Map
4766  // MV. Thus, we don't have to worry whether X_in is constant
4767  // stride.
4768  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4769 
4770  // Import from the domain Map MV to the column Map MV.
4771  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4772  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4773  }
4774 
4775  // Temporary MV for doExport (if needed), or for copying a
4776  // nonconstant stride output MV into a constant stride MV. This
4777  // is null if we don't need the temporary MV, that is, if the
4778  // Export is trivial (null).
4779  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4780 
4781  // If we have a nontrivial Export object, we must perform an
4782  // Export. In that case, the local multiply result will go into
4783  // the row Map multivector. We don't have to make a
4784  // constant-stride version of Y_in in this case, because we had to
4785  // make a constant stride Y_rowMap MV and do an Export anyway.
4786  if (! exporter.is_null ()) {
4787  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4788  {
4789  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4790 
4791  // If we're overwriting the output MV Y_in completely (beta ==
4792  // 0), then make sure that it is filled with zeros before we
4793  // do the Export. Otherwise, the ADD combine mode will use
4794  // data in Y_in, which is supposed to be zero.
4795  if (Y_is_overwritten) {
4796  Y_in.putScalar (ZERO);
4797  }
4798  else {
4799  // Scale output MV by beta, so that doExport sums in the
4800  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4801  Y_in.scale (beta);
4802  }
4803  // Do the Export operation.
4804  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4805  }
4806  }
4807  else { // Don't do an Export: row Map and range Map are the same.
4808  //
4809  // If Y_in does not have constant stride, or if the column Map
4810  // MV aliases Y_in, then we can't let the kernel write directly
4811  // to Y_in. Instead, we have to use the cached row (== range)
4812  // Map MV as temporary storage.
4813  //
4814  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4815  // the user passed in the same MultiVector for both X and Y. It
4816  // won't detect whether one MultiVector views the other. We
4817  // should also check the MultiVectors' raw data pointers.
4818  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4819  // Force creating the MV if it hasn't been created already.
4820  // This will reuse a previously created cached MV.
4821  Y_rowMap = getRowMapMultiVector (Y_in, true);
4822 
4823  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4824  // since we're overwriting it anyway.
4825  if (beta != ZERO) {
4826  Tpetra::deep_copy (*Y_rowMap, Y_in);
4827  }
4828  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4829  Tpetra::deep_copy (Y_in, *Y_rowMap);
4830  }
4831  else {
4832  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4833  }
4834  }
4835 
4836  // If the range Map is a locally replicated Map, sum up
4837  // contributions from each process. We set beta = 0 on all
4838  // processes but Proc 0 initially, so this will handle the scaling
4839  // factor beta correctly.
4840  if (Y_is_replicated) {
4841  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4842  Y_in.reduce ();
4843  }
4844  }
4845 
4846  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4847  void
4851  const Teuchos::ETransp mode,
4852  Scalar alpha,
4853  Scalar beta) const
4854  {
4856  using Teuchos::null;
4857  using Teuchos::RCP;
4858  using Teuchos::rcp;
4859  using Teuchos::rcp_const_cast;
4860  using Teuchos::rcpFromRef;
4861  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4862 
4863  // Take shortcuts for alpha == 0.
4864  if (alpha == ZERO) {
4865  // Follow the Sparse BLAS convention by ignoring both the matrix
4866  // and X_in, in this case.
4867  if (beta == ZERO) {
4868  // Follow the Sparse BLAS convention by overwriting any Inf or
4869  // NaN values in Y_in, in this case.
4870  Y_in.putScalar (ZERO);
4871  }
4872  else {
4873  Y_in.scale (beta);
4874  }
4875  return;
4876  }
4877  else if (beta == ZERO) {
4878  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4879  //when bets==0. This was not the case with transpose in a multithreaded
4880  //environment where a multiplication with subsequent atomic_adds is used
4881  //since 0 is effectively not special cased. Doing the explicit set to zero here
4882  //This catches cases where Y is nan or inf.
4883  Y_in.putScalar (ZERO);
4884  }
4885 
4886  const size_t numVectors = X_in.getNumVectors ();
4887 
4888  // We don't allow X_in and Y_in to alias one another. It's hard
4889  // to check this, because advanced users could create views from
4890  // raw pointers. However, if X_in and Y_in reference the same
4891  // object, we will do the user a favor by copying X into new
4892  // storage (with a warning). We only need to do this if we have
4893  // trivial importers; otherwise, we don't actually apply the
4894  // operator from X into Y.
4895  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4896  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4897  // access X indirectly, in case we need to create temporary storage
4898  RCP<const MV> X;
4899 
4900  // some parameters for below
4901  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4902  const bool Y_is_overwritten = (beta == ZERO);
4903  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4904  beta = ZERO;
4905  }
4906 
4907  // The kernels do not allow input or output with nonconstant stride.
4908  if (! X_in.isConstantStride () && importer.is_null ()) {
4909  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4910  } else {
4911  X = rcpFromRef (X_in); // Reference to X_in
4912  }
4913 
4914  // Set up temporary multivectors for Import and/or Export.
4915  if (importer != Teuchos::null) {
4916  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4917  importMV_ = null;
4918  }
4919  if (importMV_ == null) {
4920  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4921  }
4922  }
4923  if (exporter != Teuchos::null) {
4924  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4925  exportMV_ = null;
4926  }
4927  if (exportMV_ == null) {
4928  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4929  }
4930  }
4931 
4932  // If we have a non-trivial exporter, we must import elements that
4933  // are permuted or are on other processors.
4934  if (! exporter.is_null ()) {
4935  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4936  exportMV_->doImport (X_in, *exporter, INSERT);
4937  X = exportMV_; // multiply out of exportMV_
4938  }
4939 
4940  // If we have a non-trivial importer, we must export elements that
4941  // are permuted or belong to other processors. We will compute
4942  // solution into the to-be-exported MV; get a view.
4943  if (importer != Teuchos::null) {
4944  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4945 
4946  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4947  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4948  // back and figure out why this helps. importMV_ SHOULD be
4949  // completely overwritten in the localApply(...) call
4950  // below, because beta == ZERO there.
4951  importMV_->putScalar (ZERO);
4952  // Do the local computation.
4953  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4954 
4955  if (Y_is_overwritten) {
4956  Y_in.putScalar (ZERO);
4957  } else {
4958  Y_in.scale (beta);
4959  }
4960  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4961  }
4962  // otherwise, multiply into Y
4963  else {
4964  // can't multiply in-situ; can't multiply into non-strided multivector
4965  //
4966  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4967  // the user passed in the same MultiVector for both X and Y. It
4968  // won't detect whether one MultiVector views the other. We
4969  // should also check the MultiVectors' raw data pointers.
4970  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4971  // Make a deep copy of Y_in, into which to write the multiply result.
4972  MV Y (Y_in, Teuchos::Copy);
4973  this->localApply (*X, Y, mode, alpha, beta);
4974  Tpetra::deep_copy (Y_in, Y);
4975  } else {
4976  this->localApply (*X, Y_in, mode, alpha, beta);
4977  }
4978  }
4979 
4980  // If the range Map is a locally replicated map, sum the
4981  // contributions from each process. (That's why we set beta=0
4982  // above for all processes but Proc 0.)
4983  if (Y_is_replicated) {
4984  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4985  Y_in.reduce ();
4986  }
4987  }
4988 
4989  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4990  void
4994  const Teuchos::ETransp mode,
4995  const Scalar& alpha,
4996  const Scalar& beta) const
4997  {
4999  using Teuchos::NO_TRANS;
5000  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5001 
5002  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5003  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5004 
5005  const bool debug = ::Tpetra::Details::Behavior::debug ();
5006  if (debug) {
5007  const char tfecfFuncName[] = "localApply: ";
5008  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5009  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5010  "X.getNumVectors() = " << X.getNumVectors () << " != "
5011  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5012  const bool transpose = (mode != Teuchos::NO_TRANS);
5013  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5014  (! transpose && X.getLocalLength () !=
5015  getColMap ()->getLocalNumElements (), std::runtime_error,
5016  "NO_TRANS case: X has the wrong number of local rows. "
5017  "X.getLocalLength() = " << X.getLocalLength () << " != "
5018  "getColMap()->getLocalNumElements() = " <<
5019  getColMap ()->getLocalNumElements () << ".");
5020  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5021  (! transpose && Y.getLocalLength () !=
5022  getRowMap ()->getLocalNumElements (), std::runtime_error,
5023  "NO_TRANS case: Y has the wrong number of local rows. "
5024  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5025  "getRowMap()->getLocalNumElements() = " <<
5026  getRowMap ()->getLocalNumElements () << ".");
5027  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5028  (transpose && X.getLocalLength () !=
5029  getRowMap ()->getLocalNumElements (), std::runtime_error,
5030  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5031  "rows. X.getLocalLength() = " << X.getLocalLength ()
5032  << " != getRowMap()->getLocalNumElements() = "
5033  << getRowMap ()->getLocalNumElements () << ".");
5034  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5035  (transpose && Y.getLocalLength () !=
5036  getColMap ()->getLocalNumElements (), std::runtime_error,
5037  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5038  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5039  << " != getColMap()->getLocalNumElements() = "
5040  << getColMap ()->getLocalNumElements () << ".");
5041  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5042  (! isFillComplete (), std::runtime_error, "The matrix is not "
5043  "fill complete. You must call fillComplete() (possibly with "
5044  "domain and range Map arguments) without an intervening "
5045  "resumeFill() call before you may call this method.");
5046  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5047  (! X.isConstantStride () || ! Y.isConstantStride (),
5048  std::runtime_error, "X and Y must be constant stride.");
5049  // If the two pointers are null, then they don't alias one
5050  // another, even though they are equal.
5051  // Kokkos does not guarantee that zero row-extent vectors
5052  // point to different places, so we have to check that too.
5053  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5054  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5055  && X_lcl.extent(0) != 0,
5056  std::runtime_error, "X and Y may not alias one another.");
5057  }
5058 
5059 #if KOKKOSKERNELS_VERSION >= 40299
5060  auto A_lcl = getLocalMatrixDevice();
5061 
5062  if(!applyHelper.get()) {
5063  // The apply helper does not exist, so create it.
5064  // Decide now whether to use the imbalanced row path, or the default.
5065  bool useMergePath = false;
5066 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5067  //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5068  //we can use SPMV_MERGE_PATH for the native spmv as well.
5069  //Take out this ifdef to enable that.
5070  //
5071  //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5072  if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5073  LocalOrdinal nrows = getLocalNumRows();
5074  LocalOrdinal maxRowImbalance = 0;
5075  if(nrows != 0)
5076  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5077 
5078  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5079  useMergePath = true;
5080  }
5081 #endif
5082  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5083  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5084  }
5085 
5086  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5087  const char* modeKK = nullptr;
5088  switch(mode)
5089  {
5090  case Teuchos::NO_TRANS:
5091  modeKK = KokkosSparse::NoTranspose; break;
5092  case Teuchos::TRANS:
5093  modeKK = KokkosSparse::Transpose; break;
5094  case Teuchos::CONJ_TRANS:
5095  modeKK = KokkosSparse::ConjugateTranspose; break;
5096  default:
5097  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5098  }
5099 
5100  if(applyHelper->shouldUseIntRowptrs())
5101  {
5102  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5103  KokkosSparse::spmv(
5104  &applyHelper->handle_int, modeKK,
5105  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5106  }
5107  else
5108  {
5109  KokkosSparse::spmv(
5110  &applyHelper->handle, modeKK,
5111  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5112  }
5113 #else
5114  LocalOrdinal nrows = getLocalNumRows();
5115  LocalOrdinal maxRowImbalance = 0;
5116  if(nrows != 0)
5117  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5118 
5119  auto matrix_lcl = getLocalMultiplyOperator();
5120  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5121  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5122  else
5123  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5124 #endif
5125  }
5126 
5127  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5128  void
5132  Teuchos::ETransp mode,
5133  Scalar alpha,
5134  Scalar beta) const
5135  {
5137  const char fnName[] = "Tpetra::CrsMatrix::apply";
5138 
5139  TEUCHOS_TEST_FOR_EXCEPTION
5140  (! isFillComplete (), std::runtime_error,
5141  fnName << ": Cannot call apply() until fillComplete() "
5142  "has been called.");
5143 
5144  if (mode == Teuchos::NO_TRANS) {
5145  ProfilingRegion regionNonTranspose (fnName);
5146  this->applyNonTranspose (X, Y, alpha, beta);
5147  }
5148  else {
5149  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5150  this->applyTranspose (X, Y, mode, alpha, beta);
5151  }
5152  }
5153 
5154 
5155  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5156  template<class T>
5157  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5159  convert () const
5160  {
5161  using Teuchos::RCP;
5162  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5163  const char tfecfFuncName[] = "convert: ";
5164 
5165  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5166  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5167  "of the conversion) is not fill complete. You must first call "
5168  "fillComplete() (possibly with the domain and range Map) without an "
5169  "intervening call to resumeFill(), before you may call this method.");
5170 
5171  RCP<output_matrix_type> newMatrix
5172  (new output_matrix_type (this->getCrsGraph ()));
5173  // Copy old values into new values. impl_scalar_type and T may
5174  // differ, so we can't use Kokkos::deep_copy.
5176  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5177  this->getLocalMatrixDevice ().values);
5178  // Since newmat has a static (const) graph, the graph already has
5179  // a column Map, and Import and Export objects already exist (if
5180  // applicable). Thus, calling fillComplete is cheap.
5181  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5182 
5183  return newMatrix;
5184  }
5185 
5186 
5187  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5188  void
5191  {
5192  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5193  if (debug) {
5194  const char tfecfFuncName[] = "checkInternalState: ";
5195  const char err[] = "Internal state is not consistent. "
5196  "Please report this bug to the Tpetra developers.";
5197 
5198  // This version of the graph (RCP<const crs_graph_type>) must
5199  // always be nonnull.
5200  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5201  (staticGraph_.is_null (), std::logic_error, err);
5202  // myGraph == null means that the matrix has a const ("static")
5203  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5204  // graph).
5205  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5206  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5207  std::logic_error, err);
5208  // if matrix is fill complete, then graph must be fill complete
5209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5210  (isFillComplete () && ! staticGraph_->isFillComplete (),
5211  std::logic_error, err << " Specifically, the matrix is fill complete, "
5212  "but its graph is NOT fill complete.");
5213  // if values are allocated and they are non-zero in number, then
5214  // one of the allocations should be present
5215  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5216  (staticGraph_->indicesAreAllocated () &&
5217  staticGraph_->getLocalAllocationSize() > 0 &&
5218  staticGraph_->getLocalNumRows() > 0 &&
5219  valuesUnpacked_wdv.extent (0) == 0,
5220  std::logic_error, err);
5221  }
5222  }
5223 
5224  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5225  std::string
5228  {
5229  std::ostringstream os;
5230 
5231  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5232  if (this->getObjectLabel () != "") {
5233  os << "Label: \"" << this->getObjectLabel () << "\", ";
5234  }
5235  if (isFillComplete ()) {
5236  os << "isFillComplete: true"
5237  << ", global dimensions: [" << getGlobalNumRows () << ", "
5238  << getGlobalNumCols () << "]"
5239  << ", global number of entries: " << getGlobalNumEntries ()
5240  << "}";
5241  }
5242  else {
5243  os << "isFillComplete: false"
5244  << ", global dimensions: [" << getGlobalNumRows () << ", "
5245  << getGlobalNumCols () << "]}";
5246  }
5247  return os.str ();
5248  }
5249 
5250  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5251  void
5253  describe (Teuchos::FancyOStream &out,
5254  const Teuchos::EVerbosityLevel verbLevel) const
5255  {
5256  using std::endl;
5257  using std::setw;
5258  using Teuchos::ArrayView;
5259  using Teuchos::Comm;
5260  using Teuchos::RCP;
5261  using Teuchos::TypeNameTraits;
5262  using Teuchos::VERB_DEFAULT;
5263  using Teuchos::VERB_NONE;
5264  using Teuchos::VERB_LOW;
5265  using Teuchos::VERB_MEDIUM;
5266  using Teuchos::VERB_HIGH;
5267  using Teuchos::VERB_EXTREME;
5268 
5269  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5270 
5271  if (vl == VERB_NONE) {
5272  return; // Don't print anything at all
5273  }
5274 
5275  // By convention, describe() always begins with a tab.
5276  Teuchos::OSTab tab0 (out);
5277 
5278  RCP<const Comm<int> > comm = this->getComm();
5279  const int myRank = comm->getRank();
5280  const int numProcs = comm->getSize();
5281  size_t width = 1;
5282  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5283  ++width;
5284  }
5285  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5286 
5287  // none: print nothing
5288  // low: print O(1) info from node 0
5289  // medium: print O(P) info, num entries per process
5290  // high: print O(N) info, num entries per row
5291  // extreme: print O(NNZ) info: print indices and values
5292  //
5293  // for medium and higher, print constituent objects at specified verbLevel
5294  if (myRank == 0) {
5295  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5296  }
5297  Teuchos::OSTab tab1 (out);
5298 
5299  if (myRank == 0) {
5300  if (this->getObjectLabel () != "") {
5301  out << "Label: \"" << this->getObjectLabel () << "\", ";
5302  }
5303  {
5304  out << "Template parameters:" << endl;
5305  Teuchos::OSTab tab2 (out);
5306  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5307  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5308  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5309  << "Node: " << TypeNameTraits<Node>::name () << endl;
5310  }
5311  if (isFillComplete()) {
5312  out << "isFillComplete: true" << endl
5313  << "Global dimensions: [" << getGlobalNumRows () << ", "
5314  << getGlobalNumCols () << "]" << endl
5315  << "Global number of entries: " << getGlobalNumEntries () << endl
5316  << endl << "Global max number of entries in a row: "
5317  << getGlobalMaxNumRowEntries () << endl;
5318  }
5319  else {
5320  out << "isFillComplete: false" << endl
5321  << "Global dimensions: [" << getGlobalNumRows () << ", "
5322  << getGlobalNumCols () << "]" << endl;
5323  }
5324  }
5325 
5326  if (vl < VERB_MEDIUM) {
5327  return; // all done!
5328  }
5329 
5330  // Describe the row Map.
5331  if (myRank == 0) {
5332  out << endl << "Row Map:" << endl;
5333  }
5334  if (getRowMap ().is_null ()) {
5335  if (myRank == 0) {
5336  out << "null" << endl;
5337  }
5338  }
5339  else {
5340  if (myRank == 0) {
5341  out << endl;
5342  }
5343  getRowMap ()->describe (out, vl);
5344  }
5345 
5346  // Describe the column Map.
5347  if (myRank == 0) {
5348  out << "Column Map: ";
5349  }
5350  if (getColMap ().is_null ()) {
5351  if (myRank == 0) {
5352  out << "null" << endl;
5353  }
5354  } else if (getColMap () == getRowMap ()) {
5355  if (myRank == 0) {
5356  out << "same as row Map" << endl;
5357  }
5358  } else {
5359  if (myRank == 0) {
5360  out << endl;
5361  }
5362  getColMap ()->describe (out, vl);
5363  }
5364 
5365  // Describe the domain Map.
5366  if (myRank == 0) {
5367  out << "Domain Map: ";
5368  }
5369  if (getDomainMap ().is_null ()) {
5370  if (myRank == 0) {
5371  out << "null" << endl;
5372  }
5373  } else if (getDomainMap () == getRowMap ()) {
5374  if (myRank == 0) {
5375  out << "same as row Map" << endl;
5376  }
5377  } else if (getDomainMap () == getColMap ()) {
5378  if (myRank == 0) {
5379  out << "same as column Map" << endl;
5380  }
5381  } else {
5382  if (myRank == 0) {
5383  out << endl;
5384  }
5385  getDomainMap ()->describe (out, vl);
5386  }
5387 
5388  // Describe the range Map.
5389  if (myRank == 0) {
5390  out << "Range Map: ";
5391  }
5392  if (getRangeMap ().is_null ()) {
5393  if (myRank == 0) {
5394  out << "null" << endl;
5395  }
5396  } else if (getRangeMap () == getDomainMap ()) {
5397  if (myRank == 0) {
5398  out << "same as domain Map" << endl;
5399  }
5400  } else if (getRangeMap () == getRowMap ()) {
5401  if (myRank == 0) {
5402  out << "same as row Map" << endl;
5403  }
5404  } else {
5405  if (myRank == 0) {
5406  out << endl;
5407  }
5408  getRangeMap ()->describe (out, vl);
5409  }
5410 
5411  // O(P) data
5412  for (int curRank = 0; curRank < numProcs; ++curRank) {
5413  if (myRank == curRank) {
5414  out << "Process rank: " << curRank << endl;
5415  Teuchos::OSTab tab2 (out);
5416  if (! staticGraph_->indicesAreAllocated ()) {
5417  out << "Graph indices not allocated" << endl;
5418  }
5419  else {
5420  out << "Number of allocated entries: "
5421  << staticGraph_->getLocalAllocationSize () << endl;
5422  }
5423  out << "Number of entries: " << getLocalNumEntries () << endl
5424  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5425  << endl;
5426  }
5427  // Give output time to complete by executing some barriers.
5428  comm->barrier ();
5429  comm->barrier ();
5430  comm->barrier ();
5431  }
5432 
5433  if (vl < VERB_HIGH) {
5434  return; // all done!
5435  }
5436 
5437  // O(N) and O(NNZ) data
5438  for (int curRank = 0; curRank < numProcs; ++curRank) {
5439  if (myRank == curRank) {
5440  out << std::setw(width) << "Proc Rank"
5441  << std::setw(width) << "Global Row"
5442  << std::setw(width) << "Num Entries";
5443  if (vl == VERB_EXTREME) {
5444  out << std::setw(width) << "(Index,Value)";
5445  }
5446  out << endl;
5447  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5448  const size_t nE = getNumEntriesInLocalRow(r);
5449  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5450  out << std::setw(width) << myRank
5451  << std::setw(width) << gid
5452  << std::setw(width) << nE;
5453  if (vl == VERB_EXTREME) {
5454  if (isGloballyIndexed()) {
5455  global_inds_host_view_type rowinds;
5456  values_host_view_type rowvals;
5457  getGlobalRowView (gid, rowinds, rowvals);
5458  for (size_t j = 0; j < nE; ++j) {
5459  out << " (" << rowinds[j]
5460  << ", " << rowvals[j]
5461  << ") ";
5462  }
5463  }
5464  else if (isLocallyIndexed()) {
5465  local_inds_host_view_type rowinds;
5466  values_host_view_type rowvals;
5467  getLocalRowView (r, rowinds, rowvals);
5468  for (size_t j=0; j < nE; ++j) {
5469  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5470  << ", " << rowvals[j]
5471  << ") ";
5472  }
5473  } // globally or locally indexed
5474  } // vl == VERB_EXTREME
5475  out << endl;
5476  } // for each row r on this process
5477  } // if (myRank == curRank)
5478 
5479  // Give output time to complete
5480  comm->barrier ();
5481  comm->barrier ();
5482  comm->barrier ();
5483  } // for each process p
5484  }
5485 
5486  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5487  bool
5490  {
5491  // It's not clear what kind of compatibility checks on sizes can
5492  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5493  // compatibility.
5494 
5495  // Currently, the source object must be a RowMatrix with the same
5496  // four template parameters as the target CrsMatrix. We might
5497  // relax this requirement later.
5498  const row_matrix_type* srcRowMat =
5499  dynamic_cast<const row_matrix_type*> (&source);
5500  return (srcRowMat != nullptr);
5501  }
5502 
5503  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5504  void
5507  const typename crs_graph_type::padding_type& padding,
5508  const bool verbose)
5509  {
5511  using Details::padCrsArrays;
5512  using std::endl;
5513  using LO = local_ordinal_type;
5514  using row_ptrs_type =
5515  typename local_graph_device_type::row_map_type::non_const_type;
5516  using range_policy =
5517  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5518  const char tfecfFuncName[] = "applyCrsPadding";
5519  const char suffix[] =
5520  ". Please report this bug to the Tpetra developers.";
5521  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5522 
5523  std::unique_ptr<std::string> prefix;
5524  if (verbose) {
5525  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5526  std::ostringstream os;
5527  os << *prefix << "padding: ";
5528  padding.print(os);
5529  os << endl;
5530  std::cerr << os.str();
5531  }
5532  const int myRank = ! verbose ? -1 : [&] () {
5533  auto map = this->getMap();
5534  if (map.is_null()) {
5535  return -1;
5536  }
5537  auto comm = map->getComm();
5538  if (comm.is_null()) {
5539  return -1;
5540  }
5541  return comm->getRank();
5542  } ();
5543 
5544  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5545  if (! myGraph_->indicesAreAllocated()) {
5546  if (verbose) {
5547  std::ostringstream os;
5548  os << *prefix << "Call allocateIndices" << endl;
5549  std::cerr << os.str();
5550  }
5551  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5552  }
5553 
5554  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5555  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5556  // size needs to increase. That should be the job of
5557  // padCrsArrays.
5558 
5559  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5560  // would use it directly.
5561 
5562  if (verbose) {
5563  std::ostringstream os;
5564  os << *prefix << "Allocate row_ptrs_beg: "
5565  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5566  std::cerr << os.str();
5567  }
5568  using Kokkos::view_alloc;
5569  using Kokkos::WithoutInitializing;
5570  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5571  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5572  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5573  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5574 
5575  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5576  size_t(row_ptr_beg.extent(0) - 1);
5577  if (verbose) {
5578  std::ostringstream os;
5579  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5580  std::cerr << os.str();
5581  }
5582  row_ptrs_type row_ptr_end(
5583  view_alloc("row_ptr_end", WithoutInitializing), N);
5584 
5585  row_ptrs_type num_row_entries_d;
5586 
5587  const bool refill_num_row_entries =
5588  myGraph_->k_numRowEntries_.extent(0) != 0;
5589 
5590  if (refill_num_row_entries) { // unpacked storage
5591  // We can't assume correct *this capture until C++17, and it's
5592  // likely more efficient just to capture what we need anyway.
5593  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5594  myGraph_->k_numRowEntries_);
5595  Kokkos::parallel_for
5596  ("Fill end row pointers", range_policy(0, N),
5597  KOKKOS_LAMBDA (const size_t i) {
5598  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5599  });
5600  }
5601  else {
5602  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5603  // storage, we don't need row_ptr_end to be separate allocation;
5604  // could just have it alias row_ptr_beg+1.
5605  Kokkos::parallel_for
5606  ("Fill end row pointers", range_policy(0, N),
5607  KOKKOS_LAMBDA (const size_t i) {
5608  row_ptr_end(i) = row_ptr_beg(i+1);
5609  });
5610  }
5611 
5612  if (myGraph_->isGloballyIndexed()) {
5613  padCrsArrays(row_ptr_beg, row_ptr_end,
5614  myGraph_->gblInds_wdv,
5615  valuesUnpacked_wdv, padding, myRank, verbose);
5616  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5617  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5618  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5619  (newValuesLen != newColIndsLen, std::logic_error,
5620  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5621  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5622  << suffix);
5623  }
5624  else {
5625  padCrsArrays(row_ptr_beg, row_ptr_end,
5626  myGraph_->lclIndsUnpacked_wdv,
5627  valuesUnpacked_wdv, padding, myRank, verbose);
5628  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5629  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5630  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5631  (newValuesLen != newColIndsLen, std::logic_error,
5632  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5633  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5634  << suffix);
5635  }
5636 
5637  if (refill_num_row_entries) {
5638  Kokkos::parallel_for
5639  ("Fill num entries", range_policy(0, N),
5640  KOKKOS_LAMBDA (const size_t i) {
5641  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5642  });
5643  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5644  }
5645 
5646  if (verbose) {
5647  std::ostringstream os;
5648  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5649  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5650  << ", new size: " << row_ptr_beg.extent(0) << endl;
5651  std::cerr << os.str();
5652  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5653  row_ptr_beg.extent(0) );
5654  }
5655  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5656  }
5657 
5658  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5659  void
5660  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5661  copyAndPermuteStaticGraph(
5662  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5663  const size_t numSameIDs,
5664  const LocalOrdinal permuteToLIDs[],
5665  const LocalOrdinal permuteFromLIDs[],
5666  const size_t numPermutes)
5667  {
5668  using Details::ProfilingRegion;
5669  using Teuchos::Array;
5670  using Teuchos::ArrayView;
5671  using std::endl;
5672  using LO = LocalOrdinal;
5673  using GO = GlobalOrdinal;
5674  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5675  const char suffix[] =
5676  " Please report this bug to the Tpetra developers.";
5677  ProfilingRegion regionCAP
5678  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5679 
5680  const bool debug = Details::Behavior::debug("CrsGraph");
5681  const bool verbose = Details::Behavior::verbose("CrsGraph");
5682  std::unique_ptr<std::string> prefix;
5683  if (verbose) {
5684  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5685  std::ostringstream os;
5686  os << *prefix << "Start" << endl;
5687  }
5688  const char* const prefix_raw =
5689  verbose ? prefix.get()->c_str() : nullptr;
5690 
5691  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5692  //
5693  // Copy the first numSame row from source to target (this matrix).
5694  // This involves copying rows corresponding to LIDs [0, numSame-1].
5695  //
5696  const map_type& srcRowMap = * (srcMat.getRowMap ());
5697  nonconst_global_inds_host_view_type rowInds;
5698  nonconst_values_host_view_type rowVals;
5699  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5700  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5701  // Global ID for the current row index in the source matrix.
5702  // The first numSameIDs GIDs in the two input lists are the
5703  // same, so sourceGID == targetGID in this case.
5704  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5705  const GO targetGID = sourceGID;
5706 
5707  ArrayView<const GO>rowIndsConstView;
5708  ArrayView<const Scalar> rowValsConstView;
5709 
5710  if (sourceIsLocallyIndexed) {
5711  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5712  if (rowLength > static_cast<size_t> (rowInds.size())) {
5713  Kokkos::resize(rowInds,rowLength);
5714  Kokkos::resize(rowVals,rowLength);
5715  }
5716  // Resizing invalidates an Array's views, so we must make new
5717  // ones, even if rowLength hasn't changed.
5718  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5719  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5720 
5721  // The source matrix is locally indexed, so we have to get a
5722  // copy. Really it's the GIDs that have to be copied (because
5723  // they have to be converted from LIDs).
5724  size_t checkRowLength = 0;
5725  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5726  rowValsView, checkRowLength);
5727  if (debug) {
5728  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5729  (rowLength != checkRowLength, std::logic_error, "For "
5730  "global row index " << sourceGID << ", the source "
5731  "matrix's getNumEntriesInGlobalRow returns a row length "
5732  "of " << rowLength << ", but getGlobalRowCopy reports "
5733  "a row length of " << checkRowLength << "." << suffix);
5734  }
5735 
5736  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5737  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5738  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5739  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5740  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5741  rowIndsView.data(), rowIndsView.extent(0),
5742  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5743  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5744  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5745  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5746  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5747  // KDDKDD UVM TEMPORARY: KokkosView interface
5748  }
5749  else { // source matrix is globally indexed.
5750  global_inds_host_view_type rowIndsView;
5751  values_host_view_type rowValsView;
5752  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5753  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5754  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5755  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5756  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5757  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5758  rowIndsView.data(), rowIndsView.extent(0),
5759  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5760  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5761  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5762  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5763  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5764  // KDDKDD UVM TEMPORARY: KokkosView interface
5765 
5766  }
5767 
5768  // Applying a permutation to a matrix with a static graph
5769  // means REPLACE-ing entries.
5770  combineGlobalValues(targetGID, rowIndsConstView,
5771  rowValsConstView, REPLACE,
5772  prefix_raw, debug, verbose);
5773  }
5774 
5775  if (verbose) {
5776  std::ostringstream os;
5777  os << *prefix << "Do permutes" << endl;
5778  }
5779 
5780  const map_type& tgtRowMap = * (this->getRowMap ());
5781  for (size_t p = 0; p < numPermutes; ++p) {
5782  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5783  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5784 
5785  ArrayView<const GO> rowIndsConstView;
5786  ArrayView<const Scalar> rowValsConstView;
5787 
5788  if (sourceIsLocallyIndexed) {
5789  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5790  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5791  Kokkos::resize(rowInds,rowLength);
5792  Kokkos::resize(rowVals,rowLength);
5793  }
5794  // Resizing invalidates an Array's views, so we must make new
5795  // ones, even if rowLength hasn't changed.
5796  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5797  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5798 
5799  // The source matrix is locally indexed, so we have to get a
5800  // copy. Really it's the GIDs that have to be copied (because
5801  // they have to be converted from LIDs).
5802  size_t checkRowLength = 0;
5803  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5804  rowValsView, checkRowLength);
5805  if (debug) {
5806  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5807  (rowLength != checkRowLength, std::logic_error, "For "
5808  "source matrix global row index " << sourceGID << ", "
5809  "getNumEntriesInGlobalRow returns a row length of " <<
5810  rowLength << ", but getGlobalRowCopy a row length of "
5811  << checkRowLength << "." << suffix);
5812  }
5813 
5814  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5815  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5816  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5817  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5818  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5819  rowIndsView.data(), rowIndsView.extent(0),
5820  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5821  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5822  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5823  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5824  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5825  // KDDKDD UVM TEMPORARY: KokkosView interface
5826  }
5827  else {
5828  global_inds_host_view_type rowIndsView;
5829  values_host_view_type rowValsView;
5830  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5831  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5832  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5833  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5834  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5835  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5836  rowIndsView.data(), rowIndsView.extent(0),
5837  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5838  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5839  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5840  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5841  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5842  // KDDKDD UVM TEMPORARY: KokkosView interface
5843  }
5844 
5845  combineGlobalValues(targetGID, rowIndsConstView,
5846  rowValsConstView, REPLACE,
5847  prefix_raw, debug, verbose);
5848  }
5849 
5850  if (verbose) {
5851  std::ostringstream os;
5852  os << *prefix << "Done" << endl;
5853  }
5854  }
5855 
5856  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5857  void
5858  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5859  copyAndPermuteNonStaticGraph(
5860  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5861  const size_t numSameIDs,
5862  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5863  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5864  const size_t numPermutes)
5865  {
5866  using Details::ProfilingRegion;
5867  using Teuchos::Array;
5868  using Teuchos::ArrayView;
5869  using std::endl;
5870  using LO = LocalOrdinal;
5871  using GO = GlobalOrdinal;
5872  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5873  const char suffix[] =
5874  " Please report this bug to the Tpetra developers.";
5875  ProfilingRegion regionCAP
5876  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5877 
5878  const bool debug = Details::Behavior::debug("CrsGraph");
5879  const bool verbose = Details::Behavior::verbose("CrsGraph");
5880  std::unique_ptr<std::string> prefix;
5881  if (verbose) {
5882  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5883  std::ostringstream os;
5884  os << *prefix << "Start" << endl;
5885  }
5886  const char* const prefix_raw =
5887  verbose ? prefix.get()->c_str() : nullptr;
5888 
5889  {
5890  using row_graph_type = RowGraph<LO, GO, Node>;
5891  const row_graph_type& srcGraph = *(srcMat.getGraph());
5892  auto padding =
5893  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5894  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5895  applyCrsPadding(*padding, verbose);
5896  }
5897  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5898  //
5899  // Copy the first numSame row from source to target (this matrix).
5900  // This involves copying rows corresponding to LIDs [0, numSame-1].
5901  //
5902  const map_type& srcRowMap = * (srcMat.getRowMap ());
5903  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5904  using gids_type = nonconst_global_inds_host_view_type;
5905  using vals_type = nonconst_values_host_view_type;
5906  gids_type rowInds;
5907  vals_type rowVals;
5908  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5909  // Global ID for the current row index in the source matrix.
5910  // The first numSameIDs GIDs in the two input lists are the
5911  // same, so sourceGID == targetGID in this case.
5912  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5913  const GO targetGID = sourceGID;
5914 
5915  ArrayView<const GO> rowIndsConstView;
5916  ArrayView<const Scalar> rowValsConstView;
5917 
5918  if (sourceIsLocallyIndexed) {
5919 
5920  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5921  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5922  Kokkos::resize(rowInds,rowLength);
5923  Kokkos::resize(rowVals,rowLength);
5924  }
5925  // Resizing invalidates an Array's views, so we must make new
5926  // ones, even if rowLength hasn't changed.
5927  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5928  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5929 
5930  // The source matrix is locally indexed, so we have to get a
5931  // copy. Really it's the GIDs that have to be copied (because
5932  // they have to be converted from LIDs).
5933  size_t checkRowLength = 0;
5934  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5935  checkRowLength);
5936  if (debug) {
5937  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5938  (rowLength != checkRowLength, std::logic_error, ": For "
5939  "global row index " << sourceGID << ", the source "
5940  "matrix's getNumEntriesInGlobalRow returns a row length "
5941  "of " << rowLength << ", but getGlobalRowCopy reports "
5942  "a row length of " << checkRowLength << "." << suffix);
5943  }
5944  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5945  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5946  }
5947  else { // source matrix is globally indexed.
5948  global_inds_host_view_type rowIndsView;
5949  values_host_view_type rowValsView;
5950  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5951 
5952  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5953  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5954  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5955  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5956  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5957  rowIndsView.data(), rowIndsView.extent(0),
5958  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5959  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5960  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5961  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5962  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5963  // KDDKDD UVM TEMPORARY: KokkosView interface
5964  }
5965 
5966  // Combine the data into the target matrix.
5967  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5968  rowValsConstView, prefix_raw, debug, verbose);
5969  }
5970 
5971  if (verbose) {
5972  std::ostringstream os;
5973  os << *prefix << "Do permutes" << endl;
5974  }
5975  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5976  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5977 
5978  const map_type& tgtRowMap = * (this->getRowMap ());
5979  for (size_t p = 0; p < numPermutes; ++p) {
5980  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5981  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5982 
5983  ArrayView<const GO> rowIndsConstView;
5984  ArrayView<const Scalar> rowValsConstView;
5985 
5986  if (sourceIsLocallyIndexed) {
5987  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5988  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5989  Kokkos::resize(rowInds,rowLength);
5990  Kokkos::resize(rowVals,rowLength);
5991  }
5992  // Resizing invalidates an Array's views, so we must make new
5993  // ones, even if rowLength hasn't changed.
5994  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5995  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5996 
5997  // The source matrix is locally indexed, so we have to get a
5998  // copy. Really it's the GIDs that have to be copied (because
5999  // they have to be converted from LIDs).
6000  size_t checkRowLength = 0;
6001  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6002  rowValsView, checkRowLength);
6003  if (debug) {
6004  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6005  (rowLength != checkRowLength, std::logic_error, "For "
6006  "source matrix global row index " << sourceGID << ", "
6007  "getNumEntriesInGlobalRow returns a row length of " <<
6008  rowLength << ", but getGlobalRowCopy a row length of "
6009  << checkRowLength << "." << suffix);
6010  }
6011  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6012  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6013  }
6014  else {
6015  global_inds_host_view_type rowIndsView;
6016  values_host_view_type rowValsView;
6017  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6018 
6019  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6020  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6021  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6022  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6023  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6024  rowIndsView.data(), rowIndsView.extent(0),
6025  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6026  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6027  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6028  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6029  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6030  // KDDKDD UVM TEMPORARY: KokkosView interface
6031  }
6032 
6033  // Combine the data into the target matrix.
6034  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6035  rowValsConstView, prefix_raw, debug, verbose);
6036  }
6037 
6038  if (verbose) {
6039  std::ostringstream os;
6040  os << *prefix << "Done" << endl;
6041  }
6042  }
6043 
6044  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6045  void
6048  const SrcDistObject& srcObj,
6049  const size_t numSameIDs,
6050  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6051  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6052  const CombineMode /*CM*/)
6053  {
6054  using Details::Behavior;
6057  using std::endl;
6058 
6059  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6060  const char tfecfFuncName[] = "copyAndPermute: ";
6061  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6062 
6063  const bool verbose = Behavior::verbose("CrsMatrix");
6064  std::unique_ptr<std::string> prefix;
6065  if (verbose) {
6066  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6067  std::ostringstream os;
6068  os << *prefix << endl
6069  << *prefix << " numSameIDs: " << numSameIDs << endl
6070  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6071  << endl
6072  << *prefix << " "
6073  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6074  << endl
6075  << *prefix << " "
6076  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6077  << endl
6078  << *prefix << " "
6079  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6080  << endl;
6081  std::cerr << os.str ();
6082  }
6083 
6084  const auto numPermute = permuteToLIDs.extent (0);
6085  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6086  (numPermute != permuteFromLIDs.extent (0),
6087  std::invalid_argument, "permuteToLIDs.extent(0) = "
6088  << numPermute << "!= permuteFromLIDs.extent(0) = "
6089  << permuteFromLIDs.extent (0) << ".");
6090 
6091  // This dynamic cast should succeed, because we've already tested
6092  // it in checkSizes().
6094  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6095  if (isStaticGraph ()) {
6096  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6097  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6098  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6099  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6100 
6101  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6102  permuteToLIDs_h.data(),
6103  permuteFromLIDs_h.data(),
6104  numPermute);
6105  }
6106  else {
6107  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6108  permuteFromLIDs, numPermute);
6109  }
6110 
6111  if (verbose) {
6112  std::ostringstream os;
6113  os << *prefix << "Done" << endl;
6114  std::cerr << os.str();
6115  }
6116  }
6117 
6118  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6119  void
6122  (const SrcDistObject& source,
6123  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6124  Kokkos::DualView<char*, buffer_device_type>& exports,
6125  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6126  size_t& constantNumPackets)
6127  {
6128  using Details::Behavior;
6131  using Teuchos::outArg;
6132  using Teuchos::REDUCE_MAX;
6133  using Teuchos::reduceAll;
6134  using std::endl;
6135  typedef LocalOrdinal LO;
6136  typedef GlobalOrdinal GO;
6137  const char tfecfFuncName[] = "packAndPrepare: ";
6138  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6139 
6140  const bool debug = Behavior::debug("CrsMatrix");
6141  const bool verbose = Behavior::verbose("CrsMatrix");
6142 
6143  // Processes on which the communicator is null should not participate.
6144  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6145  if (pComm.is_null ()) {
6146  return;
6147  }
6148  const Teuchos::Comm<int>& comm = *pComm;
6149  const int myRank = comm.getSize ();
6150 
6151  std::unique_ptr<std::string> prefix;
6152  if (verbose) {
6153  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6154  std::ostringstream os;
6155  os << *prefix << "Start" << endl
6156  << *prefix << " "
6157  << dualViewStatusToString (exportLIDs, "exportLIDs")
6158  << endl
6159  << *prefix << " "
6160  << dualViewStatusToString (exports, "exports")
6161  << endl
6162  << *prefix << " "
6163  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6164  << endl;
6165  std::cerr << os.str ();
6166  }
6167 
6168  // Attempt to cast the source object to CrsMatrix. If successful,
6169  // use the source object's packNew() method to pack its data for
6170  // communication. Otherwise, attempt to cast to RowMatrix; if
6171  // successful, use the source object's pack() method. Otherwise,
6172  // the source object doesn't have the right type.
6173  //
6174  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6175  // RowMatrix to have the same Node type. Unfortunately, we don't
6176  // have a way to ask if the RowMatrix is "a RowMatrix with any
6177  // Node type," since RowMatrix doesn't have a base class. A
6178  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6179  // not currently exist, would satisfy this requirement.
6180  //
6181  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6182  // type doesn't technically need to match the target object's
6183  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6184  // and GO need not be the same, as long as there is no overflow of
6185  // the indices. However, checking for index overflow is global
6186  // and therefore undesirable.
6187 
6188  std::ostringstream msg; // for collecting error messages
6189  int lclBad = 0; // to be set below
6190 
6191  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6192  const crs_matrix_type* srcCrsMat =
6193  dynamic_cast<const crs_matrix_type*> (&source);
6194  if (srcCrsMat != nullptr) {
6195  if (verbose) {
6196  std::ostringstream os;
6197  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6198  "calling packNew" << endl;
6199  std::cerr << os.str ();
6200  }
6201  try {
6202  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6203  constantNumPackets);
6204  }
6205  catch (std::exception& e) {
6206  lclBad = 1;
6207  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6208  }
6209  }
6210  else {
6211  using Kokkos::HostSpace;
6212  using Kokkos::subview;
6213  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6214  using range_type = Kokkos::pair<size_t, size_t>;
6215 
6216  if (verbose) {
6217  std::ostringstream os;
6218  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6219  << endl;
6220  std::cerr << os.str ();
6221  }
6222 
6223  const row_matrix_type* srcRowMat =
6224  dynamic_cast<const row_matrix_type*> (&source);
6225  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6226  (srcRowMat == nullptr, std::invalid_argument,
6227  "The source object of the Import or Export operation is neither a "
6228  "CrsMatrix (with the same template parameters as the target object), "
6229  "nor a RowMatrix (with the same first four template parameters as the "
6230  "target object).");
6231 
6232  // For the RowMatrix case, we need to convert from
6233  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6234  // so terribly efficient, since packing a non-CrsMatrix
6235  // RowMatrix for Import/Export into a CrsMatrix is not a
6236  // critical case. Thus, we may allocate Teuchos::Array objects
6237  // here and copy to and from Kokkos::*View.
6238 
6239  // View exportLIDs's host data as a Teuchos::ArrayView.
6240  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6241  auto exportLIDs_h = exportLIDs.view_host ();
6242  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6243  exportLIDs_h.size ());
6244 
6245  // pack() will allocate exports_a as needed. We'll copy back
6246  // into exports (after (re)allocating exports if needed) below.
6247  Teuchos::Array<char> exports_a;
6248 
6249  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6250  // need to sync, since we're doing write-only access, but we do
6251  // need to mark the DualView as modified on host.
6252 
6253  numPacketsPerLID.clear_sync_state (); // write-only access
6254  numPacketsPerLID.modify_host ();
6255  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6256  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6257  numPacketsPerLID_h.size ());
6258 
6259  // Invoke RowMatrix's legacy pack() interface, using above
6260  // Teuchos::Array* objects.
6261  try {
6262  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6263  constantNumPackets);
6264  }
6265  catch (std::exception& e) {
6266  lclBad = 1;
6267  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6268  }
6269 
6270  // Allocate 'exports', and copy exports_a back into it.
6271  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6272  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6273  const std::string oldLabel = exports.d_view.label ();
6274  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6275  exports = exports_type (newLabel, newAllocSize);
6276  }
6277  // It's safe to assume that we're working on host anyway, so
6278  // just keep exports sync'd to host.
6279  // ignore current device contents
6280  exports.modify_host();
6281 
6282  auto exports_h = exports.view_host ();
6283  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6284 
6285  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6286  // exports_a into a nonowning Kokkos::View first before copying.
6287  typedef typename exports_type::t_host::execution_space HES;
6288  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6289  Kokkos::View<const char*, host_device_type>
6290  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6291  // DEEP_COPY REVIEW - NOT TESTED
6292  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6293  }
6294 
6295  if (debug) {
6296  int gblBad = 0; // output argument; to be set below
6297  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6298  if (gblBad != 0) {
6299  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6300  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6301  (true, std::logic_error, "packNew() or pack() threw an exception on "
6302  "one or more participating processes.");
6303  }
6304  }
6305  else {
6306  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6307  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6308  "or more participating processes. Here is this process' error "
6309  "message: " << msg.str ());
6310  }
6311 
6312  if (verbose) {
6313  std::ostringstream os;
6314  os << *prefix << "packAndPrepare: Done!" << endl
6315  << *prefix << " "
6316  << dualViewStatusToString (exportLIDs, "exportLIDs")
6317  << endl
6318  << *prefix << " "
6319  << dualViewStatusToString (exports, "exports")
6320  << endl
6321  << *prefix << " "
6322  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6323  << endl;
6324  std::cerr << os.str ();
6325  }
6326  }
6327 
6328  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6329  size_t
6330  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6331  packRow (char exports[],
6332  const size_t offset,
6333