Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_CRSMATRIX_DEF_HPP
11 #define TPETRA_CRSMATRIX_DEF_HPP
12 
20 
21 #include "Tpetra_Import_Util.hpp"
22 #include "Tpetra_Import_Util2.hpp"
23 #include "Tpetra_RowMatrix.hpp"
24 #include "Tpetra_LocalCrsMatrixOperator.hpp"
25 
32 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
40 #include "Tpetra_Details_packCrsMatrix.hpp"
41 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
43 #include "Teuchos_FancyOStream.hpp"
44 #include "Teuchos_RCP.hpp"
45 #include "Teuchos_DataAccess.hpp"
46 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
47 #include "KokkosBlas1_scal.hpp"
48 #include "KokkosSparse_getDiagCopy.hpp"
49 #include "KokkosSparse_spmv.hpp"
50 
51 #include <memory>
52 #include <sstream>
53 #include <typeinfo>
54 #include <utility>
55 #include <vector>
56 
57 namespace Tpetra {
58 
59 namespace { // (anonymous)
60 
61  template<class T, class BinaryFunction>
62  T atomic_binary_function_update (volatile T* const dest,
63  const T& inputVal,
64  BinaryFunction f)
65  {
66  T oldVal = *dest;
67  T assume;
68 
69  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
70  // POWER architectures, because 'newval' depends on 'assume',
71  // which depends on 'oldVal', which depends on '*dest'. This
72  // sets up a chain of read dependencies that should ensure
73  // correct behavior given a sane memory model.
74  do {
75  assume = oldVal;
76  T newVal = f (assume, inputVal);
77  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
78  } while (assume != oldVal);
79 
80  return oldVal;
81  }
82 } // namespace (anonymous)
83 
84 //
85 // Users must never rely on anything in the Details namespace.
86 //
87 namespace Details {
88 
98 template<class Scalar>
99 struct AbsMax {
101  Scalar operator() (const Scalar& x, const Scalar& y) {
102  typedef Teuchos::ScalarTraits<Scalar> STS;
103  return std::max (STS::magnitude (x), STS::magnitude (y));
104  }
105 };
106 
107 } // namespace Details
108 } // namespace Tpetra
109 
110 namespace Tpetra {
111 
112  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
113  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
114  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
115  size_t maxNumEntriesPerRow,
116  const Teuchos::RCP<Teuchos::ParameterList>& params) :
117  dist_object_type (rowMap)
118  {
119  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
120  "[, RCP<ParameterList>]): ";
121  Teuchos::RCP<crs_graph_type> graph;
122  try {
123  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
124  params));
125  }
126  catch (std::exception& e) {
127  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
128  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
129  "size_t [, RCP<ParameterList>]) threw an exception: "
130  << e.what ());
131  }
132  // myGraph_ not null means that the matrix owns the graph. That's
133  // different than the const CrsGraph constructor, where the matrix
134  // does _not_ own the graph.
135  myGraph_ = graph;
136  staticGraph_ = myGraph_;
137  resumeFill (params);
139  }
140 
141  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
143  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
144  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
145  const Teuchos::RCP<Teuchos::ParameterList>& params) :
146  dist_object_type (rowMap)
147  {
148  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
149  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
150  Teuchos::RCP<crs_graph_type> graph;
151  try {
152  using Teuchos::rcp;
153  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
154  params));
155  }
156  catch (std::exception& e) {
157  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
158  (true, std::runtime_error, "CrsGraph constructor "
159  "(RCP<const Map>, ArrayView<const size_t>"
160  "[, RCP<ParameterList>]) threw an exception: "
161  << e.what ());
162  }
163  // myGraph_ not null means that the matrix owns the graph. That's
164  // different than the const CrsGraph constructor, where the matrix
165  // does _not_ own the graph.
166  myGraph_ = graph;
167  staticGraph_ = graph;
168  resumeFill (params);
170  }
171 
172  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
174  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
175  const Teuchos::RCP<const map_type>& colMap,
176  const size_t maxNumEntPerRow,
177  const Teuchos::RCP<Teuchos::ParameterList>& params) :
178  dist_object_type (rowMap)
179  {
180  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
181  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
182  const char suffix[] =
183  " Please report this bug to the Tpetra developers.";
184 
185  // An artifact of debugging something a while back.
186  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
187  (! staticGraph_.is_null (), std::logic_error,
188  "staticGraph_ is not null at the beginning of the constructor."
189  << suffix);
190  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
191  (! myGraph_.is_null (), std::logic_error,
192  "myGraph_ is not null at the beginning of the constructor."
193  << suffix);
194  Teuchos::RCP<crs_graph_type> graph;
195  try {
196  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
197  maxNumEntPerRow,
198  params));
199  }
200  catch (std::exception& e) {
201  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
202  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
203  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
204  "exception: " << e.what ());
205  }
206  // myGraph_ not null means that the matrix owns the graph. That's
207  // different than the const CrsGraph constructor, where the matrix
208  // does _not_ own the graph.
209  myGraph_ = graph;
210  staticGraph_ = myGraph_;
211  resumeFill (params);
213  }
214 
215  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
217  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
218  const Teuchos::RCP<const map_type>& colMap,
219  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
220  const Teuchos::RCP<Teuchos::ParameterList>& params) :
221  dist_object_type (rowMap)
222  {
223  const char tfecfFuncName[] =
224  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
225  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
226  Teuchos::RCP<crs_graph_type> graph;
227  try {
228  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
229  numEntPerRowToAlloc,
230  params));
231  }
232  catch (std::exception& e) {
233  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
234  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
235  "RCP<const Map>, ArrayView<const size_t>[, "
236  "RCP<ParameterList>]) threw an exception: " << e.what ());
237  }
238  // myGraph_ not null means that the matrix owns the graph. That's
239  // different than the const CrsGraph constructor, where the matrix
240  // does _not_ own the graph.
241  myGraph_ = graph;
242  staticGraph_ = graph;
243  resumeFill (params);
245  }
246 
247 
248  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
250  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
251  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
252  dist_object_type (graph->getRowMap ()),
253  staticGraph_ (graph),
254  storageStatus_ (Details::STORAGE_1D_PACKED)
255  {
256  using std::endl;
257  typedef typename local_matrix_device_type::values_type values_type;
258  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
259  "RCP<ParameterList>]): ";
260  const bool verbose = Details::Behavior::verbose("CrsMatrix");
261 
262  std::unique_ptr<std::string> prefix;
263  if (verbose) {
264  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
265  std::ostringstream os;
266  os << *prefix << "Start" << endl;
267  std::cerr << os.str ();
268  }
269 
270  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
271  (graph.is_null (), std::runtime_error, "Input graph is null.");
272  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
273  (! graph->isFillComplete (), std::runtime_error, "Input graph "
274  "is not fill complete. You must call fillComplete on the "
275  "graph before using it to construct a CrsMatrix. Note that "
276  "calling resumeFill on the graph makes it not fill complete, "
277  "even if you had previously called fillComplete. In that "
278  "case, you must call fillComplete on the graph again.");
279 
280  // The graph is fill complete, so it is locally indexed and has a
281  // fixed structure. This means we can allocate the (1-D) array of
282  // values and build the local matrix right now. Note that the
283  // local matrix's number of columns comes from the column Map, not
284  // the domain Map.
285 
286  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
287  if (verbose) {
288  std::ostringstream os;
289  os << *prefix << "Allocate values: " << numEnt << endl;
290  std::cerr << os.str ();
291  }
292 
293  values_type val ("Tpetra::CrsMatrix::values", numEnt);
294  valuesPacked_wdv = values_wdv_type(val);
295  valuesUnpacked_wdv = valuesPacked_wdv;
296 
298 
299  if (verbose) {
300  std::ostringstream os;
301  os << *prefix << "Done" << endl;
302  std::cerr << os.str ();
303  }
304  }
305 
306  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
309  const Teuchos::RCP<const crs_graph_type>& graph,
310  const Teuchos::RCP<Teuchos::ParameterList>& params) :
311  dist_object_type (graph->getRowMap ()),
312  staticGraph_ (graph),
313  storageStatus_ (matrix.storageStatus_)
314  {
315  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
316  "local_matrix_device_type::values_type, "
317  "[,RCP<ParameterList>]): ";
318  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
319  (graph.is_null (), std::runtime_error, "Input graph is null.");
320  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
321  (! graph->isFillComplete (), std::runtime_error, "Input graph "
322  "is not fill complete. You must call fillComplete on the "
323  "graph before using it to construct a CrsMatrix. Note that "
324  "calling resumeFill on the graph makes it not fill complete, "
325  "even if you had previously called fillComplete. In that "
326  "case, you must call fillComplete on the graph again.");
327 
328  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
329  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
330 
331  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
332  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
333 
335  }
336 
337 
338  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
340  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
341  const typename local_matrix_device_type::values_type& values,
342  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
343  dist_object_type (graph->getRowMap ()),
344  staticGraph_ (graph),
345  storageStatus_ (Details::STORAGE_1D_PACKED)
346  {
347  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
348  "local_matrix_device_type::values_type, "
349  "[,RCP<ParameterList>]): ";
350  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
351  (graph.is_null (), std::runtime_error, "Input graph is null.");
352  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
353  (! graph->isFillComplete (), std::runtime_error, "Input graph "
354  "is not fill complete. You must call fillComplete on the "
355  "graph before using it to construct a CrsMatrix. Note that "
356  "calling resumeFill on the graph makes it not fill complete, "
357  "even if you had previously called fillComplete. In that "
358  "case, you must call fillComplete on the graph again.");
359 
360  // The graph is fill complete, so it is locally indexed and has a
361  // fixed structure. This means we can allocate the (1-D) array of
362  // values and build the local matrix right now. Note that the
363  // local matrix's number of columns comes from the column Map, not
364  // the domain Map.
365 
366  valuesPacked_wdv = values_wdv_type(values);
367  valuesUnpacked_wdv = valuesPacked_wdv;
368 
369  // FIXME (22 Jun 2016) I would very much like to get rid of
370  // k_values1D_ at some point. I find it confusing to have all
371  // these extra references lying around.
372  // KDDKDD ALMOST THERE, MARK!
373 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
374 
376  }
377 
378  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
380  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
381  const Teuchos::RCP<const map_type>& colMap,
382  const typename local_graph_device_type::row_map_type& rowPointers,
383  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
384  const typename local_matrix_device_type::values_type& values,
385  const Teuchos::RCP<Teuchos::ParameterList>& params) :
386  dist_object_type (rowMap),
387  storageStatus_ (Details::STORAGE_1D_PACKED)
388  {
389  using Details::getEntryOnHost;
390  using Teuchos::RCP;
391  using std::endl;
392  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
393  "RCP<const Map>, ptr, ind, val[, params]): ";
394  const char suffix[] =
395  ". Please report this bug to the Tpetra developers.";
396  const bool debug = Details::Behavior::debug("CrsMatrix");
397  const bool verbose = Details::Behavior::verbose("CrsMatrix");
398 
399  std::unique_ptr<std::string> prefix;
400  if (verbose) {
401  prefix = this->createPrefix(
402  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
403  std::ostringstream os;
404  os << *prefix << "Start" << endl;
405  std::cerr << os.str ();
406  }
407 
408  // Check the user's input. Note that this might throw only on
409  // some processes but not others, causing deadlock. We prefer
410  // deadlock due to exceptions to segfaults, because users can
411  // catch exceptions.
412  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
413  (values.extent(0) != columnIndices.extent(0),
414  std::invalid_argument, "values.extent(0)=" << values.extent(0)
415  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
416  << ".");
417  if (debug && rowPointers.extent(0) != 0) {
418  const size_t numEnt =
419  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
420  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
421  (numEnt != size_t(columnIndices.extent(0)) ||
422  numEnt != size_t(values.extent(0)),
423  std::invalid_argument, "Last entry of rowPointers says that "
424  "the matrix has " << numEnt << " entr"
425  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
426  "columnIndices and values don't match this. "
427  "columnIndices.extent(0)=" << columnIndices.extent (0)
428  << " and values.extent(0)=" << values.extent (0) << ".");
429  }
430 
431  RCP<crs_graph_type> graph;
432  try {
433  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
434  columnIndices, params));
435  }
436  catch (std::exception& e) {
437  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
438  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
439  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
440  << e.what ());
441  }
442  // The newly created CrsGraph _must_ have a local graph at this
443  // point. We don't really care whether CrsGraph's constructor
444  // deep-copies or shallow-copies the input, but the dimensions
445  // have to be right. That's how we tell whether the CrsGraph has
446  // a local graph.
447  auto lclGraph = graph->getLocalGraphDevice ();
448  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
449  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
450  lclGraph.entries.extent (0) != columnIndices.extent (0),
451  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
452  "ind[, params]) did not set the local graph correctly." << suffix);
453  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
454  (lclGraph.entries.extent (0) != values.extent (0),
455  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
456  "params]) did not set the local graph correctly. "
457  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
458  << " != values.extent(0) = " << values.extent (0) << suffix);
459 
460  // myGraph_ not null means that the matrix owns the graph. This
461  // is true because the column indices come in as nonconst,
462  // implying shared ownership.
463  myGraph_ = graph;
464  staticGraph_ = graph;
465 
466  // The graph may not be fill complete yet. However, it is locally
467  // indexed (since we have a column Map) and has a fixed structure
468  // (due to the input arrays). This means we can allocate the
469  // (1-D) array of values and build the local matrix right now.
470  // Note that the local matrix's number of columns comes from the
471  // column Map, not the domain Map.
472 
473  valuesPacked_wdv = values_wdv_type(values);
474  valuesUnpacked_wdv = valuesPacked_wdv;
475 
476  // FIXME (22 Jun 2016) I would very much like to get rid of
477  // k_values1D_ at some point. I find it confusing to have all
478  // these extra references lying around.
479 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
480 
482  if (verbose) {
483  std::ostringstream os;
484  os << *prefix << "Done" << endl;
485  std::cerr << os.str();
486  }
487  }
488 
489  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
491  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
492  const Teuchos::RCP<const map_type>& colMap,
493  const Teuchos::ArrayRCP<size_t>& ptr,
494  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
495  const Teuchos::ArrayRCP<Scalar>& val,
496  const Teuchos::RCP<Teuchos::ParameterList>& params) :
497  dist_object_type (rowMap),
498  storageStatus_ (Details::STORAGE_1D_PACKED)
499  {
500  using Kokkos::Compat::getKokkosViewDeepCopy;
501  using Teuchos::av_reinterpret_cast;
502  using Teuchos::RCP;
503  using values_type = typename local_matrix_device_type::values_type;
504  using IST = impl_scalar_type;
505  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
506  "RCP<const Map>, ptr, ind, val[, params]): ";
507 
508  RCP<crs_graph_type> graph;
509  try {
510  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
511  ind, params));
512  }
513  catch (std::exception& e) {
514  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
515  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
516  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
517  "RCP<ParameterList>]) threw an exception: " << e.what ());
518  }
519  // myGraph_ not null means that the matrix owns the graph. This
520  // is true because the column indices come in as nonconst,
521  // implying shared ownership.
522  myGraph_ = graph;
523  staticGraph_ = graph;
524 
525  // The graph may not be fill complete yet. However, it is locally
526  // indexed (since we have a column Map) and has a fixed structure
527  // (due to the input arrays). This means we can allocate the
528  // (1-D) array of values and build the local matrix right now.
529  // Note that the local matrix's number of columns comes from the
530  // column Map, not the domain Map.
531 
532  // The graph _must_ have a local graph at this point. We don't
533  // really care whether CrsGraph's constructor deep-copies or
534  // shallow-copies the input, but the dimensions have to be right.
535  // That's how we tell whether the CrsGraph has a local graph.
536  auto lclGraph = staticGraph_->getLocalGraphDevice ();
537  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
538  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
539  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
540  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
541  "ptr, ind[, params]) did not set the local graph correctly. "
542  "Please report this bug to the Tpetra developers.");
543 
544  values_type valIn =
545  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
546  valuesPacked_wdv = values_wdv_type(valIn);
547  valuesUnpacked_wdv = valuesPacked_wdv;
548 
549  // FIXME (22 Jun 2016) I would very much like to get rid of
550  // k_values1D_ at some point. I find it confusing to have all
551  // these extra references lying around.
552 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
553 
555  }
556 
557  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
559  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
560  const Teuchos::RCP<const map_type>& colMap,
561  const local_matrix_device_type& lclMatrix,
562  const Teuchos::RCP<Teuchos::ParameterList>& params) :
563  dist_object_type (rowMap),
564  storageStatus_ (Details::STORAGE_1D_PACKED),
565  fillComplete_ (true)
566  {
567  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
568  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
569  const char suffix[] =
570  " Please report this bug to the Tpetra developers.";
571 
572  Teuchos::RCP<crs_graph_type> graph;
573  try {
574  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
575  lclMatrix.graph, params));
576  }
577  catch (std::exception& e) {
578  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
579  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
580  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
581  "exception: " << e.what ());
582  }
583  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
584  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
585  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
586  "did not produce a fill-complete graph. Please report this bug to the "
587  "Tpetra developers.");
588  // myGraph_ not null means that the matrix owns the graph. This
589  // is true because the column indices come in as nonconst through
590  // the matrix, implying shared ownership.
591  myGraph_ = graph;
592  staticGraph_ = graph;
593 
594  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
595  valuesUnpacked_wdv = valuesPacked_wdv;
596 
597  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
598  (isFillActive (), std::logic_error,
599  "At the end of a CrsMatrix constructor that should produce "
600  "a fillComplete matrix, isFillActive() is true." << suffix);
601  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
602  (! isFillComplete (), std::logic_error, "At the end of a "
603  "CrsMatrix constructor that should produce a fillComplete "
604  "matrix, isFillComplete() is false." << suffix);
606  }
607 
608  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
611  const Teuchos::RCP<const map_type>& rowMap,
612  const Teuchos::RCP<const map_type>& colMap,
613  const Teuchos::RCP<const map_type>& domainMap,
614  const Teuchos::RCP<const map_type>& rangeMap,
615  const Teuchos::RCP<Teuchos::ParameterList>& params) :
616  dist_object_type (rowMap),
617  storageStatus_ (Details::STORAGE_1D_PACKED),
618  fillComplete_ (true)
619  {
620  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
621  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
622  "local_matrix_device_type[, RCP<ParameterList>]): ";
623  const char suffix[] =
624  " Please report this bug to the Tpetra developers.";
625 
626  Teuchos::RCP<crs_graph_type> graph;
627  try {
628  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
629  domainMap, rangeMap, params));
630  }
631  catch (std::exception& e) {
632  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
633  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
634  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
635  "RCP<ParameterList>]) threw an exception: " << e.what ());
636  }
637  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
638  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
639  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
640  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
641  "not produce a fillComplete graph." << suffix);
642  // myGraph_ not null means that the matrix owns the graph. This
643  // is true because the column indices come in as nonconst through
644  // the matrix, implying shared ownership.
645  myGraph_ = graph;
646  staticGraph_ = graph;
647 
648  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
649  valuesUnpacked_wdv = valuesPacked_wdv;
650 
651  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
652  (isFillActive (), std::logic_error,
653  "At the end of a CrsMatrix constructor that should produce "
654  "a fillComplete matrix, isFillActive() is true." << suffix);
655  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
656  (! isFillComplete (), std::logic_error, "At the end of a "
657  "CrsMatrix constructor that should produce a fillComplete "
658  "matrix, isFillComplete() is false." << suffix);
660  }
661 
662  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
665  const Teuchos::RCP<const map_type>& rowMap,
666  const Teuchos::RCP<const map_type>& colMap,
667  const Teuchos::RCP<const map_type>& domainMap,
668  const Teuchos::RCP<const map_type>& rangeMap,
669  const Teuchos::RCP<const import_type>& importer,
670  const Teuchos::RCP<const export_type>& exporter,
671  const Teuchos::RCP<Teuchos::ParameterList>& params) :
672  dist_object_type (rowMap),
673  storageStatus_ (Details::STORAGE_1D_PACKED),
674  fillComplete_ (true)
675  {
676  using Teuchos::rcp;
677  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
678  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
679  const char suffix[] =
680  " Please report this bug to the Tpetra developers.";
681 
682  Teuchos::RCP<crs_graph_type> graph;
683  try {
684  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
685  domainMap, rangeMap, importer,
686  exporter, params));
687  }
688  catch (std::exception& e) {
689  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
690  (true, std::runtime_error, "CrsGraph constructor "
691  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
692  "params) threw: " << e.what ());
693  }
694  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
695  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
696  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
697  "Export, params) did not produce a fill-complete graph. "
698  "Please report this bug to the Tpetra developers.");
699  // myGraph_ not null means that the matrix owns the graph. This
700  // is true because the column indices come in as nonconst through
701  // the matrix, implying shared ownership.
702  myGraph_ = graph;
703  staticGraph_ = graph;
704 
705  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
706  valuesUnpacked_wdv = valuesPacked_wdv;
707 
708  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
709  (isFillActive (), std::logic_error,
710  "At the end of a CrsMatrix constructor that should produce "
711  "a fillComplete matrix, isFillActive() is true." << suffix);
712  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
713  (! isFillComplete (), std::logic_error, "At the end of a "
714  "CrsMatrix constructor that should produce a fillComplete "
715  "matrix, isFillComplete() is false." << suffix);
717  }
718 
719  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
722  const Teuchos::DataAccess copyOrView):
723  dist_object_type (source.getCrsGraph()->getRowMap ()),
724  staticGraph_ (source.getCrsGraph()),
725  storageStatus_ (source.storageStatus_)
726  {
727  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
728  "const CrsMatrix&, const Teuchos::DataAccess): ";
729  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
730  (! source.isFillComplete (), std::invalid_argument,
731  "Source graph must be fillComplete().");
732 
733  if (copyOrView == Teuchos::Copy) {
734  using values_type = typename local_matrix_device_type::values_type;
735  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
736  using Kokkos::view_alloc;
737  using Kokkos::WithoutInitializing;
738  values_type newvals (view_alloc ("val", WithoutInitializing),
739  vals.extent (0));
740  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
741  Kokkos::deep_copy (newvals, vals);
742  valuesPacked_wdv = values_wdv_type(newvals);
743  valuesUnpacked_wdv = valuesPacked_wdv;
744  fillComplete (source.getDomainMap (), source.getRangeMap ());
745  }
746  else if (copyOrView == Teuchos::View) {
747  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
748  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
749  fillComplete (source.getDomainMap (), source.getRangeMap ());
750  }
751  else {
752  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
753  (true, std::invalid_argument, "Second argument 'copyOrView' "
754  "has an invalid value " << copyOrView << ". Valid values "
755  "include Teuchos::Copy = " << Teuchos::Copy << " and "
756  "Teuchos::View = " << Teuchos::View << ".");
757  }
759  }
760 
761  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
762  void
765  {
766  std::swap(crs_matrix.importMV_, this->importMV_);
767  std::swap(crs_matrix.exportMV_, this->exportMV_);
768  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
769  std::swap(crs_matrix.myGraph_, this->myGraph_);
770  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
771  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
772  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
773  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
774  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
775  }
776 
777  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
778  Teuchos::RCP<const Teuchos::Comm<int> >
780  getComm () const {
781  return getCrsGraphRef ().getComm ();
782  }
783 
784  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
785  bool
787  isFillComplete () const {
788  return fillComplete_;
789  }
790 
791  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
792  bool
794  isFillActive () const {
795  return ! fillComplete_;
796  }
797 
798  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
799  bool
802  return this->getCrsGraphRef ().isStorageOptimized ();
803  }
804 
805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
806  bool
809  return getCrsGraphRef ().isLocallyIndexed ();
810  }
811 
812  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
813  bool
816  return getCrsGraphRef ().isGloballyIndexed ();
817  }
818 
819  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
820  bool
822  hasColMap () const {
823  return getCrsGraphRef ().hasColMap ();
824  }
825 
826  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
830  return getCrsGraphRef ().getGlobalNumEntries ();
831  }
832 
833  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
834  size_t
837  return getCrsGraphRef ().getLocalNumEntries ();
838  }
839 
840  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
844  return getCrsGraphRef ().getGlobalNumRows ();
845  }
846 
847  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
851  return getCrsGraphRef ().getGlobalNumCols ();
852  }
853 
854  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
855  size_t
858  return getCrsGraphRef ().getLocalNumRows ();
859  }
860 
861 
862  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
863  size_t
866  return getCrsGraphRef ().getLocalNumCols ();
867  }
868 
869 
870  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
871  size_t
873  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
874  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
875  }
876 
877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
878  size_t
880  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
881  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
882  }
883 
884  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
885  size_t
888  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
889  }
890 
891  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892  size_t
895  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
896  }
897 
898  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
899  GlobalOrdinal
901  getIndexBase () const {
902  return getRowMap ()->getIndexBase ();
903  }
904 
905  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
906  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
908  getRowMap () const {
909  return getCrsGraphRef ().getRowMap ();
910  }
911 
912  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
913  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
915  getColMap () const {
916  return getCrsGraphRef ().getColMap ();
917  }
918 
919  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
920  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
922  getDomainMap () const {
923  return getCrsGraphRef ().getDomainMap ();
924  }
925 
926  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
927  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
929  getRangeMap () const {
930  return getCrsGraphRef ().getRangeMap ();
931  }
932 
933  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
934  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
936  getGraph () const {
937  if (staticGraph_ != Teuchos::null) {
938  return staticGraph_;
939  }
940  return myGraph_;
941  }
942 
943  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
944  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
946  getCrsGraph () const {
947  if (staticGraph_ != Teuchos::null) {
948  return staticGraph_;
949  }
950  return myGraph_;
951  }
952 
953  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
956  getCrsGraphRef () const
957  {
958 #ifdef HAVE_TPETRA_DEBUG
959  constexpr bool debug = true;
960 #else
961  constexpr bool debug = false;
962 #endif // HAVE_TPETRA_DEBUG
963 
964  if (! this->staticGraph_.is_null ()) {
965  return * (this->staticGraph_);
966  }
967  else {
968  if (debug) {
969  const char tfecfFuncName[] = "getCrsGraphRef: ";
970  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
971  (this->myGraph_.is_null (), std::logic_error,
972  "Both staticGraph_ and myGraph_ are null. "
973  "Please report this bug to the Tpetra developers.");
974  }
975  return * (this->myGraph_);
976  }
977  }
978 
979  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
980  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
983  {
984  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
985  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
986  numCols,
987  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
988  staticGraph_->getLocalGraphDevice());
989  }
990 
991  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
992  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
994  getLocalMatrixHost () const
995  {
996  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
997  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
998  valuesPacked_wdv.getHostView(Access::ReadWrite),
999  staticGraph_->getLocalGraphHost());
1000  }
1001 
1002 #if KOKKOSKERNELS_VERSION < 40299
1003 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1004  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1005  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1008  {
1009  auto localMatrix = getLocalMatrixDevice();
1010 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
1011  if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()))
1012  {
1013  if(this->ordinalRowptrs.data() == nullptr)
1014  {
1015  auto originalRowptrs = localMatrix.graph.row_map;
1016  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1017  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1018  this->ordinalRowptrs = ordinal_rowptrs_type(
1019  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1020  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1021  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1022  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1023  KOKKOS_LAMBDA(LocalOrdinal i)
1024  {
1025  ordinalRowptrs_(i) = originalRowptrs(i);
1026  });
1027  }
1028  //return local operator using ordinalRowptrs
1029  return std::make_shared<local_multiply_op_type>(
1030  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1031  }
1032 #endif
1033 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1034  return std::make_shared<local_multiply_op_type>(
1035  std::make_shared<local_matrix_device_type>(localMatrix));
1036  }
1037 #endif
1038 
1039  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1040  bool
1042  isStaticGraph () const {
1043  return myGraph_.is_null ();
1044  }
1045 
1046  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1047  bool
1050  return true;
1051  }
1052 
1053  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1054  bool
1057  return true;
1058  }
1059 
1060  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1061  void
1063  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1064  const bool verbose)
1065  {
1066  using Details::Behavior;
1068  using std::endl;
1069  const char tfecfFuncName[] = "allocateValues: ";
1070  const char suffix[] =
1071  " Please report this bug to the Tpetra developers.";
1072  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1073 
1074  std::unique_ptr<std::string> prefix;
1075  if (verbose) {
1076  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1077  std::ostringstream os;
1078  os << *prefix << "lg: "
1079  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1080  << ", gas: Graph"
1081  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1082  << "Allocated" << endl;
1083  std::cerr << os.str();
1084  }
1085 
1086  const bool debug = Behavior::debug("CrsMatrix");
1087  if (debug) {
1088  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1089  (this->staticGraph_.is_null (), std::logic_error,
1090  "staticGraph_ is null." << suffix);
1091 
1092  // If the graph indices are already allocated, then gas should be
1093  // GraphAlreadyAllocated. Otherwise, gas should be
1094  // GraphNotYetAllocated.
1095  if ((gas == GraphAlreadyAllocated) !=
1096  staticGraph_->indicesAreAllocated ()) {
1097  const char err1[] = "The caller has asserted that the graph "
1098  "is ";
1099  const char err2[] = "already allocated, but the static graph "
1100  "says that its indices are ";
1101  const char err3[] = "already allocated. ";
1102  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1103  (gas == GraphAlreadyAllocated &&
1104  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1105  err1 << err2 << "not " << err3 << suffix);
1106  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1107  (gas != GraphAlreadyAllocated &&
1108  staticGraph_->indicesAreAllocated (), std::logic_error,
1109  err1 << "not " << err2 << err3 << suffix);
1110  }
1111 
1112  // If the graph is unallocated, then it had better be a
1113  // matrix-owned graph. ("Matrix-owned graph" means that the
1114  // matrix gets to define the graph structure. If the CrsMatrix
1115  // constructor that takes an RCP<const CrsGraph> was used, then
1116  // the matrix does _not_ own the graph.)
1117  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1118  (! this->staticGraph_->indicesAreAllocated () &&
1119  this->myGraph_.is_null (), std::logic_error,
1120  "The static graph says that its indices are not allocated, "
1121  "but the graph is not owned by the matrix." << suffix);
1122  }
1123 
1124  if (gas == GraphNotYetAllocated) {
1125  if (debug) {
1126  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1127  (this->myGraph_.is_null (), std::logic_error,
1128  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1129  }
1130  try {
1131  this->myGraph_->allocateIndices (lg, verbose);
1132  }
1133  catch (std::exception& e) {
1134  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1135  (true, std::runtime_error, "CrsGraph::allocateIndices "
1136  "threw an exception: " << e.what ());
1137  }
1138  catch (...) {
1139  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1140  (true, std::runtime_error, "CrsGraph::allocateIndices "
1141  "threw an exception not a subclass of std::exception.");
1142  }
1143  }
1144 
1145  // Allocate matrix values.
1146  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1147  if (debug) {
1148  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1149  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1150  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1151  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1152  }
1153 
1154  // Allocate array of (packed???) matrix values.
1155  using values_type = typename local_matrix_device_type::values_type;
1156  if (verbose) {
1157  std::ostringstream os;
1158  os << *prefix << "Allocate values_wdv: Pre "
1159  << valuesUnpacked_wdv.extent(0) << ", post "
1160  << lclTotalNumEntries << endl;
1161  std::cerr << os.str();
1162  }
1163 // this->k_values1D_ =
1164  valuesUnpacked_wdv = values_wdv_type(
1165  values_type("Tpetra::CrsMatrix::values",
1166  lclTotalNumEntries));
1167  }
1168 
1169 
1170  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1171  void
1173  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1174  {
1176  using ::Tpetra::Details::getEntryOnHost;
1177  using Teuchos::arcp_const_cast;
1178  using Teuchos::Array;
1179  using Teuchos::ArrayRCP;
1180  using Teuchos::null;
1181  using Teuchos::RCP;
1182  using Teuchos::rcp;
1183  using std::endl;
1184  using row_map_type = typename local_graph_device_type::row_map_type;
1185  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1186  using values_type = typename local_matrix_device_type::values_type;
1187  Details::ProfilingRegion regionFLGAM
1188  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1189 
1190  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1191  "fillComplete or expertStaticFillComplete): ";
1192  const char suffix[] =
1193  " Please report this bug to the Tpetra developers.";
1194  const bool debug = Details::Behavior::debug("CrsMatrix");
1195  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1196 
1197  std::unique_ptr<std::string> prefix;
1198  if (verbose) {
1199  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1200  std::ostringstream os;
1201  os << *prefix << endl;
1202  std::cerr << os.str ();
1203  }
1204 
1205  if (debug) {
1206  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1207  // matrix owns the graph, which means myGraph_ is not null.
1208  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1209  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1210  "(myGraph_) is null. This means that the matrix has a "
1211  "const (a.k.a. \"static\") graph. fillComplete or "
1212  "expertStaticFillComplete should never call "
1213  "fillLocalGraphAndMatrix in that case." << suffix);
1214  }
1215 
1216  const size_t lclNumRows = this->getLocalNumRows ();
1217 
1218  // This method's goal is to fill in the three arrays (compressed
1219  // sparse row format) that define the sparse graph's and matrix's
1220  // structure, and the sparse matrix's values.
1221  //
1222  // Get references to the data in myGraph_, so we can modify them
1223  // as well. Note that we only call fillLocalGraphAndMatrix() if
1224  // the matrix owns the graph, which means myGraph_ is not null.
1225 
1226  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1227  // See: https://github.com/trilinos/Trilinos/issues/12237
1228  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1229  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1230 
1231  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1232  myGraph_->rowPtrsUnpacked_dev_;
1233 
1234  if (debug) {
1235  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1236  (curRowOffsets.extent (0) == 0, std::logic_error,
1237  "curRowOffsets.extent(0) == 0.");
1238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1239  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1240  "curRowOffsets.extent(0) = "
1241  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1242  << (lclNumRows + 1) << ".");
1243  const size_t numOffsets = curRowOffsets.extent (0);
1244  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1245  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1246  (numOffsets != 0 &&
1247  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1248  std::logic_error, "numOffsets = " <<
1249  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1250  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1251  << numOffsets << ") = " << valToCheck << ".");
1252  }
1253 
1254  if (myGraph_->getLocalNumEntries() !=
1255  myGraph_->getLocalAllocationSize()) {
1256 
1257  // Use the nonconst version of row_map_type for k_ptrs,
1258  // because row_map_type is const and we need to modify k_ptrs here.
1259  typename row_map_type::non_const_type k_ptrs;
1260  row_map_type k_ptrs_const;
1261  lclinds_1d_type k_inds;
1262  values_type k_vals;
1263 
1264  if (verbose) {
1265  std::ostringstream os;
1266  const auto numEnt = myGraph_->getLocalNumEntries();
1267  const auto allocSize = myGraph_->getLocalAllocationSize();
1268  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1269  << ", allocSize=" << allocSize << endl;
1270  std::cerr << os.str ();
1271  }
1272  // The matrix's current 1-D storage is "unpacked." This means
1273  // the row offsets may differ from what the final row offsets
1274  // should be. This could happen, for example, if the user
1275  // set an upper
1276  // bound on the number of entries per row, but didn't fill all
1277  // those entries.
1278  if (debug && curRowOffsets.extent (0) != 0) {
1279  const size_t numOffsets =
1280  static_cast<size_t> (curRowOffsets.extent (0));
1281  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1282  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1283  (static_cast<size_t> (valToCheck) !=
1284  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1285  std::logic_error, "(unpacked branch) Before "
1286  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1287  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1288  " = " << valuesUnpacked_wdv.extent (0) << ".");
1289  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1290  (static_cast<size_t> (valToCheck) !=
1291  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1292  std::logic_error, "(unpacked branch) Before "
1293  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1294  << ") = " << valToCheck
1295  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1296  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1297  }
1298  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1299  // the array of valid entry counts per row.
1300 
1301  // Total number of entries in the matrix on the calling
1302  // process. We will compute this in the loop below. It's
1303  // cheap to compute and useful as a sanity check.
1304  size_t lclTotalNumEntries = 0;
1305  {
1306  // Allocate the packed row offsets array. We use a nonconst
1307  // temporary (packedRowOffsets) here, because k_ptrs is
1308  // const. We will assign packedRowOffsets to k_ptrs below.
1309  if (verbose) {
1310  std::ostringstream os;
1311  os << *prefix << "Allocate packed row offsets: "
1312  << (lclNumRows+1) << endl;
1313  std::cerr << os.str ();
1314  }
1315  typename row_map_type::non_const_type
1316  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1317  typename row_entries_type::const_type numRowEnt_h =
1318  myGraph_->k_numRowEntries_;
1319  // We're computing offsets on device. This function can
1320  // handle numRowEnt_h being a host View.
1321  lclTotalNumEntries =
1322  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1323  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1324  // to use packedRowOffsets in the loop above and assign here.
1325  k_ptrs = packedRowOffsets;
1326  k_ptrs_const = k_ptrs;
1327  }
1328 
1329  if (debug) {
1330  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1331  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1332  std::logic_error,
1333  "(unpacked branch) After packing k_ptrs, "
1334  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1335  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1336  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1337  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1338  (valToCheck != lclTotalNumEntries, std::logic_error,
1339  "(unpacked branch) After filling k_ptrs, "
1340  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1341  << " != total number of entries on the calling process = "
1342  << lclTotalNumEntries << ".");
1343  }
1344 
1345  // Allocate the arrays of packed column indices and values.
1346  if (verbose) {
1347  std::ostringstream os;
1348  os << *prefix << "Allocate packed local column indices: "
1349  << lclTotalNumEntries << endl;
1350  std::cerr << os.str ();
1351  }
1352  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1353  if (verbose) {
1354  std::ostringstream os;
1355  os << *prefix << "Allocate packed values: "
1356  << lclTotalNumEntries << endl;
1357  std::cerr << os.str ();
1358  }
1359  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1360 
1361  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1362  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1363  // the packed row offsets array k_ptrs that we created above.
1364  //
1365  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1366  // need to keep around the unpacked row offsets, column
1367  // indices, and values arrays.
1368 
1369  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1370  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1371  using inds_packer_type = pack_functor<
1372  typename Graph::local_graph_device_type::entries_type::non_const_type,
1373  typename Graph::local_inds_dualv_type::t_dev::const_type,
1374  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1375  typename Graph::local_graph_device_type::row_map_type>;
1376  inds_packer_type indsPacker (
1377  k_inds,
1378  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1379  k_ptrs, curRowOffsets);
1380  using exec_space = typename decltype (k_inds)::execution_space;
1381  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1382  Kokkos::parallel_for
1383  ("Tpetra::CrsMatrix pack column indices",
1384  range_type (0, lclNumRows), indsPacker);
1385 
1386  // Pack the values from unpacked valuesUnpacked_wdv into packed
1387  // k_vals. We will replace valuesPacked_wdv below.
1388  using vals_packer_type = pack_functor<
1389  typename values_type::non_const_type,
1390  typename values_type::const_type,
1391  typename row_map_type::non_const_type,
1392  typename row_map_type::const_type>;
1393  vals_packer_type valsPacker (
1394  k_vals,
1395  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1396  k_ptrs, curRowOffsets);
1397  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1398  range_type (0, lclNumRows), valsPacker);
1399 
1400  if (debug) {
1401  const char myPrefix[] = "(\"Optimize Storage\""
1402  "=true branch) After packing, ";
1403  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1404  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1405  << "k_ptrs.extent(0) = 0. This probably means that "
1406  "rowPtrsUnpacked_ was never allocated.");
1407  if (k_ptrs.extent (0) != 0) {
1408  const size_t numOffsets (k_ptrs.extent (0));
1409  const auto valToCheck =
1410  getEntryOnHost (k_ptrs, numOffsets - 1);
1411  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1412  (size_t (valToCheck) != k_vals.extent (0),
1413  std::logic_error, myPrefix <<
1414  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1415  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1416  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1417  (size_t (valToCheck) != k_inds.extent (0),
1418  std::logic_error, myPrefix <<
1419  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1420  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1421  }
1422  }
1423  // Build the local graph.
1424  myGraph_->setRowPtrsPacked(k_ptrs_const);
1425  myGraph_->lclIndsPacked_wdv =
1426  typename crs_graph_type::local_inds_wdv_type(k_inds);
1427  valuesPacked_wdv = values_wdv_type(k_vals);
1428  }
1429  else { // We don't have to pack, so just set the pointers.
1430  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1431  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1432  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1433  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1434  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1435  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1436  valuesPacked_wdv = valuesUnpacked_wdv;
1437 
1438  if (verbose) {
1439  std::ostringstream os;
1440  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1441  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1442  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1443  << valuesUnpacked_wdv.extent(0) << endl;
1444  std::cerr << os.str();
1445  }
1446 
1447  if (debug) {
1448  const char myPrefix[] =
1449  "(\"Optimize Storage\"=false branch) ";
1450  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1452  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1453  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1454  "that rowPtrsUnpacked_ was never allocated.");
1455  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1456  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1457  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1458  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1459  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1460  std::logic_error, myPrefix <<
1461  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1462  << " != valuesPacked_wdv.extent(0) = "
1463  << valuesPacked_wdv.extent (0) << ".");
1464  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1465  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1466  std::logic_error, myPrefix <<
1467  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1468  << " != myGraph_->lclIndsPacked.extent(0) = "
1469  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1470  }
1471  }
1472  }
1473 
1474  if (debug) {
1475  const char myPrefix[] = "After packing, ";
1476  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1477  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1478  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1479  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1480  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1481  (lclNumRows+1) << ".");
1482  if (rowPtrsPackedHost.extent (0) != 0) {
1483  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1484  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1486  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1487  std::logic_error, myPrefix << "k_ptrs_const(" <<
1488  (numOffsets-1) << ") = " << valToCheck
1489  << " != valuesPacked_wdv.extent(0) = "
1490  << valuesPacked_wdv.extent (0) << ".");
1491  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1492  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1493  std::logic_error, myPrefix << "k_ptrs_const(" <<
1494  (numOffsets-1) << ") = " << valToCheck
1495  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1496  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1497  }
1498  }
1499 
1500  // May we ditch the old allocations for the packed (and otherwise
1501  // "optimized") allocations, later in this routine? Optimize
1502  // storage if the graph is not static, or if the graph already has
1503  // optimized storage.
1504  const bool defaultOptStorage =
1505  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1506  const bool requestOptimizedStorage =
1507  (! params.is_null () &&
1508  params->get ("Optimize Storage", defaultOptStorage)) ||
1509  (params.is_null () && defaultOptStorage);
1510 
1511  // The graph has optimized storage when indices are allocated,
1512  // myGraph_->k_numRowEntries_ is empty, and there are more than
1513  // zero rows on this process.
1514  if (requestOptimizedStorage) {
1515  // Free the old, unpacked, unoptimized allocations.
1516  // Free graph data structures that are only needed for
1517  // unpacked 1-D storage.
1518  if (verbose) {
1519  std::ostringstream os;
1520  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1521  << myGraph_->k_numRowEntries_.extent(0) << endl;
1522  std::cerr << os.str();
1523  }
1524 
1525  myGraph_->k_numRowEntries_ = row_entries_type ();
1526 
1527  // Keep the new 1-D packed allocations.
1528  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1529  // We directly set the memory spaces to avoid a memcpy from device to host
1530  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1531  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1532  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1533  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1534  valuesUnpacked_wdv = valuesPacked_wdv;
1535 
1536  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1537  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1538  }
1539  else {
1540  if (verbose) {
1541  std::ostringstream os;
1542  os << *prefix << "User requested NOT to optimize storage"
1543  << endl;
1544  std::cerr << os.str();
1545  }
1546  }
1547  }
1548 
1549  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1550  void
1552  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1553  {
1554  using ::Tpetra::Details::ProfilingRegion;
1555  using Teuchos::ArrayRCP;
1556  using Teuchos::Array;
1557  using Teuchos::null;
1558  using Teuchos::RCP;
1559  using Teuchos::rcp;
1560  using std::endl;
1561  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1562  using non_const_row_map_type = typename row_map_type::non_const_type;
1563  using values_type = typename local_matrix_device_type::values_type;
1564  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1565  const size_t lclNumRows = getLocalNumRows();
1566 
1567  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1568  std::unique_ptr<std::string> prefix;
1569  if (verbose) {
1570  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1571  std::ostringstream os;
1572  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1573  std::cerr << os.str ();
1574  }
1575 
1576  // The goals of this routine are first, to allocate and fill
1577  // packed 1-D storage (see below for an explanation) in the vals
1578  // array, and second, to give vals to the local matrix and
1579  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1580  // row offsets, within the scope of this routine, since we're only
1581  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1582  // fill both the graph and the matrix at the same time).
1583 
1584  // get data from staticGraph_
1585  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1586  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1587  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1588 
1589  row_map_type k_ptrs; // "packed" row offsets array
1590  values_type k_vals; // "packed" values array
1591 
1592  // May we ditch the old allocations for the packed (and otherwise
1593  // "optimized") allocations, later in this routine? Request
1594  // optimized storage by default.
1595  bool requestOptimizedStorage = true;
1596  const bool default_OptimizeStorage =
1597  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1598  if (! params.is_null() &&
1599  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1600  requestOptimizedStorage = false;
1601  }
1602  // If we're not allowed to change a static graph, then we can't
1603  // change the storage of the matrix, either. This means that if
1604  // the graph's storage isn't already optimized, we can't optimize
1605  // the matrix's storage either. Check and give warning, as
1606  // appropriate.
1607  if (! staticGraph_->isStorageOptimized () &&
1608  requestOptimizedStorage) {
1610  (true, std::runtime_error, "You requested optimized storage "
1611  "by setting the \"Optimize Storage\" flag to \"true\" in "
1612  "the ParameterList, or by virtue of default behavior. "
1613  "However, the associated CrsGraph was filled separately and "
1614  "requested not to optimize storage. Therefore, the "
1615  "CrsMatrix cannot optimize storage.");
1616  requestOptimizedStorage = false;
1617  }
1618 
1619  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1620  // See: https://github.com/trilinos/Trilinos/issues/12237
1621  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1622  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1623 
1624  // The matrix's values are currently
1625  // stored in a 1-D format. However, this format is "unpacked";
1626  // it doesn't necessarily have the same row offsets as indicated
1627  // by the ptrs array returned by allocRowPtrs. This could
1628  // happen, for example, if the user
1629  // fixed the number of matrix entries in
1630  // each row, but didn't fill all those entries.
1631  //
1632  // As above, we don't need to keep the "packed" row offsets
1633  // array ptrs here, but we do need it here temporarily, so we
1634  // have to allocate it. We'll free ptrs later in this method.
1635  //
1636  // Note that this routine checks whether storage has already
1637  // been packed. This is a common case for solution of nonlinear
1638  // PDEs using the finite element method, as long as the
1639  // structure of the sparse matrix does not change between linear
1640  // solves.
1641  if (nodeNumEntries != nodeNumAllocated) {
1642  if (verbose) {
1643  std::ostringstream os;
1644  os << *prefix << "Unpacked 1-D storage: numEnt="
1645  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1646  << endl;
1647  std::cerr << os.str();
1648  }
1649  // We have to pack the 1-D storage, since the user didn't fill
1650  // up all requested storage.
1651  if (verbose) {
1652  std::ostringstream os;
1653  os << *prefix << "Allocate packed row offsets: "
1654  << (lclNumRows+1) << endl;
1655  std::cerr << os.str();
1656  }
1657  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1658  lclNumRows+1);
1659  // Total number of entries in the matrix on the calling
1660  // process. We will compute this in the loop below. It's
1661  // cheap to compute and useful as a sanity check.
1662  size_t lclTotalNumEntries = 0;
1663  k_ptrs = tmpk_ptrs;
1664  {
1665  typename row_entries_type::const_type numRowEnt_h =
1666  staticGraph_->k_numRowEntries_;
1667  // This function can handle the counts being a host View.
1668  lclTotalNumEntries =
1669  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1670  }
1671 
1672  // Allocate the "packed" values array.
1673  // It has exactly the right number of entries.
1674  if (verbose) {
1675  std::ostringstream os;
1676  os << *prefix << "Allocate packed values: "
1677  << lclTotalNumEntries << endl;
1678  std::cerr << os.str ();
1679  }
1680  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1681 
1682  // Pack values_wdv into k_vals. We will replace values_wdv below.
1683  pack_functor<
1684  typename values_type::non_const_type,
1685  typename values_type::const_type,
1686  typename row_map_type::non_const_type,
1687  typename row_map_type::const_type> valsPacker
1688  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1689  tmpk_ptrs, k_rowPtrs);
1690 
1691  using exec_space = typename decltype (k_vals)::execution_space;
1692  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1693  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1694  range_type (0, lclNumRows), valsPacker);
1695  valuesPacked_wdv = values_wdv_type(k_vals);
1696  }
1697  else { // We don't have to pack, so just set the pointer.
1698  valuesPacked_wdv = valuesUnpacked_wdv;
1699  if (verbose) {
1700  std::ostringstream os;
1701  os << *prefix << "Storage already packed: "
1702  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1703  std::cerr << os.str();
1704  }
1705  }
1706 
1707  // May we ditch the old allocations for the packed one?
1708  if (requestOptimizedStorage) {
1709  // The user requested optimized storage, so we can dump the
1710  // unpacked 1-D storage, and keep the packed storage.
1711  valuesUnpacked_wdv = valuesPacked_wdv;
1712 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1713  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1714  }
1715  }
1716 
1717  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1718  void
1720  insertIndicesAndValues (crs_graph_type& graph,
1721  RowInfo& rowInfo,
1722  const typename crs_graph_type::SLocalGlobalViews& newInds,
1723  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1724  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1725  const ELocalGlobal lg,
1726  const ELocalGlobal I)
1727  {
1728  const size_t oldNumEnt = rowInfo.numEntries;
1729  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1730 
1731  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1732  // that probably relates to scalar_type vs. impl_scalar_type
1733  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1734  // details; look for GCC_WORKAROUND macro definition.
1735  if (numInserted > 0) {
1736  const size_t startOffset = oldNumEnt;
1737  memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1738  numInserted * sizeof (impl_scalar_type));
1739  }
1740  }
1741 
1742  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1743  void
1745  insertLocalValues (const LocalOrdinal lclRow,
1746  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1747  const Teuchos::ArrayView<const Scalar>& values,
1748  const CombineMode CM)
1749  {
1750  using std::endl;
1751  const char tfecfFuncName[] = "insertLocalValues: ";
1752 
1753  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1754  (! this->isFillActive (), std::runtime_error,
1755  "Fill is not active. After calling fillComplete, you must call "
1756  "resumeFill before you may insert entries into the matrix again.");
1757  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1758  (this->isStaticGraph (), std::runtime_error,
1759  "Cannot insert indices with static graph; use replaceLocalValues() "
1760  "instead.");
1761  // At this point, we know that myGraph_ is nonnull.
1762  crs_graph_type& graph = * (this->myGraph_);
1763  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1764  (graph.colMap_.is_null (), std::runtime_error,
1765  "Cannot insert local indices without a column map.");
1766  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1767  (graph.isGloballyIndexed (),
1768  std::runtime_error, "Graph indices are global; use "
1769  "insertGlobalValues().");
1770  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1771  (values.size () != indices.size (), std::runtime_error,
1772  "values.size() = " << values.size ()
1773  << " != indices.size() = " << indices.size () << ".");
1774  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1775  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1776  "Local row index " << lclRow << " does not belong to this process.");
1777 
1778  if (! graph.indicesAreAllocated ()) {
1779  // We only allocate values at most once per process, so it's OK
1780  // to check TPETRA_VERBOSE here.
1781  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1782  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1783  }
1784 
1785 #ifdef HAVE_TPETRA_DEBUG
1786  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1787  // In a debug build, test whether any of the given column indices
1788  // are not in the column Map. Keep track of the invalid column
1789  // indices so we can tell the user about them.
1790  {
1791  using Teuchos::toString;
1792 
1793  const map_type& colMap = * (graph.colMap_);
1794  Teuchos::Array<LocalOrdinal> badColInds;
1795  bool allInColMap = true;
1796  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1797  if (! colMap.isNodeLocalElement (indices[k])) {
1798  allInColMap = false;
1799  badColInds.push_back (indices[k]);
1800  }
1801  }
1802  if (! allInColMap) {
1803  std::ostringstream os;
1804  os << "You attempted to insert entries in owned row " << lclRow
1805  << ", at the following column indices: " << toString (indices)
1806  << "." << endl;
1807  os << "Of those, the following indices are not in the column Map on "
1808  "this process: " << toString (badColInds) << "." << endl << "Since "
1809  "the matrix has a column Map already, it is invalid to insert "
1810  "entries at those locations.";
1811  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1812  (true, std::invalid_argument, os.str ());
1813  }
1814  }
1815 #endif // HAVE_TPETRA_DEBUG
1816 
1817  RowInfo rowInfo = graph.getRowInfo (lclRow);
1818 
1819  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1820  if (CM == ADD) {
1821  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1822  valsView[offset] += values[k]; };
1823  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1824  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1825  } else if (CM == INSERT) {
1826  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1827  valsView[offset] = values[k]; };
1828  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1829  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1830  } else {
1831  std::ostringstream os;
1832  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1833  << "but this has not been implemented." << endl;
1834  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1835  (true, std::invalid_argument, os.str ());
1836  }
1837  }
1838 
1839  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1840  void
1842  insertLocalValues (const LocalOrdinal localRow,
1843  const LocalOrdinal numEnt,
1844  const Scalar vals[],
1845  const LocalOrdinal cols[],
1846  const CombineMode CM)
1847  {
1848  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1849  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1850  this->insertLocalValues (localRow, colsT, valsT, CM);
1851  }
1852 
1853  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1854  void
1857  RowInfo& rowInfo,
1858  const GlobalOrdinal gblColInds[],
1859  const impl_scalar_type vals[],
1860  const size_t numInputEnt)
1861  {
1862 #ifdef HAVE_TPETRA_DEBUG
1863  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1864  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1865  const size_t curNumEnt = rowInfo.numEntries;
1866 #endif // HAVE_TPETRA_DEBUG
1867 
1868  if (! graph.indicesAreAllocated ()) {
1869  // We only allocate values at most once per process, so it's OK
1870  // to check TPETRA_VERBOSE here.
1871  using ::Tpetra::Details::Behavior;
1872  const bool verbose = Behavior::verbose("CrsMatrix");
1873  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1874  // mfh 23 Jul 2017: allocateValues invalidates existing
1875  // getRowInfo results. Once we get rid of lazy graph
1876  // allocation, we'll be able to move the getRowInfo call outside
1877  // of this method.
1878  rowInfo = graph.getRowInfo (rowInfo.localRow);
1879  }
1880 
1881  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1882  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1883  valsView[offset] += vals[k];
1884  };
1885  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1886 #ifdef HAVE_TPETRA_DEBUG
1887  //numInserted is only used inside the debug code below.
1888  auto numInserted =
1889 #endif
1890  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1891 
1892 #ifdef HAVE_TPETRA_DEBUG
1893  size_t newNumEnt = curNumEnt + numInserted;
1894  const size_t chkNewNumEnt =
1895  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1896  if (chkNewNumEnt != newNumEnt) {
1897  std::ostringstream os;
1898  os << std::endl << "newNumEnt = " << newNumEnt
1899  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1900  << ") = " << chkNewNumEnt << "." << std::endl
1901  << "\torigNumEnt: " << origNumEnt << std::endl
1902  << "\tnumInputEnt: " << numInputEnt << std::endl
1903  << "\tgblColInds: [";
1904  for (size_t k = 0; k < numInputEnt; ++k) {
1905  os << gblColInds[k];
1906  if (k + size_t (1) < numInputEnt) {
1907  os << ",";
1908  }
1909  }
1910  os << "]" << std::endl
1911  << "\tvals: [";
1912  for (size_t k = 0; k < numInputEnt; ++k) {
1913  os << vals[k];
1914  if (k + size_t (1) < numInputEnt) {
1915  os << ",";
1916  }
1917  }
1918  os << "]" << std::endl;
1919 
1920  if (this->supportsRowViews ()) {
1921  values_host_view_type vals2;
1922  if (this->isGloballyIndexed ()) {
1923  global_inds_host_view_type gblColInds2;
1924  const GlobalOrdinal gblRow =
1925  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1926  if (gblRow ==
1927  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1928  os << "Local row index " << rowInfo.localRow << " is invalid!"
1929  << std::endl;
1930  }
1931  else {
1932  bool getViewThrew = false;
1933  try {
1934  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1935  }
1936  catch (std::exception& e) {
1937  getViewThrew = true;
1938  os << "getGlobalRowView threw exception:" << std::endl
1939  << e.what () << std::endl;
1940  }
1941  if (! getViewThrew) {
1942  os << "\tNew global column indices: ";
1943  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1944  os << gblColInds2[jjj] << " ";
1945  os << std::endl;
1946  os << "\tNew values: ";
1947  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1948  os << vals2[jjj] << " ";
1949  os << std::endl;
1950  }
1951  }
1952  }
1953  else if (this->isLocallyIndexed ()) {
1954  local_inds_host_view_type lclColInds2;
1955  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1956  os << "\tNew local column indices: ";
1957  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1958  os << lclColInds2[jjj] << " ";
1959  os << std::endl;
1960  os << "\tNew values: ";
1961  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1962  os << vals2[jjj] << " ";
1963  os << std::endl;
1964  }
1965  }
1966 
1967  os << "Please report this bug to the Tpetra developers.";
1968  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1969  (true, std::logic_error, os.str ());
1970  }
1971 #endif // HAVE_TPETRA_DEBUG
1972  }
1973 
1974  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1975  void
1977  insertGlobalValues (const GlobalOrdinal gblRow,
1978  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1979  const Teuchos::ArrayView<const Scalar>& values)
1980  {
1981  using Teuchos::toString;
1982  using std::endl;
1983  typedef impl_scalar_type IST;
1984  typedef LocalOrdinal LO;
1985  typedef GlobalOrdinal GO;
1986  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1987  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1988  const char tfecfFuncName[] = "insertGlobalValues: ";
1989 
1990 #ifdef HAVE_TPETRA_DEBUG
1991  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1992  (values.size () != indices.size (), std::runtime_error,
1993  "values.size() = " << values.size () << " != indices.size() = "
1994  << indices.size () << ".");
1995 #endif // HAVE_TPETRA_DEBUG
1996 
1997  // getRowMap() is not thread safe, because it increments RCP's
1998  // reference count. getCrsGraphRef() is thread safe.
1999  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2000  const LO lclRow = rowMap.getLocalElement (gblRow);
2001 
2002  if (lclRow == OTLO::invalid ()) {
2003  // Input row is _not_ owned by the calling process.
2004  //
2005  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2006  // is not in the row Map, it doesn't matter whether or not the
2007  // graph is static; the data just get stashed for later use by
2008  // globalAssemble().
2009  this->insertNonownedGlobalValues (gblRow, indices, values);
2010  }
2011  else { // Input row _is_ owned by the calling process
2012  if (this->isStaticGraph ()) {
2013  // Uh oh! Not allowed to insert into owned rows in that case.
2014  const int myRank = rowMap.getComm ()->getRank ();
2015  const int numProcs = rowMap.getComm ()->getSize ();
2016  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2017  (true, std::runtime_error,
2018  "The matrix was constructed with a constant (\"static\") graph, "
2019  "yet the given global row index " << gblRow << " is in the row "
2020  "Map on the calling process (with rank " << myRank << ", of " <<
2021  numProcs << " process(es)). In this case, you may not insert "
2022  "new entries into rows owned by the calling process.");
2023  }
2024 
2025  crs_graph_type& graph = * (this->myGraph_);
2026  const IST* const inputVals =
2027  reinterpret_cast<const IST*> (values.getRawPtr ());
2028  const GO* const inputGblColInds = indices.getRawPtr ();
2029  const size_t numInputEnt = indices.size ();
2030  RowInfo rowInfo = graph.getRowInfo (lclRow);
2031 
2032  // If the matrix has a column Map, check at this point whether
2033  // the column indices belong to the column Map.
2034  //
2035  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2036  // test to the CrsGraph method, since it may have to do this
2037  // anyway.
2038  if (! graph.colMap_.is_null ()) {
2039  const map_type& colMap = * (graph.colMap_);
2040  // In a debug build, keep track of the nonowned ("bad") column
2041  // indices, so that we can display them in the exception
2042  // message. In a release build, just ditch the loop early if
2043  // we encounter a nonowned column index.
2044 #ifdef HAVE_TPETRA_DEBUG
2045  Teuchos::Array<GO> badColInds;
2046 #endif // HAVE_TPETRA_DEBUG
2047  const size_type numEntriesToInsert = indices.size ();
2048  bool allInColMap = true;
2049  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2050  if (! colMap.isNodeGlobalElement (indices[k])) {
2051  allInColMap = false;
2052 #ifdef HAVE_TPETRA_DEBUG
2053  badColInds.push_back (indices[k]);
2054 #else
2055  break;
2056 #endif // HAVE_TPETRA_DEBUG
2057  }
2058  }
2059  if (! allInColMap) {
2060  std::ostringstream os;
2061  os << "You attempted to insert entries in owned row " << gblRow
2062  << ", at the following column indices: " << toString (indices)
2063  << "." << endl;
2064 #ifdef HAVE_TPETRA_DEBUG
2065  os << "Of those, the following indices are not in the column Map "
2066  "on this process: " << toString (badColInds) << "." << endl
2067  << "Since the matrix has a column Map already, it is invalid "
2068  "to insert entries at those locations.";
2069 #else
2070  os << "At least one of those indices is not in the column Map "
2071  "on this process." << endl << "It is invalid to insert into "
2072  "columns not in the column Map on the process that owns the "
2073  "row.";
2074 #endif // HAVE_TPETRA_DEBUG
2075  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2076  (true, std::invalid_argument, os.str ());
2077  }
2078  }
2079 
2080  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2081  inputVals, numInputEnt);
2082  }
2083  }
2084 
2085 
2086  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2087  void
2089  insertGlobalValues (const GlobalOrdinal globalRow,
2090  const LocalOrdinal numEnt,
2091  const Scalar vals[],
2092  const GlobalOrdinal inds[])
2093  {
2094  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2095  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2096  this->insertGlobalValues (globalRow, indsT, valsT);
2097  }
2098 
2099 
2100  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2101  void
2104  const GlobalOrdinal gblRow,
2105  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2106  const Teuchos::ArrayView<const Scalar>& values,
2107  const bool debug)
2108  {
2109  typedef impl_scalar_type IST;
2110  typedef LocalOrdinal LO;
2111  typedef GlobalOrdinal GO;
2112  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2113  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2114 
2115  if (debug) {
2116  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2117  (values.size () != indices.size (), std::runtime_error,
2118  "values.size() = " << values.size () << " != indices.size() = "
2119  << indices.size () << ".");
2120  }
2121 
2122  // getRowMap() is not thread safe, because it increments RCP's
2123  // reference count. getCrsGraphRef() is thread safe.
2124  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2125  const LO lclRow = rowMap.getLocalElement (gblRow);
2126  if (lclRow == OTLO::invalid ()) {
2127  // Input row is _not_ owned by the calling process.
2128  //
2129  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2130  // is not in the row Map, it doesn't matter whether or not the
2131  // graph is static; the data just get stashed for later use by
2132  // globalAssemble().
2133  this->insertNonownedGlobalValues (gblRow, indices, values);
2134  }
2135  else { // Input row _is_ owned by the calling process
2136  if (this->isStaticGraph ()) {
2137  // Uh oh! Not allowed to insert into owned rows in that case.
2138  const int myRank = rowMap.getComm ()->getRank ();
2139  const int numProcs = rowMap.getComm ()->getSize ();
2140  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2141  (true, std::runtime_error,
2142  "The matrix was constructed with a constant (\"static\") graph, "
2143  "yet the given global row index " << gblRow << " is in the row "
2144  "Map on the calling process (with rank " << myRank << ", of " <<
2145  numProcs << " process(es)). In this case, you may not insert "
2146  "new entries into rows owned by the calling process.");
2147  }
2148 
2149  crs_graph_type& graph = * (this->myGraph_);
2150  const IST* const inputVals =
2151  reinterpret_cast<const IST*> (values.getRawPtr ());
2152  const GO* const inputGblColInds = indices.getRawPtr ();
2153  const size_t numInputEnt = indices.size ();
2154  RowInfo rowInfo = graph.getRowInfo (lclRow);
2155 
2156  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2157  // This branch is similar in function to the following branch, but for
2158  // the special case that the target graph is locally indexed.
2159  // In this case, we cannot simply filter
2160  // out global indices that don't exist on the receiving process and
2161  // insert the remaining (global) indices, but we must convert them (the
2162  // remaining global indices) to local and call `insertLocalValues`.
2163  const map_type& colMap = * (graph.colMap_);
2164  size_t curOffset = 0;
2165  while (curOffset < numInputEnt) {
2166  // Find a sequence of input indices that are in the column Map on the
2167  // calling process. Doing a sequence at a time, instead of one at a
2168  // time, amortizes some overhead.
2169  Teuchos::Array<LO> lclIndices;
2170  size_t endOffset = curOffset;
2171  for ( ; endOffset < numInputEnt; ++endOffset) {
2172  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2173  if (lclIndex != OTLO::invalid())
2174  lclIndices.push_back(lclIndex);
2175  else
2176  break;
2177  }
2178  // curOffset, endOffset: half-exclusive range of indices in the column
2179  // Map on the calling process. If endOffset == curOffset, the range is
2180  // empty.
2181  const LO numIndInSeq = (endOffset - curOffset);
2182  if (numIndInSeq != 0) {
2183  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2184  }
2185  // Invariant before the increment line: Either endOffset ==
2186  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2187  // on the calling process.
2188  if (debug) {
2189  const bool invariant = endOffset == numInputEnt ||
2190  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2191  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2192  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2193  }
2194  curOffset = endOffset + 1;
2195  }
2196  }
2197  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2198  const map_type& colMap = * (graph.colMap_);
2199  size_t curOffset = 0;
2200  while (curOffset < numInputEnt) {
2201  // Find a sequence of input indices that are in the column
2202  // Map on the calling process. Doing a sequence at a time,
2203  // instead of one at a time, amortizes some overhead.
2204  size_t endOffset = curOffset;
2205  for ( ; endOffset < numInputEnt &&
2206  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2207  ++endOffset)
2208  {}
2209  // curOffset, endOffset: half-exclusive range of indices in
2210  // the column Map on the calling process. If endOffset ==
2211  // curOffset, the range is empty.
2212  const LO numIndInSeq = (endOffset - curOffset);
2213  if (numIndInSeq != 0) {
2214  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2215  this->insertGlobalValuesImpl (graph, rowInfo,
2216  inputGblColInds + curOffset,
2217  inputVals + curOffset,
2218  numIndInSeq);
2219  }
2220  // Invariant before the increment line: Either endOffset ==
2221  // numInputEnt, or inputGblColInds[endOffset] is not in the
2222  // column Map on the calling process.
2223  if (debug) {
2224  const bool invariant = endOffset == numInputEnt ||
2225  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2226  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2227  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2228  }
2229  curOffset = endOffset + 1;
2230  }
2231  }
2232  else { // we don't have a column Map.
2233  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2234  inputVals, numInputEnt);
2235  }
2236  }
2237  }
2238 
2239  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2240  void
2241  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2242  insertGlobalValuesFilteredChecked(
2243  const GlobalOrdinal gblRow,
2244  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2245  const Teuchos::ArrayView<const Scalar>& values,
2246  const char* const prefix,
2247  const bool debug,
2248  const bool verbose)
2249  {
2251  using std::endl;
2252 
2253  try {
2254  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2255  }
2256  catch(std::exception& e) {
2257  std::ostringstream os;
2258  if (verbose) {
2259  const size_t maxNumToPrint =
2261  os << *prefix << ": insertGlobalValuesFiltered threw an "
2262  "exception: " << e.what() << endl
2263  << "Global row index: " << gblRow << endl;
2264  verbosePrintArray(os, indices, "Global column indices",
2265  maxNumToPrint);
2266  os << endl;
2267  verbosePrintArray(os, values, "Values", maxNumToPrint);
2268  os << endl;
2269  }
2270  else {
2271  os << ": insertGlobalValuesFiltered threw an exception: "
2272  << e.what();
2273  }
2274  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2275  }
2276  }
2277 
2278  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2279  LocalOrdinal
2282  const crs_graph_type& graph,
2283  const RowInfo& rowInfo,
2284  const LocalOrdinal inds[],
2285  const impl_scalar_type newVals[],
2286  const LocalOrdinal numElts)
2287  {
2288  typedef LocalOrdinal LO;
2289  typedef GlobalOrdinal GO;
2290  const bool sorted = graph.isSorted ();
2291 
2292  size_t hint = 0; // Guess for the current index k into rowVals
2293  LO numValid = 0; // number of valid local column indices
2294 
2295  if (graph.isLocallyIndexed ()) {
2296  // Get a view of the column indices in the row. This amortizes
2297  // the cost of getting the view over all the entries of inds.
2298  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2299 
2300  for (LO j = 0; j < numElts; ++j) {
2301  const LO lclColInd = inds[j];
2302  const size_t offset =
2303  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2304  lclColInd, hint, sorted);
2305  if (offset != rowInfo.numEntries) {
2306  rowVals[offset] = newVals[j];
2307  hint = offset + 1;
2308  ++numValid;
2309  }
2310  }
2311  }
2312  else if (graph.isGloballyIndexed ()) {
2313  if (graph.colMap_.is_null ()) {
2314  return Teuchos::OrdinalTraits<LO>::invalid ();
2315  }
2316  const map_type colMap = * (graph.colMap_);
2317 
2318  // Get a view of the column indices in the row. This amortizes
2319  // the cost of getting the view over all the entries of inds.
2320  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2321 
2322  for (LO j = 0; j < numElts; ++j) {
2323  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2324  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2325  const size_t offset =
2326  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2327  gblColInd, hint, sorted);
2328  if (offset != rowInfo.numEntries) {
2329  rowVals[offset] = newVals[j];
2330  hint = offset + 1;
2331  ++numValid;
2332  }
2333  }
2334  }
2335  }
2336  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2337  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2338  // to be neither locally nor globally indexed on a process.
2339  // This means that the graph or matrix has no entries on that
2340  // process. Epetra also works like this. It's related to lazy
2341  // allocation (on first insertion, not at graph / matrix
2342  // construction). Lazy allocation will go away because it is
2343  // not thread scalable.
2344 
2345  return numValid;
2346  }
2347 
2348  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2349  LocalOrdinal
2351  replaceLocalValues (const LocalOrdinal localRow,
2352  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2353  const Teuchos::ArrayView<const Scalar>& vals)
2354  {
2355  typedef LocalOrdinal LO;
2356 
2357  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2358  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2359  return Teuchos::OrdinalTraits<LO>::invalid ();
2360  }
2361  const LO* const inputInds = lclCols.getRawPtr ();
2362  const Scalar* const inputVals = vals.getRawPtr ();
2363  return this->replaceLocalValues (localRow, numInputEnt,
2364  inputVals, inputInds);
2365  }
2366 
2367  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2369  local_ordinal_type
2372  const local_ordinal_type localRow,
2373  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2374  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2375  {
2376  using LO = local_ordinal_type;
2377  const LO numInputEnt = inputInds.extent(0);
2378  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2379  return Teuchos::OrdinalTraits<LO>::invalid();
2380  }
2381  const Scalar* const inVals =
2382  reinterpret_cast<const Scalar*>(inputVals.data());
2383  return this->replaceLocalValues(localRow, numInputEnt,
2384  inVals, inputInds.data());
2385  }
2386 
2387  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2388  LocalOrdinal
2390  replaceLocalValues (const LocalOrdinal localRow,
2391  const LocalOrdinal numEnt,
2392  const Scalar inputVals[],
2393  const LocalOrdinal inputCols[])
2394  {
2395  typedef impl_scalar_type IST;
2396  typedef LocalOrdinal LO;
2397 
2398  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2399  // Fill must be active and the "nonconst" graph must exist.
2400  return Teuchos::OrdinalTraits<LO>::invalid ();
2401  }
2402  const crs_graph_type& graph = * (this->staticGraph_);
2403  const RowInfo rowInfo = graph.getRowInfo (localRow);
2404 
2405  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2406  // The calling process does not own this row, so it is not
2407  // allowed to modify its values.
2408  return static_cast<LO> (0);
2409  }
2410  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2411  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2412  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2413  inputCols, inVals, numEnt);
2414  }
2415 
2416  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2417  LocalOrdinal
2420  const crs_graph_type& graph,
2421  const RowInfo& rowInfo,
2422  const GlobalOrdinal inds[],
2423  const impl_scalar_type newVals[],
2424  const LocalOrdinal numElts)
2425  {
2426  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2427  auto fun =
2428  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2429  rowVals[offset] = newVals[k];
2430  };
2431  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2432  return graph.findGlobalIndices(rowInfo, indsT, cb);
2433  }
2434 
2435  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2436  LocalOrdinal
2438  replaceGlobalValues (const GlobalOrdinal globalRow,
2439  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2440  const Teuchos::ArrayView<const Scalar>& inputVals)
2441  {
2442  typedef LocalOrdinal LO;
2443 
2444  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2445  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2446  return Teuchos::OrdinalTraits<LO>::invalid ();
2447  }
2448  return this->replaceGlobalValues (globalRow, numInputEnt,
2449  inputVals.getRawPtr (),
2450  inputGblColInds.getRawPtr ());
2451  }
2452 
2453  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2454  LocalOrdinal
2456  replaceGlobalValues (const GlobalOrdinal globalRow,
2457  const LocalOrdinal numEnt,
2458  const Scalar inputVals[],
2459  const GlobalOrdinal inputGblColInds[])
2460  {
2461  typedef impl_scalar_type IST;
2462  typedef LocalOrdinal LO;
2463 
2464  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2465  // Fill must be active and the "nonconst" graph must exist.
2466  return Teuchos::OrdinalTraits<LO>::invalid ();
2467  }
2468  const crs_graph_type& graph = * (this->staticGraph_);
2469 
2470  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2471  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2472  // The input local row is invalid on the calling process,
2473  // which means that the calling process summed 0 entries.
2474  return static_cast<LO> (0);
2475  }
2476 
2477  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2478  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2479  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2480  inputGblColInds, inVals, numEnt);
2481  }
2482 
2483  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2485  local_ordinal_type
2488  const global_ordinal_type globalRow,
2489  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2490  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2491  {
2492  // We use static_assert here to check the template parameters,
2493  // rather than std::enable_if (e.g., on the return value, to
2494  // enable compilation only if the template parameters match the
2495  // desired attributes). This turns obscure link errors into
2496  // clear compilation errors. It also makes the return value a
2497  // lot easier to see.
2498  using LO = local_ordinal_type;
2499  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2500  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2501  return Teuchos::OrdinalTraits<LO>::invalid();
2502  }
2503  const Scalar* const inVals =
2504  reinterpret_cast<const Scalar*>(inputVals.data());
2505  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2506  inputInds.data());
2507  }
2508 
2509  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2510  LocalOrdinal
2513  const crs_graph_type& graph,
2514  const RowInfo& rowInfo,
2515  const GlobalOrdinal inds[],
2516  const impl_scalar_type newVals[],
2517  const LocalOrdinal numElts,
2518  const bool atomic)
2519  {
2520  typedef LocalOrdinal LO;
2521  typedef GlobalOrdinal GO;
2522 
2523  const bool sorted = graph.isSorted ();
2524 
2525  size_t hint = 0; // guess at the index's relative offset in the row
2526  LO numValid = 0; // number of valid input column indices
2527 
2528  if (graph.isLocallyIndexed ()) {
2529  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2530  // pointer does NOT change its reference count. Thus, this
2531  // code is still thread safe.
2532  if (graph.colMap_.is_null ()) {
2533  // NO input column indices are valid in this case, since if
2534  // the column Map is null on the calling process, then the
2535  // calling process owns no graph entries.
2536  return numValid;
2537  }
2538  const map_type& colMap = * (graph.colMap_);
2539 
2540  // Get a view of the column indices in the row. This amortizes
2541  // the cost of getting the view over all the entries of inds.
2542  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2543  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2544 
2545  for (LO j = 0; j < numElts; ++j) {
2546  const LO lclColInd = colMap.getLocalElement (inds[j]);
2547  if (lclColInd != LINV) {
2548  const size_t offset =
2549  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2550  lclColInd, hint, sorted);
2551  if (offset != rowInfo.numEntries) {
2552  if (atomic) {
2553  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2554  }
2555  else {
2556  rowVals[offset] += newVals[j];
2557  }
2558  hint = offset + 1;
2559  numValid++;
2560  }
2561  }
2562  }
2563  }
2564  else if (graph.isGloballyIndexed ()) {
2565  // Get a view of the column indices in the row. This amortizes
2566  // the cost of getting the view over all the entries of inds.
2567  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2568 
2569  for (LO j = 0; j < numElts; ++j) {
2570  const GO gblColInd = inds[j];
2571  const size_t offset =
2572  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2573  gblColInd, hint, sorted);
2574  if (offset != rowInfo.numEntries) {
2575  if (atomic) {
2576  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2577  }
2578  else {
2579  rowVals[offset] += newVals[j];
2580  }
2581  hint = offset + 1;
2582  numValid++;
2583  }
2584  }
2585  }
2586  // If the graph is neither locally nor globally indexed on the
2587  // calling process, that means the calling process has no graph
2588  // entries. Thus, none of the input column indices are valid.
2589 
2590  return numValid;
2591  }
2592 
2593  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2594  LocalOrdinal
2596  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2597  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2598  const Teuchos::ArrayView<const Scalar>& inputVals,
2599  const bool atomic)
2600  {
2601  typedef LocalOrdinal LO;
2602 
2603  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2604  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2605  return Teuchos::OrdinalTraits<LO>::invalid ();
2606  }
2607  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2608  inputVals.getRawPtr (),
2609  inputGblColInds.getRawPtr (),
2610  atomic);
2611  }
2612 
2613  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2614  LocalOrdinal
2616  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2617  const LocalOrdinal numInputEnt,
2618  const Scalar inputVals[],
2619  const GlobalOrdinal inputGblColInds[],
2620  const bool atomic)
2621  {
2622  typedef impl_scalar_type IST;
2623  typedef LocalOrdinal LO;
2624  typedef GlobalOrdinal GO;
2625 
2626  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2627  // Fill must be active and the "nonconst" graph must exist.
2628  return Teuchos::OrdinalTraits<LO>::invalid ();
2629  }
2630  const crs_graph_type& graph = * (this->staticGraph_);
2631 
2632  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2633  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2634  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2635  // thread safe in a debug build, in part because it uses
2636  // Teuchos::ArrayView, and in part because of the data structure
2637  // used to stash outgoing entries.
2638  using Teuchos::ArrayView;
2639  ArrayView<const GO> inputGblColInds_av(
2640  numInputEnt == 0 ? nullptr : inputGblColInds,
2641  numInputEnt);
2642  ArrayView<const Scalar> inputVals_av(
2643  numInputEnt == 0 ? nullptr :
2644  inputVals, numInputEnt);
2645  // gblRow is not in the row Map on the calling process, so stash
2646  // the given entries away in a separate data structure.
2647  // globalAssemble() (called during fillComplete()) will exchange
2648  // that data and sum it in using sumIntoGlobalValues().
2649  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2650  inputVals_av);
2651  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2652  // since we won't know whether the given indices were valid
2653  // until globalAssemble (called in fillComplete) is called.
2654  // That's why insertNonownedGlobalValues doesn't return
2655  // anything. Just for consistency, I'll return the number of
2656  // entries that the user gave us.
2657  return numInputEnt;
2658  }
2659  else { // input row is in the row Map on the calling process
2660  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2661  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2662  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2663  inputGblColInds, inVals,
2664  numInputEnt, atomic);
2665  }
2666  }
2667 
2668  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2669  LocalOrdinal
2671  transformLocalValues (const LocalOrdinal lclRow,
2672  const LocalOrdinal numInputEnt,
2673  const impl_scalar_type inputVals[],
2674  const LocalOrdinal inputCols[],
2675  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2676  const bool atomic)
2677  {
2678  using Tpetra::Details::OrdinalTraits;
2679  typedef LocalOrdinal LO;
2680 
2681  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2682  // Fill must be active and the "nonconst" graph must exist.
2683  return Teuchos::OrdinalTraits<LO>::invalid ();
2684  }
2685  const crs_graph_type& graph = * (this->staticGraph_);
2686  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2687 
2688  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2689  // The calling process does not own this row, so it is not
2690  // allowed to modify its values.
2691  return static_cast<LO> (0);
2692  }
2693  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2694  return this->transformLocalValues (curRowVals.data (), graph,
2695  rowInfo, inputCols, inputVals,
2696  numInputEnt, f, atomic);
2697  }
2698 
2699  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2700  LocalOrdinal
2701  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2702  transformGlobalValues (const GlobalOrdinal gblRow,
2703  const LocalOrdinal numInputEnt,
2704  const impl_scalar_type inputVals[],
2705  const GlobalOrdinal inputCols[],
2706  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2707  const bool atomic)
2708  {
2709  using Tpetra::Details::OrdinalTraits;
2710  typedef LocalOrdinal LO;
2711 
2712  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2713  // Fill must be active and the "nonconst" graph must exist.
2714  return OrdinalTraits<LO>::invalid ();
2715  }
2716  const crs_graph_type& graph = * (this->staticGraph_);
2717  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2718 
2719  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2720  // The calling process does not own this row, so it is not
2721  // allowed to modify its values.
2722  return static_cast<LO> (0);
2723  }
2724  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2725  return this->transformGlobalValues (curRowVals.data (), graph,
2726  rowInfo, inputCols, inputVals,
2727  numInputEnt, f, atomic);
2728  }
2729 
2730  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2731  LocalOrdinal
2732  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2733  transformLocalValues (impl_scalar_type rowVals[],
2734  const crs_graph_type& graph,
2735  const RowInfo& rowInfo,
2736  const LocalOrdinal inds[],
2737  const impl_scalar_type newVals[],
2738  const LocalOrdinal numElts,
2739  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2740  const bool atomic)
2741  {
2742  typedef impl_scalar_type ST;
2743  typedef LocalOrdinal LO;
2744  typedef GlobalOrdinal GO;
2745 
2746  //if (newVals.extent (0) != inds.extent (0)) {
2747  // The sizes of the input arrays must match.
2748  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2749  //}
2750  //const LO numElts = static_cast<LO> (inds.extent (0));
2751  const bool sorted = graph.isSorted ();
2752 
2753  LO numValid = 0; // number of valid input column indices
2754  size_t hint = 0; // Guess for the current index k into rowVals
2755 
2756  if (graph.isLocallyIndexed ()) {
2757  // Get a view of the column indices in the row. This amortizes
2758  // the cost of getting the view over all the entries of inds.
2759  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2760 
2761  for (LO j = 0; j < numElts; ++j) {
2762  const LO lclColInd = inds[j];
2763  const size_t offset =
2764  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2765  lclColInd, hint, sorted);
2766  if (offset != rowInfo.numEntries) {
2767  if (atomic) {
2768  // NOTE (mfh 30 Nov 2015) The commented-out code is
2769  // wrong because another thread may have changed
2770  // rowVals[offset] between those two lines of code.
2771  //
2772  //const ST newVal = f (rowVals[offset], newVals[j]);
2773  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2774 
2775  volatile ST* const dest = &rowVals[offset];
2776  (void) atomic_binary_function_update (dest, newVals[j], f);
2777  }
2778  else {
2779  // use binary function f
2780  rowVals[offset] = f (rowVals[offset], newVals[j]);
2781  }
2782  hint = offset + 1;
2783  ++numValid;
2784  }
2785  }
2786  }
2787  else if (graph.isGloballyIndexed ()) {
2788  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2789  // pointer does NOT change its reference count. Thus, this
2790  // code is still thread safe.
2791  if (graph.colMap_.is_null ()) {
2792  // NO input column indices are valid in this case. Either
2793  // the column Map hasn't been set yet (so local indices
2794  // don't exist yet), or the calling process owns no graph
2795  // entries.
2796  return numValid;
2797  }
2798  const map_type& colMap = * (graph.colMap_);
2799  // Get a view of the column indices in the row. This amortizes
2800  // the cost of getting the view over all the entries of inds.
2801  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2802 
2803  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2804  for (LO j = 0; j < numElts; ++j) {
2805  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2806  if (gblColInd != GINV) {
2807  const size_t offset =
2808  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2809  gblColInd, hint, sorted);
2810  if (offset != rowInfo.numEntries) {
2811  if (atomic) {
2812  // NOTE (mfh 30 Nov 2015) The commented-out code is
2813  // wrong because another thread may have changed
2814  // rowVals[offset] between those two lines of code.
2815  //
2816  //const ST newVal = f (rowVals[offset], newVals[j]);
2817  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2818 
2819  volatile ST* const dest = &rowVals[offset];
2820  (void) atomic_binary_function_update (dest, newVals[j], f);
2821  }
2822  else {
2823  // use binary function f
2824  rowVals[offset] = f (rowVals[offset], newVals[j]);
2825  }
2826  hint = offset + 1;
2827  numValid++;
2828  }
2829  }
2830  }
2831  }
2832  // If the graph is neither locally nor globally indexed on the
2833  // calling process, that means the calling process has no graph
2834  // entries. Thus, none of the input column indices are valid.
2835 
2836  return numValid;
2837  }
2838 
2839  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2840  LocalOrdinal
2841  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2842  transformGlobalValues (impl_scalar_type rowVals[],
2843  const crs_graph_type& graph,
2844  const RowInfo& rowInfo,
2845  const GlobalOrdinal inds[],
2846  const impl_scalar_type newVals[],
2847  const LocalOrdinal numElts,
2848  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2849  const bool atomic)
2850  {
2851  typedef impl_scalar_type ST;
2852  typedef LocalOrdinal LO;
2853  typedef GlobalOrdinal GO;
2854 
2855  //if (newVals.extent (0) != inds.extent (0)) {
2856  // The sizes of the input arrays must match.
2857  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2858  //}
2859  //const LO numElts = static_cast<LO> (inds.extent (0));
2860  const bool sorted = graph.isSorted ();
2861 
2862  LO numValid = 0; // number of valid input column indices
2863  size_t hint = 0; // Guess for the current index k into rowVals
2864 
2865  if (graph.isGloballyIndexed ()) {
2866  // Get a view of the column indices in the row. This amortizes
2867  // the cost of getting the view over all the entries of inds.
2868  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2869 
2870  for (LO j = 0; j < numElts; ++j) {
2871  const GO gblColInd = inds[j];
2872  const size_t offset =
2873  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2874  gblColInd, hint, sorted);
2875  if (offset != rowInfo.numEntries) {
2876  if (atomic) {
2877  // NOTE (mfh 30 Nov 2015) The commented-out code is
2878  // wrong because another thread may have changed
2879  // rowVals[offset] between those two lines of code.
2880  //
2881  //const ST newVal = f (rowVals[offset], newVals[j]);
2882  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2883 
2884  volatile ST* const dest = &rowVals[offset];
2885  (void) atomic_binary_function_update (dest, newVals[j], f);
2886  }
2887  else {
2888  // use binary function f
2889  rowVals[offset] = f (rowVals[offset], newVals[j]);
2890  }
2891  hint = offset + 1;
2892  ++numValid;
2893  }
2894  }
2895  }
2896  else if (graph.isLocallyIndexed ()) {
2897  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2898  // pointer does NOT change its reference count. Thus, this
2899  // code is still thread safe.
2900  if (graph.colMap_.is_null ()) {
2901  // NO input column indices are valid in this case. Either the
2902  // column Map hasn't been set yet (so local indices don't
2903  // exist yet), or the calling process owns no graph entries.
2904  return numValid;
2905  }
2906  const map_type& colMap = * (graph.colMap_);
2907  // Get a view of the column indices in the row. This amortizes
2908  // the cost of getting the view over all the entries of inds.
2909  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2910 
2911  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2912  for (LO j = 0; j < numElts; ++j) {
2913  const LO lclColInd = colMap.getLocalElement (inds[j]);
2914  if (lclColInd != LINV) {
2915  const size_t offset =
2916  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2917  lclColInd, hint, sorted);
2918  if (offset != rowInfo.numEntries) {
2919  if (atomic) {
2920  // NOTE (mfh 30 Nov 2015) The commented-out code is
2921  // wrong because another thread may have changed
2922  // rowVals[offset] between those two lines of code.
2923  //
2924  //const ST newVal = f (rowVals[offset], newVals[j]);
2925  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2926 
2927  volatile ST* const dest = &rowVals[offset];
2928  (void) atomic_binary_function_update (dest, newVals[j], f);
2929  }
2930  else {
2931  // use binary function f
2932  rowVals[offset] = f (rowVals[offset], newVals[j]);
2933  }
2934  hint = offset + 1;
2935  numValid++;
2936  }
2937  }
2938  }
2939  }
2940  // If the graph is neither locally nor globally indexed on the
2941  // calling process, that means the calling process has no graph
2942  // entries. Thus, none of the input column indices are valid.
2943 
2944  return numValid;
2945  }
2946 
2947  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2948  LocalOrdinal
2951  const crs_graph_type& graph,
2952  const RowInfo& rowInfo,
2953  const LocalOrdinal inds[],
2954  const impl_scalar_type newVals[],
2955  const LocalOrdinal numElts,
2956  const bool atomic)
2957  {
2958  typedef LocalOrdinal LO;
2959  typedef GlobalOrdinal GO;
2960 
2961  const bool sorted = graph.isSorted ();
2962 
2963  size_t hint = 0; // Guess for the current index k into rowVals
2964  LO numValid = 0; // number of valid local column indices
2965 
2966  if (graph.isLocallyIndexed ()) {
2967  // Get a view of the column indices in the row. This amortizes
2968  // the cost of getting the view over all the entries of inds.
2969  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2970 
2971  for (LO j = 0; j < numElts; ++j) {
2972  const LO lclColInd = inds[j];
2973  const size_t offset =
2974  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2975  lclColInd, hint, sorted);
2976  if (offset != rowInfo.numEntries) {
2977  if (atomic) {
2978  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2979  }
2980  else {
2981  rowVals[offset] += newVals[j];
2982  }
2983  hint = offset + 1;
2984  ++numValid;
2985  }
2986  }
2987  }
2988  else if (graph.isGloballyIndexed ()) {
2989  if (graph.colMap_.is_null ()) {
2990  return Teuchos::OrdinalTraits<LO>::invalid ();
2991  }
2992  const map_type colMap = * (graph.colMap_);
2993 
2994  // Get a view of the column indices in the row. This amortizes
2995  // the cost of getting the view over all the entries of inds.
2996  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2997 
2998  for (LO j = 0; j < numElts; ++j) {
2999  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3000  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3001  const size_t offset =
3002  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3003  gblColInd, hint, sorted);
3004  if (offset != rowInfo.numEntries) {
3005  if (atomic) {
3006  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3007  }
3008  else {
3009  rowVals[offset] += newVals[j];
3010  }
3011  hint = offset + 1;
3012  ++numValid;
3013  }
3014  }
3015  }
3016  }
3017  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3018  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3019  // to be neither locally nor globally indexed on a process.
3020  // This means that the graph or matrix has no entries on that
3021  // process. Epetra also works like this. It's related to lazy
3022  // allocation (on first insertion, not at graph / matrix
3023  // construction). Lazy allocation will go away because it is
3024  // not thread scalable.
3025 
3026  return numValid;
3027  }
3028 
3029  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3030  LocalOrdinal
3032  sumIntoLocalValues (const LocalOrdinal localRow,
3033  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3034  const Teuchos::ArrayView<const Scalar>& values,
3035  const bool atomic)
3036  {
3037  using LO = local_ordinal_type;
3038  const LO numInputEnt = static_cast<LO>(indices.size());
3039  if (static_cast<LO>(values.size()) != numInputEnt) {
3040  return Teuchos::OrdinalTraits<LO>::invalid();
3041  }
3042  const LO* const inputInds = indices.getRawPtr();
3043  const scalar_type* const inputVals = values.getRawPtr();
3044  return this->sumIntoLocalValues(localRow, numInputEnt,
3045  inputVals, inputInds, atomic);
3046  }
3047 
3048  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3050  local_ordinal_type
3053  const local_ordinal_type localRow,
3054  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3055  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3056  const bool atomic)
3057  {
3058  using LO = local_ordinal_type;
3059  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3060  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3061  return Teuchos::OrdinalTraits<LO>::invalid();
3062  }
3063  const scalar_type* inVals =
3064  reinterpret_cast<const scalar_type*>(inputVals.data());
3065  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3066  inputInds.data(), atomic);
3067  }
3068 
3069  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3070  LocalOrdinal
3072  sumIntoLocalValues (const LocalOrdinal localRow,
3073  const LocalOrdinal numEnt,
3074  const Scalar vals[],
3075  const LocalOrdinal cols[],
3076  const bool atomic)
3077  {
3078  typedef impl_scalar_type IST;
3079  typedef LocalOrdinal LO;
3080 
3081  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3082  // Fill must be active and the "nonconst" graph must exist.
3083  return Teuchos::OrdinalTraits<LO>::invalid ();
3084  }
3085  const crs_graph_type& graph = * (this->staticGraph_);
3086  const RowInfo rowInfo = graph.getRowInfo (localRow);
3087 
3088  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3089  // The calling process does not own this row, so it is not
3090  // allowed to modify its values.
3091  return static_cast<LO> (0);
3092  }
3093  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3094  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3095  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3096  cols, inputVals, numEnt, atomic);
3097  }
3098 
3099  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3101  values_dualv_type::t_host::const_type
3103  getValuesViewHost (const RowInfo& rowinfo) const
3104  {
3105  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3106  return typename values_dualv_type::t_host::const_type ();
3107  else
3108  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3109  rowinfo.allocSize,
3110  Access::ReadOnly);
3111  }
3112 
3113  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3115  values_dualv_type::t_host
3118  {
3119  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3120  return typename values_dualv_type::t_host ();
3121  else
3122  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3123  rowinfo.allocSize,
3124  Access::ReadWrite);
3125  }
3126 
3127  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3129  values_dualv_type::t_dev::const_type
3131  getValuesViewDevice (const RowInfo& rowinfo) const
3132  {
3133  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3134  return typename values_dualv_type::t_dev::const_type ();
3135  else
3136  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3137  rowinfo.allocSize,
3138  Access::ReadOnly);
3139  }
3140 
3141  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3143  values_dualv_type::t_dev
3146  {
3147  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3148  return typename values_dualv_type::t_dev ();
3149  else
3150  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3151  rowinfo.allocSize,
3152  Access::ReadWrite);
3153  }
3154 
3155 
3156  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3157  void
3160  nonconst_local_inds_host_view_type &indices,
3161  nonconst_values_host_view_type &values,
3162  size_t& numEntries) const
3163  {
3164  using Teuchos::ArrayView;
3165  using Teuchos::av_reinterpret_cast;
3166  const char tfecfFuncName[] = "getLocalRowCopy: ";
3167 
3168  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3169  (! this->hasColMap (), std::runtime_error,
3170  "The matrix does not have a column Map yet. This means we don't have "
3171  "local indices for columns yet, so it doesn't make sense to call this "
3172  "method. If the matrix doesn't have a column Map yet, you should call "
3173  "fillComplete on it first.");
3174 
3175  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3176  const size_t theNumEntries = rowinfo.numEntries;
3177  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3178  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3179  static_cast<size_t> (values.size ()) < theNumEntries,
3180  std::runtime_error, "Row with local index " << localRow << " has " <<
3181  theNumEntries << " entry/ies, but indices.size() = " <<
3182  indices.size () << " and values.size() = " << values.size () << ".");
3183  numEntries = theNumEntries; // first side effect
3184 
3185  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3186  if (staticGraph_->isLocallyIndexed ()) {
3187  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3188  auto curVals = getValuesViewHost(rowinfo);
3189 
3190  for (size_t j = 0; j < theNumEntries; ++j) {
3191  values[j] = curVals[j];
3192  indices[j] = curLclInds(j);
3193  }
3194  }
3195  else if (staticGraph_->isGloballyIndexed ()) {
3196  // Don't call getColMap(), because it touches RCP's reference count.
3197  const map_type& colMap = * (staticGraph_->colMap_);
3198  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3199  auto curVals = getValuesViewHost(rowinfo);
3200 
3201  for (size_t j = 0; j < theNumEntries; ++j) {
3202  values[j] = curVals[j];
3203  indices[j] = colMap.getLocalElement (curGblInds(j));
3204  }
3205  }
3206  }
3207  }
3208 
3209 
3210 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3211 void
3214  nonconst_global_inds_host_view_type &indices,
3215  nonconst_values_host_view_type &values,
3216  size_t& numEntries) const
3217  {
3218  using Teuchos::ArrayView;
3219  using Teuchos::av_reinterpret_cast;
3220  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3221 
3222  const RowInfo rowinfo =
3223  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3224  const size_t theNumEntries = rowinfo.numEntries;
3225  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3226  static_cast<size_t> (indices.size ()) < theNumEntries ||
3227  static_cast<size_t> (values.size ()) < theNumEntries,
3228  std::runtime_error, "Row with global index " << globalRow << " has "
3229  << theNumEntries << " entry/ies, but indices.size() = " <<
3230  indices.size () << " and values.size() = " << values.size () << ".");
3231  numEntries = theNumEntries; // first side effect
3232 
3233  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3234  if (staticGraph_->isLocallyIndexed ()) {
3235  const map_type& colMap = * (staticGraph_->colMap_);
3236  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3237  auto curVals = getValuesViewHost(rowinfo);
3238 
3239  for (size_t j = 0; j < theNumEntries; ++j) {
3240  values[j] = curVals[j];
3241  indices[j] = colMap.getGlobalElement (curLclInds(j));
3242  }
3243  }
3244  else if (staticGraph_->isGloballyIndexed ()) {
3245  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3246  auto curVals = getValuesViewHost(rowinfo);
3247 
3248  for (size_t j = 0; j < theNumEntries; ++j) {
3249  values[j] = curVals[j];
3250  indices[j] = curGblInds(j);
3251  }
3252  }
3253  }
3254  }
3255 
3256 
3257  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3258  void
3260  getLocalRowView(LocalOrdinal localRow,
3261  local_inds_host_view_type &indices,
3262  values_host_view_type &values) const
3263  {
3264  const char tfecfFuncName[] = "getLocalRowView: ";
3265 
3266  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3267  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3268  "its indices as global indices, so you cannot get a view with local "
3269  "column indices. If the matrix has a column Map, you may call "
3270  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3271  "a view with global column indices by calling getGlobalRowCopy().");
3272 
3273  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3274  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3275  rowInfo.numEntries > 0) {
3276  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3277  rowInfo.offset1D,
3278  rowInfo.numEntries,
3279  Access::ReadOnly);
3280  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3281  rowInfo.numEntries,
3282  Access::ReadOnly);
3283  }
3284  else {
3285  // This does the right thing (reports an empty row) if the input
3286  // row is invalid.
3287  indices = local_inds_host_view_type();
3288  values = values_host_view_type();
3289  }
3290 
3291 #ifdef HAVE_TPETRA_DEBUG
3292  const char suffix[] = ". This should never happen. Please report this "
3293  "bug to the Tpetra developers.";
3294  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3295  (static_cast<size_t> (indices.size ()) !=
3296  static_cast<size_t> (values.size ()), std::logic_error,
3297  "At the end of this method, for local row " << localRow << ", "
3298  "indices.size() = " << indices.size () << " != values.size () = "
3299  << values.size () << suffix);
3300  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3301  (static_cast<size_t> (indices.size ()) !=
3302  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3303  "At the end of this method, for local row " << localRow << ", "
3304  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3305  << rowInfo.numEntries << suffix);
3306  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3307  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3308  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3309  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3310  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3311  expectedNumEntries << suffix);
3312 #endif // HAVE_TPETRA_DEBUG
3313  }
3314 
3315 
3316  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3317  void
3319  getGlobalRowView (GlobalOrdinal globalRow,
3320  global_inds_host_view_type &indices,
3321  values_host_view_type &values) const
3322  {
3323  const char tfecfFuncName[] = "getGlobalRowView: ";
3324 
3325  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3326  isLocallyIndexed (), std::runtime_error,
3327  "The matrix is locally indexed, so we cannot return a view of the row "
3328  "with global column indices. Use getGlobalRowCopy() instead.");
3329 
3330  // This does the right thing (reports an empty row) if the input
3331  // row is invalid.
3332  const RowInfo rowInfo =
3333  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3334  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3335  rowInfo.numEntries > 0) {
3336  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3337  rowInfo.numEntries,
3338  Access::ReadOnly);
3339  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3340  rowInfo.numEntries,
3341  Access::ReadOnly);
3342  }
3343  else {
3344  indices = global_inds_host_view_type();
3345  values = values_host_view_type();
3346  }
3347 
3348 #ifdef HAVE_TPETRA_DEBUG
3349  const char suffix[] = ". This should never happen. Please report this "
3350  "bug to the Tpetra developers.";
3351  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3352  (static_cast<size_t> (indices.size ()) !=
3353  static_cast<size_t> (values.size ()), std::logic_error,
3354  "At the end of this method, for global row " << globalRow << ", "
3355  "indices.size() = " << indices.size () << " != values.size () = "
3356  << values.size () << suffix);
3357  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3358  (static_cast<size_t> (indices.size ()) !=
3359  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3360  "At the end of this method, for global row " << globalRow << ", "
3361  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3362  << rowInfo.numEntries << suffix);
3363  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3364  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3365  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3366  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3367  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3368  " " << expectedNumEntries << suffix);
3369 #endif // HAVE_TPETRA_DEBUG
3370  }
3371 
3372 
3373  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3374  void
3376  scale (const Scalar& alpha)
3377  {
3378  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3379 
3380  const size_t nlrs = staticGraph_->getLocalNumRows ();
3381  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3382  if (! staticGraph_->indicesAreAllocated () ||
3383  nlrs == 0 || numEntries == 0) {
3384  // do nothing
3385  }
3386  else {
3387 
3388  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3389  KokkosBlas::scal(vals, theAlpha, vals);
3390 
3391  }
3392  }
3393 
3394  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3395  void
3397  setAllToScalar (const Scalar& alpha)
3398  {
3399  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3400 
3401  // replace all values in the matrix
3402  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3403  // however, if there are no valid entries, we can short-circuit
3404  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3405  const size_t numEntries = staticGraph_->getLocalNumEntries();
3406  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3407  // do nothing
3408  }
3409  else {
3410  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3411  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3412  theAlpha);
3413  // CAG: This fence was found to be required on Cuda with UVM=on.
3414  Kokkos::fence("CrsMatrix::setAllToScalar");
3415  }
3416  }
3417 
3418  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3419  void
3421  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3422  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3423  const typename local_matrix_device_type::values_type& values)
3424  {
3425  using ProfilingRegion=Details::ProfilingRegion;
3426  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3427  const char tfecfFuncName[] = "setAllValues: ";
3428  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3429  (columnIndices.size () != values.size (), std::invalid_argument,
3430  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3431  " = " << values.size () << ".");
3432  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3433  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3434 
3435  try {
3436  myGraph_->setAllIndices (rowPointers, columnIndices);
3437  }
3438  catch (std::exception &e) {
3439  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3440  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3441  "exception: " << e.what ());
3442  }
3443 
3444  // Make sure that myGraph_ now has a local graph. It may not be
3445  // fillComplete yet, so it's important to check. We don't care
3446  // whether setAllIndices() did a shallow copy or a deep copy, so a
3447  // good way to check is to compare dimensions.
3448  auto lclGraph = myGraph_->getLocalGraphDevice ();
3449  const size_t numEnt = lclGraph.entries.extent (0);
3450  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3451  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3452  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3453  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3454  "local graph. Please report this bug to the Tpetra developers.");
3455 
3456  valuesPacked_wdv = values_wdv_type(values);
3457  valuesUnpacked_wdv = valuesPacked_wdv;
3458 
3459  // Storage MUST be packed, since the interface doesn't give any
3460  // way to indicate any extra space at the end of each row.
3461  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3462 
3463  checkInternalState ();
3464  }
3465 
3466  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3467  void
3469  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3470  {
3471  using ProfilingRegion=Details::ProfilingRegion;
3472  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3473 
3474  auto graph = localDeviceMatrix.graph;
3475  //FIXME how to check whether graph is allocated
3476 
3477  auto rows = graph.row_map;
3478  auto columns = graph.entries;
3479  auto values = localDeviceMatrix.values;
3480 
3481  setAllValues(rows,columns,values);
3482  }
3483 
3484  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3485  void
3487  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3488  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3489  const Teuchos::ArrayRCP<Scalar>& val)
3490  {
3491  using Kokkos::Compat::getKokkosViewDeepCopy;
3492  using Teuchos::ArrayRCP;
3493  using Teuchos::av_reinterpret_cast;
3494  typedef device_type DT;
3495  typedef impl_scalar_type IST;
3496  typedef typename local_graph_device_type::row_map_type row_map_type;
3497  //typedef typename row_map_type::non_const_value_type row_offset_type;
3498  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3499 
3500  // The row offset type may depend on the execution space. It may
3501  // not necessarily be size_t. If it's not, we need to make a deep
3502  // copy. We need to make a deep copy anyway so that Kokkos can
3503  // own the memory. Regardless, ptrIn gets the copy.
3504  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3505  Kokkos::View<const size_t*,
3506  typename row_map_type::array_layout,
3507  Kokkos::HostSpace,
3508  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3509  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3510 
3511  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3512  (ptrNative.extent (0) != ptrSizeT.extent (0),
3513  std::logic_error, "ptrNative.extent(0) = " <<
3514  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3515  << ptrSizeT.extent (0) << ". Please report this bug to the "
3516  "Tpetra developers.");
3517 
3518  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3519  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3520  this->setAllValues (ptrNative, indIn, valIn);
3521  }
3522 
3523  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3524  void
3526  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3527  {
3528  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3529  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3530  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3531 
3532  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3533  // this method in CrsGraph too, so don't call it (otherwise build
3534  // warnings will show up and annoy users). Instead, copy results
3535  // in and out, if the memory space requires it.
3536 
3537  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3538  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3539  offsets.resize (lclNumRows);
3540  }
3541 
3542  // The input ArrayRCP must always be a host pointer. Thus, if
3543  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3544  // to write to that allocation directly as a Kokkos::View.
3545  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3546  // It is always syntactically correct to assign a raw host
3547  // pointer to a device View, so this code will compile correctly
3548  // even if this branch never runs.
3549  typedef Kokkos::View<size_t*, device_type,
3550  Kokkos::MemoryUnmanaged> output_type;
3551  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3552  staticGraph_->getLocalDiagOffsets (offsetsOut);
3553  }
3554  else {
3555  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3556  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3557  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3558  Kokkos::MemoryUnmanaged> output_type;
3559  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3560  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3561  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3562  }
3563  }
3564 
3565  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3566  void
3569  {
3570  using Teuchos::ArrayRCP;
3571  using Teuchos::ArrayView;
3572  using Teuchos::av_reinterpret_cast;
3573  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3574  typedef local_ordinal_type LO;
3575 
3576 
3577  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3578  staticGraph_.is_null (), std::runtime_error,
3579  "This method requires that the matrix have a graph.");
3580  auto rowMapPtr = this->getRowMap ();
3581  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3582  // Processes on which the row Map or its communicator is null
3583  // don't participate. Users shouldn't even call this method on
3584  // those processes.
3585  return;
3586  }
3587  auto colMapPtr = this->getColMap ();
3588  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3589  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3590  "This method requires that the matrix have a column Map.");
3591  const map_type& rowMap = * rowMapPtr;
3592  const map_type& colMap = * colMapPtr;
3593  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3594 
3595 #ifdef HAVE_TPETRA_DEBUG
3596  // isCompatible() requires an all-reduce, and thus this check
3597  // should only be done in debug mode.
3598  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3599  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3600  "The input Vector's Map must be compatible with the CrsMatrix's row "
3601  "Map. You may check this by using Map's isCompatible method: "
3602  "diag.getMap ()->isCompatible (A.getRowMap ());");
3603 #endif // HAVE_TPETRA_DEBUG
3604 
3605  if (this->isFillComplete ()) {
3606  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3607  // 1-D subview of the first (and only) column of D_lcl.
3608  const auto D_lcl_1d =
3609  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3610 
3611  const auto lclRowMap = rowMap.getLocalMap ();
3612  const auto lclColMap = colMap.getLocalMap ();
3614  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3615  lclColMap,
3616  getLocalMatrixDevice ());
3617  }
3618  else {
3620  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3621  }
3622  }
3623 
3624  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3625  void
3628  const Kokkos::View<const size_t*, device_type,
3629  Kokkos::MemoryUnmanaged>& offsets) const
3630  {
3631  typedef LocalOrdinal LO;
3632 
3633 #ifdef HAVE_TPETRA_DEBUG
3634  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3635  const map_type& rowMap = * (this->getRowMap ());
3636  // isCompatible() requires an all-reduce, and thus this check
3637  // should only be done in debug mode.
3638  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3639  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3640  "The input Vector's Map must be compatible with (in the sense of Map::"
3641  "isCompatible) the CrsMatrix's row Map.");
3642 #endif // HAVE_TPETRA_DEBUG
3643 
3644  // For now, we fill the Vector on the host and sync to device.
3645  // Later, we may write a parallel kernel that works entirely on
3646  // device.
3647  //
3648  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3649  // we write a device kernel, it will not need to assume UVM.
3650 
3651  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3652  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3653  // Get 1-D subview of the first (and only) column of D_lcl.
3654  auto D_lcl_1d =
3655  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3656 
3657  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3658  getLocalMatrixDevice ());
3659  }
3660 
3661  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3662  void
3665  const Teuchos::ArrayView<const size_t>& offsets) const
3666  {
3667  using LO = LocalOrdinal;
3668  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3669  using IST = impl_scalar_type;
3670 
3671 #ifdef HAVE_TPETRA_DEBUG
3672  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3673  const map_type& rowMap = * (this->getRowMap ());
3674  // isCompatible() requires an all-reduce, and thus this check
3675  // should only be done in debug mode.
3676  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3677  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3678  "The input Vector's Map must be compatible with (in the sense of Map::"
3679  "isCompatible) the CrsMatrix's row Map.");
3680 #endif // HAVE_TPETRA_DEBUG
3681 
3682  // See #1510. In case diag has already been marked modified on
3683  // device, we need to clear that flag, since the code below works
3684  // on host.
3685  //diag.clear_sync_state ();
3686 
3687  // For now, we fill the Vector on the host and sync to device.
3688  // Later, we may write a parallel kernel that works entirely on
3689  // device.
3690  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3691  // 1-D subview of the first (and only) column of lclVecHost.
3692  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3693 
3694  using host_offsets_view_type =
3695  Kokkos::View<const size_t*, Kokkos::HostSpace,
3696  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3697  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3698  // Find the diagonal entries and put them in lclVecHost1d.
3699  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3700  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3701  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3702 
3703  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3704  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3705  Kokkos::parallel_for
3706  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3707  range_type (0, myNumRows),
3708  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3709  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3710  if (h_offsets[lclRow] != INV) {
3711  auto curRowOffset = rowPtrsPackedHost (lclRow);
3712  lclVecHost1d(lclRow) =
3713  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3714  }
3715  });
3716  //diag.sync_device ();
3717  }
3718 
3719 
3720  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3721  void
3724  {
3725  using ::Tpetra::Details::ProfilingRegion;
3726  using Teuchos::ArrayRCP;
3727  using Teuchos::ArrayView;
3728  using Teuchos::null;
3729  using Teuchos::RCP;
3730  using Teuchos::rcp;
3731  using Teuchos::rcpFromRef;
3733  const char tfecfFuncName[] = "leftScale: ";
3734 
3735  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3736 
3737  RCP<const vec_type> xp;
3738  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3739  // Take from Epetra: If we have a non-trivial exporter, we must
3740  // import elements that are permuted or are on other processors.
3741  auto exporter = this->getCrsGraphRef ().getExporter ();
3742  if (exporter.get () != nullptr) {
3743  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3744  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3745  xp = tempVec;
3746  }
3747  else {
3748  xp = rcpFromRef (x);
3749  }
3750  }
3751  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3752  xp = rcpFromRef (x);
3753  }
3754  else {
3755  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3756  (true, std::invalid_argument, "x's Map must be the same as "
3757  "either the row Map or the range Map of the CrsMatrix.");
3758  }
3759 
3760  if (this->isFillComplete()) {
3761  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3762  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3764  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3765  x_lcl_1d, false, false);
3766  }
3767  else {
3768  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3769  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3770  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3771  " fillComplete");
3772  }
3773  }
3774 
3775  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3776  void
3779  {
3780  using ::Tpetra::Details::ProfilingRegion;
3781  using Teuchos::ArrayRCP;
3782  using Teuchos::ArrayView;
3783  using Teuchos::null;
3784  using Teuchos::RCP;
3785  using Teuchos::rcp;
3786  using Teuchos::rcpFromRef;
3788  const char tfecfFuncName[] = "rightScale: ";
3789 
3790  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3791 
3792  RCP<const vec_type> xp;
3793  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3794  // Take from Epetra: If we have a non-trivial exporter, we must
3795  // import elements that are permuted or are on other processors.
3796  auto importer = this->getCrsGraphRef ().getImporter ();
3797  if (importer.get () != nullptr) {
3798  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3799  tempVec->doImport (x, *importer, REPLACE);
3800  xp = tempVec;
3801  }
3802  else {
3803  xp = rcpFromRef (x);
3804  }
3805  }
3806  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3807  xp = rcpFromRef (x);
3808  } else {
3809  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3810  (true, std::runtime_error, "x's Map must be the same as "
3811  "either the domain Map or the column Map of the CrsMatrix.");
3812  }
3813 
3814  if (this->isFillComplete()) {
3815  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3816  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3818  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3819  x_lcl_1d, false, false);
3820  }
3821  else {
3822  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3823  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3824  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3825  " fillComplete");
3826  }
3827  }
3828 
3829  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3833  {
3834  using Teuchos::ArrayView;
3835  using Teuchos::outArg;
3836  using Teuchos::REDUCE_SUM;
3837  using Teuchos::reduceAll;
3838 
3839  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3840  // local part of this computation. It could make sense to put
3841  // this operation in the Kokkos::CrsMatrix.
3842 
3843  // check the cache first
3844  mag_type mySum = STM::zero ();
3845  if (getLocalNumEntries() > 0) {
3846  if (isStorageOptimized ()) {
3847  // "Optimized" storage is packed storage. That means we can
3848  // iterate in one pass through the 1-D values array.
3849  const size_t numEntries = getLocalNumEntries ();
3850  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3851  for (size_t k = 0; k < numEntries; ++k) {
3852  auto val = values[k];
3853  // Note (etp 06 Jan 2015) We need abs() here for composite types
3854  // (in general, if mag_type is on the left-hand-side, we need
3855  // abs() on the right-hand-side)
3856  const mag_type val_abs = STS::abs (val);
3857  mySum += val_abs * val_abs;
3858  }
3859  }
3860  else {
3861  const LocalOrdinal numRows =
3862  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3863  for (LocalOrdinal r = 0; r < numRows; ++r) {
3864  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3865  const size_t numEntries = rowInfo.numEntries;
3866  auto A_r = this->getValuesViewHost(rowInfo);
3867  for (size_t k = 0; k < numEntries; ++k) {
3868  const impl_scalar_type val = A_r[k];
3869  const mag_type val_abs = STS::abs (val);
3870  mySum += val_abs * val_abs;
3871  }
3872  }
3873  }
3874  }
3875  mag_type totalSum = STM::zero ();
3876  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3877  mySum, outArg (totalSum));
3878  return STM::sqrt (totalSum);
3879  }
3880 
3881  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3882  void
3884  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3885  {
3886  const char tfecfFuncName[] = "replaceColMap: ";
3887  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3888  // Then replacing the column Map might mean that we need to
3889  // reindex the column indices.
3890  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3891  myGraph_.is_null (), std::runtime_error,
3892  "This method does not work if the matrix has a const graph. The whole "
3893  "idea of a const graph is that you are not allowed to change it, but "
3894  "this method necessarily must modify the graph, since the graph owns "
3895  "the matrix's column Map.");
3896  myGraph_->replaceColMap (newColMap);
3897  }
3898 
3899  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3900  void
3903  const Teuchos::RCP<const map_type>& newColMap,
3904  const Teuchos::RCP<const import_type>& newImport,
3905  const bool sortEachRow)
3906  {
3907  const char tfecfFuncName[] = "reindexColumns: ";
3908  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3909  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3910  "The input graph is null, but the matrix does not own its graph.");
3911 
3912  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3913  const bool sortGraph = false; // we'll sort graph & matrix together below
3914 
3915  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3916 
3917  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3918  const LocalOrdinal lclNumRows =
3919  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3920 
3921  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3922 
3923  const RowInfo rowInfo = theGraph.getRowInfo (row);
3924  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3925  auto vals = this->getValuesViewHostNonConst (rowInfo);
3926 
3927  sort2 (lclColInds.data (),
3928  lclColInds.data () + rowInfo.numEntries,
3929  vals.data ());
3930  }
3931  theGraph.indicesAreSorted_ = true;
3932  }
3933  }
3934 
3935  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3936  void
3938  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3939  {
3940  const char tfecfFuncName[] = "replaceDomainMap: ";
3941  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3942  myGraph_.is_null (), std::runtime_error,
3943  "This method does not work if the matrix has a const graph. The whole "
3944  "idea of a const graph is that you are not allowed to change it, but this"
3945  " method necessarily must modify the graph, since the graph owns the "
3946  "matrix's domain Map and Import objects.");
3947  myGraph_->replaceDomainMap (newDomainMap);
3948  }
3949 
3950  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3951  void
3953  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3954  Teuchos::RCP<const import_type>& newImporter)
3955  {
3956  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3957  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3958  myGraph_.is_null (), std::runtime_error,
3959  "This method does not work if the matrix has a const graph. The whole "
3960  "idea of a const graph is that you are not allowed to change it, but this"
3961  " method necessarily must modify the graph, since the graph owns the "
3962  "matrix's domain Map and Import objects.");
3963  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3964  }
3965 
3966  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3967  void
3969  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
3970  {
3971  const char tfecfFuncName[] = "replaceRangeMap: ";
3972  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3973  myGraph_.is_null (), std::runtime_error,
3974  "This method does not work if the matrix has a const graph. The whole "
3975  "idea of a const graph is that you are not allowed to change it, but this"
3976  " method necessarily must modify the graph, since the graph owns the "
3977  "matrix's domain Map and Import objects.");
3978  myGraph_->replaceRangeMap (newRangeMap);
3979  }
3980 
3981  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3982  void
3984  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
3985  Teuchos::RCP<const export_type>& newExporter)
3986  {
3987  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3988  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3989  myGraph_.is_null (), std::runtime_error,
3990  "This method does not work if the matrix has a const graph. The whole "
3991  "idea of a const graph is that you are not allowed to change it, but this"
3992  " method necessarily must modify the graph, since the graph owns the "
3993  "matrix's domain Map and Import objects.");
3994  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
3995  }
3996 
3997  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3998  void
4000  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4001  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4002  const Teuchos::ArrayView<const Scalar>& values)
4003  {
4004  using Teuchos::Array;
4005  typedef GlobalOrdinal GO;
4006  typedef typename Array<GO>::size_type size_type;
4007 
4008  const size_type numToInsert = indices.size ();
4009  // Add the new data to the list of nonlocals.
4010  // This creates the arrays if they don't exist yet.
4011  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4012  Array<GO>& curRowInds = curRow.first;
4013  Array<Scalar>& curRowVals = curRow.second;
4014  const size_type newCapacity = curRowInds.size () + numToInsert;
4015  curRowInds.reserve (newCapacity);
4016  curRowVals.reserve (newCapacity);
4017  for (size_type k = 0; k < numToInsert; ++k) {
4018  curRowInds.push_back (indices[k]);
4019  curRowVals.push_back (values[k]);
4020  }
4021  }
4022 
4023  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4024  void
4027  {
4028  using Details::Behavior;
4030  using Teuchos::Comm;
4031  using Teuchos::outArg;
4032  using Teuchos::RCP;
4033  using Teuchos::rcp;
4034  using Teuchos::REDUCE_MAX;
4035  using Teuchos::REDUCE_MIN;
4036  using Teuchos::reduceAll;
4037  using std::endl;
4039  //typedef LocalOrdinal LO;
4040  typedef GlobalOrdinal GO;
4041  typedef typename Teuchos::Array<GO>::size_type size_type;
4042  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4043  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4044 
4045  const bool verbose = Behavior::verbose("CrsMatrix");
4046  std::unique_ptr<std::string> prefix;
4047  if (verbose) {
4048  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4049  std::ostringstream os;
4050  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4051  << endl;
4052  std::cerr << os.str();
4053  }
4054  RCP<const Comm<int> > comm = getComm ();
4055 
4056  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4057  (! isFillActive (), std::runtime_error, "Fill must be active before "
4058  "you may call this method.");
4059 
4060  const size_t myNumNonlocalRows = nonlocals_.size ();
4061 
4062  // If no processes have nonlocal rows, then we don't have to do
4063  // anything. Checking this is probably cheaper than constructing
4064  // the Map of nonlocal rows (see below) and noticing that it has
4065  // zero global entries.
4066  {
4067  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4068  int someoneHasNonlocalRows = 0;
4069  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4070  outArg (someoneHasNonlocalRows));
4071  if (someoneHasNonlocalRows == 0) {
4072  return; // no process has nonlocal rows, so nothing to do
4073  }
4074  }
4075 
4076  // 1. Create a list of the "nonlocal" rows on each process. this
4077  // requires iterating over nonlocals_, so while we do this,
4078  // deduplicate the entries and get a count for each nonlocal
4079  // row on this process.
4080  // 2. Construct a new row Map corresponding to those rows. This
4081  // Map is likely overlapping. We know that the Map is not
4082  // empty on all processes, because the above all-reduce and
4083  // return exclude that case.
4084 
4085  RCP<const map_type> nonlocalRowMap;
4086  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4087  {
4088  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4089  size_type curPos = 0;
4090  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4091  ++mapIter, ++curPos) {
4092  myNonlocalGblRows[curPos] = mapIter->first;
4093  // Get the values and column indices by reference, since we
4094  // intend to change them in place (that's what "erase" does).
4095  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4096  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4097 
4098  // Sort both arrays jointly, using the column indices as keys,
4099  // then merge them jointly. "Merge" here adds values
4100  // corresponding to the same column indices. The first 2 args
4101  // of merge2 are output arguments that work just like the
4102  // return value of std::unique.
4103  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4104  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4105  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4106  merge2 (gblCols_newEnd, vals_newEnd,
4107  gblCols.begin (), gblCols.end (),
4108  vals.begin (), vals.end ());
4109  gblCols.erase (gblCols_newEnd, gblCols.end ());
4110  vals.erase (vals_newEnd, vals.end ());
4111  numEntPerNonlocalRow[curPos] = gblCols.size ();
4112  }
4113 
4114  // Currently, Map requires that its indexBase be the global min
4115  // of all its global indices. Map won't compute this for us, so
4116  // we must do it. If our process has no nonlocal rows, set the
4117  // "min" to the max possible GO value. This ensures that if
4118  // some process has at least one nonlocal row, then it will pick
4119  // that up as the min. We know that at least one process has a
4120  // nonlocal row, since the all-reduce and return at the top of
4121  // this method excluded that case.
4122  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4123  {
4124  auto iter = std::min_element (myNonlocalGblRows.begin (),
4125  myNonlocalGblRows.end ());
4126  if (iter != myNonlocalGblRows.end ()) {
4127  myMinNonlocalGblRow = *iter;
4128  }
4129  }
4130  GO gblMinNonlocalGblRow = 0;
4131  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4132  outArg (gblMinNonlocalGblRow));
4133  const GO indexBase = gblMinNonlocalGblRow;
4134  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4135  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4136  }
4137 
4138  // 3. Use the values and column indices for each nonlocal row, as
4139  // stored in nonlocals_, to construct a CrsMatrix corresponding
4140  // to nonlocal rows. We have
4141  // exact counts of the number of entries in each nonlocal row.
4142 
4143  if (verbose) {
4144  std::ostringstream os;
4145  os << *prefix << "Create nonlocal matrix" << endl;
4146  std::cerr << os.str();
4147  }
4148  RCP<crs_matrix_type> nonlocalMatrix =
4149  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4150  {
4151  size_type curPos = 0;
4152  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4153  ++mapIter, ++curPos) {
4154  const GO gblRow = mapIter->first;
4155  // Get values & column indices by ref, just to avoid copy.
4156  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4157  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4158  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4159  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4160  }
4161  }
4162  // There's no need to fill-complete the nonlocals matrix.
4163  // We just use it as a temporary container for the Export.
4164 
4165  // 4. If the original row Map is one to one, then we can Export
4166  // directly from nonlocalMatrix into this. Otherwise, we have
4167  // to create a temporary matrix with a one-to-one row Map,
4168  // Export into that, then Import from the temporary matrix into
4169  // *this.
4170 
4171  auto origRowMap = this->getRowMap ();
4172  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4173 
4174  int isLocallyComplete = 1; // true by default
4175 
4176  if (origRowMapIsOneToOne) {
4177  if (verbose) {
4178  std::ostringstream os;
4179  os << *prefix << "Original row Map is 1-to-1" << endl;
4180  std::cerr << os.str();
4181  }
4182  export_type exportToOrig (nonlocalRowMap, origRowMap);
4183  if (! exportToOrig.isLocallyComplete ()) {
4184  isLocallyComplete = 0;
4185  }
4186  if (verbose) {
4187  std::ostringstream os;
4188  os << *prefix << "doExport from nonlocalMatrix" << endl;
4189  std::cerr << os.str();
4190  }
4191  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4192  // We're done at this point!
4193  }
4194  else {
4195  if (verbose) {
4196  std::ostringstream os;
4197  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4198  std::cerr << os.str();
4199  }
4200  // If you ask a Map whether it is one to one, it does some
4201  // communication and stashes intermediate results for later use
4202  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4203  // much more then the original cost of calling isOneToOne.
4204  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4205  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4206  if (! exportToOneToOne.isLocallyComplete ()) {
4207  isLocallyComplete = 0;
4208  }
4209 
4210  // Create a temporary matrix with the one-to-one row Map.
4211  //
4212  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4213  // each row, to avoid reallocation during the Export operation.
4214  if (verbose) {
4215  std::ostringstream os;
4216  os << *prefix << "Create & doExport into 1-to-1 matrix"
4217  << endl;
4218  std::cerr << os.str();
4219  }
4220  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4221  // Export from matrix of nonlocals into the temp one-to-one matrix.
4222  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4223  Tpetra::ADD);
4224 
4225  // We don't need the matrix of nonlocals anymore, so get rid of
4226  // it, to keep the memory high-water mark down.
4227  if (verbose) {
4228  std::ostringstream os;
4229  os << *prefix << "Free nonlocalMatrix" << endl;
4230  std::cerr << os.str();
4231  }
4232  nonlocalMatrix = Teuchos::null;
4233 
4234  // Import from the one-to-one matrix to the original matrix.
4235  if (verbose) {
4236  std::ostringstream os;
4237  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4238  std::cerr << os.str();
4239  }
4240  import_type importToOrig (oneToOneRowMap, origRowMap);
4241  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4242  }
4243 
4244  // It's safe now to clear out nonlocals_, since we've already
4245  // committed side effects to *this. The standard idiom for
4246  // clearing a Container like std::map, is to swap it with an empty
4247  // Container and let the swapped Container fall out of scope.
4248  if (verbose) {
4249  std::ostringstream os;
4250  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4251  std::cerr << os.str();
4252  }
4253  decltype (nonlocals_) newNonlocals;
4254  std::swap (nonlocals_, newNonlocals);
4255 
4256  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4257  // don't like throwing an exception here. A local return value
4258  // would likely be more useful to users. However, if users find
4259  // themselves exercising nonlocal inserts often, then they are
4260  // probably novice users who need the help. See Gibhub Issues
4261  // #603 and #601 (esp. the latter) for discussion.
4262 
4263  int isGloballyComplete = 0; // output argument of reduceAll
4264  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4265  outArg (isGloballyComplete));
4266  TEUCHOS_TEST_FOR_EXCEPTION
4267  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4268  "you called insertGlobalValues with a global row index which is not in "
4269  "the matrix's row Map on any process in its communicator.");
4270  }
4271 
4272  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4273  void
4275  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4276  {
4277  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4278  myGraph_->resumeFill (params);
4279  }
4280 #if KOKKOSKERNELS_VERSION >= 40299
4281  // Delete the apply helper (if it exists)
4282  applyHelper.reset();
4283 #endif
4284  fillComplete_ = false;
4285  }
4286 
4287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4288  bool
4291  return getCrsGraphRef ().haveGlobalConstants ();
4292  }
4293 
4294  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4295  void
4297  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4298  {
4299  const char tfecfFuncName[] = "fillComplete(params): ";
4300 
4301  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4302  (this->getCrsGraph ().is_null (), std::logic_error,
4303  "getCrsGraph() returns null. This should not happen at this point. "
4304  "Please report this bug to the Tpetra developers.");
4305 
4306  const crs_graph_type& graph = this->getCrsGraphRef ();
4307  if (this->isStaticGraph () && graph.isFillComplete ()) {
4308  // If this matrix's graph is fill complete and the user did not
4309  // supply a domain or range Map, use the graph's domain and
4310  // range Maps.
4311  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4312  }
4313  else { // assume that user's row Map is the domain and range Map
4314  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4315  Teuchos::RCP<const map_type> domainMap = rangeMap;
4316  this->fillComplete (domainMap, rangeMap, params);
4317  }
4318  }
4319 
4320  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4321  void
4323  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4324  const Teuchos::RCP<const map_type>& rangeMap,
4325  const Teuchos::RCP<Teuchos::ParameterList>& params)
4326  {
4327  using Details::Behavior;
4329  using Teuchos::ArrayRCP;
4330  using Teuchos::RCP;
4331  using Teuchos::rcp;
4332  using std::endl;
4333  const char tfecfFuncName[] = "fillComplete: ";
4334  ProfilingRegion regionFillComplete
4335  ("Tpetra::CrsMatrix::fillComplete");
4336  const bool verbose = Behavior::verbose("CrsMatrix");
4337  std::unique_ptr<std::string> prefix;
4338  if (verbose) {
4339  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4340  std::ostringstream os;
4341  os << *prefix << endl;
4342  std::cerr << os.str ();
4343  }
4344  Details::ProfilingRegion region(
4345  "Tpetra::CrsMatrix::fillCompete",
4346  "fillCompete");
4347 
4348  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4349  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4350  "Matrix fill state must be active (isFillActive() "
4351  "must be true) before you may call fillComplete().");
4352  const int numProcs = this->getComm ()->getSize ();
4353 
4354  //
4355  // Read parameters from the input ParameterList.
4356  //
4357  {
4358  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4359 
4360  // If true, the caller promises that no process did nonlocal
4361  // changes since the last call to fillComplete.
4362  bool assertNoNonlocalInserts = false;
4363  // If true, makeColMap sorts remote GIDs (within each remote
4364  // process' group).
4365  bool sortGhosts = true;
4366 
4367  if (! params.is_null ()) {
4368  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4369  assertNoNonlocalInserts);
4370  if (params->isParameter ("sort column map ghost gids")) {
4371  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4372  }
4373  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4374  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4375  }
4376  }
4377  // We also don't need to do global assembly if there is only one
4378  // process in the communicator.
4379  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4380  // This parameter only matters if this matrix owns its graph.
4381  if (! this->myGraph_.is_null ()) {
4382  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4383  }
4384 
4385  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4386  if (this->hasColMap ()) { // use local indices
4387  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4388  }
4389  else { // no column Map, so use global indices
4390  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4391  }
4392  }
4393  // Global assemble, if we need to. This call only costs a single
4394  // all-reduce if we didn't need global assembly after all.
4395  if (needGlobalAssemble) {
4396  this->globalAssemble ();
4397  }
4398  else {
4399  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4400  (numProcs == 1 && nonlocals_.size() > 0,
4401  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4402  "An invalid entry (i.e., with row index not in the row Map) must have "
4403  "been submitted to the CrsMatrix.");
4404  }
4405  }
4406  if (this->isStaticGraph ()) {
4407  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4408  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4409  // checks below only in debug mode. It would be nicer to do a
4410  // local check, then propagate the error state in a deferred
4411  // way, whenever communication happens. That would reduce the
4412  // cost of checking, to the point where it may make sense to
4413  // enable it even in release mode.
4414 #ifdef HAVE_TPETRA_DEBUG
4415  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4416  // input Maps incurs a penalty of two all-reduces for the
4417  // otherwise optimal const graph case.
4418  //
4419  // We could turn these (max) 2 all-reduces into (max) 1, by
4420  // fusing them. We could do this by adding a "locallySameAs"
4421  // method to Map, which would return one of four states:
4422  //
4423  // a. Certainly globally the same
4424  // b. Certainly globally not the same
4425  // c. Locally the same
4426  // d. Locally not the same
4427  //
4428  // The first two states don't require further communication.
4429  // The latter two states require an all-reduce to communicate
4430  // globally, but we only need one all-reduce, since we only need
4431  // to check whether at least one of the Maps is wrong.
4432  const bool domainMapsMatch =
4433  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4434  const bool rangeMapsMatch =
4435  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4436 
4437  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4438  (! domainMapsMatch, std::runtime_error,
4439  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4440  "The graph cannot be changed because it was given to the CrsMatrix "
4441  "constructor as const. You can fix this by passing in the graph's "
4442  "domain Map and range Map to the matrix's fillComplete call.");
4443 
4444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4445  (! rangeMapsMatch, std::runtime_error,
4446  "The CrsMatrix's range Map does not match the graph's range Map. "
4447  "The graph cannot be changed because it was given to the CrsMatrix "
4448  "constructor as const. You can fix this by passing in the graph's "
4449  "domain Map and range Map to the matrix's fillComplete call.");
4450 #endif // HAVE_TPETRA_DEBUG
4451 
4452  // The matrix does _not_ own the graph, and the graph's
4453  // structure is already fixed, so just fill the local matrix.
4454  this->fillLocalMatrix (params);
4455  }
4456  else {
4457  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4458  // Set the graph's domain and range Maps. This will clear the
4459  // Import if the domain Map has changed (is a different
4460  // pointer), and the Export if the range Map has changed (is a
4461  // different pointer).
4462  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4463 
4464  // Make the graph's column Map, if necessary.
4465  Teuchos::Array<int> remotePIDs (0);
4466  const bool mustBuildColMap = ! this->hasColMap ();
4467  if (mustBuildColMap) {
4468  this->myGraph_->makeColMap (remotePIDs);
4469  }
4470 
4471  // Make indices local, if necessary. The method won't do
4472  // anything if the graph is already locally indexed.
4473  const std::pair<size_t, std::string> makeIndicesLocalResult =
4474  this->myGraph_->makeIndicesLocal(verbose);
4475  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4476  // the error state to makeImportExport
4477  // which may do all-reduces and thus may
4478  // have the opportunity to communicate that error state.
4479  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4480  (makeIndicesLocalResult.first != 0, std::runtime_error,
4481  makeIndicesLocalResult.second);
4482 
4483  const bool sorted = this->myGraph_->isSorted ();
4484  const bool merged = this->myGraph_->isMerged ();
4485  this->sortAndMergeIndicesAndValues (sorted, merged);
4486 
4487  // Make Import and Export objects, if they haven't been made
4488  // already. If we made a column Map above, reuse information
4489  // from that process to avoid communiation in the Import setup.
4490  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4491 
4492  // The matrix _does_ own the graph, so fill the local graph at
4493  // the same time as the local matrix.
4494  this->fillLocalGraphAndMatrix (params);
4495 
4496  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4497  params->get ("compute global constants", true);
4498  if (callGraphComputeGlobalConstants) {
4499  this->myGraph_->computeGlobalConstants ();
4500  }
4501  else {
4502  this->myGraph_->computeLocalConstants ();
4503  }
4504  this->myGraph_->fillComplete_ = true;
4505  this->myGraph_->checkInternalState ();
4506  }
4507 
4508  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4509 
4510  this->fillComplete_ = true; // Now we're fill complete!
4511  {
4512  Details::ProfilingRegion region_cis(
4513  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4514  );
4515  this->checkInternalState ();
4516  }
4517  } //fillComplete(domainMap, rangeMap, params)
4518 
4519  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4520  void
4522  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4523  const Teuchos::RCP<const map_type> & rangeMap,
4524  const Teuchos::RCP<const import_type>& importer,
4525  const Teuchos::RCP<const export_type>& exporter,
4526  const Teuchos::RCP<Teuchos::ParameterList> &params)
4527  {
4528 #ifdef HAVE_TPETRA_MMM_TIMINGS
4529  std::string label;
4530  if(!params.is_null())
4531  label = params->get("Timer Label",label);
4532  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4533  using Teuchos::TimeMonitor;
4534 
4535  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4536 #endif
4537 
4538  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4539  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4540  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4541  "must be true) before calling fillComplete().");
4542  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4543  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4544 
4545  {
4546 #ifdef HAVE_TPETRA_MMM_TIMINGS
4547  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4548 #endif
4549  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4550  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4551  }
4552 
4553  {
4554 #ifdef HAVE_TPETRA_MMM_TIMINGS
4555  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4556 #endif
4557  // Fill the local graph and matrix
4558  fillLocalGraphAndMatrix (params);
4559  }
4560  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4561 
4562  // Now we're fill complete!
4563  fillComplete_ = true;
4564 
4565  // Sanity checks at the end.
4566 #ifdef HAVE_TPETRA_DEBUG
4567  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4568  ": We're at the end of fillComplete(), but isFillActive() is true. "
4569  "Please report this bug to the Tpetra developers.");
4570  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4571  ": We're at the end of fillComplete(), but isFillActive() is true. "
4572  "Please report this bug to the Tpetra developers.");
4573 #endif // HAVE_TPETRA_DEBUG
4574  {
4575 #ifdef HAVE_TPETRA_MMM_TIMINGS
4576  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4577 #endif
4578 
4579  checkInternalState();
4580  }
4581  }
4582 
4583  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4585  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4586  {
4587  impl_scalar_type* rowValueIter = vals;
4588  // beg,end define a half-exclusive interval over which to iterate.
4589  LocalOrdinal* beg = cols;
4590  LocalOrdinal* end = cols + rowLen;
4591  LocalOrdinal* newend = beg;
4592  if (beg != end) {
4593  LocalOrdinal* cur = beg + 1;
4594  impl_scalar_type* vcur = rowValueIter + 1;
4595  impl_scalar_type* vend = rowValueIter;
4596  cur = beg+1;
4597  while (cur != end) {
4598  if (*cur != *newend) {
4599  // new entry; save it
4600  ++newend;
4601  ++vend;
4602  (*newend) = (*cur);
4603  (*vend) = (*vcur);
4604  }
4605  else {
4606  // old entry; merge it
4607  //(*vend) = f (*vend, *vcur);
4608  (*vend) += *vcur;
4609  }
4610  ++cur;
4611  ++vcur;
4612  }
4613  ++newend; // one past the last entry, per typical [beg,end) semantics
4614  }
4615  return newend - beg;
4616  }
4617 
4618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4619  void
4621  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4622  {
4623  using ::Tpetra::Details::ProfilingRegion;
4624  typedef LocalOrdinal LO;
4625  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4626  host_execution_space;
4627  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4628  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4629  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4630 
4631  if (! sorted || ! merged) {
4632  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4633  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4634  "\"static\" (const) graph, since the matrix does not own the graph.");
4635  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4636  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4637  "this matrix claims ! isStaticGraph(). "
4638  "Please report this bug to the Tpetra developers.");
4639  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4640  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4641  "this method if the graph's storage has already been optimized. "
4642  "Please report this bug to the Tpetra developers.");
4643 
4644  crs_graph_type& graph = * (this->myGraph_);
4645  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4646  size_t totalNumDups = 0;
4647  {
4648  //Accessing host unpacked (4-array CRS) local matrix.
4649  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4650  auto rowLengths_ = graph.k_numRowEntries_;
4651  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4652  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4653  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4654  [=] (const LO lclRow, size_t& numDups) {
4655  size_t rowBegin = rowBegins_(lclRow);
4656  size_t rowLen = rowLengths_(lclRow);
4657  LO* cols = cols_.data() + rowBegin;
4658  impl_scalar_type* vals = vals_.data() + rowBegin;
4659  if (! sorted) {
4660  sort2 (cols, cols + rowLen, vals);
4661  }
4662  if (! merged) {
4663  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4664  rowLengths_(lclRow) = newRowLength;
4665  numDups += rowLen - newRowLength;
4666  }
4667  }, totalNumDups);
4668  }
4669  if (! sorted) {
4670  graph.indicesAreSorted_ = true; // we just sorted every row
4671  }
4672  if (! merged) {
4673  graph.noRedundancies_ = true; // we just merged every row
4674  }
4675  }
4676  }
4677 
4678  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4679  void
4683  Scalar alpha,
4684  Scalar beta) const
4685  {
4687  using Teuchos::RCP;
4688  using Teuchos::rcp;
4689  using Teuchos::rcp_const_cast;
4690  using Teuchos::rcpFromRef;
4691  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4692  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4693 
4694  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4695  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4696  // failing only for the Kokkos refactor version of Tpetra. It's a
4697  // good idea regardless to have the bypass.
4698  if (alpha == ZERO) {
4699  if (beta == ZERO) {
4700  Y_in.putScalar (ZERO);
4701  } else if (beta != ONE) {
4702  Y_in.scale (beta);
4703  }
4704  return;
4705  }
4706 
4707  // It's possible that X is a view of Y or vice versa. We don't
4708  // allow this (apply() requires that X and Y not alias one
4709  // another), but it's helpful to detect and work around this case.
4710  // We don't try to to detect the more subtle cases (e.g., one is a
4711  // subview of the other, but their initial pointers differ). We
4712  // only need to do this if this matrix's Import is trivial;
4713  // otherwise, we don't actually apply the operator from X into Y.
4714 
4715  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4716  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4717 
4718  // If beta == 0, then the output MV will be overwritten; none of
4719  // its entries should be read. (Sparse BLAS semantics say that we
4720  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4721  // This matters if we need to do an Export operation; see below.
4722  const bool Y_is_overwritten = (beta == ZERO);
4723 
4724  // We treat the case of a replicated MV output specially.
4725  const bool Y_is_replicated =
4726  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4727 
4728  // This is part of the special case for replicated MV output.
4729  // We'll let each process do its thing, but do an all-reduce at
4730  // the end to sum up the results. Setting beta=0 on all processes
4731  // but Proc 0 makes the math work out for the all-reduce. (This
4732  // assumes that the replicated data is correctly replicated, so
4733  // that the data are the same on all processes.)
4734  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4735  beta = ZERO;
4736  }
4737 
4738  // Temporary MV for Import operation. After the block of code
4739  // below, this will be an (Imported if necessary) column Map MV
4740  // ready to give to localApply(...).
4741  RCP<const MV> X_colMap;
4742  if (importer.is_null ()) {
4743  if (! X_in.isConstantStride ()) {
4744  // Not all sparse mat-vec kernels can handle an input MV with
4745  // nonconstant stride correctly, so we have to copy it in that
4746  // case into a constant stride MV. To make a constant stride
4747  // copy of X_in, we force creation of the column (== domain)
4748  // Map MV (if it hasn't already been created, else fetch the
4749  // cached copy). This avoids creating a new MV each time.
4750  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4751  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4752  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4753  }
4754  else {
4755  // The domain and column Maps are the same, so do the local
4756  // multiply using the domain Map input MV X_in.
4757  X_colMap = rcpFromRef (X_in);
4758  }
4759  }
4760  else { // need to Import source (multi)vector
4761  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4762 
4763  // We're doing an Import anyway, which will copy the relevant
4764  // elements of the domain Map MV X_in into a separate column Map
4765  // MV. Thus, we don't have to worry whether X_in is constant
4766  // stride.
4767  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4768 
4769  // Import from the domain Map MV to the column Map MV.
4770  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4771  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4772  }
4773 
4774  // Temporary MV for doExport (if needed), or for copying a
4775  // nonconstant stride output MV into a constant stride MV. This
4776  // is null if we don't need the temporary MV, that is, if the
4777  // Export is trivial (null).
4778  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4779 
4780  // If we have a nontrivial Export object, we must perform an
4781  // Export. In that case, the local multiply result will go into
4782  // the row Map multivector. We don't have to make a
4783  // constant-stride version of Y_in in this case, because we had to
4784  // make a constant stride Y_rowMap MV and do an Export anyway.
4785  if (! exporter.is_null ()) {
4786  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4787  {
4788  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4789 
4790  // If we're overwriting the output MV Y_in completely (beta ==
4791  // 0), then make sure that it is filled with zeros before we
4792  // do the Export. Otherwise, the ADD combine mode will use
4793  // data in Y_in, which is supposed to be zero.
4794  if (Y_is_overwritten) {
4795  Y_in.putScalar (ZERO);
4796  }
4797  else {
4798  // Scale output MV by beta, so that doExport sums in the
4799  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4800  Y_in.scale (beta);
4801  }
4802  // Do the Export operation.
4803  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4804  }
4805  }
4806  else { // Don't do an Export: row Map and range Map are the same.
4807  //
4808  // If Y_in does not have constant stride, or if the column Map
4809  // MV aliases Y_in, then we can't let the kernel write directly
4810  // to Y_in. Instead, we have to use the cached row (== range)
4811  // Map MV as temporary storage.
4812  //
4813  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4814  // the user passed in the same MultiVector for both X and Y. It
4815  // won't detect whether one MultiVector views the other. We
4816  // should also check the MultiVectors' raw data pointers.
4817  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4818  // Force creating the MV if it hasn't been created already.
4819  // This will reuse a previously created cached MV.
4820  Y_rowMap = getRowMapMultiVector (Y_in, true);
4821 
4822  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4823  // since we're overwriting it anyway.
4824  if (beta != ZERO) {
4825  Tpetra::deep_copy (*Y_rowMap, Y_in);
4826  }
4827  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4828  Tpetra::deep_copy (Y_in, *Y_rowMap);
4829  }
4830  else {
4831  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4832  }
4833  }
4834 
4835  // If the range Map is a locally replicated Map, sum up
4836  // contributions from each process. We set beta = 0 on all
4837  // processes but Proc 0 initially, so this will handle the scaling
4838  // factor beta correctly.
4839  if (Y_is_replicated) {
4840  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4841  Y_in.reduce ();
4842  }
4843  }
4844 
4845  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4846  void
4850  const Teuchos::ETransp mode,
4851  Scalar alpha,
4852  Scalar beta) const
4853  {
4855  using Teuchos::null;
4856  using Teuchos::RCP;
4857  using Teuchos::rcp;
4858  using Teuchos::rcp_const_cast;
4859  using Teuchos::rcpFromRef;
4860  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4861 
4862  // Take shortcuts for alpha == 0.
4863  if (alpha == ZERO) {
4864  // Follow the Sparse BLAS convention by ignoring both the matrix
4865  // and X_in, in this case.
4866  if (beta == ZERO) {
4867  // Follow the Sparse BLAS convention by overwriting any Inf or
4868  // NaN values in Y_in, in this case.
4869  Y_in.putScalar (ZERO);
4870  }
4871  else {
4872  Y_in.scale (beta);
4873  }
4874  return;
4875  }
4876  else if (beta == ZERO) {
4877  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4878  //when bets==0. This was not the case with transpose in a multithreaded
4879  //environment where a multiplication with subsequent atomic_adds is used
4880  //since 0 is effectively not special cased. Doing the explicit set to zero here
4881  //This catches cases where Y is nan or inf.
4882  Y_in.putScalar (ZERO);
4883  }
4884 
4885  const size_t numVectors = X_in.getNumVectors ();
4886 
4887  // We don't allow X_in and Y_in to alias one another. It's hard
4888  // to check this, because advanced users could create views from
4889  // raw pointers. However, if X_in and Y_in reference the same
4890  // object, we will do the user a favor by copying X into new
4891  // storage (with a warning). We only need to do this if we have
4892  // trivial importers; otherwise, we don't actually apply the
4893  // operator from X into Y.
4894  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4895  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4896  // access X indirectly, in case we need to create temporary storage
4897  RCP<const MV> X;
4898 
4899  // some parameters for below
4900  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4901  const bool Y_is_overwritten = (beta == ZERO);
4902  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4903  beta = ZERO;
4904  }
4905 
4906  // The kernels do not allow input or output with nonconstant stride.
4907  if (! X_in.isConstantStride () && importer.is_null ()) {
4908  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4909  } else {
4910  X = rcpFromRef (X_in); // Reference to X_in
4911  }
4912 
4913  // Set up temporary multivectors for Import and/or Export.
4914  if (importer != Teuchos::null) {
4915  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4916  importMV_ = null;
4917  }
4918  if (importMV_ == null) {
4919  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4920  }
4921  }
4922  if (exporter != Teuchos::null) {
4923  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4924  exportMV_ = null;
4925  }
4926  if (exportMV_ == null) {
4927  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4928  }
4929  }
4930 
4931  // If we have a non-trivial exporter, we must import elements that
4932  // are permuted or are on other processors.
4933  if (! exporter.is_null ()) {
4934  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4935  exportMV_->doImport (X_in, *exporter, INSERT);
4936  X = exportMV_; // multiply out of exportMV_
4937  }
4938 
4939  // If we have a non-trivial importer, we must export elements that
4940  // are permuted or belong to other processors. We will compute
4941  // solution into the to-be-exported MV; get a view.
4942  if (importer != Teuchos::null) {
4943  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4944 
4945  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4946  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4947  // back and figure out why this helps. importMV_ SHOULD be
4948  // completely overwritten in the localApply(...) call
4949  // below, because beta == ZERO there.
4950  importMV_->putScalar (ZERO);
4951  // Do the local computation.
4952  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4953 
4954  if (Y_is_overwritten) {
4955  Y_in.putScalar (ZERO);
4956  } else {
4957  Y_in.scale (beta);
4958  }
4959  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4960  }
4961  // otherwise, multiply into Y
4962  else {
4963  // can't multiply in-situ; can't multiply into non-strided multivector
4964  //
4965  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4966  // the user passed in the same MultiVector for both X and Y. It
4967  // won't detect whether one MultiVector views the other. We
4968  // should also check the MultiVectors' raw data pointers.
4969  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4970  // Make a deep copy of Y_in, into which to write the multiply result.
4971  MV Y (Y_in, Teuchos::Copy);
4972  this->localApply (*X, Y, mode, alpha, beta);
4973  Tpetra::deep_copy (Y_in, Y);
4974  } else {
4975  this->localApply (*X, Y_in, mode, alpha, beta);
4976  }
4977  }
4978 
4979  // If the range Map is a locally replicated map, sum the
4980  // contributions from each process. (That's why we set beta=0
4981  // above for all processes but Proc 0.)
4982  if (Y_is_replicated) {
4983  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4984  Y_in.reduce ();
4985  }
4986  }
4987 
4988  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4989  void
4993  const Teuchos::ETransp mode,
4994  const Scalar& alpha,
4995  const Scalar& beta) const
4996  {
4998  using Teuchos::NO_TRANS;
4999  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5000 
5001  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5002  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5003 
5004  const bool debug = ::Tpetra::Details::Behavior::debug ();
5005  if (debug) {
5006  const char tfecfFuncName[] = "localApply: ";
5007  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5008  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5009  "X.getNumVectors() = " << X.getNumVectors () << " != "
5010  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5011  const bool transpose = (mode != Teuchos::NO_TRANS);
5012  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5013  (! transpose && X.getLocalLength () !=
5014  getColMap ()->getLocalNumElements (), std::runtime_error,
5015  "NO_TRANS case: X has the wrong number of local rows. "
5016  "X.getLocalLength() = " << X.getLocalLength () << " != "
5017  "getColMap()->getLocalNumElements() = " <<
5018  getColMap ()->getLocalNumElements () << ".");
5019  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5020  (! transpose && Y.getLocalLength () !=
5021  getRowMap ()->getLocalNumElements (), std::runtime_error,
5022  "NO_TRANS case: Y has the wrong number of local rows. "
5023  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5024  "getRowMap()->getLocalNumElements() = " <<
5025  getRowMap ()->getLocalNumElements () << ".");
5026  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5027  (transpose && X.getLocalLength () !=
5028  getRowMap ()->getLocalNumElements (), std::runtime_error,
5029  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5030  "rows. X.getLocalLength() = " << X.getLocalLength ()
5031  << " != getRowMap()->getLocalNumElements() = "
5032  << getRowMap ()->getLocalNumElements () << ".");
5033  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5034  (transpose && Y.getLocalLength () !=
5035  getColMap ()->getLocalNumElements (), std::runtime_error,
5036  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5037  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5038  << " != getColMap()->getLocalNumElements() = "
5039  << getColMap ()->getLocalNumElements () << ".");
5040  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5041  (! isFillComplete (), std::runtime_error, "The matrix is not "
5042  "fill complete. You must call fillComplete() (possibly with "
5043  "domain and range Map arguments) without an intervening "
5044  "resumeFill() call before you may call this method.");
5045  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5046  (! X.isConstantStride () || ! Y.isConstantStride (),
5047  std::runtime_error, "X and Y must be constant stride.");
5048  // If the two pointers are null, then they don't alias one
5049  // another, even though they are equal.
5050  // Kokkos does not guarantee that zero row-extent vectors
5051  // point to different places, so we have to check that too.
5052  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5053  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5054  && X_lcl.extent(0) != 0,
5055  std::runtime_error, "X and Y may not alias one another.");
5056  }
5057 
5058 #if KOKKOSKERNELS_VERSION >= 40299
5059  auto A_lcl = getLocalMatrixDevice();
5060 
5061  if(!applyHelper.get()) {
5062  // The apply helper does not exist, so create it.
5063  // Decide now whether to use the imbalanced row path, or the default.
5064  bool useMergePath = false;
5065 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5066  //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5067  //we can use SPMV_MERGE_PATH for the native spmv as well.
5068  //Take out this ifdef to enable that.
5069  //
5070  //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5071  if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5072  LocalOrdinal nrows = getLocalNumRows();
5073  LocalOrdinal maxRowImbalance = 0;
5074  if(nrows != 0)
5075  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5076 
5077  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5078  useMergePath = true;
5079  }
5080 #endif
5081  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5082  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5083  }
5084 
5085  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5086  const char* modeKK = nullptr;
5087  switch(mode)
5088  {
5089  case Teuchos::NO_TRANS:
5090  modeKK = KokkosSparse::NoTranspose; break;
5091  case Teuchos::TRANS:
5092  modeKK = KokkosSparse::Transpose; break;
5093  case Teuchos::CONJ_TRANS:
5094  modeKK = KokkosSparse::ConjugateTranspose; break;
5095  default:
5096  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5097  }
5098 
5099  if(applyHelper->shouldUseIntRowptrs())
5100  {
5101  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5102  KokkosSparse::spmv(
5103  &applyHelper->handle_int, modeKK,
5104  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5105  }
5106  else
5107  {
5108  KokkosSparse::spmv(
5109  &applyHelper->handle, modeKK,
5110  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5111  }
5112 #else
5113  LocalOrdinal nrows = getLocalNumRows();
5114  LocalOrdinal maxRowImbalance = 0;
5115  if(nrows != 0)
5116  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5117 
5118  auto matrix_lcl = getLocalMultiplyOperator();
5119  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5120  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5121  else
5122  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5123 #endif
5124  }
5125 
5126  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5127  void
5131  Teuchos::ETransp mode,
5132  Scalar alpha,
5133  Scalar beta) const
5134  {
5136  const char fnName[] = "Tpetra::CrsMatrix::apply";
5137 
5138  TEUCHOS_TEST_FOR_EXCEPTION
5139  (! isFillComplete (), std::runtime_error,
5140  fnName << ": Cannot call apply() until fillComplete() "
5141  "has been called.");
5142 
5143  if (mode == Teuchos::NO_TRANS) {
5144  ProfilingRegion regionNonTranspose (fnName);
5145  this->applyNonTranspose (X, Y, alpha, beta);
5146  }
5147  else {
5148  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5149  this->applyTranspose (X, Y, mode, alpha, beta);
5150  }
5151  }
5152 
5153 
5154  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5155  template<class T>
5156  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5158  convert () const
5159  {
5160  using Teuchos::RCP;
5161  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5162  const char tfecfFuncName[] = "convert: ";
5163 
5164  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5165  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5166  "of the conversion) is not fill complete. You must first call "
5167  "fillComplete() (possibly with the domain and range Map) without an "
5168  "intervening call to resumeFill(), before you may call this method.");
5169 
5170  RCP<output_matrix_type> newMatrix
5171  (new output_matrix_type (this->getCrsGraph ()));
5172  // Copy old values into new values. impl_scalar_type and T may
5173  // differ, so we can't use Kokkos::deep_copy.
5175  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5176  this->getLocalMatrixDevice ().values);
5177  // Since newmat has a static (const) graph, the graph already has
5178  // a column Map, and Import and Export objects already exist (if
5179  // applicable). Thus, calling fillComplete is cheap.
5180  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5181 
5182  return newMatrix;
5183  }
5184 
5185 
5186  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5187  void
5190  {
5191  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5192  if (debug) {
5193  const char tfecfFuncName[] = "checkInternalState: ";
5194  const char err[] = "Internal state is not consistent. "
5195  "Please report this bug to the Tpetra developers.";
5196 
5197  // This version of the graph (RCP<const crs_graph_type>) must
5198  // always be nonnull.
5199  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5200  (staticGraph_.is_null (), std::logic_error, err);
5201  // myGraph == null means that the matrix has a const ("static")
5202  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5203  // graph).
5204  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5205  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5206  std::logic_error, err);
5207  // if matrix is fill complete, then graph must be fill complete
5208  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5209  (isFillComplete () && ! staticGraph_->isFillComplete (),
5210  std::logic_error, err << " Specifically, the matrix is fill complete, "
5211  "but its graph is NOT fill complete.");
5212  // if values are allocated and they are non-zero in number, then
5213  // one of the allocations should be present
5214  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5215  (staticGraph_->indicesAreAllocated () &&
5216  staticGraph_->getLocalAllocationSize() > 0 &&
5217  staticGraph_->getLocalNumRows() > 0 &&
5218  valuesUnpacked_wdv.extent (0) == 0,
5219  std::logic_error, err);
5220  }
5221  }
5222 
5223  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5224  std::string
5227  {
5228  std::ostringstream os;
5229 
5230  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5231  if (this->getObjectLabel () != "") {
5232  os << "Label: \"" << this->getObjectLabel () << "\", ";
5233  }
5234  if (isFillComplete ()) {
5235  os << "isFillComplete: true"
5236  << ", global dimensions: [" << getGlobalNumRows () << ", "
5237  << getGlobalNumCols () << "]"
5238  << ", global number of entries: " << getGlobalNumEntries ()
5239  << "}";
5240  }
5241  else {
5242  os << "isFillComplete: false"
5243  << ", global dimensions: [" << getGlobalNumRows () << ", "
5244  << getGlobalNumCols () << "]}";
5245  }
5246  return os.str ();
5247  }
5248 
5249  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5250  void
5252  describe (Teuchos::FancyOStream &out,
5253  const Teuchos::EVerbosityLevel verbLevel) const
5254  {
5255  using std::endl;
5256  using std::setw;
5257  using Teuchos::ArrayView;
5258  using Teuchos::Comm;
5259  using Teuchos::RCP;
5260  using Teuchos::TypeNameTraits;
5261  using Teuchos::VERB_DEFAULT;
5262  using Teuchos::VERB_NONE;
5263  using Teuchos::VERB_LOW;
5264  using Teuchos::VERB_MEDIUM;
5265  using Teuchos::VERB_HIGH;
5266  using Teuchos::VERB_EXTREME;
5267 
5268  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5269 
5270  if (vl == VERB_NONE) {
5271  return; // Don't print anything at all
5272  }
5273 
5274  // By convention, describe() always begins with a tab.
5275  Teuchos::OSTab tab0 (out);
5276 
5277  RCP<const Comm<int> > comm = this->getComm();
5278  const int myRank = comm->getRank();
5279  const int numProcs = comm->getSize();
5280  size_t width = 1;
5281  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5282  ++width;
5283  }
5284  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5285 
5286  // none: print nothing
5287  // low: print O(1) info from node 0
5288  // medium: print O(P) info, num entries per process
5289  // high: print O(N) info, num entries per row
5290  // extreme: print O(NNZ) info: print indices and values
5291  //
5292  // for medium and higher, print constituent objects at specified verbLevel
5293  if (myRank == 0) {
5294  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5295  }
5296  Teuchos::OSTab tab1 (out);
5297 
5298  if (myRank == 0) {
5299  if (this->getObjectLabel () != "") {
5300  out << "Label: \"" << this->getObjectLabel () << "\", ";
5301  }
5302  {
5303  out << "Template parameters:" << endl;
5304  Teuchos::OSTab tab2 (out);
5305  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5306  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5307  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5308  << "Node: " << TypeNameTraits<Node>::name () << endl;
5309  }
5310  if (isFillComplete()) {
5311  out << "isFillComplete: true" << endl
5312  << "Global dimensions: [" << getGlobalNumRows () << ", "
5313  << getGlobalNumCols () << "]" << endl
5314  << "Global number of entries: " << getGlobalNumEntries () << endl
5315  << endl << "Global max number of entries in a row: "
5316  << getGlobalMaxNumRowEntries () << endl;
5317  }
5318  else {
5319  out << "isFillComplete: false" << endl
5320  << "Global dimensions: [" << getGlobalNumRows () << ", "
5321  << getGlobalNumCols () << "]" << endl;
5322  }
5323  }
5324 
5325  if (vl < VERB_MEDIUM) {
5326  return; // all done!
5327  }
5328 
5329  // Describe the row Map.
5330  if (myRank == 0) {
5331  out << endl << "Row Map:" << endl;
5332  }
5333  if (getRowMap ().is_null ()) {
5334  if (myRank == 0) {
5335  out << "null" << endl;
5336  }
5337  }
5338  else {
5339  if (myRank == 0) {
5340  out << endl;
5341  }
5342  getRowMap ()->describe (out, vl);
5343  }
5344 
5345  // Describe the column Map.
5346  if (myRank == 0) {
5347  out << "Column Map: ";
5348  }
5349  if (getColMap ().is_null ()) {
5350  if (myRank == 0) {
5351  out << "null" << endl;
5352  }
5353  } else if (getColMap () == getRowMap ()) {
5354  if (myRank == 0) {
5355  out << "same as row Map" << endl;
5356  }
5357  } else {
5358  if (myRank == 0) {
5359  out << endl;
5360  }
5361  getColMap ()->describe (out, vl);
5362  }
5363 
5364  // Describe the domain Map.
5365  if (myRank == 0) {
5366  out << "Domain Map: ";
5367  }
5368  if (getDomainMap ().is_null ()) {
5369  if (myRank == 0) {
5370  out << "null" << endl;
5371  }
5372  } else if (getDomainMap () == getRowMap ()) {
5373  if (myRank == 0) {
5374  out << "same as row Map" << endl;
5375  }
5376  } else if (getDomainMap () == getColMap ()) {
5377  if (myRank == 0) {
5378  out << "same as column Map" << endl;
5379  }
5380  } else {
5381  if (myRank == 0) {
5382  out << endl;
5383  }
5384  getDomainMap ()->describe (out, vl);
5385  }
5386 
5387  // Describe the range Map.
5388  if (myRank == 0) {
5389  out << "Range Map: ";
5390  }
5391  if (getRangeMap ().is_null ()) {
5392  if (myRank == 0) {
5393  out << "null" << endl;
5394  }
5395  } else if (getRangeMap () == getDomainMap ()) {
5396  if (myRank == 0) {
5397  out << "same as domain Map" << endl;
5398  }
5399  } else if (getRangeMap () == getRowMap ()) {
5400  if (myRank == 0) {
5401  out << "same as row Map" << endl;
5402  }
5403  } else {
5404  if (myRank == 0) {
5405  out << endl;
5406  }
5407  getRangeMap ()->describe (out, vl);
5408  }
5409 
5410  // O(P) data
5411  for (int curRank = 0; curRank < numProcs; ++curRank) {
5412  if (myRank == curRank) {
5413  out << "Process rank: " << curRank << endl;
5414  Teuchos::OSTab tab2 (out);
5415  if (! staticGraph_->indicesAreAllocated ()) {
5416  out << "Graph indices not allocated" << endl;
5417  }
5418  else {
5419  out << "Number of allocated entries: "
5420  << staticGraph_->getLocalAllocationSize () << endl;
5421  }
5422  out << "Number of entries: " << getLocalNumEntries () << endl
5423  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5424  << endl;
5425  }
5426  // Give output time to complete by executing some barriers.
5427  comm->barrier ();
5428  comm->barrier ();
5429  comm->barrier ();
5430  }
5431 
5432  if (vl < VERB_HIGH) {
5433  return; // all done!
5434  }
5435 
5436  // O(N) and O(NNZ) data
5437  for (int curRank = 0; curRank < numProcs; ++curRank) {
5438  if (myRank == curRank) {
5439  out << std::setw(width) << "Proc Rank"
5440  << std::setw(width) << "Global Row"
5441  << std::setw(width) << "Num Entries";
5442  if (vl == VERB_EXTREME) {
5443  out << std::setw(width) << "(Index,Value)";
5444  }
5445  out << endl;
5446  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5447  const size_t nE = getNumEntriesInLocalRow(r);
5448  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5449  out << std::setw(width) << myRank
5450  << std::setw(width) << gid
5451  << std::setw(width) << nE;
5452  if (vl == VERB_EXTREME) {
5453  if (isGloballyIndexed()) {
5454  global_inds_host_view_type rowinds;
5455  values_host_view_type rowvals;
5456  getGlobalRowView (gid, rowinds, rowvals);
5457  for (size_t j = 0; j < nE; ++j) {
5458  out << " (" << rowinds[j]
5459  << ", " << rowvals[j]
5460  << ") ";
5461  }
5462  }
5463  else if (isLocallyIndexed()) {
5464  local_inds_host_view_type rowinds;
5465  values_host_view_type rowvals;
5466  getLocalRowView (r, rowinds, rowvals);
5467  for (size_t j=0; j < nE; ++j) {
5468  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5469  << ", " << rowvals[j]
5470  << ") ";
5471  }
5472  } // globally or locally indexed
5473  } // vl == VERB_EXTREME
5474  out << endl;
5475  } // for each row r on this process
5476  } // if (myRank == curRank)
5477 
5478  // Give output time to complete
5479  comm->barrier ();
5480  comm->barrier ();
5481  comm->barrier ();
5482  } // for each process p
5483  }
5484 
5485  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5486  bool
5489  {
5490  // It's not clear what kind of compatibility checks on sizes can
5491  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5492  // compatibility.
5493 
5494  // Currently, the source object must be a RowMatrix with the same
5495  // four template parameters as the target CrsMatrix. We might
5496  // relax this requirement later.
5497  const row_matrix_type* srcRowMat =
5498  dynamic_cast<const row_matrix_type*> (&source);
5499  return (srcRowMat != nullptr);
5500  }
5501 
5502  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5503  void
5506  const typename crs_graph_type::padding_type& padding,
5507  const bool verbose)
5508  {
5510  using Details::padCrsArrays;
5511  using std::endl;
5512  using LO = local_ordinal_type;
5513  using row_ptrs_type =
5514  typename local_graph_device_type::row_map_type::non_const_type;
5515  using range_policy =
5516  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5517  const char tfecfFuncName[] = "applyCrsPadding";
5518  const char suffix[] =
5519  ". Please report this bug to the Tpetra developers.";
5520  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5521 
5522  std::unique_ptr<std::string> prefix;
5523  if (verbose) {
5524  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5525  std::ostringstream os;
5526  os << *prefix << "padding: ";
5527  padding.print(os);
5528  os << endl;
5529  std::cerr << os.str();
5530  }
5531  const int myRank = ! verbose ? -1 : [&] () {
5532  auto map = this->getMap();
5533  if (map.is_null()) {
5534  return -1;
5535  }
5536  auto comm = map->getComm();
5537  if (comm.is_null()) {
5538  return -1;
5539  }
5540  return comm->getRank();
5541  } ();
5542 
5543  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5544  if (! myGraph_->indicesAreAllocated()) {
5545  if (verbose) {
5546  std::ostringstream os;
5547  os << *prefix << "Call allocateIndices" << endl;
5548  std::cerr << os.str();
5549  }
5550  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5551  }
5552 
5553  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5554  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5555  // size needs to increase. That should be the job of
5556  // padCrsArrays.
5557 
5558  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5559  // would use it directly.
5560 
5561  if (verbose) {
5562  std::ostringstream os;
5563  os << *prefix << "Allocate row_ptrs_beg: "
5564  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5565  std::cerr << os.str();
5566  }
5567  using Kokkos::view_alloc;
5568  using Kokkos::WithoutInitializing;
5569  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5570  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5571  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5572  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5573 
5574  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5575  size_t(row_ptr_beg.extent(0) - 1);
5576  if (verbose) {
5577  std::ostringstream os;
5578  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5579  std::cerr << os.str();
5580  }
5581  row_ptrs_type row_ptr_end(
5582  view_alloc("row_ptr_end", WithoutInitializing), N);
5583 
5584  row_ptrs_type num_row_entries_d;
5585 
5586  const bool refill_num_row_entries =
5587  myGraph_->k_numRowEntries_.extent(0) != 0;
5588 
5589  if (refill_num_row_entries) { // unpacked storage
5590  // We can't assume correct *this capture until C++17, and it's
5591  // likely more efficient just to capture what we need anyway.
5592  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5593  myGraph_->k_numRowEntries_);
5594  Kokkos::parallel_for
5595  ("Fill end row pointers", range_policy(0, N),
5596  KOKKOS_LAMBDA (const size_t i) {
5597  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5598  });
5599  }
5600  else {
5601  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5602  // storage, we don't need row_ptr_end to be separate allocation;
5603  // could just have it alias row_ptr_beg+1.
5604  Kokkos::parallel_for
5605  ("Fill end row pointers", range_policy(0, N),
5606  KOKKOS_LAMBDA (const size_t i) {
5607  row_ptr_end(i) = row_ptr_beg(i+1);
5608  });
5609  }
5610 
5611  if (myGraph_->isGloballyIndexed()) {
5612  padCrsArrays(row_ptr_beg, row_ptr_end,
5613  myGraph_->gblInds_wdv,
5614  valuesUnpacked_wdv, padding, myRank, verbose);
5615  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5616  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5617  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5618  (newValuesLen != newColIndsLen, std::logic_error,
5619  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5620  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5621  << suffix);
5622  }
5623  else {
5624  padCrsArrays(row_ptr_beg, row_ptr_end,
5625  myGraph_->lclIndsUnpacked_wdv,
5626  valuesUnpacked_wdv, padding, myRank, verbose);
5627  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5628  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5629  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5630  (newValuesLen != newColIndsLen, std::logic_error,
5631  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5632  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5633  << suffix);
5634  }
5635 
5636  if (refill_num_row_entries) {
5637  Kokkos::parallel_for
5638  ("Fill num entries", range_policy(0, N),
5639  KOKKOS_LAMBDA (const size_t i) {
5640  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5641  });
5642  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5643  }
5644 
5645  if (verbose) {
5646  std::ostringstream os;
5647  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5648  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5649  << ", new size: " << row_ptr_beg.extent(0) << endl;
5650  std::cerr << os.str();
5651  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5652  row_ptr_beg.extent(0) );
5653  }
5654  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5655  }
5656 
5657  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5658  void
5659  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5660  copyAndPermuteStaticGraph(
5661  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5662  const size_t numSameIDs,
5663  const LocalOrdinal permuteToLIDs[],
5664  const LocalOrdinal permuteFromLIDs[],
5665  const size_t numPermutes)
5666  {
5667  using Details::ProfilingRegion;
5668  using Teuchos::Array;
5669  using Teuchos::ArrayView;
5670  using std::endl;
5671  using LO = LocalOrdinal;
5672  using GO = GlobalOrdinal;
5673  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5674  const char suffix[] =
5675  " Please report this bug to the Tpetra developers.";
5676  ProfilingRegion regionCAP
5677  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5678 
5679  const bool debug = Details::Behavior::debug("CrsGraph");
5680  const bool verbose = Details::Behavior::verbose("CrsGraph");
5681  std::unique_ptr<std::string> prefix;
5682  if (verbose) {
5683  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5684  std::ostringstream os;
5685  os << *prefix << "Start" << endl;
5686  }
5687  const char* const prefix_raw =
5688  verbose ? prefix.get()->c_str() : nullptr;
5689 
5690  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5691  //
5692  // Copy the first numSame row from source to target (this matrix).
5693  // This involves copying rows corresponding to LIDs [0, numSame-1].
5694  //
5695  const map_type& srcRowMap = * (srcMat.getRowMap ());
5696  nonconst_global_inds_host_view_type rowInds;
5697  nonconst_values_host_view_type rowVals;
5698  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5699  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5700  // Global ID for the current row index in the source matrix.
5701  // The first numSameIDs GIDs in the two input lists are the
5702  // same, so sourceGID == targetGID in this case.
5703  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5704  const GO targetGID = sourceGID;
5705 
5706  ArrayView<const GO>rowIndsConstView;
5707  ArrayView<const Scalar> rowValsConstView;
5708 
5709  if (sourceIsLocallyIndexed) {
5710  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5711  if (rowLength > static_cast<size_t> (rowInds.size())) {
5712  Kokkos::resize(rowInds,rowLength);
5713  Kokkos::resize(rowVals,rowLength);
5714  }
5715  // Resizing invalidates an Array's views, so we must make new
5716  // ones, even if rowLength hasn't changed.
5717  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5718  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5719 
5720  // The source matrix is locally indexed, so we have to get a
5721  // copy. Really it's the GIDs that have to be copied (because
5722  // they have to be converted from LIDs).
5723  size_t checkRowLength = 0;
5724  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5725  rowValsView, checkRowLength);
5726  if (debug) {
5727  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5728  (rowLength != checkRowLength, std::logic_error, "For "
5729  "global row index " << sourceGID << ", the source "
5730  "matrix's getNumEntriesInGlobalRow returns a row length "
5731  "of " << rowLength << ", but getGlobalRowCopy reports "
5732  "a row length of " << checkRowLength << "." << suffix);
5733  }
5734 
5735  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5736  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5737  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5738  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5739  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5740  rowIndsView.data(), rowIndsView.extent(0),
5741  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5742  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5743  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5744  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5745  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5746  // KDDKDD UVM TEMPORARY: KokkosView interface
5747  }
5748  else { // source matrix is globally indexed.
5749  global_inds_host_view_type rowIndsView;
5750  values_host_view_type rowValsView;
5751  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5752  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5753  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5754  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5755  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5756  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5757  rowIndsView.data(), rowIndsView.extent(0),
5758  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5759  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5760  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5761  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5762  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5763  // KDDKDD UVM TEMPORARY: KokkosView interface
5764 
5765  }
5766 
5767  // Applying a permutation to a matrix with a static graph
5768  // means REPLACE-ing entries.
5769  combineGlobalValues(targetGID, rowIndsConstView,
5770  rowValsConstView, REPLACE,
5771  prefix_raw, debug, verbose);
5772  }
5773 
5774  if (verbose) {
5775  std::ostringstream os;
5776  os << *prefix << "Do permutes" << endl;
5777  }
5778 
5779  const map_type& tgtRowMap = * (this->getRowMap ());
5780  for (size_t p = 0; p < numPermutes; ++p) {
5781  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5782  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5783 
5784  ArrayView<const GO> rowIndsConstView;
5785  ArrayView<const Scalar> rowValsConstView;
5786 
5787  if (sourceIsLocallyIndexed) {
5788  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5789  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5790  Kokkos::resize(rowInds,rowLength);
5791  Kokkos::resize(rowVals,rowLength);
5792  }
5793  // Resizing invalidates an Array's views, so we must make new
5794  // ones, even if rowLength hasn't changed.
5795  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5796  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5797 
5798  // The source matrix is locally indexed, so we have to get a
5799  // copy. Really it's the GIDs that have to be copied (because
5800  // they have to be converted from LIDs).
5801  size_t checkRowLength = 0;
5802  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5803  rowValsView, checkRowLength);
5804  if (debug) {
5805  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5806  (rowLength != checkRowLength, std::logic_error, "For "
5807  "source matrix global row index " << sourceGID << ", "
5808  "getNumEntriesInGlobalRow returns a row length of " <<
5809  rowLength << ", but getGlobalRowCopy a row length of "
5810  << checkRowLength << "." << suffix);
5811  }
5812 
5813  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5814  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5815  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5816  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5817  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5818  rowIndsView.data(), rowIndsView.extent(0),
5819  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5820  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5821  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5822  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5823  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5824  // KDDKDD UVM TEMPORARY: KokkosView interface
5825  }
5826  else {
5827  global_inds_host_view_type rowIndsView;
5828  values_host_view_type rowValsView;
5829  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5830  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5831  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5832  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5833  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5834  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5835  rowIndsView.data(), rowIndsView.extent(0),
5836  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5837  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5838  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5839  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5840  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5841  // KDDKDD UVM TEMPORARY: KokkosView interface
5842  }
5843 
5844  combineGlobalValues(targetGID, rowIndsConstView,
5845  rowValsConstView, REPLACE,
5846  prefix_raw, debug, verbose);
5847  }
5848 
5849  if (verbose) {
5850  std::ostringstream os;
5851  os << *prefix << "Done" << endl;
5852  }
5853  }
5854 
5855  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5856  void
5857  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5858  copyAndPermuteNonStaticGraph(
5859  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5860  const size_t numSameIDs,
5861  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5862  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5863  const size_t numPermutes)
5864  {
5865  using Details::ProfilingRegion;
5866  using Teuchos::Array;
5867  using Teuchos::ArrayView;
5868  using std::endl;
5869  using LO = LocalOrdinal;
5870  using GO = GlobalOrdinal;
5871  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5872  const char suffix[] =
5873  " Please report this bug to the Tpetra developers.";
5874  ProfilingRegion regionCAP
5875  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5876 
5877  const bool debug = Details::Behavior::debug("CrsGraph");
5878  const bool verbose = Details::Behavior::verbose("CrsGraph");
5879  std::unique_ptr<std::string> prefix;
5880  if (verbose) {
5881  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5882  std::ostringstream os;
5883  os << *prefix << "Start" << endl;
5884  }
5885  const char* const prefix_raw =
5886  verbose ? prefix.get()->c_str() : nullptr;
5887 
5888  {
5889  using row_graph_type = RowGraph<LO, GO, Node>;
5890  const row_graph_type& srcGraph = *(srcMat.getGraph());
5891  auto padding =
5892  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5893  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5894  applyCrsPadding(*padding, verbose);
5895  }
5896  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5897  //
5898  // Copy the first numSame row from source to target (this matrix).
5899  // This involves copying rows corresponding to LIDs [0, numSame-1].
5900  //
5901  const map_type& srcRowMap = * (srcMat.getRowMap ());
5902  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5903  using gids_type = nonconst_global_inds_host_view_type;
5904  using vals_type = nonconst_values_host_view_type;
5905  gids_type rowInds;
5906  vals_type rowVals;
5907  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5908  // Global ID for the current row index in the source matrix.
5909  // The first numSameIDs GIDs in the two input lists are the
5910  // same, so sourceGID == targetGID in this case.
5911  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5912  const GO targetGID = sourceGID;
5913 
5914  ArrayView<const GO> rowIndsConstView;
5915  ArrayView<const Scalar> rowValsConstView;
5916 
5917  if (sourceIsLocallyIndexed) {
5918 
5919  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5920  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5921  Kokkos::resize(rowInds,rowLength);
5922  Kokkos::resize(rowVals,rowLength);
5923  }
5924  // Resizing invalidates an Array's views, so we must make new
5925  // ones, even if rowLength hasn't changed.
5926  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5927  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5928 
5929  // The source matrix is locally indexed, so we have to get a
5930  // copy. Really it's the GIDs that have to be copied (because
5931  // they have to be converted from LIDs).
5932  size_t checkRowLength = 0;
5933  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5934  checkRowLength);
5935  if (debug) {
5936  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5937  (rowLength != checkRowLength, std::logic_error, ": For "
5938  "global row index " << sourceGID << ", the source "
5939  "matrix's getNumEntriesInGlobalRow returns a row length "
5940  "of " << rowLength << ", but getGlobalRowCopy reports "
5941  "a row length of " << checkRowLength << "." << suffix);
5942  }
5943  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5944  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5945  }
5946  else { // source matrix is globally indexed.
5947  global_inds_host_view_type rowIndsView;
5948  values_host_view_type rowValsView;
5949  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5950 
5951  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5952  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5953  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5954  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5955  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5956  rowIndsView.data(), rowIndsView.extent(0),
5957  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5958  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5959  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5960  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5961  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5962  // KDDKDD UVM TEMPORARY: KokkosView interface
5963  }
5964 
5965  // Combine the data into the target matrix.
5966  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5967  rowValsConstView, prefix_raw, debug, verbose);
5968  }
5969 
5970  if (verbose) {
5971  std::ostringstream os;
5972  os << *prefix << "Do permutes" << endl;
5973  }
5974  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5975  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5976 
5977  const map_type& tgtRowMap = * (this->getRowMap ());
5978  for (size_t p = 0; p < numPermutes; ++p) {
5979  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5980  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5981 
5982  ArrayView<const GO> rowIndsConstView;
5983  ArrayView<const Scalar> rowValsConstView;
5984 
5985  if (sourceIsLocallyIndexed) {
5986  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5987  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5988  Kokkos::resize(rowInds,rowLength);
5989  Kokkos::resize(rowVals,rowLength);
5990  }
5991  // Resizing invalidates an Array's views, so we must make new
5992  // ones, even if rowLength hasn't changed.
5993  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5994  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5995 
5996  // The source matrix is locally indexed, so we have to get a
5997  // copy. Really it's the GIDs that have to be copied (because
5998  // they have to be converted from LIDs).
5999  size_t checkRowLength = 0;
6000  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6001  rowValsView, checkRowLength);
6002  if (debug) {
6003  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6004  (rowLength != checkRowLength, std::logic_error, "For "
6005  "source matrix global row index " << sourceGID << ", "
6006  "getNumEntriesInGlobalRow returns a row length of " <<
6007  rowLength << ", but getGlobalRowCopy a row length of "
6008  << checkRowLength << "." << suffix);
6009  }
6010  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6011  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6012  }
6013  else {
6014  global_inds_host_view_type rowIndsView;
6015  values_host_view_type rowValsView;
6016  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6017 
6018  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6019  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6020  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6021  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6022  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6023  rowIndsView.data(), rowIndsView.extent(0),
6024  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6025  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6026  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6027  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6028  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6029  // KDDKDD UVM TEMPORARY: KokkosView interface
6030  }
6031 
6032  // Combine the data into the target matrix.
6033  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6034  rowValsConstView, prefix_raw, debug, verbose);
6035  }
6036 
6037  if (verbose) {
6038  std::ostringstream os;
6039  os << *prefix << "Done" << endl;
6040  }
6041  }
6042 
6043  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6044  void
6047  const SrcDistObject& srcObj,
6048  const size_t numSameIDs,
6049  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6050  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6051  const CombineMode /*CM*/)
6052  {
6053  using Details::Behavior;
6056  using std::endl;
6057 
6058  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6059  const char tfecfFuncName[] = "copyAndPermute: ";
6060  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6061 
6062  const bool verbose = Behavior::verbose("CrsMatrix");
6063  std::unique_ptr<std::string> prefix;
6064  if (verbose) {
6065  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6066  std::ostringstream os;
6067  os << *prefix << endl
6068  << *prefix << " numSameIDs: " << numSameIDs << endl
6069  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6070  << endl
6071  << *prefix << " "
6072  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6073  << endl
6074  << *prefix << " "
6075  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6076  << endl
6077  << *prefix << " "
6078  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6079  << endl;
6080  std::cerr << os.str ();
6081  }
6082 
6083  const auto numPermute = permuteToLIDs.extent (0);
6084  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6085  (numPermute != permuteFromLIDs.extent (0),
6086  std::invalid_argument, "permuteToLIDs.extent(0) = "
6087  << numPermute << "!= permuteFromLIDs.extent(0) = "
6088  << permuteFromLIDs.extent (0) << ".");
6089 
6090  // This dynamic cast should succeed, because we've already tested
6091  // it in checkSizes().
6093  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6094  if (isStaticGraph ()) {
6095  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6096  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6097  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6098  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6099 
6100  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6101  permuteToLIDs_h.data(),
6102  permuteFromLIDs_h.data(),
6103  numPermute);
6104  }
6105  else {
6106  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6107  permuteFromLIDs, numPermute);
6108  }
6109 
6110  if (verbose) {
6111  std::ostringstream os;
6112  os << *prefix << "Done" << endl;
6113  std::cerr << os.str();
6114  }
6115  }
6116 
6117  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6118  void
6121  (const SrcDistObject& source,
6122  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6123  Kokkos::DualView<char*, buffer_device_type>& exports,
6124  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6125  size_t& constantNumPackets)
6126  {
6127  using Details::Behavior;
6130  using Teuchos::outArg;
6131  using Teuchos::REDUCE_MAX;
6132  using Teuchos::reduceAll;
6133  using std::endl;
6134  typedef LocalOrdinal LO;
6135  typedef GlobalOrdinal GO;
6136  const char tfecfFuncName[] = "packAndPrepare: ";
6137  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6138 
6139  const bool debug = Behavior::debug("CrsMatrix");
6140  const bool verbose = Behavior::verbose("CrsMatrix");
6141 
6142  // Processes on which the communicator is null should not participate.
6143  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6144  if (pComm.is_null ()) {
6145  return;
6146  }
6147  const Teuchos::Comm<int>& comm = *pComm;
6148  const int myRank = comm.getSize ();
6149 
6150  std::unique_ptr<std::string> prefix;
6151  if (verbose) {
6152  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6153  std::ostringstream os;
6154  os << *prefix << "Start" << endl
6155  << *prefix << " "
6156  << dualViewStatusToString (exportLIDs, "exportLIDs")
6157  << endl
6158  << *prefix << " "
6159  << dualViewStatusToString (exports, "exports")
6160  << endl
6161  << *prefix << " "
6162  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6163  << endl;
6164  std::cerr << os.str ();
6165  }
6166 
6167  // Attempt to cast the source object to CrsMatrix. If successful,
6168  // use the source object's packNew() method to pack its data for
6169  // communication. Otherwise, attempt to cast to RowMatrix; if
6170  // successful, use the source object's pack() method. Otherwise,
6171  // the source object doesn't have the right type.
6172  //
6173  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6174  // RowMatrix to have the same Node type. Unfortunately, we don't
6175  // have a way to ask if the RowMatrix is "a RowMatrix with any
6176  // Node type," since RowMatrix doesn't have a base class. A
6177  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6178  // not currently exist, would satisfy this requirement.
6179  //
6180  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6181  // type doesn't technically need to match the target object's
6182  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6183  // and GO need not be the same, as long as there is no overflow of
6184  // the indices. However, checking for index overflow is global
6185  // and therefore undesirable.
6186 
6187  std::ostringstream msg; // for collecting error messages
6188  int lclBad = 0; // to be set below
6189 
6190  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6191  const crs_matrix_type* srcCrsMat =
6192  dynamic_cast<const crs_matrix_type*> (&source);
6193  if (srcCrsMat != nullptr) {
6194  if (verbose) {
6195  std::ostringstream os;
6196  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6197  "calling packNew" << endl;
6198  std::cerr << os.str ();
6199  }
6200  try {
6201  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6202  constantNumPackets);
6203  }
6204  catch (std::exception& e) {
6205  lclBad = 1;
6206  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6207  }
6208  }
6209  else {
6210  using Kokkos::HostSpace;
6211  using Kokkos::subview;
6212  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6213  using range_type = Kokkos::pair<size_t, size_t>;
6214 
6215  if (verbose) {
6216  std::ostringstream os;
6217  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6218  << endl;
6219  std::cerr << os.str ();
6220  }
6221 
6222  const row_matrix_type* srcRowMat =
6223  dynamic_cast<const row_matrix_type*> (&source);
6224  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6225  (srcRowMat == nullptr, std::invalid_argument,
6226  "The source object of the Import or Export operation is neither a "
6227  "CrsMatrix (with the same template parameters as the target object), "
6228  "nor a RowMatrix (with the same first four template parameters as the "
6229  "target object).");
6230 
6231  // For the RowMatrix case, we need to convert from
6232  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6233  // so terribly efficient, since packing a non-CrsMatrix
6234  // RowMatrix for Import/Export into a CrsMatrix is not a
6235  // critical case. Thus, we may allocate Teuchos::Array objects
6236  // here and copy to and from Kokkos::*View.
6237 
6238  // View exportLIDs's host data as a Teuchos::ArrayView.
6239  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6240  auto exportLIDs_h = exportLIDs.view_host ();
6241  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6242  exportLIDs_h.size ());
6243 
6244  // pack() will allocate exports_a as needed. We'll copy back
6245  // into exports (after (re)allocating exports if needed) below.
6246  Teuchos::Array<char> exports_a;
6247 
6248  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6249  // need to sync, since we're doing write-only access, but we do
6250  // need to mark the DualView as modified on host.
6251 
6252  numPacketsPerLID.clear_sync_state (); // write-only access
6253  numPacketsPerLID.modify_host ();
6254  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6255  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6256  numPacketsPerLID_h.size ());
6257 
6258  // Invoke RowMatrix's legacy pack() interface, using above
6259  // Teuchos::Array* objects.
6260  try {
6261  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6262  constantNumPackets);
6263  }
6264  catch (std::exception& e) {
6265  lclBad = 1;
6266  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6267  }
6268 
6269  // Allocate 'exports', and copy exports_a back into it.
6270  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6271  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6272  const std::string oldLabel = exports.d_view.label ();
6273  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6274  exports = exports_type (newLabel, newAllocSize);
6275  }
6276  // It's safe to assume that we're working on host anyway, so
6277  // just keep exports sync'd to host.
6278  // ignore current device contents
6279  exports.modify_host();
6280 
6281  auto exports_h = exports.view_host ();
6282  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6283 
6284  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6285  // exports_a into a nonowning Kokkos::View first before copying.
6286  typedef typename exports_type::t_host::execution_space HES;
6287  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6288  Kokkos::View<const char*, host_device_type>
6289  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6290  // DEEP_COPY REVIEW - NOT TESTED
6291  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6292  }
6293 
6294  if (debug) {
6295  int gblBad = 0; // output argument; to be set below
6296  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6297  if (gblBad != 0) {
6298  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6299  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6300  (true, std::logic_error, "packNew() or pack() threw an exception on "
6301  "one or more participating processes.");
6302  }
6303  }
6304  else {
6305  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6306  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6307  "or more participating processes. Here is this process' error "
6308  "message: " << msg.str ());
6309  }
6310 
6311  if (verbose) {
6312  std::ostringstream os;
6313  os << *prefix << "packAndPrepare: Done!" << endl
6314  << *prefix << " "
6315  << dualViewStatusToString (exportLIDs, "exportLIDs")
6316  << endl
6317  << *prefix << " "
6318  << dualViewStatusToString (exports, "exports")
6319  << endl
6320  << *prefix << " "
6321  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6322  << endl;
6323  std::cerr << os.str ();
6324  }
6325  }
6326 
6327  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6328  size_t
6329  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6330  packRow (char exports[],
6331  const size_t offset,
6332  const size_t numEnt,
6333