Tpetra parallel linear algebra  Version of the Day
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
Tpetra_CrsMatrix_def.hpp
Go to the documentation of this file.
1 // @HEADER
2 // *****************************************************************************
3 // Tpetra: Templated Linear Algebra Services Package
4 //
5 // Copyright 2008 NTESS and the Tpetra contributors.
6 // SPDX-License-Identifier: BSD-3-Clause
7 // *****************************************************************************
8 // @HEADER
9 
10 #ifndef TPETRA_CRSMATRIX_DEF_HPP
11 #define TPETRA_CRSMATRIX_DEF_HPP
12 
20 
21 #include "Tpetra_Import_Util.hpp"
22 #include "Tpetra_Import_Util2.hpp"
23 #include "Tpetra_RowMatrix.hpp"
24 #include "Tpetra_LocalCrsMatrixOperator.hpp"
25 
32 #include "Tpetra_Details_getDiagCopyWithoutOffsets.hpp"
40 #include "Tpetra_Details_packCrsMatrix.hpp"
41 #include "Tpetra_Details_unpackCrsMatrixAndCombine.hpp"
43 #include "Teuchos_FancyOStream.hpp"
44 #include "Teuchos_RCP.hpp"
45 #include "Teuchos_DataAccess.hpp"
46 #include "Teuchos_SerialDenseMatrix.hpp" // unused here, could delete
47 #include "KokkosBlas1_scal.hpp"
48 #include "KokkosSparse_getDiagCopy.hpp"
49 #include "KokkosSparse_spmv.hpp"
50 
51 #include <memory>
52 #include <sstream>
53 #include <typeinfo>
54 #include <utility>
55 #include <vector>
56 
57 namespace Tpetra {
58 
59 namespace { // (anonymous)
60 
61  template<class T, class BinaryFunction>
62  T atomic_binary_function_update (volatile T* const dest,
63  const T& inputVal,
64  BinaryFunction f)
65  {
66  T oldVal = *dest;
67  T assume;
68 
69  // NOTE (mfh 30 Nov 2015) I do NOT need a fence here for IBM
70  // POWER architectures, because 'newval' depends on 'assume',
71  // which depends on 'oldVal', which depends on '*dest'. This
72  // sets up a chain of read dependencies that should ensure
73  // correct behavior given a sane memory model.
74  do {
75  assume = oldVal;
76  T newVal = f (assume, inputVal);
77  oldVal = Kokkos::atomic_compare_exchange (dest, assume, newVal);
78  } while (assume != oldVal);
79 
80  return oldVal;
81  }
82 } // namespace (anonymous)
83 
84 //
85 // Users must never rely on anything in the Details namespace.
86 //
87 namespace Details {
88 
98 template<class Scalar>
99 struct AbsMax {
101  Scalar operator() (const Scalar& x, const Scalar& y) {
102  typedef Teuchos::ScalarTraits<Scalar> STS;
103  return std::max (STS::magnitude (x), STS::magnitude (y));
104  }
105 };
106 
107 } // namespace Details
108 } // namespace Tpetra
109 
110 namespace Tpetra {
111 
112  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
113  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
114  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
115  size_t maxNumEntriesPerRow,
116  const Teuchos::RCP<Teuchos::ParameterList>& params) :
117  dist_object_type (rowMap)
118  {
119  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, size_t "
120  "[, RCP<ParameterList>]): ";
121  Teuchos::RCP<crs_graph_type> graph;
122  try {
123  graph = Teuchos::rcp (new crs_graph_type (rowMap, maxNumEntriesPerRow,
124  params));
125  }
126  catch (std::exception& e) {
127  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
128  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
129  "size_t [, RCP<ParameterList>]) threw an exception: "
130  << e.what ());
131  }
132  // myGraph_ not null means that the matrix owns the graph. That's
133  // different than the const CrsGraph constructor, where the matrix
134  // does _not_ own the graph.
135  myGraph_ = graph;
136  staticGraph_ = myGraph_;
137  resumeFill (params);
139  }
140 
141  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
143  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
144  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
145  const Teuchos::RCP<Teuchos::ParameterList>& params) :
146  dist_object_type (rowMap)
147  {
148  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
149  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
150  Teuchos::RCP<crs_graph_type> graph;
151  try {
152  using Teuchos::rcp;
153  graph = rcp(new crs_graph_type(rowMap, numEntPerRowToAlloc,
154  params));
155  }
156  catch (std::exception& e) {
157  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
158  (true, std::runtime_error, "CrsGraph constructor "
159  "(RCP<const Map>, ArrayView<const size_t>"
160  "[, RCP<ParameterList>]) threw an exception: "
161  << e.what ());
162  }
163  // myGraph_ not null means that the matrix owns the graph. That's
164  // different than the const CrsGraph constructor, where the matrix
165  // does _not_ own the graph.
166  myGraph_ = graph;
167  staticGraph_ = graph;
168  resumeFill (params);
170  }
171 
172  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
174  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
175  const Teuchos::RCP<const map_type>& colMap,
176  const size_t maxNumEntPerRow,
177  const Teuchos::RCP<Teuchos::ParameterList>& params) :
178  dist_object_type (rowMap)
179  {
180  const char tfecfFuncName[] = "CrsMatrix(RCP<const Map>, "
181  "RCP<const Map>, size_t[, RCP<ParameterList>]): ";
182  const char suffix[] =
183  " Please report this bug to the Tpetra developers.";
184 
185  // An artifact of debugging something a while back.
186  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
187  (! staticGraph_.is_null (), std::logic_error,
188  "staticGraph_ is not null at the beginning of the constructor."
189  << suffix);
190  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
191  (! myGraph_.is_null (), std::logic_error,
192  "myGraph_ is not null at the beginning of the constructor."
193  << suffix);
194  Teuchos::RCP<crs_graph_type> graph;
195  try {
196  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
197  maxNumEntPerRow,
198  params));
199  }
200  catch (std::exception& e) {
201  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
202  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
203  "RCP<const Map>, size_t[, RCP<ParameterList>]) threw an "
204  "exception: " << e.what ());
205  }
206  // myGraph_ not null means that the matrix owns the graph. That's
207  // different than the const CrsGraph constructor, where the matrix
208  // does _not_ own the graph.
209  myGraph_ = graph;
210  staticGraph_ = myGraph_;
211  resumeFill (params);
213  }
214 
215  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
217  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
218  const Teuchos::RCP<const map_type>& colMap,
219  const Teuchos::ArrayView<const size_t>& numEntPerRowToAlloc,
220  const Teuchos::RCP<Teuchos::ParameterList>& params) :
221  dist_object_type (rowMap)
222  {
223  const char tfecfFuncName[] =
224  "CrsMatrix(RCP<const Map>, RCP<const Map>, "
225  "ArrayView<const size_t>[, RCP<ParameterList>]): ";
226  Teuchos::RCP<crs_graph_type> graph;
227  try {
228  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
229  numEntPerRowToAlloc,
230  params));
231  }
232  catch (std::exception& e) {
233  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
234  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
235  "RCP<const Map>, ArrayView<const size_t>[, "
236  "RCP<ParameterList>]) threw an exception: " << e.what ());
237  }
238  // myGraph_ not null means that the matrix owns the graph. That's
239  // different than the const CrsGraph constructor, where the matrix
240  // does _not_ own the graph.
241  myGraph_ = graph;
242  staticGraph_ = graph;
243  resumeFill (params);
245  }
246 
247 
248  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
250  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
251  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
252  dist_object_type (graph->getRowMap ()),
253  staticGraph_ (graph),
254  storageStatus_ (Details::STORAGE_1D_PACKED)
255  {
256  using std::endl;
257  typedef typename local_matrix_device_type::values_type values_type;
258  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>[, "
259  "RCP<ParameterList>]): ";
260  const bool verbose = Details::Behavior::verbose("CrsMatrix");
261 
262  std::unique_ptr<std::string> prefix;
263  if (verbose) {
264  prefix = this->createPrefix("CrsMatrix", "CrsMatrix(graph,params)");
265  std::ostringstream os;
266  os << *prefix << "Start" << endl;
267  std::cerr << os.str ();
268  }
269 
270  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
271  (graph.is_null (), std::runtime_error, "Input graph is null.");
272  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
273  (! graph->isFillComplete (), std::runtime_error, "Input graph "
274  "is not fill complete. You must call fillComplete on the "
275  "graph before using it to construct a CrsMatrix. Note that "
276  "calling resumeFill on the graph makes it not fill complete, "
277  "even if you had previously called fillComplete. In that "
278  "case, you must call fillComplete on the graph again.");
279 
280  // The graph is fill complete, so it is locally indexed and has a
281  // fixed structure. This means we can allocate the (1-D) array of
282  // values and build the local matrix right now. Note that the
283  // local matrix's number of columns comes from the column Map, not
284  // the domain Map.
285 
286  const size_t numEnt = graph->lclIndsPacked_wdv.extent (0);
287  if (verbose) {
288  std::ostringstream os;
289  os << *prefix << "Allocate values: " << numEnt << endl;
290  std::cerr << os.str ();
291  }
292 
293  values_type val ("Tpetra::CrsMatrix::values", numEnt);
294  valuesPacked_wdv = values_wdv_type(val);
295  valuesUnpacked_wdv = valuesPacked_wdv;
296 
298 
299  if (verbose) {
300  std::ostringstream os;
301  os << *prefix << "Done" << endl;
302  std::cerr << os.str ();
303  }
304  }
305 
306  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
309  const Teuchos::RCP<const crs_graph_type>& graph,
310  const Teuchos::RCP<Teuchos::ParameterList>& params) :
311  dist_object_type (graph->getRowMap ()),
312  staticGraph_ (graph),
313  storageStatus_ (matrix.storageStatus_)
314  {
315  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
316  "local_matrix_device_type::values_type, "
317  "[,RCP<ParameterList>]): ";
318  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
319  (graph.is_null (), std::runtime_error, "Input graph is null.");
320  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
321  (! graph->isFillComplete (), std::runtime_error, "Input graph "
322  "is not fill complete. You must call fillComplete on the "
323  "graph before using it to construct a CrsMatrix. Note that "
324  "calling resumeFill on the graph makes it not fill complete, "
325  "even if you had previously called fillComplete. In that "
326  "case, you must call fillComplete on the graph again.");
327 
328  size_t numValuesPacked = graph->lclIndsPacked_wdv.extent(0);
329  valuesPacked_wdv = values_wdv_type(matrix.valuesPacked_wdv, 0, numValuesPacked);
330 
331  size_t numValuesUnpacked = graph->lclIndsUnpacked_wdv.extent(0);
332  valuesUnpacked_wdv = values_wdv_type(matrix.valuesUnpacked_wdv, 0, numValuesUnpacked);
333 
335  }
336 
337 
338  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
340  CrsMatrix (const Teuchos::RCP<const crs_graph_type>& graph,
341  const typename local_matrix_device_type::values_type& values,
342  const Teuchos::RCP<Teuchos::ParameterList>& /* params */) :
343  dist_object_type (graph->getRowMap ()),
344  staticGraph_ (graph),
345  storageStatus_ (Details::STORAGE_1D_PACKED)
346  {
347  const char tfecfFuncName[] = "CrsMatrix(RCP<const CrsGraph>, "
348  "local_matrix_device_type::values_type, "
349  "[,RCP<ParameterList>]): ";
350  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
351  (graph.is_null (), std::runtime_error, "Input graph is null.");
352  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
353  (! graph->isFillComplete (), std::runtime_error, "Input graph "
354  "is not fill complete. You must call fillComplete on the "
355  "graph before using it to construct a CrsMatrix. Note that "
356  "calling resumeFill on the graph makes it not fill complete, "
357  "even if you had previously called fillComplete. In that "
358  "case, you must call fillComplete on the graph again.");
359 
360  // The graph is fill complete, so it is locally indexed and has a
361  // fixed structure. This means we can allocate the (1-D) array of
362  // values and build the local matrix right now. Note that the
363  // local matrix's number of columns comes from the column Map, not
364  // the domain Map.
365 
366  valuesPacked_wdv = values_wdv_type(values);
367  valuesUnpacked_wdv = valuesPacked_wdv;
368 
369  // FIXME (22 Jun 2016) I would very much like to get rid of
370  // k_values1D_ at some point. I find it confusing to have all
371  // these extra references lying around.
372  // KDDKDD ALMOST THERE, MARK!
373 // k_values1D_ = valuesUnpacked_wdv.getDeviceView(Access::ReadWrite);
374 
376  }
377 
378  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
380  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
381  const Teuchos::RCP<const map_type>& colMap,
382  const typename local_graph_device_type::row_map_type& rowPointers,
383  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
384  const typename local_matrix_device_type::values_type& values,
385  const Teuchos::RCP<Teuchos::ParameterList>& params) :
386  dist_object_type (rowMap),
387  storageStatus_ (Details::STORAGE_1D_PACKED)
388  {
389  using Details::getEntryOnHost;
390  using Teuchos::RCP;
391  using std::endl;
392  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
393  "RCP<const Map>, ptr, ind, val[, params]): ";
394  const char suffix[] =
395  ". Please report this bug to the Tpetra developers.";
396  const bool debug = Details::Behavior::debug("CrsMatrix");
397  const bool verbose = Details::Behavior::verbose("CrsMatrix");
398 
399  std::unique_ptr<std::string> prefix;
400  if (verbose) {
401  prefix = this->createPrefix(
402  "CrsMatrix", "CrsMatrix(rowMap,colMap,ptr,ind,val[,params])");
403  std::ostringstream os;
404  os << *prefix << "Start" << endl;
405  std::cerr << os.str ();
406  }
407 
408  // Check the user's input. Note that this might throw only on
409  // some processes but not others, causing deadlock. We prefer
410  // deadlock due to exceptions to segfaults, because users can
411  // catch exceptions.
412  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
413  (values.extent(0) != columnIndices.extent(0),
414  std::invalid_argument, "values.extent(0)=" << values.extent(0)
415  << " != columnIndices.extent(0) = " << columnIndices.extent(0)
416  << ".");
417  if (debug && rowPointers.extent(0) != 0) {
418  const size_t numEnt =
419  getEntryOnHost(rowPointers, rowPointers.extent(0) - 1);
420  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
421  (numEnt != size_t(columnIndices.extent(0)) ||
422  numEnt != size_t(values.extent(0)),
423  std::invalid_argument, "Last entry of rowPointers says that "
424  "the matrix has " << numEnt << " entr"
425  << (numEnt != 1 ? "ies" : "y") << ", but the dimensions of "
426  "columnIndices and values don't match this. "
427  "columnIndices.extent(0)=" << columnIndices.extent (0)
428  << " and values.extent(0)=" << values.extent (0) << ".");
429  }
430 
431  RCP<crs_graph_type> graph;
432  try {
433  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, rowPointers,
434  columnIndices, params));
435  }
436  catch (std::exception& e) {
437  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
438  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
439  "RCP<const Map>, ptr, ind[, params]) threw an exception: "
440  << e.what ());
441  }
442  // The newly created CrsGraph _must_ have a local graph at this
443  // point. We don't really care whether CrsGraph's constructor
444  // deep-copies or shallow-copies the input, but the dimensions
445  // have to be right. That's how we tell whether the CrsGraph has
446  // a local graph.
447  auto lclGraph = graph->getLocalGraphDevice ();
448  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
449  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
450  lclGraph.entries.extent (0) != columnIndices.extent (0),
451  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, "
452  "ind[, params]) did not set the local graph correctly." << suffix);
453  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
454  (lclGraph.entries.extent (0) != values.extent (0),
455  std::logic_error, "CrsGraph's constructor (rowMap, colMap, ptr, ind[, "
456  "params]) did not set the local graph correctly. "
457  "lclGraph.entries.extent(0) = " << lclGraph.entries.extent (0)
458  << " != values.extent(0) = " << values.extent (0) << suffix);
459 
460  // myGraph_ not null means that the matrix owns the graph. This
461  // is true because the column indices come in as nonconst,
462  // implying shared ownership.
463  myGraph_ = graph;
464  staticGraph_ = graph;
465 
466  // The graph may not be fill complete yet. However, it is locally
467  // indexed (since we have a column Map) and has a fixed structure
468  // (due to the input arrays). This means we can allocate the
469  // (1-D) array of values and build the local matrix right now.
470  // Note that the local matrix's number of columns comes from the
471  // column Map, not the domain Map.
472 
473  valuesPacked_wdv = values_wdv_type(values);
474  valuesUnpacked_wdv = valuesPacked_wdv;
475 
476  // FIXME (22 Jun 2016) I would very much like to get rid of
477  // k_values1D_ at some point. I find it confusing to have all
478  // these extra references lying around.
479 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
480 
482  if (verbose) {
483  std::ostringstream os;
484  os << *prefix << "Done" << endl;
485  std::cerr << os.str();
486  }
487  }
488 
489  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
491  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
492  const Teuchos::RCP<const map_type>& colMap,
493  const Teuchos::ArrayRCP<size_t>& ptr,
494  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
495  const Teuchos::ArrayRCP<Scalar>& val,
496  const Teuchos::RCP<Teuchos::ParameterList>& params) :
497  dist_object_type (rowMap),
498  storageStatus_ (Details::STORAGE_1D_PACKED)
499  {
500  using Kokkos::Compat::getKokkosViewDeepCopy;
501  using Teuchos::av_reinterpret_cast;
502  using Teuchos::RCP;
503  using values_type = typename local_matrix_device_type::values_type;
504  using IST = impl_scalar_type;
505  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
506  "RCP<const Map>, ptr, ind, val[, params]): ";
507 
508  RCP<crs_graph_type> graph;
509  try {
510  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap, ptr,
511  ind, params));
512  }
513  catch (std::exception& e) {
514  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
515  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
516  "RCP<const Map>, ArrayRCP<size_t>, ArrayRCP<LocalOrdinal>[, "
517  "RCP<ParameterList>]) threw an exception: " << e.what ());
518  }
519  // myGraph_ not null means that the matrix owns the graph. This
520  // is true because the column indices come in as nonconst,
521  // implying shared ownership.
522  myGraph_ = graph;
523  staticGraph_ = graph;
524 
525  // The graph may not be fill complete yet. However, it is locally
526  // indexed (since we have a column Map) and has a fixed structure
527  // (due to the input arrays). This means we can allocate the
528  // (1-D) array of values and build the local matrix right now.
529  // Note that the local matrix's number of columns comes from the
530  // column Map, not the domain Map.
531 
532  // The graph _must_ have a local graph at this point. We don't
533  // really care whether CrsGraph's constructor deep-copies or
534  // shallow-copies the input, but the dimensions have to be right.
535  // That's how we tell whether the CrsGraph has a local graph.
536  auto lclGraph = staticGraph_->getLocalGraphDevice ();
537  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
538  (size_t (lclGraph.row_map.extent (0)) != size_t (ptr.size ()) ||
539  size_t (lclGraph.entries.extent (0)) != size_t (ind.size ()),
540  std::logic_error, "CrsGraph's constructor (rowMap, colMap, "
541  "ptr, ind[, params]) did not set the local graph correctly. "
542  "Please report this bug to the Tpetra developers.");
543 
544  values_type valIn =
545  getKokkosViewDeepCopy<device_type> (av_reinterpret_cast<IST> (val ()));
546  valuesPacked_wdv = values_wdv_type(valIn);
547  valuesUnpacked_wdv = valuesPacked_wdv;
548 
549  // FIXME (22 Jun 2016) I would very much like to get rid of
550  // k_values1D_ at some point. I find it confusing to have all
551  // these extra references lying around.
552 // this->k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
553 
555  }
556 
557  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
559  CrsMatrix (const Teuchos::RCP<const map_type>& rowMap,
560  const Teuchos::RCP<const map_type>& colMap,
561  const local_matrix_device_type& lclMatrix,
562  const Teuchos::RCP<Teuchos::ParameterList>& params) :
563  dist_object_type (rowMap),
564  storageStatus_ (Details::STORAGE_1D_PACKED),
565  fillComplete_ (true)
566  {
567  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
568  "RCP<const Map>, local_matrix_device_type[, RCP<ParameterList>]): ";
569  const char suffix[] =
570  " Please report this bug to the Tpetra developers.";
571 
572  Teuchos::RCP<crs_graph_type> graph;
573  try {
574  graph = Teuchos::rcp (new crs_graph_type (rowMap, colMap,
575  lclMatrix.graph, params));
576  }
577  catch (std::exception& e) {
578  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
579  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
580  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) threw an "
581  "exception: " << e.what ());
582  }
583  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
584  (!graph->isFillComplete (), std::logic_error, "CrsGraph constructor (RCP"
585  "<const Map>, RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) "
586  "did not produce a fill-complete graph. Please report this bug to the "
587  "Tpetra developers.");
588  // myGraph_ not null means that the matrix owns the graph. This
589  // is true because the column indices come in as nonconst through
590  // the matrix, implying shared ownership.
591  myGraph_ = graph;
592  staticGraph_ = graph;
593 
594  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
595  valuesUnpacked_wdv = valuesPacked_wdv;
596 
597  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
598  (isFillActive (), std::logic_error,
599  "At the end of a CrsMatrix constructor that should produce "
600  "a fillComplete matrix, isFillActive() is true." << suffix);
601  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
602  (! isFillComplete (), std::logic_error, "At the end of a "
603  "CrsMatrix constructor that should produce a fillComplete "
604  "matrix, isFillComplete() is false." << suffix);
606  }
607 
608  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
611  const Teuchos::RCP<const map_type>& rowMap,
612  const Teuchos::RCP<const map_type>& colMap,
613  const Teuchos::RCP<const map_type>& domainMap,
614  const Teuchos::RCP<const map_type>& rangeMap,
615  const Teuchos::RCP<Teuchos::ParameterList>& params) :
616  dist_object_type (rowMap),
617  storageStatus_ (Details::STORAGE_1D_PACKED),
618  fillComplete_ (true)
619  {
620  const char tfecfFuncName[] = "Tpetra::CrsMatrix(RCP<const Map>, "
621  "RCP<const Map>, RCP<const Map>, RCP<const Map>, "
622  "local_matrix_device_type[, RCP<ParameterList>]): ";
623  const char suffix[] =
624  " Please report this bug to the Tpetra developers.";
625 
626  Teuchos::RCP<crs_graph_type> graph;
627  try {
628  graph = Teuchos::rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
629  domainMap, rangeMap, params));
630  }
631  catch (std::exception& e) {
632  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
633  (true, std::runtime_error, "CrsGraph constructor (RCP<const Map>, "
634  "RCP<const Map>, RCP<const Map>, RCP<const Map>, local_graph_device_type[, "
635  "RCP<ParameterList>]) threw an exception: " << e.what ());
636  }
637  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
638  (! graph->isFillComplete (), std::logic_error, "CrsGraph "
639  "constructor (RCP<const Map>, RCP<const Map>, RCP<const Map>, "
640  "RCP<const Map>, local_graph_device_type[, RCP<ParameterList>]) did "
641  "not produce a fillComplete graph." << suffix);
642  // myGraph_ not null means that the matrix owns the graph. This
643  // is true because the column indices come in as nonconst through
644  // the matrix, implying shared ownership.
645  myGraph_ = graph;
646  staticGraph_ = graph;
647 
648  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
649  valuesUnpacked_wdv = valuesPacked_wdv;
650 
651  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
652  (isFillActive (), std::logic_error,
653  "At the end of a CrsMatrix constructor that should produce "
654  "a fillComplete matrix, isFillActive() is true." << suffix);
655  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
656  (! isFillComplete (), std::logic_error, "At the end of a "
657  "CrsMatrix constructor that should produce a fillComplete "
658  "matrix, isFillComplete() is false." << suffix);
660  }
661 
662  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
665  const Teuchos::RCP<const map_type>& rowMap,
666  const Teuchos::RCP<const map_type>& colMap,
667  const Teuchos::RCP<const map_type>& domainMap,
668  const Teuchos::RCP<const map_type>& rangeMap,
669  const Teuchos::RCP<const import_type>& importer,
670  const Teuchos::RCP<const export_type>& exporter,
671  const Teuchos::RCP<Teuchos::ParameterList>& params) :
672  dist_object_type (rowMap),
673  storageStatus_ (Details::STORAGE_1D_PACKED),
674  fillComplete_ (true)
675  {
676  using Teuchos::rcp;
677  const char tfecfFuncName[] = "Tpetra::CrsMatrix"
678  "(lclMat,Map,Map,Map,Map,Import,Export,params): ";
679  const char suffix[] =
680  " Please report this bug to the Tpetra developers.";
681 
682  Teuchos::RCP<crs_graph_type> graph;
683  try {
684  graph = rcp (new crs_graph_type (lclMatrix.graph, rowMap, colMap,
685  domainMap, rangeMap, importer,
686  exporter, params));
687  }
688  catch (std::exception& e) {
689  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
690  (true, std::runtime_error, "CrsGraph constructor "
691  "(local_graph_device_type, Map, Map, Map, Map, Import, Export, "
692  "params) threw: " << e.what ());
693  }
694  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
695  (!graph->isFillComplete (), std::logic_error, "CrsGraph "
696  "constructor (local_graph_device_type, Map, Map, Map, Map, Import, "
697  "Export, params) did not produce a fill-complete graph. "
698  "Please report this bug to the Tpetra developers.");
699  // myGraph_ not null means that the matrix owns the graph. This
700  // is true because the column indices come in as nonconst through
701  // the matrix, implying shared ownership.
702  myGraph_ = graph;
703  staticGraph_ = graph;
704 
705  valuesPacked_wdv = values_wdv_type(lclMatrix.values);
706  valuesUnpacked_wdv = valuesPacked_wdv;
707 
708  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
709  (isFillActive (), std::logic_error,
710  "At the end of a CrsMatrix constructor that should produce "
711  "a fillComplete matrix, isFillActive() is true." << suffix);
712  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
713  (! isFillComplete (), std::logic_error, "At the end of a "
714  "CrsMatrix constructor that should produce a fillComplete "
715  "matrix, isFillComplete() is false." << suffix);
717  }
718 
719  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
722  const Teuchos::DataAccess copyOrView):
723  dist_object_type (source.getCrsGraph()->getRowMap ()),
724  staticGraph_ (source.getCrsGraph()),
725  storageStatus_ (source.storageStatus_)
726  {
727  const char tfecfFuncName[] = "Tpetra::CrsMatrix("
728  "const CrsMatrix&, const Teuchos::DataAccess): ";
729  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
730  (! source.isFillComplete (), std::invalid_argument,
731  "Source graph must be fillComplete().");
732 
733  if (copyOrView == Teuchos::Copy) {
734  using values_type = typename local_matrix_device_type::values_type;
735  auto vals = source.getLocalValuesDevice (Access::ReadOnly);
736  using Kokkos::view_alloc;
737  using Kokkos::WithoutInitializing;
738  values_type newvals (view_alloc ("val", WithoutInitializing),
739  vals.extent (0));
740  // DEEP_COPY REVIEW - DEVICE-TO_DEVICE
741  Kokkos::deep_copy (newvals, vals);
742  valuesPacked_wdv = values_wdv_type(newvals);
743  valuesUnpacked_wdv = valuesPacked_wdv;
744  fillComplete (source.getDomainMap (), source.getRangeMap ());
745  }
746  else if (copyOrView == Teuchos::View) {
747  valuesPacked_wdv = values_wdv_type(source.valuesPacked_wdv);
748  valuesUnpacked_wdv = values_wdv_type(source.valuesUnpacked_wdv);
749  fillComplete (source.getDomainMap (), source.getRangeMap ());
750  }
751  else {
752  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
753  (true, std::invalid_argument, "Second argument 'copyOrView' "
754  "has an invalid value " << copyOrView << ". Valid values "
755  "include Teuchos::Copy = " << Teuchos::Copy << " and "
756  "Teuchos::View = " << Teuchos::View << ".");
757  }
759  }
760 
761  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
762  void
765  {
766  std::swap(crs_matrix.importMV_, this->importMV_);
767  std::swap(crs_matrix.exportMV_, this->exportMV_);
768  std::swap(crs_matrix.staticGraph_, this->staticGraph_);
769  std::swap(crs_matrix.myGraph_, this->myGraph_);
770  std::swap(crs_matrix.valuesPacked_wdv, this->valuesPacked_wdv);
771  std::swap(crs_matrix.valuesUnpacked_wdv, this->valuesUnpacked_wdv);
772  std::swap(crs_matrix.storageStatus_, this->storageStatus_);
773  std::swap(crs_matrix.fillComplete_, this->fillComplete_);
774  std::swap(crs_matrix.nonlocals_, this->nonlocals_);
775  }
776 
777  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
778  Teuchos::RCP<const Teuchos::Comm<int> >
780  getComm () const {
781  return getCrsGraphRef ().getComm ();
782  }
783 
784  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
785  bool
787  isFillComplete () const {
788  return fillComplete_;
789  }
790 
791  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
792  bool
794  isFillActive () const {
795  return ! fillComplete_;
796  }
797 
798  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
799  bool
802  return this->getCrsGraphRef ().isStorageOptimized ();
803  }
804 
805  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
806  bool
809  return getCrsGraphRef ().isLocallyIndexed ();
810  }
811 
812  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
813  bool
816  return getCrsGraphRef ().isGloballyIndexed ();
817  }
818 
819  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
820  bool
822  hasColMap () const {
823  return getCrsGraphRef ().hasColMap ();
824  }
825 
826  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
830  return getCrsGraphRef ().getGlobalNumEntries ();
831  }
832 
833  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
834  size_t
837  return getCrsGraphRef ().getLocalNumEntries ();
838  }
839 
840  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
844  return getCrsGraphRef ().getGlobalNumRows ();
845  }
846 
847  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
851  return getCrsGraphRef ().getGlobalNumCols ();
852  }
853 
854  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
855  size_t
858  return getCrsGraphRef ().getLocalNumRows ();
859  }
860 
861 
862  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
863  size_t
866  return getCrsGraphRef ().getLocalNumCols ();
867  }
868 
869 
870  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
871  size_t
873  getNumEntriesInGlobalRow (GlobalOrdinal globalRow) const {
874  return getCrsGraphRef ().getNumEntriesInGlobalRow (globalRow);
875  }
876 
877  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
878  size_t
880  getNumEntriesInLocalRow (LocalOrdinal localRow) const {
881  return getCrsGraphRef ().getNumEntriesInLocalRow (localRow);
882  }
883 
884  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
885  size_t
888  return getCrsGraphRef ().getGlobalMaxNumRowEntries ();
889  }
890 
891  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
892  size_t
895  return getCrsGraphRef ().getLocalMaxNumRowEntries ();
896  }
897 
898  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
899  GlobalOrdinal
901  getIndexBase () const {
902  return getRowMap ()->getIndexBase ();
903  }
904 
905  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
906  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
908  getRowMap () const {
909  return getCrsGraphRef ().getRowMap ();
910  }
911 
912  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
913  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
915  getColMap () const {
916  return getCrsGraphRef ().getColMap ();
917  }
918 
919  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
920  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
922  getDomainMap () const {
923  return getCrsGraphRef ().getDomainMap ();
924  }
925 
926  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
927  Teuchos::RCP<const Map<LocalOrdinal, GlobalOrdinal, Node> >
929  getRangeMap () const {
930  return getCrsGraphRef ().getRangeMap ();
931  }
932 
933  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
934  Teuchos::RCP<const RowGraph<LocalOrdinal, GlobalOrdinal, Node> >
936  getGraph () const {
937  if (staticGraph_ != Teuchos::null) {
938  return staticGraph_;
939  }
940  return myGraph_;
941  }
942 
943  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
944  Teuchos::RCP<const CrsGraph<LocalOrdinal, GlobalOrdinal, Node> >
946  getCrsGraph () const {
947  if (staticGraph_ != Teuchos::null) {
948  return staticGraph_;
949  }
950  return myGraph_;
951  }
952 
953  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
956  getCrsGraphRef () const
957  {
958 #ifdef HAVE_TPETRA_DEBUG
959  constexpr bool debug = true;
960 #else
961  constexpr bool debug = false;
962 #endif // HAVE_TPETRA_DEBUG
963 
964  if (! this->staticGraph_.is_null ()) {
965  return * (this->staticGraph_);
966  }
967  else {
968  if (debug) {
969  const char tfecfFuncName[] = "getCrsGraphRef: ";
970  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
971  (this->myGraph_.is_null (), std::logic_error,
972  "Both staticGraph_ and myGraph_ are null. "
973  "Please report this bug to the Tpetra developers.");
974  }
975  return * (this->myGraph_);
976  }
977  }
978 
979  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
980  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_device_type
983  {
984  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
985  return local_matrix_device_type("Tpetra::CrsMatrix::lclMatrixDevice",
986  numCols,
987  valuesPacked_wdv.getDeviceView(Access::ReadWrite),
988  staticGraph_->getLocalGraphDevice());
989  }
990 
991  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
992  typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_matrix_host_type
994  getLocalMatrixHost () const
995  {
996  auto numCols = staticGraph_->getColMap()->getLocalNumElements();
997  return local_matrix_host_type("Tpetra::CrsMatrix::lclMatrixHost", numCols,
998  valuesPacked_wdv.getHostView(Access::ReadWrite),
999  staticGraph_->getLocalGraphHost());
1000  }
1001 
1002 #if KOKKOSKERNELS_VERSION < 40299
1003 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1004  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1005  std::shared_ptr<typename CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::local_multiply_op_type>
1008  {
1009  auto localMatrix = getLocalMatrixDevice();
1010 #if defined(KOKKOSKERNELS_ENABLE_TPL_CUSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_ROCSPARSE) || defined(KOKKOSKERNELS_ENABLE_TPL_MKL)
1011  if(this->getLocalNumEntries() <= size_t(Teuchos::OrdinalTraits<LocalOrdinal>::max()))
1012  {
1013  if(this->ordinalRowptrs.data() == nullptr)
1014  {
1015  auto originalRowptrs = localMatrix.graph.row_map;
1016  //create LocalOrdinal-typed copy of the local graph's rowptrs.
1017  //This enables the LocalCrsMatrixOperator to use cuSPARSE SpMV.
1018  this->ordinalRowptrs = ordinal_rowptrs_type(
1019  Kokkos::ViewAllocateWithoutInitializing("CrsMatrix::ordinalRowptrs"), originalRowptrs.extent(0));
1020  auto ordinalRowptrs_ = this->ordinalRowptrs; //don't want to capture 'this'
1021  Kokkos::parallel_for("CrsMatrix::getLocalMultiplyOperator::convertRowptrs",
1022  Kokkos::RangePolicy<execution_space>(0, originalRowptrs.extent(0)),
1023  KOKKOS_LAMBDA(LocalOrdinal i)
1024  {
1025  ordinalRowptrs_(i) = originalRowptrs(i);
1026  });
1027  }
1028  //return local operator using ordinalRowptrs
1029  return std::make_shared<local_multiply_op_type>(
1030  std::make_shared<local_matrix_device_type>(localMatrix), this->ordinalRowptrs);
1031  }
1032 #endif
1033 // KDDKDD NOT SURE WHY THIS MUST RETURN A SHARED_PTR
1034  return std::make_shared<local_multiply_op_type>(
1035  std::make_shared<local_matrix_device_type>(localMatrix));
1036  }
1037 #endif
1038 
1039  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1040  bool
1042  isStaticGraph () const {
1043  return myGraph_.is_null ();
1044  }
1045 
1046  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1047  bool
1050  return true;
1051  }
1052 
1053  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1054  bool
1057  return true;
1058  }
1059 
1060  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1061  void
1063  allocateValues (ELocalGlobal lg, GraphAllocationStatus gas,
1064  const bool verbose)
1065  {
1066  using Details::Behavior;
1068  using std::endl;
1069  const char tfecfFuncName[] = "allocateValues: ";
1070  const char suffix[] =
1071  " Please report this bug to the Tpetra developers.";
1072  ProfilingRegion region("Tpetra::CrsMatrix::allocateValues");
1073 
1074  std::unique_ptr<std::string> prefix;
1075  if (verbose) {
1076  prefix = this->createPrefix("CrsMatrix", "allocateValues");
1077  std::ostringstream os;
1078  os << *prefix << "lg: "
1079  << (lg == LocalIndices ? "Local" : "Global") << "Indices"
1080  << ", gas: Graph"
1081  << (gas == GraphAlreadyAllocated ? "Already" : "NotYet")
1082  << "Allocated" << endl;
1083  std::cerr << os.str();
1084  }
1085 
1086  const bool debug = Behavior::debug("CrsMatrix");
1087  if (debug) {
1088  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1089  (this->staticGraph_.is_null (), std::logic_error,
1090  "staticGraph_ is null." << suffix);
1091 
1092  // If the graph indices are already allocated, then gas should be
1093  // GraphAlreadyAllocated. Otherwise, gas should be
1094  // GraphNotYetAllocated.
1095  if ((gas == GraphAlreadyAllocated) !=
1096  staticGraph_->indicesAreAllocated ()) {
1097  const char err1[] = "The caller has asserted that the graph "
1098  "is ";
1099  const char err2[] = "already allocated, but the static graph "
1100  "says that its indices are ";
1101  const char err3[] = "already allocated. ";
1102  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1103  (gas == GraphAlreadyAllocated &&
1104  ! staticGraph_->indicesAreAllocated (), std::logic_error,
1105  err1 << err2 << "not " << err3 << suffix);
1106  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1107  (gas != GraphAlreadyAllocated &&
1108  staticGraph_->indicesAreAllocated (), std::logic_error,
1109  err1 << "not " << err2 << err3 << suffix);
1110  }
1111 
1112  // If the graph is unallocated, then it had better be a
1113  // matrix-owned graph. ("Matrix-owned graph" means that the
1114  // matrix gets to define the graph structure. If the CrsMatrix
1115  // constructor that takes an RCP<const CrsGraph> was used, then
1116  // the matrix does _not_ own the graph.)
1117  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1118  (! this->staticGraph_->indicesAreAllocated () &&
1119  this->myGraph_.is_null (), std::logic_error,
1120  "The static graph says that its indices are not allocated, "
1121  "but the graph is not owned by the matrix." << suffix);
1122  }
1123 
1124  if (gas == GraphNotYetAllocated) {
1125  if (debug) {
1126  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1127  (this->myGraph_.is_null (), std::logic_error,
1128  "gas = GraphNotYetAllocated, but myGraph_ is null." << suffix);
1129  }
1130  try {
1131  this->myGraph_->allocateIndices (lg, verbose);
1132  }
1133  catch (std::exception& e) {
1134  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1135  (true, std::runtime_error, "CrsGraph::allocateIndices "
1136  "threw an exception: " << e.what ());
1137  }
1138  catch (...) {
1139  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1140  (true, std::runtime_error, "CrsGraph::allocateIndices "
1141  "threw an exception not a subclass of std::exception.");
1142  }
1143  }
1144 
1145  // Allocate matrix values.
1146  const size_t lclTotalNumEntries = this->staticGraph_->getLocalAllocationSize();
1147  if (debug) {
1148  const size_t lclNumRows = this->staticGraph_->getLocalNumRows ();
1149  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1150  (this->staticGraph_->getRowPtrsUnpackedHost()(lclNumRows) != lclTotalNumEntries, std::logic_error,
1151  "length of staticGraph's lclIndsUnpacked does not match final entry of rowPtrsUnapcked_host." << suffix);
1152  }
1153 
1154  // Allocate array of (packed???) matrix values.
1155  using values_type = typename local_matrix_device_type::values_type;
1156  if (verbose) {
1157  std::ostringstream os;
1158  os << *prefix << "Allocate values_wdv: Pre "
1159  << valuesUnpacked_wdv.extent(0) << ", post "
1160  << lclTotalNumEntries << endl;
1161  std::cerr << os.str();
1162  }
1163 // this->k_values1D_ =
1164  valuesUnpacked_wdv = values_wdv_type(
1165  values_type("Tpetra::CrsMatrix::values",
1166  lclTotalNumEntries));
1167  }
1168 
1169 
1170  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1171  void
1173  fillLocalGraphAndMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1174  {
1176  using ::Tpetra::Details::getEntryOnHost;
1177  using Teuchos::arcp_const_cast;
1178  using Teuchos::Array;
1179  using Teuchos::ArrayRCP;
1180  using Teuchos::null;
1181  using Teuchos::RCP;
1182  using Teuchos::rcp;
1183  using std::endl;
1184  using row_map_type = typename local_graph_device_type::row_map_type;
1185  using lclinds_1d_type = typename Graph::local_graph_device_type::entries_type::non_const_type;
1186  using values_type = typename local_matrix_device_type::values_type;
1187  Details::ProfilingRegion regionFLGAM
1188  ("Tpetra::CrsMatrix::fillLocalGraphAndMatrix");
1189 
1190  const char tfecfFuncName[] = "fillLocalGraphAndMatrix (called from "
1191  "fillComplete or expertStaticFillComplete): ";
1192  const char suffix[] =
1193  " Please report this bug to the Tpetra developers.";
1194  const bool debug = Details::Behavior::debug("CrsMatrix");
1195  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1196 
1197  std::unique_ptr<std::string> prefix;
1198  if (verbose) {
1199  prefix = this->createPrefix("CrsMatrix", "fillLocalGraphAndMatrix");
1200  std::ostringstream os;
1201  os << *prefix << endl;
1202  std::cerr << os.str ();
1203  }
1204 
1205  if (debug) {
1206  // fillComplete() only calls fillLocalGraphAndMatrix() if the
1207  // matrix owns the graph, which means myGraph_ is not null.
1208  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1209  (myGraph_.is_null (), std::logic_error, "The nonconst graph "
1210  "(myGraph_) is null. This means that the matrix has a "
1211  "const (a.k.a. \"static\") graph. fillComplete or "
1212  "expertStaticFillComplete should never call "
1213  "fillLocalGraphAndMatrix in that case." << suffix);
1214  }
1215 
1216  const size_t lclNumRows = this->getLocalNumRows ();
1217 
1218  // This method's goal is to fill in the three arrays (compressed
1219  // sparse row format) that define the sparse graph's and matrix's
1220  // structure, and the sparse matrix's values.
1221  //
1222  // Get references to the data in myGraph_, so we can modify them
1223  // as well. Note that we only call fillLocalGraphAndMatrix() if
1224  // the matrix owns the graph, which means myGraph_ is not null.
1225 
1226  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1227  // See: https://github.com/trilinos/Trilinos/issues/12237
1228  //using row_entries_type = decltype (myGraph_->k_numRowEntries_);
1229  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1230 
1231  typename Graph::local_graph_device_type::row_map_type curRowOffsets =
1232  myGraph_->rowPtrsUnpacked_dev_;
1233 
1234  if (debug) {
1235  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1236  (curRowOffsets.extent (0) == 0, std::logic_error,
1237  "curRowOffsets.extent(0) == 0.");
1238  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1239  (curRowOffsets.extent (0) != lclNumRows + 1, std::logic_error,
1240  "curRowOffsets.extent(0) = "
1241  << curRowOffsets.extent (0) << " != lclNumRows + 1 = "
1242  << (lclNumRows + 1) << ".");
1243  const size_t numOffsets = curRowOffsets.extent (0);
1244  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1245  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1246  (numOffsets != 0 &&
1247  myGraph_->lclIndsUnpacked_wdv.extent (0) != valToCheck,
1248  std::logic_error, "numOffsets = " <<
1249  numOffsets << " != 0 and myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1250  << myGraph_->lclIndsUnpacked_wdv.extent (0) << " != curRowOffsets("
1251  << numOffsets << ") = " << valToCheck << ".");
1252  }
1253 
1254  if (myGraph_->getLocalNumEntries() !=
1255  myGraph_->getLocalAllocationSize()) {
1256 
1257  // Use the nonconst version of row_map_type for k_ptrs,
1258  // because row_map_type is const and we need to modify k_ptrs here.
1259  typename row_map_type::non_const_type k_ptrs;
1260  row_map_type k_ptrs_const;
1261  lclinds_1d_type k_inds;
1262  values_type k_vals;
1263 
1264  if (verbose) {
1265  std::ostringstream os;
1266  const auto numEnt = myGraph_->getLocalNumEntries();
1267  const auto allocSize = myGraph_->getLocalAllocationSize();
1268  os << *prefix << "Unpacked 1-D storage: numEnt=" << numEnt
1269  << ", allocSize=" << allocSize << endl;
1270  std::cerr << os.str ();
1271  }
1272  // The matrix's current 1-D storage is "unpacked." This means
1273  // the row offsets may differ from what the final row offsets
1274  // should be. This could happen, for example, if the user
1275  // set an upper
1276  // bound on the number of entries per row, but didn't fill all
1277  // those entries.
1278  if (debug && curRowOffsets.extent (0) != 0) {
1279  const size_t numOffsets =
1280  static_cast<size_t> (curRowOffsets.extent (0));
1281  const auto valToCheck = myGraph_->getRowPtrsUnpackedHost()(numOffsets - 1);
1282  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1283  (static_cast<size_t> (valToCheck) !=
1284  static_cast<size_t> (valuesUnpacked_wdv.extent (0)),
1285  std::logic_error, "(unpacked branch) Before "
1286  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1287  << ") = " << valToCheck << " != valuesUnpacked_wdv.extent(0)"
1288  " = " << valuesUnpacked_wdv.extent (0) << ".");
1289  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1290  (static_cast<size_t> (valToCheck) !=
1291  static_cast<size_t> (myGraph_->lclIndsUnpacked_wdv.extent (0)),
1292  std::logic_error, "(unpacked branch) Before "
1293  "allocating or packing, curRowOffsets(" << (numOffsets-1)
1294  << ") = " << valToCheck
1295  << " != myGraph_->lclIndsUnpacked_wdv.extent(0) = "
1296  << myGraph_->lclIndsUnpacked_wdv.extent (0) << ".");
1297  }
1298  // Pack the row offsets into k_ptrs, by doing a sum-scan of
1299  // the array of valid entry counts per row.
1300 
1301  // Total number of entries in the matrix on the calling
1302  // process. We will compute this in the loop below. It's
1303  // cheap to compute and useful as a sanity check.
1304  size_t lclTotalNumEntries = 0;
1305  {
1306  // Allocate the packed row offsets array. We use a nonconst
1307  // temporary (packedRowOffsets) here, because k_ptrs is
1308  // const. We will assign packedRowOffsets to k_ptrs below.
1309  if (verbose) {
1310  std::ostringstream os;
1311  os << *prefix << "Allocate packed row offsets: "
1312  << (lclNumRows+1) << endl;
1313  std::cerr << os.str ();
1314  }
1315  typename row_map_type::non_const_type
1316  packedRowOffsets ("Tpetra::CrsGraph::ptr", lclNumRows + 1);
1317  typename row_entries_type::const_type numRowEnt_h =
1318  myGraph_->k_numRowEntries_;
1319  // We're computing offsets on device. This function can
1320  // handle numRowEnt_h being a host View.
1321  lclTotalNumEntries =
1322  computeOffsetsFromCounts (packedRowOffsets, numRowEnt_h);
1323  // packedRowOffsets is modifiable; k_ptrs isn't, so we have
1324  // to use packedRowOffsets in the loop above and assign here.
1325  k_ptrs = packedRowOffsets;
1326  k_ptrs_const = k_ptrs;
1327  }
1328 
1329  if (debug) {
1330  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1331  (static_cast<size_t> (k_ptrs.extent (0)) != lclNumRows + 1,
1332  std::logic_error,
1333  "(unpacked branch) After packing k_ptrs, "
1334  "k_ptrs.extent(0) = " << k_ptrs.extent (0) << " != "
1335  "lclNumRows+1 = " << (lclNumRows+1) << ".");
1336  const auto valToCheck = getEntryOnHost (k_ptrs, lclNumRows);
1337  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1338  (valToCheck != lclTotalNumEntries, std::logic_error,
1339  "(unpacked branch) After filling k_ptrs, "
1340  "k_ptrs(lclNumRows=" << lclNumRows << ") = " << valToCheck
1341  << " != total number of entries on the calling process = "
1342  << lclTotalNumEntries << ".");
1343  }
1344 
1345  // Allocate the arrays of packed column indices and values.
1346  if (verbose) {
1347  std::ostringstream os;
1348  os << *prefix << "Allocate packed local column indices: "
1349  << lclTotalNumEntries << endl;
1350  std::cerr << os.str ();
1351  }
1352  k_inds = lclinds_1d_type ("Tpetra::CrsGraph::lclInds", lclTotalNumEntries);
1353  if (verbose) {
1354  std::ostringstream os;
1355  os << *prefix << "Allocate packed values: "
1356  << lclTotalNumEntries << endl;
1357  std::cerr << os.str ();
1358  }
1359  k_vals = values_type ("Tpetra::CrsMatrix::values", lclTotalNumEntries);
1360 
1361  // curRowOffsets (myGraph_->rowPtrsUnpacked_) (???), lclIndsUnpacked_wdv,
1362  // and valuesUnpacked_wdv are currently unpacked. Pack them, using
1363  // the packed row offsets array k_ptrs that we created above.
1364  //
1365  // FIXME (mfh 06 Aug 2014) If "Optimize Storage" is false, we
1366  // need to keep around the unpacked row offsets, column
1367  // indices, and values arrays.
1368 
1369  // Pack the column indices from unpacked lclIndsUnpacked_wdv into
1370  // packed k_inds. We will replace lclIndsUnpacked_wdv below.
1371  using inds_packer_type = pack_functor<
1372  typename Graph::local_graph_device_type::entries_type::non_const_type,
1373  typename Graph::local_inds_dualv_type::t_dev::const_type,
1374  typename Graph::local_graph_device_type::row_map_type::non_const_type,
1375  typename Graph::local_graph_device_type::row_map_type>;
1376  inds_packer_type indsPacker (
1377  k_inds,
1378  myGraph_->lclIndsUnpacked_wdv.getDeviceView(Access::ReadOnly),
1379  k_ptrs, curRowOffsets);
1380  using exec_space = typename decltype (k_inds)::execution_space;
1381  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1382  Kokkos::parallel_for
1383  ("Tpetra::CrsMatrix pack column indices",
1384  range_type (0, lclNumRows), indsPacker);
1385 
1386  // Pack the values from unpacked valuesUnpacked_wdv into packed
1387  // k_vals. We will replace valuesPacked_wdv below.
1388  using vals_packer_type = pack_functor<
1389  typename values_type::non_const_type,
1390  typename values_type::const_type,
1391  typename row_map_type::non_const_type,
1392  typename row_map_type::const_type>;
1393  vals_packer_type valsPacker (
1394  k_vals,
1395  this->valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1396  k_ptrs, curRowOffsets);
1397  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1398  range_type (0, lclNumRows), valsPacker);
1399 
1400  if (debug) {
1401  const char myPrefix[] = "(\"Optimize Storage\""
1402  "=true branch) After packing, ";
1403  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1404  (k_ptrs.extent (0) == 0, std::logic_error, myPrefix
1405  << "k_ptrs.extent(0) = 0. This probably means that "
1406  "rowPtrsUnpacked_ was never allocated.");
1407  if (k_ptrs.extent (0) != 0) {
1408  const size_t numOffsets (k_ptrs.extent (0));
1409  const auto valToCheck =
1410  getEntryOnHost (k_ptrs, numOffsets - 1);
1411  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1412  (size_t (valToCheck) != k_vals.extent (0),
1413  std::logic_error, myPrefix <<
1414  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1415  " != k_vals.extent(0) = " << k_vals.extent (0) << ".");
1416  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1417  (size_t (valToCheck) != k_inds.extent (0),
1418  std::logic_error, myPrefix <<
1419  "k_ptrs(" << (numOffsets-1) << ") = " << valToCheck <<
1420  " != k_inds.extent(0) = " << k_inds.extent (0) << ".");
1421  }
1422  }
1423  // Build the local graph.
1424  myGraph_->setRowPtrsPacked(k_ptrs_const);
1425  myGraph_->lclIndsPacked_wdv =
1426  typename crs_graph_type::local_inds_wdv_type(k_inds);
1427  valuesPacked_wdv = values_wdv_type(k_vals);
1428  }
1429  else { // We don't have to pack, so just set the pointers.
1430  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1431  // FIXME? This is already done in the graph fill call - need to avoid the memcpy to host
1432  myGraph_->rowPtrsPacked_dev_ = myGraph_->rowPtrsUnpacked_dev_;
1433  myGraph_->rowPtrsPacked_host_ = myGraph_->rowPtrsUnpacked_host_;
1434  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1435  myGraph_->lclIndsPacked_wdv = myGraph_->lclIndsUnpacked_wdv;
1436  valuesPacked_wdv = valuesUnpacked_wdv;
1437 
1438  if (verbose) {
1439  std::ostringstream os;
1440  os << *prefix << "Storage already packed: rowPtrsUnpacked_: "
1441  << myGraph_->getRowPtrsUnpackedHost().extent(0) << ", lclIndsUnpacked_wdv: "
1442  << myGraph_->lclIndsUnpacked_wdv.extent(0) << ", valuesUnpacked_wdv: "
1443  << valuesUnpacked_wdv.extent(0) << endl;
1444  std::cerr << os.str();
1445  }
1446 
1447  if (debug) {
1448  const char myPrefix[] =
1449  "(\"Optimize Storage\"=false branch) ";
1450  auto rowPtrsUnpackedHost = myGraph_->getRowPtrsUnpackedHost();
1451  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1452  (myGraph_->rowPtrsUnpacked_dev_.extent (0) == 0, std::logic_error, myPrefix
1453  << "myGraph->rowPtrsUnpacked_dev_.extent(0) = 0. This probably means "
1454  "that rowPtrsUnpacked_ was never allocated.");
1455  if (myGraph_->rowPtrsUnpacked_dev_.extent (0) != 0) {
1456  const size_t numOffsets = rowPtrsUnpackedHost.extent (0);
1457  const auto valToCheck = rowPtrsUnpackedHost(numOffsets - 1);
1458  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1459  (size_t (valToCheck) != valuesPacked_wdv.extent (0),
1460  std::logic_error, myPrefix <<
1461  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1462  << " != valuesPacked_wdv.extent(0) = "
1463  << valuesPacked_wdv.extent (0) << ".");
1464  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1465  (size_t (valToCheck) != myGraph_->lclIndsPacked_wdv.extent (0),
1466  std::logic_error, myPrefix <<
1467  "k_ptrs_const(" << (numOffsets-1) << ") = " << valToCheck
1468  << " != myGraph_->lclIndsPacked.extent(0) = "
1469  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1470  }
1471  }
1472  }
1473 
1474  if (debug) {
1475  const char myPrefix[] = "After packing, ";
1476  auto rowPtrsPackedHost = myGraph_->getRowPtrsPackedHost();
1477  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1478  (size_t (rowPtrsPackedHost.extent (0)) != size_t (lclNumRows + 1),
1479  std::logic_error, myPrefix << "myGraph_->rowPtrsPacked_host_.extent(0) = "
1480  << rowPtrsPackedHost.extent (0) << " != lclNumRows+1 = " <<
1481  (lclNumRows+1) << ".");
1482  if (rowPtrsPackedHost.extent (0) != 0) {
1483  const size_t numOffsets (rowPtrsPackedHost.extent (0));
1484  const size_t valToCheck = rowPtrsPackedHost(numOffsets-1);
1485  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1486  (valToCheck != size_t (valuesPacked_wdv.extent (0)),
1487  std::logic_error, myPrefix << "k_ptrs_const(" <<
1488  (numOffsets-1) << ") = " << valToCheck
1489  << " != valuesPacked_wdv.extent(0) = "
1490  << valuesPacked_wdv.extent (0) << ".");
1491  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1492  (valToCheck != size_t (myGraph_->lclIndsPacked_wdv.extent (0)),
1493  std::logic_error, myPrefix << "k_ptrs_const(" <<
1494  (numOffsets-1) << ") = " << valToCheck
1495  << " != myGraph_->lclIndsPacked_wdvk_inds.extent(0) = "
1496  << myGraph_->lclIndsPacked_wdv.extent (0) << ".");
1497  }
1498  }
1499 
1500  // May we ditch the old allocations for the packed (and otherwise
1501  // "optimized") allocations, later in this routine? Optimize
1502  // storage if the graph is not static, or if the graph already has
1503  // optimized storage.
1504  const bool defaultOptStorage =
1505  ! isStaticGraph () || staticGraph_->isStorageOptimized ();
1506  const bool requestOptimizedStorage =
1507  (! params.is_null () &&
1508  params->get ("Optimize Storage", defaultOptStorage)) ||
1509  (params.is_null () && defaultOptStorage);
1510 
1511  // The graph has optimized storage when indices are allocated,
1512  // myGraph_->k_numRowEntries_ is empty, and there are more than
1513  // zero rows on this process.
1514  if (requestOptimizedStorage) {
1515  // Free the old, unpacked, unoptimized allocations.
1516  // Free graph data structures that are only needed for
1517  // unpacked 1-D storage.
1518  if (verbose) {
1519  std::ostringstream os;
1520  os << *prefix << "Optimizing storage: free k_numRowEntries_: "
1521  << myGraph_->k_numRowEntries_.extent(0) << endl;
1522  std::cerr << os.str();
1523  }
1524 
1525  myGraph_->k_numRowEntries_ = row_entries_type ();
1526 
1527  // Keep the new 1-D packed allocations.
1528  // FIXME KDDKDD https://github.com/trilinos/Trilinos/issues/9657
1529  // We directly set the memory spaces to avoid a memcpy from device to host
1530  myGraph_->rowPtrsUnpacked_dev_ = myGraph_->rowPtrsPacked_dev_;
1531  myGraph_->rowPtrsUnpacked_host_ = myGraph_->rowPtrsPacked_host_;
1532  myGraph_->packedUnpackedRowPtrsMatch_ = true;
1533  myGraph_->lclIndsUnpacked_wdv = myGraph_->lclIndsPacked_wdv;
1534  valuesUnpacked_wdv = valuesPacked_wdv;
1535 
1536  myGraph_->storageStatus_ = Details::STORAGE_1D_PACKED;
1537  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1538  }
1539  else {
1540  if (verbose) {
1541  std::ostringstream os;
1542  os << *prefix << "User requested NOT to optimize storage"
1543  << endl;
1544  std::cerr << os.str();
1545  }
1546  }
1547  }
1548 
1549  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1550  void
1552  fillLocalMatrix (const Teuchos::RCP<Teuchos::ParameterList>& params)
1553  {
1554  using ::Tpetra::Details::ProfilingRegion;
1555  using Teuchos::ArrayRCP;
1556  using Teuchos::Array;
1557  using Teuchos::null;
1558  using Teuchos::RCP;
1559  using Teuchos::rcp;
1560  using std::endl;
1561  using row_map_type = typename Graph::local_graph_device_type::row_map_type;
1562  using non_const_row_map_type = typename row_map_type::non_const_type;
1563  using values_type = typename local_matrix_device_type::values_type;
1564  ProfilingRegion regionFLM("Tpetra::CrsMatrix::fillLocalMatrix");
1565  const size_t lclNumRows = getLocalNumRows();
1566 
1567  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1568  std::unique_ptr<std::string> prefix;
1569  if (verbose) {
1570  prefix = this->createPrefix("CrsMatrix", "fillLocalMatrix");
1571  std::ostringstream os;
1572  os << *prefix << "lclNumRows: " << lclNumRows << endl;
1573  std::cerr << os.str ();
1574  }
1575 
1576  // The goals of this routine are first, to allocate and fill
1577  // packed 1-D storage (see below for an explanation) in the vals
1578  // array, and second, to give vals to the local matrix and
1579  // finalize the local matrix. We only need k_ptrs, the packed 1-D
1580  // row offsets, within the scope of this routine, since we're only
1581  // filling the local matrix here (use fillLocalGraphAndMatrix() to
1582  // fill both the graph and the matrix at the same time).
1583 
1584  // get data from staticGraph_
1585  size_t nodeNumEntries = staticGraph_->getLocalNumEntries ();
1586  size_t nodeNumAllocated = staticGraph_->getLocalAllocationSize ();
1587  row_map_type k_rowPtrs = staticGraph_->rowPtrsPacked_dev_;
1588 
1589  row_map_type k_ptrs; // "packed" row offsets array
1590  values_type k_vals; // "packed" values array
1591 
1592  // May we ditch the old allocations for the packed (and otherwise
1593  // "optimized") allocations, later in this routine? Request
1594  // optimized storage by default.
1595  bool requestOptimizedStorage = true;
1596  const bool default_OptimizeStorage =
1597  ! isStaticGraph() || staticGraph_->isStorageOptimized();
1598  if (! params.is_null() &&
1599  ! params->get("Optimize Storage", default_OptimizeStorage)) {
1600  requestOptimizedStorage = false;
1601  }
1602  // If we're not allowed to change a static graph, then we can't
1603  // change the storage of the matrix, either. This means that if
1604  // the graph's storage isn't already optimized, we can't optimize
1605  // the matrix's storage either. Check and give warning, as
1606  // appropriate.
1607  if (! staticGraph_->isStorageOptimized () &&
1608  requestOptimizedStorage) {
1610  (true, std::runtime_error, "You requested optimized storage "
1611  "by setting the \"Optimize Storage\" flag to \"true\" in "
1612  "the ParameterList, or by virtue of default behavior. "
1613  "However, the associated CrsGraph was filled separately and "
1614  "requested not to optimize storage. Therefore, the "
1615  "CrsMatrix cannot optimize storage.");
1616  requestOptimizedStorage = false;
1617  }
1618 
1619  // NOTE: This does not work correctly w/ GCC 12.3 + CUDA due to a compiler bug.
1620  // See: https://github.com/trilinos/Trilinos/issues/12237
1621  //using row_entries_type = decltype (staticGraph_->k_numRowEntries_);
1622  using row_entries_type = typename crs_graph_type::num_row_entries_type;
1623 
1624  // The matrix's values are currently
1625  // stored in a 1-D format. However, this format is "unpacked";
1626  // it doesn't necessarily have the same row offsets as indicated
1627  // by the ptrs array returned by allocRowPtrs. This could
1628  // happen, for example, if the user
1629  // fixed the number of matrix entries in
1630  // each row, but didn't fill all those entries.
1631  //
1632  // As above, we don't need to keep the "packed" row offsets
1633  // array ptrs here, but we do need it here temporarily, so we
1634  // have to allocate it. We'll free ptrs later in this method.
1635  //
1636  // Note that this routine checks whether storage has already
1637  // been packed. This is a common case for solution of nonlinear
1638  // PDEs using the finite element method, as long as the
1639  // structure of the sparse matrix does not change between linear
1640  // solves.
1641  if (nodeNumEntries != nodeNumAllocated) {
1642  if (verbose) {
1643  std::ostringstream os;
1644  os << *prefix << "Unpacked 1-D storage: numEnt="
1645  << nodeNumEntries << ", allocSize=" << nodeNumAllocated
1646  << endl;
1647  std::cerr << os.str();
1648  }
1649  // We have to pack the 1-D storage, since the user didn't fill
1650  // up all requested storage.
1651  if (verbose) {
1652  std::ostringstream os;
1653  os << *prefix << "Allocate packed row offsets: "
1654  << (lclNumRows+1) << endl;
1655  std::cerr << os.str();
1656  }
1657  non_const_row_map_type tmpk_ptrs ("Tpetra::CrsGraph::ptr",
1658  lclNumRows+1);
1659  // Total number of entries in the matrix on the calling
1660  // process. We will compute this in the loop below. It's
1661  // cheap to compute and useful as a sanity check.
1662  size_t lclTotalNumEntries = 0;
1663  k_ptrs = tmpk_ptrs;
1664  {
1665  typename row_entries_type::const_type numRowEnt_h =
1666  staticGraph_->k_numRowEntries_;
1667  // This function can handle the counts being a host View.
1668  lclTotalNumEntries =
1669  Details::computeOffsetsFromCounts (tmpk_ptrs, numRowEnt_h);
1670  }
1671 
1672  // Allocate the "packed" values array.
1673  // It has exactly the right number of entries.
1674  if (verbose) {
1675  std::ostringstream os;
1676  os << *prefix << "Allocate packed values: "
1677  << lclTotalNumEntries << endl;
1678  std::cerr << os.str ();
1679  }
1680  k_vals = values_type ("Tpetra::CrsMatrix::val", lclTotalNumEntries);
1681 
1682  // Pack values_wdv into k_vals. We will replace values_wdv below.
1683  pack_functor<
1684  typename values_type::non_const_type,
1685  typename values_type::const_type,
1686  typename row_map_type::non_const_type,
1687  typename row_map_type::const_type> valsPacker
1688  (k_vals, valuesUnpacked_wdv.getDeviceView(Access::ReadOnly),
1689  tmpk_ptrs, k_rowPtrs);
1690 
1691  using exec_space = typename decltype (k_vals)::execution_space;
1692  using range_type = Kokkos::RangePolicy<exec_space, LocalOrdinal>;
1693  Kokkos::parallel_for ("Tpetra::CrsMatrix pack values",
1694  range_type (0, lclNumRows), valsPacker);
1695  valuesPacked_wdv = values_wdv_type(k_vals);
1696  }
1697  else { // We don't have to pack, so just set the pointer.
1698  valuesPacked_wdv = valuesUnpacked_wdv;
1699  if (verbose) {
1700  std::ostringstream os;
1701  os << *prefix << "Storage already packed: "
1702  << "valuesUnpacked_wdv: " << valuesUnpacked_wdv.extent(0) << endl;
1703  std::cerr << os.str();
1704  }
1705  }
1706 
1707  // May we ditch the old allocations for the packed one?
1708  if (requestOptimizedStorage) {
1709  // The user requested optimized storage, so we can dump the
1710  // unpacked 1-D storage, and keep the packed storage.
1711  valuesUnpacked_wdv = valuesPacked_wdv;
1712 // k_values1D_ = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
1713  this->storageStatus_ = Details::STORAGE_1D_PACKED;
1714  }
1715  }
1716 
1717  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1718  void
1720  insertIndicesAndValues (crs_graph_type& graph,
1721  RowInfo& rowInfo,
1722  const typename crs_graph_type::SLocalGlobalViews& newInds,
1723  const Teuchos::ArrayView<impl_scalar_type>& oldRowVals,
1724  const Teuchos::ArrayView<const impl_scalar_type>& newRowVals,
1725  const ELocalGlobal lg,
1726  const ELocalGlobal I)
1727  {
1728  const size_t oldNumEnt = rowInfo.numEntries;
1729  const size_t numInserted = graph.insertIndices (rowInfo, newInds, lg, I);
1730 
1731  // Use of memcpy here works around an issue with GCC >= 4.9.0,
1732  // that probably relates to scalar_type vs. impl_scalar_type
1733  // aliasing. See history of Tpetra_CrsGraph_def.hpp for
1734  // details; look for GCC_WORKAROUND macro definition.
1735  if (numInserted > 0) {
1736  const size_t startOffset = oldNumEnt;
1737  memcpy ((void*) &oldRowVals[startOffset], &newRowVals[0],
1738  numInserted * sizeof (impl_scalar_type));
1739  }
1740  }
1741 
1742  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1743  void
1745  insertLocalValues (const LocalOrdinal lclRow,
1746  const Teuchos::ArrayView<const LocalOrdinal>& indices,
1747  const Teuchos::ArrayView<const Scalar>& values,
1748  const CombineMode CM)
1749  {
1750  using std::endl;
1751  const char tfecfFuncName[] = "insertLocalValues: ";
1752 
1753  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1754  (! this->isFillActive (), std::runtime_error,
1755  "Fill is not active. After calling fillComplete, you must call "
1756  "resumeFill before you may insert entries into the matrix again.");
1757  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1758  (this->isStaticGraph (), std::runtime_error,
1759  "Cannot insert indices with static graph; use replaceLocalValues() "
1760  "instead.");
1761  // At this point, we know that myGraph_ is nonnull.
1762  crs_graph_type& graph = * (this->myGraph_);
1763  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1764  (graph.colMap_.is_null (), std::runtime_error,
1765  "Cannot insert local indices without a column map.");
1766  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1767  (graph.isGloballyIndexed (),
1768  std::runtime_error, "Graph indices are global; use "
1769  "insertGlobalValues().");
1770  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1771  (values.size () != indices.size (), std::runtime_error,
1772  "values.size() = " << values.size ()
1773  << " != indices.size() = " << indices.size () << ".");
1774  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
1775  ! graph.rowMap_->isNodeLocalElement (lclRow), std::runtime_error,
1776  "Local row index " << lclRow << " does not belong to this process.");
1777 
1778  if (! graph.indicesAreAllocated ()) {
1779  // We only allocate values at most once per process, so it's OK
1780  // to check TPETRA_VERBOSE here.
1781  const bool verbose = Details::Behavior::verbose("CrsMatrix");
1782  this->allocateValues (LocalIndices, GraphNotYetAllocated, verbose);
1783  }
1784 
1785 #ifdef HAVE_TPETRA_DEBUG
1786  const size_t numEntriesToAdd = static_cast<size_t> (indices.size ());
1787  // In a debug build, test whether any of the given column indices
1788  // are not in the column Map. Keep track of the invalid column
1789  // indices so we can tell the user about them.
1790  {
1791  using Teuchos::toString;
1792 
1793  const map_type& colMap = * (graph.colMap_);
1794  Teuchos::Array<LocalOrdinal> badColInds;
1795  bool allInColMap = true;
1796  for (size_t k = 0; k < numEntriesToAdd; ++k) {
1797  if (! colMap.isNodeLocalElement (indices[k])) {
1798  allInColMap = false;
1799  badColInds.push_back (indices[k]);
1800  }
1801  }
1802  if (! allInColMap) {
1803  std::ostringstream os;
1804  os << "You attempted to insert entries in owned row " << lclRow
1805  << ", at the following column indices: " << toString (indices)
1806  << "." << endl;
1807  os << "Of those, the following indices are not in the column Map on "
1808  "this process: " << toString (badColInds) << "." << endl << "Since "
1809  "the matrix has a column Map already, it is invalid to insert "
1810  "entries at those locations.";
1811  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1812  (true, std::invalid_argument, os.str ());
1813  }
1814  }
1815 #endif // HAVE_TPETRA_DEBUG
1816 
1817  RowInfo rowInfo = graph.getRowInfo (lclRow);
1818 
1819  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1820  if (CM == ADD) {
1821  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1822  valsView[offset] += values[k]; };
1823  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1824  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1825  } else if (CM == INSERT) {
1826  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset) {
1827  valsView[offset] = values[k]; };
1828  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1829  graph.insertLocalIndicesImpl(lclRow, indices, cb);
1830  } else {
1831  std::ostringstream os;
1832  os << "You attempted to use insertLocalValues with CombineMode " << combineModeToString(CM)
1833  << "but this has not been implemented." << endl;
1834  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1835  (true, std::invalid_argument, os.str ());
1836  }
1837  }
1838 
1839  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1840  void
1842  insertLocalValues (const LocalOrdinal localRow,
1843  const LocalOrdinal numEnt,
1844  const Scalar vals[],
1845  const LocalOrdinal cols[],
1846  const CombineMode CM)
1847  {
1848  Teuchos::ArrayView<const LocalOrdinal> colsT (cols, numEnt);
1849  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
1850  this->insertLocalValues (localRow, colsT, valsT, CM);
1851  }
1852 
1853  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1854  void
1857  RowInfo& rowInfo,
1858  const GlobalOrdinal gblColInds[],
1859  const impl_scalar_type vals[],
1860  const size_t numInputEnt)
1861  {
1862 #ifdef HAVE_TPETRA_DEBUG
1863  const char tfecfFuncName[] = "insertGlobalValuesImpl: ";
1864  const size_t origNumEnt = graph.getNumEntriesInLocalRow (rowInfo.localRow);
1865  const size_t curNumEnt = rowInfo.numEntries;
1866 #endif // HAVE_TPETRA_DEBUG
1867 
1868  if (! graph.indicesAreAllocated ()) {
1869  // We only allocate values at most once per process, so it's OK
1870  // to check TPETRA_VERBOSE here.
1871  using ::Tpetra::Details::Behavior;
1872  const bool verbose = Behavior::verbose("CrsMatrix");
1873  this->allocateValues (GlobalIndices, GraphNotYetAllocated, verbose);
1874  // mfh 23 Jul 2017: allocateValues invalidates existing
1875  // getRowInfo results. Once we get rid of lazy graph
1876  // allocation, we'll be able to move the getRowInfo call outside
1877  // of this method.
1878  rowInfo = graph.getRowInfo (rowInfo.localRow);
1879  }
1880 
1881  auto valsView = this->getValuesViewHostNonConst(rowInfo);
1882  auto fun = [&](size_t const k, size_t const /*start*/, size_t const offset){
1883  valsView[offset] += vals[k];
1884  };
1885  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
1886 #ifdef HAVE_TPETRA_DEBUG
1887  //numInserted is only used inside the debug code below.
1888  auto numInserted =
1889 #endif
1890  graph.insertGlobalIndicesImpl(rowInfo, gblColInds, numInputEnt, cb);
1891 
1892 #ifdef HAVE_TPETRA_DEBUG
1893  size_t newNumEnt = curNumEnt + numInserted;
1894  const size_t chkNewNumEnt =
1895  graph.getNumEntriesInLocalRow (rowInfo.localRow);
1896  if (chkNewNumEnt != newNumEnt) {
1897  std::ostringstream os;
1898  os << std::endl << "newNumEnt = " << newNumEnt
1899  << " != graph.getNumEntriesInLocalRow(" << rowInfo.localRow
1900  << ") = " << chkNewNumEnt << "." << std::endl
1901  << "\torigNumEnt: " << origNumEnt << std::endl
1902  << "\tnumInputEnt: " << numInputEnt << std::endl
1903  << "\tgblColInds: [";
1904  for (size_t k = 0; k < numInputEnt; ++k) {
1905  os << gblColInds[k];
1906  if (k + size_t (1) < numInputEnt) {
1907  os << ",";
1908  }
1909  }
1910  os << "]" << std::endl
1911  << "\tvals: [";
1912  for (size_t k = 0; k < numInputEnt; ++k) {
1913  os << vals[k];
1914  if (k + size_t (1) < numInputEnt) {
1915  os << ",";
1916  }
1917  }
1918  os << "]" << std::endl;
1919 
1920  if (this->supportsRowViews ()) {
1921  values_host_view_type vals2;
1922  if (this->isGloballyIndexed ()) {
1923  global_inds_host_view_type gblColInds2;
1924  const GlobalOrdinal gblRow =
1925  graph.rowMap_->getGlobalElement (rowInfo.localRow);
1926  if (gblRow ==
1927  Tpetra::Details::OrdinalTraits<GlobalOrdinal>::invalid ()) {
1928  os << "Local row index " << rowInfo.localRow << " is invalid!"
1929  << std::endl;
1930  }
1931  else {
1932  bool getViewThrew = false;
1933  try {
1934  this->getGlobalRowView (gblRow, gblColInds2, vals2);
1935  }
1936  catch (std::exception& e) {
1937  getViewThrew = true;
1938  os << "getGlobalRowView threw exception:" << std::endl
1939  << e.what () << std::endl;
1940  }
1941  if (! getViewThrew) {
1942  os << "\tNew global column indices: ";
1943  for (size_t jjj = 0; jjj < gblColInds2.extent(0); jjj++)
1944  os << gblColInds2[jjj] << " ";
1945  os << std::endl;
1946  os << "\tNew values: ";
1947  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1948  os << vals2[jjj] << " ";
1949  os << std::endl;
1950  }
1951  }
1952  }
1953  else if (this->isLocallyIndexed ()) {
1954  local_inds_host_view_type lclColInds2;
1955  this->getLocalRowView (rowInfo.localRow, lclColInds2, vals2);
1956  os << "\tNew local column indices: ";
1957  for (size_t jjj = 0; jjj < lclColInds2.extent(0); jjj++)
1958  os << lclColInds2[jjj] << " ";
1959  os << std::endl;
1960  os << "\tNew values: ";
1961  for (size_t jjj = 0; jjj < vals2.extent(0); jjj++)
1962  os << vals2[jjj] << " ";
1963  os << std::endl;
1964  }
1965  }
1966 
1967  os << "Please report this bug to the Tpetra developers.";
1968  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1969  (true, std::logic_error, os.str ());
1970  }
1971 #endif // HAVE_TPETRA_DEBUG
1972  }
1973 
1974  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
1975  void
1977  insertGlobalValues (const GlobalOrdinal gblRow,
1978  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
1979  const Teuchos::ArrayView<const Scalar>& values)
1980  {
1981  using Teuchos::toString;
1982  using std::endl;
1983  typedef impl_scalar_type IST;
1984  typedef LocalOrdinal LO;
1985  typedef GlobalOrdinal GO;
1986  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
1987  typedef typename Teuchos::ArrayView<const GO>::size_type size_type;
1988  const char tfecfFuncName[] = "insertGlobalValues: ";
1989 
1990 #ifdef HAVE_TPETRA_DEBUG
1991  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
1992  (values.size () != indices.size (), std::runtime_error,
1993  "values.size() = " << values.size () << " != indices.size() = "
1994  << indices.size () << ".");
1995 #endif // HAVE_TPETRA_DEBUG
1996 
1997  // getRowMap() is not thread safe, because it increments RCP's
1998  // reference count. getCrsGraphRef() is thread safe.
1999  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2000  const LO lclRow = rowMap.getLocalElement (gblRow);
2001 
2002  if (lclRow == OTLO::invalid ()) {
2003  // Input row is _not_ owned by the calling process.
2004  //
2005  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2006  // is not in the row Map, it doesn't matter whether or not the
2007  // graph is static; the data just get stashed for later use by
2008  // globalAssemble().
2009  this->insertNonownedGlobalValues (gblRow, indices, values);
2010  }
2011  else { // Input row _is_ owned by the calling process
2012  if (this->isStaticGraph ()) {
2013  // Uh oh! Not allowed to insert into owned rows in that case.
2014  const int myRank = rowMap.getComm ()->getRank ();
2015  const int numProcs = rowMap.getComm ()->getSize ();
2016  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2017  (true, std::runtime_error,
2018  "The matrix was constructed with a constant (\"static\") graph, "
2019  "yet the given global row index " << gblRow << " is in the row "
2020  "Map on the calling process (with rank " << myRank << ", of " <<
2021  numProcs << " process(es)). In this case, you may not insert "
2022  "new entries into rows owned by the calling process.");
2023  }
2024 
2025  crs_graph_type& graph = * (this->myGraph_);
2026  const IST* const inputVals =
2027  reinterpret_cast<const IST*> (values.getRawPtr ());
2028  const GO* const inputGblColInds = indices.getRawPtr ();
2029  const size_t numInputEnt = indices.size ();
2030  RowInfo rowInfo = graph.getRowInfo (lclRow);
2031 
2032  // If the matrix has a column Map, check at this point whether
2033  // the column indices belong to the column Map.
2034  //
2035  // FIXME (mfh 16 May 2013) We may want to consider deferring the
2036  // test to the CrsGraph method, since it may have to do this
2037  // anyway.
2038  if (! graph.colMap_.is_null ()) {
2039  const map_type& colMap = * (graph.colMap_);
2040  // In a debug build, keep track of the nonowned ("bad") column
2041  // indices, so that we can display them in the exception
2042  // message. In a release build, just ditch the loop early if
2043  // we encounter a nonowned column index.
2044 #ifdef HAVE_TPETRA_DEBUG
2045  Teuchos::Array<GO> badColInds;
2046 #endif // HAVE_TPETRA_DEBUG
2047  const size_type numEntriesToInsert = indices.size ();
2048  bool allInColMap = true;
2049  for (size_type k = 0; k < numEntriesToInsert; ++k) {
2050  if (! colMap.isNodeGlobalElement (indices[k])) {
2051  allInColMap = false;
2052 #ifdef HAVE_TPETRA_DEBUG
2053  badColInds.push_back (indices[k]);
2054 #else
2055  break;
2056 #endif // HAVE_TPETRA_DEBUG
2057  }
2058  }
2059  if (! allInColMap) {
2060  std::ostringstream os;
2061  os << "You attempted to insert entries in owned row " << gblRow
2062  << ", at the following column indices: " << toString (indices)
2063  << "." << endl;
2064 #ifdef HAVE_TPETRA_DEBUG
2065  os << "Of those, the following indices are not in the column Map "
2066  "on this process: " << toString (badColInds) << "." << endl
2067  << "Since the matrix has a column Map already, it is invalid "
2068  "to insert entries at those locations.";
2069 #else
2070  os << "At least one of those indices is not in the column Map "
2071  "on this process." << endl << "It is invalid to insert into "
2072  "columns not in the column Map on the process that owns the "
2073  "row.";
2074 #endif // HAVE_TPETRA_DEBUG
2075  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2076  (true, std::invalid_argument, os.str ());
2077  }
2078  }
2079 
2080  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2081  inputVals, numInputEnt);
2082  }
2083  }
2084 
2085 
2086  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2087  void
2089  insertGlobalValues (const GlobalOrdinal globalRow,
2090  const LocalOrdinal numEnt,
2091  const Scalar vals[],
2092  const GlobalOrdinal inds[])
2093  {
2094  Teuchos::ArrayView<const GlobalOrdinal> indsT (inds, numEnt);
2095  Teuchos::ArrayView<const Scalar> valsT (vals, numEnt);
2096  this->insertGlobalValues (globalRow, indsT, valsT);
2097  }
2098 
2099 
2100  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2101  void
2104  const GlobalOrdinal gblRow,
2105  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2106  const Teuchos::ArrayView<const Scalar>& values,
2107  const bool debug)
2108  {
2109  typedef impl_scalar_type IST;
2110  typedef LocalOrdinal LO;
2111  typedef GlobalOrdinal GO;
2112  typedef Tpetra::Details::OrdinalTraits<LO> OTLO;
2113  const char tfecfFuncName[] = "insertGlobalValuesFiltered: ";
2114 
2115  if (debug) {
2116  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2117  (values.size () != indices.size (), std::runtime_error,
2118  "values.size() = " << values.size () << " != indices.size() = "
2119  << indices.size () << ".");
2120  }
2121 
2122  // getRowMap() is not thread safe, because it increments RCP's
2123  // reference count. getCrsGraphRef() is thread safe.
2124  const map_type& rowMap = * (this->getCrsGraphRef ().rowMap_);
2125  const LO lclRow = rowMap.getLocalElement (gblRow);
2126  if (lclRow == OTLO::invalid ()) {
2127  // Input row is _not_ owned by the calling process.
2128  //
2129  // See a note (now deleted) from mfh 14 Dec 2012: If input row
2130  // is not in the row Map, it doesn't matter whether or not the
2131  // graph is static; the data just get stashed for later use by
2132  // globalAssemble().
2133  this->insertNonownedGlobalValues (gblRow, indices, values);
2134  }
2135  else { // Input row _is_ owned by the calling process
2136  if (this->isStaticGraph ()) {
2137  // Uh oh! Not allowed to insert into owned rows in that case.
2138  const int myRank = rowMap.getComm ()->getRank ();
2139  const int numProcs = rowMap.getComm ()->getSize ();
2140  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2141  (true, std::runtime_error,
2142  "The matrix was constructed with a constant (\"static\") graph, "
2143  "yet the given global row index " << gblRow << " is in the row "
2144  "Map on the calling process (with rank " << myRank << ", of " <<
2145  numProcs << " process(es)). In this case, you may not insert "
2146  "new entries into rows owned by the calling process.");
2147  }
2148 
2149  crs_graph_type& graph = * (this->myGraph_);
2150  const IST* const inputVals =
2151  reinterpret_cast<const IST*> (values.getRawPtr ());
2152  const GO* const inputGblColInds = indices.getRawPtr ();
2153  const size_t numInputEnt = indices.size ();
2154  RowInfo rowInfo = graph.getRowInfo (lclRow);
2155 
2156  if (!graph.colMap_.is_null() && graph.isLocallyIndexed()) {
2157  // This branch is similar in function to the following branch, but for
2158  // the special case that the target graph is locally indexed.
2159  // In this case, we cannot simply filter
2160  // out global indices that don't exist on the receiving process and
2161  // insert the remaining (global) indices, but we must convert them (the
2162  // remaining global indices) to local and call `insertLocalValues`.
2163  const map_type& colMap = * (graph.colMap_);
2164  size_t curOffset = 0;
2165  while (curOffset < numInputEnt) {
2166  // Find a sequence of input indices that are in the column Map on the
2167  // calling process. Doing a sequence at a time, instead of one at a
2168  // time, amortizes some overhead.
2169  Teuchos::Array<LO> lclIndices;
2170  size_t endOffset = curOffset;
2171  for ( ; endOffset < numInputEnt; ++endOffset) {
2172  auto lclIndex = colMap.getLocalElement(inputGblColInds[endOffset]);
2173  if (lclIndex != OTLO::invalid())
2174  lclIndices.push_back(lclIndex);
2175  else
2176  break;
2177  }
2178  // curOffset, endOffset: half-exclusive range of indices in the column
2179  // Map on the calling process. If endOffset == curOffset, the range is
2180  // empty.
2181  const LO numIndInSeq = (endOffset - curOffset);
2182  if (numIndInSeq != 0) {
2183  this->insertLocalValues(lclRow, lclIndices(), values(curOffset, numIndInSeq));
2184  }
2185  // Invariant before the increment line: Either endOffset ==
2186  // numInputEnt, or inputGblColInds[endOffset] is not in the column Map
2187  // on the calling process.
2188  if (debug) {
2189  const bool invariant = endOffset == numInputEnt ||
2190  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2191  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2192  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2193  }
2194  curOffset = endOffset + 1;
2195  }
2196  }
2197  else if (! graph.colMap_.is_null ()) { // We have a column Map.
2198  const map_type& colMap = * (graph.colMap_);
2199  size_t curOffset = 0;
2200  while (curOffset < numInputEnt) {
2201  // Find a sequence of input indices that are in the column
2202  // Map on the calling process. Doing a sequence at a time,
2203  // instead of one at a time, amortizes some overhead.
2204  size_t endOffset = curOffset;
2205  for ( ; endOffset < numInputEnt &&
2206  colMap.getLocalElement (inputGblColInds[endOffset]) != OTLO::invalid ();
2207  ++endOffset)
2208  {}
2209  // curOffset, endOffset: half-exclusive range of indices in
2210  // the column Map on the calling process. If endOffset ==
2211  // curOffset, the range is empty.
2212  const LO numIndInSeq = (endOffset - curOffset);
2213  if (numIndInSeq != 0) {
2214  rowInfo = graph.getRowInfo(lclRow); // KDD 5/19 Need fresh RowInfo in each loop iteration
2215  this->insertGlobalValuesImpl (graph, rowInfo,
2216  inputGblColInds + curOffset,
2217  inputVals + curOffset,
2218  numIndInSeq);
2219  }
2220  // Invariant before the increment line: Either endOffset ==
2221  // numInputEnt, or inputGblColInds[endOffset] is not in the
2222  // column Map on the calling process.
2223  if (debug) {
2224  const bool invariant = endOffset == numInputEnt ||
2225  colMap.getLocalElement (inputGblColInds[endOffset]) == OTLO::invalid ();
2226  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
2227  (! invariant, std::logic_error, std::endl << "Invariant failed!");
2228  }
2229  curOffset = endOffset + 1;
2230  }
2231  }
2232  else { // we don't have a column Map.
2233  this->insertGlobalValuesImpl (graph, rowInfo, inputGblColInds,
2234  inputVals, numInputEnt);
2235  }
2236  }
2237  }
2238 
2239  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2240  void
2241  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2242  insertGlobalValuesFilteredChecked(
2243  const GlobalOrdinal gblRow,
2244  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
2245  const Teuchos::ArrayView<const Scalar>& values,
2246  const char* const prefix,
2247  const bool debug,
2248  const bool verbose)
2249  {
2251  using std::endl;
2252 
2253  try {
2254  insertGlobalValuesFiltered(gblRow, indices, values, debug);
2255  }
2256  catch(std::exception& e) {
2257  std::ostringstream os;
2258  if (verbose) {
2259  const size_t maxNumToPrint =
2261  os << *prefix << ": insertGlobalValuesFiltered threw an "
2262  "exception: " << e.what() << endl
2263  << "Global row index: " << gblRow << endl;
2264  verbosePrintArray(os, indices, "Global column indices",
2265  maxNumToPrint);
2266  os << endl;
2267  verbosePrintArray(os, values, "Values", maxNumToPrint);
2268  os << endl;
2269  }
2270  else {
2271  os << ": insertGlobalValuesFiltered threw an exception: "
2272  << e.what();
2273  }
2274  TEUCHOS_TEST_FOR_EXCEPTION(true, std::runtime_error, os.str());
2275  }
2276  }
2277 
2278  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2279  LocalOrdinal
2282  const crs_graph_type& graph,
2283  const RowInfo& rowInfo,
2284  const LocalOrdinal inds[],
2285  const impl_scalar_type newVals[],
2286  const LocalOrdinal numElts)
2287  {
2288  typedef LocalOrdinal LO;
2289  typedef GlobalOrdinal GO;
2290  const bool sorted = graph.isSorted ();
2291 
2292  size_t hint = 0; // Guess for the current index k into rowVals
2293  LO numValid = 0; // number of valid local column indices
2294 
2295  if (graph.isLocallyIndexed ()) {
2296  // Get a view of the column indices in the row. This amortizes
2297  // the cost of getting the view over all the entries of inds.
2298  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2299 
2300  for (LO j = 0; j < numElts; ++j) {
2301  const LO lclColInd = inds[j];
2302  const size_t offset =
2303  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2304  lclColInd, hint, sorted);
2305  if (offset != rowInfo.numEntries) {
2306  rowVals[offset] = newVals[j];
2307  hint = offset + 1;
2308  ++numValid;
2309  }
2310  }
2311  }
2312  else if (graph.isGloballyIndexed ()) {
2313  if (graph.colMap_.is_null ()) {
2314  return Teuchos::OrdinalTraits<LO>::invalid ();
2315  }
2316  const map_type colMap = * (graph.colMap_);
2317 
2318  // Get a view of the column indices in the row. This amortizes
2319  // the cost of getting the view over all the entries of inds.
2320  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2321 
2322  for (LO j = 0; j < numElts; ++j) {
2323  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2324  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
2325  const size_t offset =
2326  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2327  gblColInd, hint, sorted);
2328  if (offset != rowInfo.numEntries) {
2329  rowVals[offset] = newVals[j];
2330  hint = offset + 1;
2331  ++numValid;
2332  }
2333  }
2334  }
2335  }
2336  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
2337  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
2338  // to be neither locally nor globally indexed on a process.
2339  // This means that the graph or matrix has no entries on that
2340  // process. Epetra also works like this. It's related to lazy
2341  // allocation (on first insertion, not at graph / matrix
2342  // construction). Lazy allocation will go away because it is
2343  // not thread scalable.
2344 
2345  return numValid;
2346  }
2347 
2348  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2349  LocalOrdinal
2351  replaceLocalValues (const LocalOrdinal localRow,
2352  const Teuchos::ArrayView<const LocalOrdinal>& lclCols,
2353  const Teuchos::ArrayView<const Scalar>& vals)
2354  {
2355  typedef LocalOrdinal LO;
2356 
2357  const LO numInputEnt = static_cast<LO> (lclCols.size ());
2358  if (static_cast<LO> (vals.size ()) != numInputEnt) {
2359  return Teuchos::OrdinalTraits<LO>::invalid ();
2360  }
2361  const LO* const inputInds = lclCols.getRawPtr ();
2362  const Scalar* const inputVals = vals.getRawPtr ();
2363  return this->replaceLocalValues (localRow, numInputEnt,
2364  inputVals, inputInds);
2365  }
2366 
2367  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2369  local_ordinal_type
2372  const local_ordinal_type localRow,
2373  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2374  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2375  {
2376  using LO = local_ordinal_type;
2377  const LO numInputEnt = inputInds.extent(0);
2378  if (numInputEnt != static_cast<LO>(inputVals.extent(0))) {
2379  return Teuchos::OrdinalTraits<LO>::invalid();
2380  }
2381  const Scalar* const inVals =
2382  reinterpret_cast<const Scalar*>(inputVals.data());
2383  return this->replaceLocalValues(localRow, numInputEnt,
2384  inVals, inputInds.data());
2385  }
2386 
2387  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2388  LocalOrdinal
2390  replaceLocalValues (const LocalOrdinal localRow,
2391  const LocalOrdinal numEnt,
2392  const Scalar inputVals[],
2393  const LocalOrdinal inputCols[])
2394  {
2395  typedef impl_scalar_type IST;
2396  typedef LocalOrdinal LO;
2397 
2398  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2399  // Fill must be active and the "nonconst" graph must exist.
2400  return Teuchos::OrdinalTraits<LO>::invalid ();
2401  }
2402  const crs_graph_type& graph = * (this->staticGraph_);
2403  const RowInfo rowInfo = graph.getRowInfo (localRow);
2404 
2405  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2406  // The calling process does not own this row, so it is not
2407  // allowed to modify its values.
2408  return static_cast<LO> (0);
2409  }
2410  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2411  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2412  return this->replaceLocalValuesImpl (curRowVals.data (), graph, rowInfo,
2413  inputCols, inVals, numEnt);
2414  }
2415 
2416  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2417  LocalOrdinal
2420  const crs_graph_type& graph,
2421  const RowInfo& rowInfo,
2422  const GlobalOrdinal inds[],
2423  const impl_scalar_type newVals[],
2424  const LocalOrdinal numElts)
2425  {
2426  Teuchos::ArrayView<const GlobalOrdinal> indsT(inds, numElts);
2427  auto fun =
2428  [&](size_t const k, size_t const /*start*/, size_t const offset) {
2429  rowVals[offset] = newVals[k];
2430  };
2431  std::function<void(size_t const, size_t const, size_t const)> cb(std::ref(fun));
2432  return graph.findGlobalIndices(rowInfo, indsT, cb);
2433  }
2434 
2435  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2436  LocalOrdinal
2438  replaceGlobalValues (const GlobalOrdinal globalRow,
2439  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2440  const Teuchos::ArrayView<const Scalar>& inputVals)
2441  {
2442  typedef LocalOrdinal LO;
2443 
2444  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2445  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2446  return Teuchos::OrdinalTraits<LO>::invalid ();
2447  }
2448  return this->replaceGlobalValues (globalRow, numInputEnt,
2449  inputVals.getRawPtr (),
2450  inputGblColInds.getRawPtr ());
2451  }
2452 
2453  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2454  LocalOrdinal
2456  replaceGlobalValues (const GlobalOrdinal globalRow,
2457  const LocalOrdinal numEnt,
2458  const Scalar inputVals[],
2459  const GlobalOrdinal inputGblColInds[])
2460  {
2461  typedef impl_scalar_type IST;
2462  typedef LocalOrdinal LO;
2463 
2464  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2465  // Fill must be active and the "nonconst" graph must exist.
2466  return Teuchos::OrdinalTraits<LO>::invalid ();
2467  }
2468  const crs_graph_type& graph = * (this->staticGraph_);
2469 
2470  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (globalRow);
2471  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2472  // The input local row is invalid on the calling process,
2473  // which means that the calling process summed 0 entries.
2474  return static_cast<LO> (0);
2475  }
2476 
2477  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2478  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2479  return this->replaceGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2480  inputGblColInds, inVals, numEnt);
2481  }
2482 
2483  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2485  local_ordinal_type
2488  const global_ordinal_type globalRow,
2489  const Kokkos::View<const global_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
2490  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals)
2491  {
2492  // We use static_assert here to check the template parameters,
2493  // rather than std::enable_if (e.g., on the return value, to
2494  // enable compilation only if the template parameters match the
2495  // desired attributes). This turns obscure link errors into
2496  // clear compilation errors. It also makes the return value a
2497  // lot easier to see.
2498  using LO = local_ordinal_type;
2499  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
2500  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
2501  return Teuchos::OrdinalTraits<LO>::invalid();
2502  }
2503  const Scalar* const inVals =
2504  reinterpret_cast<const Scalar*>(inputVals.data());
2505  return this->replaceGlobalValues(globalRow, numInputEnt, inVals,
2506  inputInds.data());
2507  }
2508 
2509  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2510  LocalOrdinal
2513  const crs_graph_type& graph,
2514  const RowInfo& rowInfo,
2515  const GlobalOrdinal inds[],
2516  const impl_scalar_type newVals[],
2517  const LocalOrdinal numElts,
2518  const bool atomic)
2519  {
2520  typedef LocalOrdinal LO;
2521  typedef GlobalOrdinal GO;
2522 
2523  const bool sorted = graph.isSorted ();
2524 
2525  size_t hint = 0; // guess at the index's relative offset in the row
2526  LO numValid = 0; // number of valid input column indices
2527 
2528  if (graph.isLocallyIndexed ()) {
2529  // NOTE (mfh 04 Nov 2015) Dereferencing an RCP or reading its
2530  // pointer does NOT change its reference count. Thus, this
2531  // code is still thread safe.
2532  if (graph.colMap_.is_null ()) {
2533  // NO input column indices are valid in this case, since if
2534  // the column Map is null on the calling process, then the
2535  // calling process owns no graph entries.
2536  return numValid;
2537  }
2538  const map_type& colMap = * (graph.colMap_);
2539 
2540  // Get a view of the column indices in the row. This amortizes
2541  // the cost of getting the view over all the entries of inds.
2542  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2543  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2544 
2545  for (LO j = 0; j < numElts; ++j) {
2546  const LO lclColInd = colMap.getLocalElement (inds[j]);
2547  if (lclColInd != LINV) {
2548  const size_t offset =
2549  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2550  lclColInd, hint, sorted);
2551  if (offset != rowInfo.numEntries) {
2552  if (atomic) {
2553  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2554  }
2555  else {
2556  rowVals[offset] += newVals[j];
2557  }
2558  hint = offset + 1;
2559  numValid++;
2560  }
2561  }
2562  }
2563  }
2564  else if (graph.isGloballyIndexed ()) {
2565  // Get a view of the column indices in the row. This amortizes
2566  // the cost of getting the view over all the entries of inds.
2567  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2568 
2569  for (LO j = 0; j < numElts; ++j) {
2570  const GO gblColInd = inds[j];
2571  const size_t offset =
2572  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2573  gblColInd, hint, sorted);
2574  if (offset != rowInfo.numEntries) {
2575  if (atomic) {
2576  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2577  }
2578  else {
2579  rowVals[offset] += newVals[j];
2580  }
2581  hint = offset + 1;
2582  numValid++;
2583  }
2584  }
2585  }
2586  // If the graph is neither locally nor globally indexed on the
2587  // calling process, that means the calling process has no graph
2588  // entries. Thus, none of the input column indices are valid.
2589 
2590  return numValid;
2591  }
2592 
2593  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2594  LocalOrdinal
2596  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2597  const Teuchos::ArrayView<const GlobalOrdinal>& inputGblColInds,
2598  const Teuchos::ArrayView<const Scalar>& inputVals,
2599  const bool atomic)
2600  {
2601  typedef LocalOrdinal LO;
2602 
2603  const LO numInputEnt = static_cast<LO> (inputGblColInds.size ());
2604  if (static_cast<LO> (inputVals.size ()) != numInputEnt) {
2605  return Teuchos::OrdinalTraits<LO>::invalid ();
2606  }
2607  return this->sumIntoGlobalValues (gblRow, numInputEnt,
2608  inputVals.getRawPtr (),
2609  inputGblColInds.getRawPtr (),
2610  atomic);
2611  }
2612 
2613  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2614  LocalOrdinal
2616  sumIntoGlobalValues (const GlobalOrdinal gblRow,
2617  const LocalOrdinal numInputEnt,
2618  const Scalar inputVals[],
2619  const GlobalOrdinal inputGblColInds[],
2620  const bool atomic)
2621  {
2622  typedef impl_scalar_type IST;
2623  typedef LocalOrdinal LO;
2624  typedef GlobalOrdinal GO;
2625 
2626  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2627  // Fill must be active and the "nonconst" graph must exist.
2628  return Teuchos::OrdinalTraits<LO>::invalid ();
2629  }
2630  const crs_graph_type& graph = * (this->staticGraph_);
2631 
2632  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2633  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
2634  // mfh 23 Mar 2017, 26 Jul 2017: This branch may not be not
2635  // thread safe in a debug build, in part because it uses
2636  // Teuchos::ArrayView, and in part because of the data structure
2637  // used to stash outgoing entries.
2638  using Teuchos::ArrayView;
2639  ArrayView<const GO> inputGblColInds_av(
2640  numInputEnt == 0 ? nullptr : inputGblColInds,
2641  numInputEnt);
2642  ArrayView<const Scalar> inputVals_av(
2643  numInputEnt == 0 ? nullptr :
2644  inputVals, numInputEnt);
2645  // gblRow is not in the row Map on the calling process, so stash
2646  // the given entries away in a separate data structure.
2647  // globalAssemble() (called during fillComplete()) will exchange
2648  // that data and sum it in using sumIntoGlobalValues().
2649  this->insertNonownedGlobalValues (gblRow, inputGblColInds_av,
2650  inputVals_av);
2651  // FIXME (mfh 08 Jul 2014) It's not clear what to return here,
2652  // since we won't know whether the given indices were valid
2653  // until globalAssemble (called in fillComplete) is called.
2654  // That's why insertNonownedGlobalValues doesn't return
2655  // anything. Just for consistency, I'll return the number of
2656  // entries that the user gave us.
2657  return numInputEnt;
2658  }
2659  else { // input row is in the row Map on the calling process
2660  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2661  const IST* const inVals = reinterpret_cast<const IST*> (inputVals);
2662  return this->sumIntoGlobalValuesImpl (curRowVals.data (), graph, rowInfo,
2663  inputGblColInds, inVals,
2664  numInputEnt, atomic);
2665  }
2666  }
2667 
2668  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2669  LocalOrdinal
2671  transformLocalValues (const LocalOrdinal lclRow,
2672  const LocalOrdinal numInputEnt,
2673  const impl_scalar_type inputVals[],
2674  const LocalOrdinal inputCols[],
2675  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2676  const bool atomic)
2677  {
2678  using Tpetra::Details::OrdinalTraits;
2679  typedef LocalOrdinal LO;
2680 
2681  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2682  // Fill must be active and the "nonconst" graph must exist.
2683  return Teuchos::OrdinalTraits<LO>::invalid ();
2684  }
2685  const crs_graph_type& graph = * (this->staticGraph_);
2686  const RowInfo rowInfo = graph.getRowInfo (lclRow);
2687 
2688  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2689  // The calling process does not own this row, so it is not
2690  // allowed to modify its values.
2691  return static_cast<LO> (0);
2692  }
2693  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2694  return this->transformLocalValues (curRowVals.data (), graph,
2695  rowInfo, inputCols, inputVals,
2696  numInputEnt, f, atomic);
2697  }
2698 
2699  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2700  LocalOrdinal
2701  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2702  transformGlobalValues (const GlobalOrdinal gblRow,
2703  const LocalOrdinal numInputEnt,
2704  const impl_scalar_type inputVals[],
2705  const GlobalOrdinal inputCols[],
2706  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2707  const bool atomic)
2708  {
2709  using Tpetra::Details::OrdinalTraits;
2710  typedef LocalOrdinal LO;
2711 
2712  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
2713  // Fill must be active and the "nonconst" graph must exist.
2714  return OrdinalTraits<LO>::invalid ();
2715  }
2716  const crs_graph_type& graph = * (this->staticGraph_);
2717  const RowInfo rowInfo = graph.getRowInfoFromGlobalRowIndex (gblRow);
2718 
2719  if (rowInfo.localRow == OrdinalTraits<size_t>::invalid ()) {
2720  // The calling process does not own this row, so it is not
2721  // allowed to modify its values.
2722  return static_cast<LO> (0);
2723  }
2724  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
2725  return this->transformGlobalValues (curRowVals.data (), graph,
2726  rowInfo, inputCols, inputVals,
2727  numInputEnt, f, atomic);
2728  }
2729 
2730  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2731  LocalOrdinal
2732  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2733  transformLocalValues (impl_scalar_type rowVals[],
2734  const crs_graph_type& graph,
2735  const RowInfo& rowInfo,
2736  const LocalOrdinal inds[],
2737  const impl_scalar_type newVals[],
2738  const LocalOrdinal numElts,
2739  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2740  const bool atomic)
2741  {
2742  typedef impl_scalar_type ST;
2743  typedef LocalOrdinal LO;
2744  typedef GlobalOrdinal GO;
2745 
2746  //if (newVals.extent (0) != inds.extent (0)) {
2747  // The sizes of the input arrays must match.
2748  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2749  //}
2750  //const LO numElts = static_cast<LO> (inds.extent (0));
2751  const bool sorted = graph.isSorted ();
2752 
2753  LO numValid = 0; // number of valid input column indices
2754  size_t hint = 0; // Guess for the current index k into rowVals
2755 
2756  if (graph.isLocallyIndexed ()) {
2757  // Get a view of the column indices in the row. This amortizes
2758  // the cost of getting the view over all the entries of inds.
2759  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2760 
2761  for (LO j = 0; j < numElts; ++j) {
2762  const LO lclColInd = inds[j];
2763  const size_t offset =
2764  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2765  lclColInd, hint, sorted);
2766  if (offset != rowInfo.numEntries) {
2767  if (atomic) {
2768  // NOTE (mfh 30 Nov 2015) The commented-out code is
2769  // wrong because another thread may have changed
2770  // rowVals[offset] between those two lines of code.
2771  //
2772  //const ST newVal = f (rowVals[offset], newVals[j]);
2773  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2774 
2775  volatile ST* const dest = &rowVals[offset];
2776  (void) atomic_binary_function_update (dest, newVals[j], f);
2777  }
2778  else {
2779  // use binary function f
2780  rowVals[offset] = f (rowVals[offset], newVals[j]);
2781  }
2782  hint = offset + 1;
2783  ++numValid;
2784  }
2785  }
2786  }
2787  else if (graph.isGloballyIndexed ()) {
2788  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2789  // pointer does NOT change its reference count. Thus, this
2790  // code is still thread safe.
2791  if (graph.colMap_.is_null ()) {
2792  // NO input column indices are valid in this case. Either
2793  // the column Map hasn't been set yet (so local indices
2794  // don't exist yet), or the calling process owns no graph
2795  // entries.
2796  return numValid;
2797  }
2798  const map_type& colMap = * (graph.colMap_);
2799  // Get a view of the column indices in the row. This amortizes
2800  // the cost of getting the view over all the entries of inds.
2801  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2802 
2803  const GO GINV = Teuchos::OrdinalTraits<GO>::invalid ();
2804  for (LO j = 0; j < numElts; ++j) {
2805  const GO gblColInd = colMap.getGlobalElement (inds[j]);
2806  if (gblColInd != GINV) {
2807  const size_t offset =
2808  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2809  gblColInd, hint, sorted);
2810  if (offset != rowInfo.numEntries) {
2811  if (atomic) {
2812  // NOTE (mfh 30 Nov 2015) The commented-out code is
2813  // wrong because another thread may have changed
2814  // rowVals[offset] between those two lines of code.
2815  //
2816  //const ST newVal = f (rowVals[offset], newVals[j]);
2817  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2818 
2819  volatile ST* const dest = &rowVals[offset];
2820  (void) atomic_binary_function_update (dest, newVals[j], f);
2821  }
2822  else {
2823  // use binary function f
2824  rowVals[offset] = f (rowVals[offset], newVals[j]);
2825  }
2826  hint = offset + 1;
2827  numValid++;
2828  }
2829  }
2830  }
2831  }
2832  // If the graph is neither locally nor globally indexed on the
2833  // calling process, that means the calling process has no graph
2834  // entries. Thus, none of the input column indices are valid.
2835 
2836  return numValid;
2837  }
2838 
2839  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2840  LocalOrdinal
2841  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
2842  transformGlobalValues (impl_scalar_type rowVals[],
2843  const crs_graph_type& graph,
2844  const RowInfo& rowInfo,
2845  const GlobalOrdinal inds[],
2846  const impl_scalar_type newVals[],
2847  const LocalOrdinal numElts,
2848  std::function<impl_scalar_type (const impl_scalar_type&, const impl_scalar_type&) > f,
2849  const bool atomic)
2850  {
2851  typedef impl_scalar_type ST;
2852  typedef LocalOrdinal LO;
2853  typedef GlobalOrdinal GO;
2854 
2855  //if (newVals.extent (0) != inds.extent (0)) {
2856  // The sizes of the input arrays must match.
2857  //return Tpetra::Details::OrdinalTraits<LO>::invalid ();
2858  //}
2859  //const LO numElts = static_cast<LO> (inds.extent (0));
2860  const bool sorted = graph.isSorted ();
2861 
2862  LO numValid = 0; // number of valid input column indices
2863  size_t hint = 0; // Guess for the current index k into rowVals
2864 
2865  if (graph.isGloballyIndexed ()) {
2866  // Get a view of the column indices in the row. This amortizes
2867  // the cost of getting the view over all the entries of inds.
2868  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2869 
2870  for (LO j = 0; j < numElts; ++j) {
2871  const GO gblColInd = inds[j];
2872  const size_t offset =
2873  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2874  gblColInd, hint, sorted);
2875  if (offset != rowInfo.numEntries) {
2876  if (atomic) {
2877  // NOTE (mfh 30 Nov 2015) The commented-out code is
2878  // wrong because another thread may have changed
2879  // rowVals[offset] between those two lines of code.
2880  //
2881  //const ST newVal = f (rowVals[offset], newVals[j]);
2882  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2883 
2884  volatile ST* const dest = &rowVals[offset];
2885  (void) atomic_binary_function_update (dest, newVals[j], f);
2886  }
2887  else {
2888  // use binary function f
2889  rowVals[offset] = f (rowVals[offset], newVals[j]);
2890  }
2891  hint = offset + 1;
2892  ++numValid;
2893  }
2894  }
2895  }
2896  else if (graph.isLocallyIndexed ()) {
2897  // NOTE (mfh 26 Nov 2015) Dereferencing an RCP or reading its
2898  // pointer does NOT change its reference count. Thus, this
2899  // code is still thread safe.
2900  if (graph.colMap_.is_null ()) {
2901  // NO input column indices are valid in this case. Either the
2902  // column Map hasn't been set yet (so local indices don't
2903  // exist yet), or the calling process owns no graph entries.
2904  return numValid;
2905  }
2906  const map_type& colMap = * (graph.colMap_);
2907  // Get a view of the column indices in the row. This amortizes
2908  // the cost of getting the view over all the entries of inds.
2909  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2910 
2911  const LO LINV = Teuchos::OrdinalTraits<LO>::invalid ();
2912  for (LO j = 0; j < numElts; ++j) {
2913  const LO lclColInd = colMap.getLocalElement (inds[j]);
2914  if (lclColInd != LINV) {
2915  const size_t offset =
2916  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2917  lclColInd, hint, sorted);
2918  if (offset != rowInfo.numEntries) {
2919  if (atomic) {
2920  // NOTE (mfh 30 Nov 2015) The commented-out code is
2921  // wrong because another thread may have changed
2922  // rowVals[offset] between those two lines of code.
2923  //
2924  //const ST newVal = f (rowVals[offset], newVals[j]);
2925  //Kokkos::atomic_assign (&rowVals[offset], newVal);
2926 
2927  volatile ST* const dest = &rowVals[offset];
2928  (void) atomic_binary_function_update (dest, newVals[j], f);
2929  }
2930  else {
2931  // use binary function f
2932  rowVals[offset] = f (rowVals[offset], newVals[j]);
2933  }
2934  hint = offset + 1;
2935  numValid++;
2936  }
2937  }
2938  }
2939  }
2940  // If the graph is neither locally nor globally indexed on the
2941  // calling process, that means the calling process has no graph
2942  // entries. Thus, none of the input column indices are valid.
2943 
2944  return numValid;
2945  }
2946 
2947  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
2948  LocalOrdinal
2951  const crs_graph_type& graph,
2952  const RowInfo& rowInfo,
2953  const LocalOrdinal inds[],
2954  const impl_scalar_type newVals[],
2955  const LocalOrdinal numElts,
2956  const bool atomic)
2957  {
2958  typedef LocalOrdinal LO;
2959  typedef GlobalOrdinal GO;
2960 
2961  const bool sorted = graph.isSorted ();
2962 
2963  size_t hint = 0; // Guess for the current index k into rowVals
2964  LO numValid = 0; // number of valid local column indices
2965 
2966  if (graph.isLocallyIndexed ()) {
2967  // Get a view of the column indices in the row. This amortizes
2968  // the cost of getting the view over all the entries of inds.
2969  auto colInds = graph.getLocalIndsViewHost (rowInfo);
2970 
2971  for (LO j = 0; j < numElts; ++j) {
2972  const LO lclColInd = inds[j];
2973  const size_t offset =
2974  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
2975  lclColInd, hint, sorted);
2976  if (offset != rowInfo.numEntries) {
2977  if (atomic) {
2978  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
2979  }
2980  else {
2981  rowVals[offset] += newVals[j];
2982  }
2983  hint = offset + 1;
2984  ++numValid;
2985  }
2986  }
2987  }
2988  else if (graph.isGloballyIndexed ()) {
2989  if (graph.colMap_.is_null ()) {
2990  return Teuchos::OrdinalTraits<LO>::invalid ();
2991  }
2992  const map_type colMap = * (graph.colMap_);
2993 
2994  // Get a view of the column indices in the row. This amortizes
2995  // the cost of getting the view over all the entries of inds.
2996  auto colInds = graph.getGlobalIndsViewHost (rowInfo);
2997 
2998  for (LO j = 0; j < numElts; ++j) {
2999  const GO gblColInd = colMap.getGlobalElement (inds[j]);
3000  if (gblColInd != Teuchos::OrdinalTraits<GO>::invalid ()) {
3001  const size_t offset =
3002  KokkosSparse::findRelOffset (colInds, rowInfo.numEntries,
3003  gblColInd, hint, sorted);
3004  if (offset != rowInfo.numEntries) {
3005  if (atomic) {
3006  Kokkos::atomic_add (&rowVals[offset], newVals[j]);
3007  }
3008  else {
3009  rowVals[offset] += newVals[j];
3010  }
3011  hint = offset + 1;
3012  ++numValid;
3013  }
3014  }
3015  }
3016  }
3017  // NOTE (mfh 26 Jun 2014, 26 Nov 2015) In the current version of
3018  // CrsGraph and CrsMatrix, it's possible for a matrix (or graph)
3019  // to be neither locally nor globally indexed on a process.
3020  // This means that the graph or matrix has no entries on that
3021  // process. Epetra also works like this. It's related to lazy
3022  // allocation (on first insertion, not at graph / matrix
3023  // construction). Lazy allocation will go away because it is
3024  // not thread scalable.
3025 
3026  return numValid;
3027  }
3028 
3029  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3030  LocalOrdinal
3032  sumIntoLocalValues (const LocalOrdinal localRow,
3033  const Teuchos::ArrayView<const LocalOrdinal>& indices,
3034  const Teuchos::ArrayView<const Scalar>& values,
3035  const bool atomic)
3036  {
3037  using LO = local_ordinal_type;
3038  const LO numInputEnt = static_cast<LO>(indices.size());
3039  if (static_cast<LO>(values.size()) != numInputEnt) {
3040  return Teuchos::OrdinalTraits<LO>::invalid();
3041  }
3042  const LO* const inputInds = indices.getRawPtr();
3043  const scalar_type* const inputVals = values.getRawPtr();
3044  return this->sumIntoLocalValues(localRow, numInputEnt,
3045  inputVals, inputInds, atomic);
3046  }
3047 
3048  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3050  local_ordinal_type
3053  const local_ordinal_type localRow,
3054  const Kokkos::View<const local_ordinal_type*, Kokkos::AnonymousSpace>& inputInds,
3055  const Kokkos::View<const impl_scalar_type*, Kokkos::AnonymousSpace>& inputVals,
3056  const bool atomic)
3057  {
3058  using LO = local_ordinal_type;
3059  const LO numInputEnt = static_cast<LO>(inputInds.extent(0));
3060  if (static_cast<LO>(inputVals.extent(0)) != numInputEnt) {
3061  return Teuchos::OrdinalTraits<LO>::invalid();
3062  }
3063  const scalar_type* inVals =
3064  reinterpret_cast<const scalar_type*>(inputVals.data());
3065  return this->sumIntoLocalValues(localRow, numInputEnt, inVals,
3066  inputInds.data(), atomic);
3067  }
3068 
3069  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3070  LocalOrdinal
3072  sumIntoLocalValues (const LocalOrdinal localRow,
3073  const LocalOrdinal numEnt,
3074  const Scalar vals[],
3075  const LocalOrdinal cols[],
3076  const bool atomic)
3077  {
3078  typedef impl_scalar_type IST;
3079  typedef LocalOrdinal LO;
3080 
3081  if (! this->isFillActive () || this->staticGraph_.is_null ()) {
3082  // Fill must be active and the "nonconst" graph must exist.
3083  return Teuchos::OrdinalTraits<LO>::invalid ();
3084  }
3085  const crs_graph_type& graph = * (this->staticGraph_);
3086  const RowInfo rowInfo = graph.getRowInfo (localRow);
3087 
3088  if (rowInfo.localRow == Teuchos::OrdinalTraits<size_t>::invalid ()) {
3089  // The calling process does not own this row, so it is not
3090  // allowed to modify its values.
3091  return static_cast<LO> (0);
3092  }
3093  auto curRowVals = this->getValuesViewHostNonConst (rowInfo);
3094  const IST* const inputVals = reinterpret_cast<const IST*> (vals);
3095  return this->sumIntoLocalValuesImpl (curRowVals.data (), graph, rowInfo,
3096  cols, inputVals, numEnt, atomic);
3097  }
3098 
3099  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3101  values_dualv_type::t_host::const_type
3103  getValuesViewHost (const RowInfo& rowinfo) const
3104  {
3105  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3106  return typename values_dualv_type::t_host::const_type ();
3107  else
3108  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3109  rowinfo.allocSize,
3110  Access::ReadOnly);
3111  }
3112 
3113  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3115  values_dualv_type::t_host
3118  {
3119  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3120  return typename values_dualv_type::t_host ();
3121  else
3122  return valuesUnpacked_wdv.getHostSubview(rowinfo.offset1D,
3123  rowinfo.allocSize,
3124  Access::ReadWrite);
3125  }
3126 
3127  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3129  values_dualv_type::t_dev::const_type
3131  getValuesViewDevice (const RowInfo& rowinfo) const
3132  {
3133  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3134  return typename values_dualv_type::t_dev::const_type ();
3135  else
3136  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3137  rowinfo.allocSize,
3138  Access::ReadOnly);
3139  }
3140 
3141  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3143  values_dualv_type::t_dev
3146  {
3147  if (rowinfo.allocSize == 0 || valuesUnpacked_wdv.extent(0) == 0)
3148  return typename values_dualv_type::t_dev ();
3149  else
3150  return valuesUnpacked_wdv.getDeviceSubview(rowinfo.offset1D,
3151  rowinfo.allocSize,
3152  Access::ReadWrite);
3153  }
3154 
3155 
3156  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3157  void
3160  nonconst_local_inds_host_view_type &indices,
3161  nonconst_values_host_view_type &values,
3162  size_t& numEntries) const
3163  {
3164  using Teuchos::ArrayView;
3165  using Teuchos::av_reinterpret_cast;
3166  const char tfecfFuncName[] = "getLocalRowCopy: ";
3167 
3168  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3169  (! this->hasColMap (), std::runtime_error,
3170  "The matrix does not have a column Map yet. This means we don't have "
3171  "local indices for columns yet, so it doesn't make sense to call this "
3172  "method. If the matrix doesn't have a column Map yet, you should call "
3173  "fillComplete on it first.");
3174 
3175  const RowInfo rowinfo = staticGraph_->getRowInfo (localRow);
3176  const size_t theNumEntries = rowinfo.numEntries;
3177  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3178  (static_cast<size_t> (indices.size ()) < theNumEntries ||
3179  static_cast<size_t> (values.size ()) < theNumEntries,
3180  std::runtime_error, "Row with local index " << localRow << " has " <<
3181  theNumEntries << " entry/ies, but indices.size() = " <<
3182  indices.size () << " and values.size() = " << values.size () << ".");
3183  numEntries = theNumEntries; // first side effect
3184 
3185  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3186  if (staticGraph_->isLocallyIndexed ()) {
3187  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3188  auto curVals = getValuesViewHost(rowinfo);
3189 
3190  for (size_t j = 0; j < theNumEntries; ++j) {
3191  values[j] = curVals[j];
3192  indices[j] = curLclInds(j);
3193  }
3194  }
3195  else if (staticGraph_->isGloballyIndexed ()) {
3196  // Don't call getColMap(), because it touches RCP's reference count.
3197  const map_type& colMap = * (staticGraph_->colMap_);
3198  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3199  auto curVals = getValuesViewHost(rowinfo);
3200 
3201  for (size_t j = 0; j < theNumEntries; ++j) {
3202  values[j] = curVals[j];
3203  indices[j] = colMap.getLocalElement (curGblInds(j));
3204  }
3205  }
3206  }
3207  }
3208 
3209 
3210 template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3211 void
3214  nonconst_global_inds_host_view_type &indices,
3215  nonconst_values_host_view_type &values,
3216  size_t& numEntries) const
3217  {
3218  using Teuchos::ArrayView;
3219  using Teuchos::av_reinterpret_cast;
3220  const char tfecfFuncName[] = "getGlobalRowCopy: ";
3221 
3222  const RowInfo rowinfo =
3223  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3224  const size_t theNumEntries = rowinfo.numEntries;
3225  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3226  static_cast<size_t> (indices.size ()) < theNumEntries ||
3227  static_cast<size_t> (values.size ()) < theNumEntries,
3228  std::runtime_error, "Row with global index " << globalRow << " has "
3229  << theNumEntries << " entry/ies, but indices.size() = " <<
3230  indices.size () << " and values.size() = " << values.size () << ".");
3231  numEntries = theNumEntries; // first side effect
3232 
3233  if (rowinfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid ()) {
3234  if (staticGraph_->isLocallyIndexed ()) {
3235  const map_type& colMap = * (staticGraph_->colMap_);
3236  auto curLclInds = staticGraph_->getLocalIndsViewHost(rowinfo);
3237  auto curVals = getValuesViewHost(rowinfo);
3238 
3239  for (size_t j = 0; j < theNumEntries; ++j) {
3240  values[j] = curVals[j];
3241  indices[j] = colMap.getGlobalElement (curLclInds(j));
3242  }
3243  }
3244  else if (staticGraph_->isGloballyIndexed ()) {
3245  auto curGblInds = staticGraph_->getGlobalIndsViewHost(rowinfo);
3246  auto curVals = getValuesViewHost(rowinfo);
3247 
3248  for (size_t j = 0; j < theNumEntries; ++j) {
3249  values[j] = curVals[j];
3250  indices[j] = curGblInds(j);
3251  }
3252  }
3253  }
3254  }
3255 
3256 
3257  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3258  void
3260  getLocalRowView(LocalOrdinal localRow,
3261  local_inds_host_view_type &indices,
3262  values_host_view_type &values) const
3263  {
3264  const char tfecfFuncName[] = "getLocalRowView: ";
3265 
3266  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3267  isGloballyIndexed (), std::runtime_error, "The matrix currently stores "
3268  "its indices as global indices, so you cannot get a view with local "
3269  "column indices. If the matrix has a column Map, you may call "
3270  "getLocalRowCopy() to get local column indices; otherwise, you may get "
3271  "a view with global column indices by calling getGlobalRowCopy().");
3272 
3273  const RowInfo rowInfo = staticGraph_->getRowInfo (localRow);
3274  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3275  rowInfo.numEntries > 0) {
3276  indices = staticGraph_->lclIndsUnpacked_wdv.getHostSubview(
3277  rowInfo.offset1D,
3278  rowInfo.numEntries,
3279  Access::ReadOnly);
3280  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3281  rowInfo.numEntries,
3282  Access::ReadOnly);
3283  }
3284  else {
3285  // This does the right thing (reports an empty row) if the input
3286  // row is invalid.
3287  indices = local_inds_host_view_type();
3288  values = values_host_view_type();
3289  }
3290 
3291 #ifdef HAVE_TPETRA_DEBUG
3292  const char suffix[] = ". This should never happen. Please report this "
3293  "bug to the Tpetra developers.";
3294  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3295  (static_cast<size_t> (indices.size ()) !=
3296  static_cast<size_t> (values.size ()), std::logic_error,
3297  "At the end of this method, for local row " << localRow << ", "
3298  "indices.size() = " << indices.size () << " != values.size () = "
3299  << values.size () << suffix);
3300  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3301  (static_cast<size_t> (indices.size ()) !=
3302  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3303  "At the end of this method, for local row " << localRow << ", "
3304  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3305  << rowInfo.numEntries << suffix);
3306  const size_t expectedNumEntries = getNumEntriesInLocalRow (localRow);
3307  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3308  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3309  "of this method, for local row " << localRow << ", rowInfo.numEntries = "
3310  << rowInfo.numEntries << " != getNumEntriesInLocalRow(localRow) = " <<
3311  expectedNumEntries << suffix);
3312 #endif // HAVE_TPETRA_DEBUG
3313  }
3314 
3315 
3316  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3317  void
3319  getGlobalRowView (GlobalOrdinal globalRow,
3320  global_inds_host_view_type &indices,
3321  values_host_view_type &values) const
3322  {
3323  const char tfecfFuncName[] = "getGlobalRowView: ";
3324 
3325  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3326  isLocallyIndexed (), std::runtime_error,
3327  "The matrix is locally indexed, so we cannot return a view of the row "
3328  "with global column indices. Use getGlobalRowCopy() instead.");
3329 
3330  // This does the right thing (reports an empty row) if the input
3331  // row is invalid.
3332  const RowInfo rowInfo =
3333  staticGraph_->getRowInfoFromGlobalRowIndex (globalRow);
3334  if (rowInfo.localRow != Teuchos::OrdinalTraits<size_t>::invalid () &&
3335  rowInfo.numEntries > 0) {
3336  indices = staticGraph_->gblInds_wdv.getHostSubview(rowInfo.offset1D,
3337  rowInfo.numEntries,
3338  Access::ReadOnly);
3339  values = valuesUnpacked_wdv.getHostSubview(rowInfo.offset1D,
3340  rowInfo.numEntries,
3341  Access::ReadOnly);
3342  }
3343  else {
3344  indices = global_inds_host_view_type();
3345  values = values_host_view_type();
3346  }
3347 
3348 #ifdef HAVE_TPETRA_DEBUG
3349  const char suffix[] = ". This should never happen. Please report this "
3350  "bug to the Tpetra developers.";
3351  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3352  (static_cast<size_t> (indices.size ()) !=
3353  static_cast<size_t> (values.size ()), std::logic_error,
3354  "At the end of this method, for global row " << globalRow << ", "
3355  "indices.size() = " << indices.size () << " != values.size () = "
3356  << values.size () << suffix);
3357  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3358  (static_cast<size_t> (indices.size ()) !=
3359  static_cast<size_t> (rowInfo.numEntries), std::logic_error,
3360  "At the end of this method, for global row " << globalRow << ", "
3361  "indices.size() = " << indices.size () << " != rowInfo.numEntries = "
3362  << rowInfo.numEntries << suffix);
3363  const size_t expectedNumEntries = getNumEntriesInGlobalRow (globalRow);
3364  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3365  (rowInfo.numEntries != expectedNumEntries, std::logic_error, "At the end "
3366  "of this method, for global row " << globalRow << ", rowInfo.numEntries "
3367  "= " << rowInfo.numEntries << " != getNumEntriesInGlobalRow(globalRow) ="
3368  " " << expectedNumEntries << suffix);
3369 #endif // HAVE_TPETRA_DEBUG
3370  }
3371 
3372 
3373  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3374  void
3376  scale (const Scalar& alpha)
3377  {
3378  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3379 
3380  const size_t nlrs = staticGraph_->getLocalNumRows ();
3381  const size_t numEntries = staticGraph_->getLocalNumEntries ();
3382  if (! staticGraph_->indicesAreAllocated () ||
3383  nlrs == 0 || numEntries == 0) {
3384  // do nothing
3385  }
3386  else {
3387 
3388  auto vals = valuesPacked_wdv.getDeviceView(Access::ReadWrite);
3389  KokkosBlas::scal(vals, theAlpha, vals);
3390 
3391  }
3392  }
3393 
3394  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3395  void
3397  setAllToScalar (const Scalar& alpha)
3398  {
3399  const impl_scalar_type theAlpha = static_cast<impl_scalar_type> (alpha);
3400 
3401  // replace all values in the matrix
3402  // it is easiest to replace all allocated values, instead of replacing only the ones with valid entries
3403  // however, if there are no valid entries, we can short-circuit
3404  // furthermore, if the values aren't allocated, we can short-circuit (no entry have been inserted so far)
3405  const size_t numEntries = staticGraph_->getLocalNumEntries();
3406  if (! staticGraph_->indicesAreAllocated () || numEntries == 0) {
3407  // do nothing
3408  }
3409  else {
3410  // DEEP_COPY REVIEW - VALUE-TO-DEVICE
3411  Kokkos::deep_copy (execution_space(), valuesUnpacked_wdv.getDeviceView(Access::OverwriteAll),
3412  theAlpha);
3413  // CAG: This fence was found to be required on Cuda with UVM=on.
3414  Kokkos::fence("CrsMatrix::setAllToScalar");
3415  }
3416  }
3417 
3418  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3419  void
3421  setAllValues (const typename local_graph_device_type::row_map_type& rowPointers,
3422  const typename local_graph_device_type::entries_type::non_const_type& columnIndices,
3423  const typename local_matrix_device_type::values_type& values)
3424  {
3425  using ProfilingRegion=Details::ProfilingRegion;
3426  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues");
3427  const char tfecfFuncName[] = "setAllValues: ";
3428  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3429  (columnIndices.size () != values.size (), std::invalid_argument,
3430  "columnIndices.size() = " << columnIndices.size () << " != values.size()"
3431  " = " << values.size () << ".");
3432  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3433  (myGraph_.is_null (), std::runtime_error, "myGraph_ must not be null.");
3434 
3435  try {
3436  myGraph_->setAllIndices (rowPointers, columnIndices);
3437  }
3438  catch (std::exception &e) {
3439  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3440  (true, std::runtime_error, "myGraph_->setAllIndices() threw an "
3441  "exception: " << e.what ());
3442  }
3443 
3444  // Make sure that myGraph_ now has a local graph. It may not be
3445  // fillComplete yet, so it's important to check. We don't care
3446  // whether setAllIndices() did a shallow copy or a deep copy, so a
3447  // good way to check is to compare dimensions.
3448  auto lclGraph = myGraph_->getLocalGraphDevice ();
3449  const size_t numEnt = lclGraph.entries.extent (0);
3450  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3451  (lclGraph.row_map.extent (0) != rowPointers.extent (0) ||
3452  numEnt != static_cast<size_t> (columnIndices.extent (0)),
3453  std::logic_error, "myGraph_->setAllIndices() did not correctly create "
3454  "local graph. Please report this bug to the Tpetra developers.");
3455 
3456  valuesPacked_wdv = values_wdv_type(values);
3457  valuesUnpacked_wdv = valuesPacked_wdv;
3458 
3459  // Storage MUST be packed, since the interface doesn't give any
3460  // way to indicate any extra space at the end of each row.
3461  this->storageStatus_ = Details::STORAGE_1D_PACKED;
3462 
3463  checkInternalState ();
3464  }
3465 
3466  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3467  void
3469  setAllValues ( const local_matrix_device_type& localDeviceMatrix)
3470  {
3471  using ProfilingRegion=Details::ProfilingRegion;
3472  ProfilingRegion region ("Tpetra::CrsMatrix::setAllValues from KokkosSparse::CrsMatrix");
3473 
3474  auto graph = localDeviceMatrix.graph;
3475  //FIXME how to check whether graph is allocated
3476 
3477  auto rows = graph.row_map;
3478  auto columns = graph.entries;
3479  auto values = localDeviceMatrix.values;
3480 
3481  setAllValues(rows,columns,values);
3482  }
3483 
3484  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3485  void
3487  setAllValues (const Teuchos::ArrayRCP<size_t>& ptr,
3488  const Teuchos::ArrayRCP<LocalOrdinal>& ind,
3489  const Teuchos::ArrayRCP<Scalar>& val)
3490  {
3491  using Kokkos::Compat::getKokkosViewDeepCopy;
3492  using Teuchos::ArrayRCP;
3493  using Teuchos::av_reinterpret_cast;
3494  typedef device_type DT;
3495  typedef impl_scalar_type IST;
3496  typedef typename local_graph_device_type::row_map_type row_map_type;
3497  //typedef typename row_map_type::non_const_value_type row_offset_type;
3498  const char tfecfFuncName[] = "setAllValues(ArrayRCP<size_t>, ArrayRCP<LO>, ArrayRCP<Scalar>): ";
3499 
3500  // The row offset type may depend on the execution space. It may
3501  // not necessarily be size_t. If it's not, we need to make a deep
3502  // copy. We need to make a deep copy anyway so that Kokkos can
3503  // own the memory. Regardless, ptrIn gets the copy.
3504  typename row_map_type::non_const_type ptrNative ("ptr", ptr.size ());
3505  Kokkos::View<const size_t*,
3506  typename row_map_type::array_layout,
3507  Kokkos::HostSpace,
3508  Kokkos::MemoryUnmanaged> ptrSizeT (ptr.getRawPtr (), ptr.size ());
3509  ::Tpetra::Details::copyOffsets (ptrNative, ptrSizeT);
3510 
3511  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3512  (ptrNative.extent (0) != ptrSizeT.extent (0),
3513  std::logic_error, "ptrNative.extent(0) = " <<
3514  ptrNative.extent (0) << " != ptrSizeT.extent(0) = "
3515  << ptrSizeT.extent (0) << ". Please report this bug to the "
3516  "Tpetra developers.");
3517 
3518  auto indIn = getKokkosViewDeepCopy<DT> (ind ());
3519  auto valIn = getKokkosViewDeepCopy<DT> (av_reinterpret_cast<IST> (val ()));
3520  this->setAllValues (ptrNative, indIn, valIn);
3521  }
3522 
3523  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3524  void
3526  getLocalDiagOffsets (Teuchos::ArrayRCP<size_t>& offsets) const
3527  {
3528  const char tfecfFuncName[] = "getLocalDiagOffsets: ";
3529  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3530  (staticGraph_.is_null (), std::runtime_error, "The matrix has no graph.");
3531 
3532  // mfh 11 May 2016: We plan to deprecate the ArrayRCP version of
3533  // this method in CrsGraph too, so don't call it (otherwise build
3534  // warnings will show up and annoy users). Instead, copy results
3535  // in and out, if the memory space requires it.
3536 
3537  const size_t lclNumRows = staticGraph_->getLocalNumRows ();
3538  if (static_cast<size_t> (offsets.size ()) < lclNumRows) {
3539  offsets.resize (lclNumRows);
3540  }
3541 
3542  // The input ArrayRCP must always be a host pointer. Thus, if
3543  // device_type::memory_space is Kokkos::HostSpace, it's OK for us
3544  // to write to that allocation directly as a Kokkos::View.
3545  if (std::is_same<memory_space, Kokkos::HostSpace>::value) {
3546  // It is always syntactically correct to assign a raw host
3547  // pointer to a device View, so this code will compile correctly
3548  // even if this branch never runs.
3549  typedef Kokkos::View<size_t*, device_type,
3550  Kokkos::MemoryUnmanaged> output_type;
3551  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3552  staticGraph_->getLocalDiagOffsets (offsetsOut);
3553  }
3554  else {
3555  Kokkos::View<size_t*, device_type> offsetsTmp ("diagOffsets", lclNumRows);
3556  staticGraph_->getLocalDiagOffsets (offsetsTmp);
3557  typedef Kokkos::View<size_t*, Kokkos::HostSpace,
3558  Kokkos::MemoryUnmanaged> output_type;
3559  output_type offsetsOut (offsets.getRawPtr (), lclNumRows);
3560  // DEEP_COPY REVIEW - DEVICE-TO-HOST
3561  Kokkos::deep_copy (execution_space(), offsetsOut, offsetsTmp);
3562  }
3563  }
3564 
3565  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3566  void
3569  {
3570  using Teuchos::ArrayRCP;
3571  using Teuchos::ArrayView;
3572  using Teuchos::av_reinterpret_cast;
3573  const char tfecfFuncName[] = "getLocalDiagCopy (1-arg): ";
3574  typedef local_ordinal_type LO;
3575 
3576 
3577  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3578  staticGraph_.is_null (), std::runtime_error,
3579  "This method requires that the matrix have a graph.");
3580  auto rowMapPtr = this->getRowMap ();
3581  if (rowMapPtr.is_null () || rowMapPtr->getComm ().is_null ()) {
3582  // Processes on which the row Map or its communicator is null
3583  // don't participate. Users shouldn't even call this method on
3584  // those processes.
3585  return;
3586  }
3587  auto colMapPtr = this->getColMap ();
3588  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3589  (! this->hasColMap () || colMapPtr.is_null (), std::runtime_error,
3590  "This method requires that the matrix have a column Map.");
3591  const map_type& rowMap = * rowMapPtr;
3592  const map_type& colMap = * colMapPtr;
3593  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3594 
3595 #ifdef HAVE_TPETRA_DEBUG
3596  // isCompatible() requires an all-reduce, and thus this check
3597  // should only be done in debug mode.
3598  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3599  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3600  "The input Vector's Map must be compatible with the CrsMatrix's row "
3601  "Map. You may check this by using Map's isCompatible method: "
3602  "diag.getMap ()->isCompatible (A.getRowMap ());");
3603 #endif // HAVE_TPETRA_DEBUG
3604 
3605  if (this->isFillComplete ()) {
3606  const auto D_lcl = diag.getLocalViewDevice(Access::OverwriteAll);
3607  // 1-D subview of the first (and only) column of D_lcl.
3608  const auto D_lcl_1d =
3609  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3610 
3611  const auto lclRowMap = rowMap.getLocalMap ();
3612  const auto lclColMap = colMap.getLocalMap ();
3614  (void) getDiagCopyWithoutOffsets (D_lcl_1d, lclRowMap,
3615  lclColMap,
3616  getLocalMatrixDevice ());
3617  }
3618  else {
3620  (void) getLocalDiagCopyWithoutOffsetsNotFillComplete (diag, *this);
3621  }
3622  }
3623 
3624  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3625  void
3628  const Kokkos::View<const size_t*, device_type,
3629  Kokkos::MemoryUnmanaged>& offsets) const
3630  {
3631  typedef LocalOrdinal LO;
3632 
3633 #ifdef HAVE_TPETRA_DEBUG
3634  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3635  const map_type& rowMap = * (this->getRowMap ());
3636  // isCompatible() requires an all-reduce, and thus this check
3637  // should only be done in debug mode.
3638  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3639  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3640  "The input Vector's Map must be compatible with (in the sense of Map::"
3641  "isCompatible) the CrsMatrix's row Map.");
3642 #endif // HAVE_TPETRA_DEBUG
3643 
3644  // For now, we fill the Vector on the host and sync to device.
3645  // Later, we may write a parallel kernel that works entirely on
3646  // device.
3647  //
3648  // NOTE (mfh 21 Jan 2016): The host kernel here assumes UVM. Once
3649  // we write a device kernel, it will not need to assume UVM.
3650 
3651  auto D_lcl = diag.getLocalViewDevice (Access::OverwriteAll);
3652  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3653  // Get 1-D subview of the first (and only) column of D_lcl.
3654  auto D_lcl_1d =
3655  Kokkos::subview (D_lcl, Kokkos::make_pair (LO (0), myNumRows), 0);
3656 
3657  KokkosSparse::getDiagCopy (D_lcl_1d, offsets,
3658  getLocalMatrixDevice ());
3659  }
3660 
3661  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3662  void
3665  const Teuchos::ArrayView<const size_t>& offsets) const
3666  {
3667  using LO = LocalOrdinal;
3668  using host_execution_space = Kokkos::DefaultHostExecutionSpace;
3669  using IST = impl_scalar_type;
3670 
3671 #ifdef HAVE_TPETRA_DEBUG
3672  const char tfecfFuncName[] = "getLocalDiagCopy: ";
3673  const map_type& rowMap = * (this->getRowMap ());
3674  // isCompatible() requires an all-reduce, and thus this check
3675  // should only be done in debug mode.
3676  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3677  ! diag.getMap ()->isCompatible (rowMap), std::runtime_error,
3678  "The input Vector's Map must be compatible with (in the sense of Map::"
3679  "isCompatible) the CrsMatrix's row Map.");
3680 #endif // HAVE_TPETRA_DEBUG
3681 
3682  // See #1510. In case diag has already been marked modified on
3683  // device, we need to clear that flag, since the code below works
3684  // on host.
3685  //diag.clear_sync_state ();
3686 
3687  // For now, we fill the Vector on the host and sync to device.
3688  // Later, we may write a parallel kernel that works entirely on
3689  // device.
3690  auto lclVecHost = diag.getLocalViewHost(Access::OverwriteAll);
3691  // 1-D subview of the first (and only) column of lclVecHost.
3692  auto lclVecHost1d = Kokkos::subview (lclVecHost, Kokkos::ALL (), 0);
3693 
3694  using host_offsets_view_type =
3695  Kokkos::View<const size_t*, Kokkos::HostSpace,
3696  Kokkos::MemoryTraits<Kokkos::Unmanaged> >;
3697  host_offsets_view_type h_offsets (offsets.getRawPtr (), offsets.size ());
3698  // Find the diagonal entries and put them in lclVecHost1d.
3699  using range_type = Kokkos::RangePolicy<host_execution_space, LO>;
3700  const LO myNumRows = static_cast<LO> (this->getLocalNumRows ());
3701  const size_t INV = Tpetra::Details::OrdinalTraits<size_t>::invalid ();
3702 
3703  auto rowPtrsPackedHost = staticGraph_->getRowPtrsPackedHost();
3704  auto valuesPackedHost = valuesPacked_wdv.getHostView(Access::ReadOnly);
3705  Kokkos::parallel_for
3706  ("Tpetra::CrsMatrix::getLocalDiagCopy",
3707  range_type (0, myNumRows),
3708  [&, INV, h_offsets] (const LO lclRow) { // Value capture is a workaround for cuda + gcc-7.2 compiler bug w/c++14
3709  lclVecHost1d(lclRow) = STS::zero (); // default value if no diag entry
3710  if (h_offsets[lclRow] != INV) {
3711  auto curRowOffset = rowPtrsPackedHost (lclRow);
3712  lclVecHost1d(lclRow) =
3713  static_cast<IST> (valuesPackedHost(curRowOffset+h_offsets[lclRow]));
3714  }
3715  });
3716  //diag.sync_device ();
3717  }
3718 
3719 
3720  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3721  void
3724  {
3725  using ::Tpetra::Details::ProfilingRegion;
3726  using Teuchos::ArrayRCP;
3727  using Teuchos::ArrayView;
3728  using Teuchos::null;
3729  using Teuchos::RCP;
3730  using Teuchos::rcp;
3731  using Teuchos::rcpFromRef;
3733  const char tfecfFuncName[] = "leftScale: ";
3734 
3735  ProfilingRegion region ("Tpetra::CrsMatrix::leftScale");
3736 
3737  RCP<const vec_type> xp;
3738  if (this->getRangeMap ()->isSameAs (* (x.getMap ()))) {
3739  // Take from Epetra: If we have a non-trivial exporter, we must
3740  // import elements that are permuted or are on other processors.
3741  auto exporter = this->getCrsGraphRef ().getExporter ();
3742  if (exporter.get () != nullptr) {
3743  RCP<vec_type> tempVec (new vec_type (this->getRowMap ()));
3744  tempVec->doImport (x, *exporter, REPLACE); // reverse mode
3745  xp = tempVec;
3746  }
3747  else {
3748  xp = rcpFromRef (x);
3749  }
3750  }
3751  else if (this->getRowMap ()->isSameAs (* (x.getMap ()))) {
3752  xp = rcpFromRef (x);
3753  }
3754  else {
3755  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3756  (true, std::invalid_argument, "x's Map must be the same as "
3757  "either the row Map or the range Map of the CrsMatrix.");
3758  }
3759 
3760  if (this->isFillComplete()) {
3761  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3762  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3764  leftScaleLocalCrsMatrix (getLocalMatrixDevice (),
3765  x_lcl_1d, false, false);
3766  }
3767  else {
3768  // 6/2020 Disallow leftScale of non-fillComplete matrices #7446
3769  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3770  (true, std::runtime_error, "CrsMatrix::leftScale requires matrix to be"
3771  " fillComplete");
3772  }
3773  }
3774 
3775  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3776  void
3779  {
3780  using ::Tpetra::Details::ProfilingRegion;
3781  using Teuchos::ArrayRCP;
3782  using Teuchos::ArrayView;
3783  using Teuchos::null;
3784  using Teuchos::RCP;
3785  using Teuchos::rcp;
3786  using Teuchos::rcpFromRef;
3788  const char tfecfFuncName[] = "rightScale: ";
3789 
3790  ProfilingRegion region ("Tpetra::CrsMatrix::rightScale");
3791 
3792  RCP<const vec_type> xp;
3793  if (this->getDomainMap ()->isSameAs (* (x.getMap ()))) {
3794  // Take from Epetra: If we have a non-trivial exporter, we must
3795  // import elements that are permuted or are on other processors.
3796  auto importer = this->getCrsGraphRef ().getImporter ();
3797  if (importer.get () != nullptr) {
3798  RCP<vec_type> tempVec (new vec_type (this->getColMap ()));
3799  tempVec->doImport (x, *importer, REPLACE);
3800  xp = tempVec;
3801  }
3802  else {
3803  xp = rcpFromRef (x);
3804  }
3805  }
3806  else if (this->getColMap ()->isSameAs (* (x.getMap ()))) {
3807  xp = rcpFromRef (x);
3808  } else {
3809  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3810  (true, std::runtime_error, "x's Map must be the same as "
3811  "either the domain Map or the column Map of the CrsMatrix.");
3812  }
3813 
3814  if (this->isFillComplete()) {
3815  auto x_lcl = xp->getLocalViewDevice (Access::ReadOnly);
3816  auto x_lcl_1d = Kokkos::subview (x_lcl, Kokkos::ALL (), 0);
3818  rightScaleLocalCrsMatrix (getLocalMatrixDevice (),
3819  x_lcl_1d, false, false);
3820  }
3821  else {
3822  // 6/2020 Disallow rightScale of non-fillComplete matrices #7446
3823  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
3824  (true, std::runtime_error, "CrsMatrix::rightScale requires matrix to be"
3825  " fillComplete");
3826  }
3827  }
3828 
3829  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3833  {
3834  using Teuchos::ArrayView;
3835  using Teuchos::outArg;
3836  using Teuchos::REDUCE_SUM;
3837  using Teuchos::reduceAll;
3838 
3839  // FIXME (mfh 05 Aug 2014) Write a thread-parallel kernel for the
3840  // local part of this computation. It could make sense to put
3841  // this operation in the Kokkos::CrsMatrix.
3842 
3843  // check the cache first
3844  mag_type mySum = STM::zero ();
3845  if (getLocalNumEntries() > 0) {
3846  if (isStorageOptimized ()) {
3847  // "Optimized" storage is packed storage. That means we can
3848  // iterate in one pass through the 1-D values array.
3849  const size_t numEntries = getLocalNumEntries ();
3850  auto values = valuesPacked_wdv.getHostView(Access::ReadOnly);
3851  for (size_t k = 0; k < numEntries; ++k) {
3852  auto val = values[k];
3853  // Note (etp 06 Jan 2015) We need abs() here for composite types
3854  // (in general, if mag_type is on the left-hand-side, we need
3855  // abs() on the right-hand-side)
3856  const mag_type val_abs = STS::abs (val);
3857  mySum += val_abs * val_abs;
3858  }
3859  }
3860  else {
3861  const LocalOrdinal numRows =
3862  static_cast<LocalOrdinal> (this->getLocalNumRows ());
3863  for (LocalOrdinal r = 0; r < numRows; ++r) {
3864  const RowInfo rowInfo = myGraph_->getRowInfo (r);
3865  const size_t numEntries = rowInfo.numEntries;
3866  auto A_r = this->getValuesViewHost(rowInfo);
3867  for (size_t k = 0; k < numEntries; ++k) {
3868  const impl_scalar_type val = A_r[k];
3869  const mag_type val_abs = STS::abs (val);
3870  mySum += val_abs * val_abs;
3871  }
3872  }
3873  }
3874  }
3875  mag_type totalSum = STM::zero ();
3876  reduceAll<int, mag_type> (* (getComm ()), REDUCE_SUM,
3877  mySum, outArg (totalSum));
3878  return STM::sqrt (totalSum);
3879  }
3880 
3881  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3882  void
3884  replaceColMap (const Teuchos::RCP<const map_type>& newColMap)
3885  {
3886  const char tfecfFuncName[] = "replaceColMap: ";
3887  // FIXME (mfh 06 Aug 2014) What if the graph is locally indexed?
3888  // Then replacing the column Map might mean that we need to
3889  // reindex the column indices.
3890  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3891  myGraph_.is_null (), std::runtime_error,
3892  "This method does not work if the matrix has a const graph. The whole "
3893  "idea of a const graph is that you are not allowed to change it, but "
3894  "this method necessarily must modify the graph, since the graph owns "
3895  "the matrix's column Map.");
3896  myGraph_->replaceColMap (newColMap);
3897  }
3898 
3899  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3900  void
3903  const Teuchos::RCP<const map_type>& newColMap,
3904  const Teuchos::RCP<const import_type>& newImport,
3905  const bool sortEachRow)
3906  {
3907  const char tfecfFuncName[] = "reindexColumns: ";
3908  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3909  graph == nullptr && myGraph_.is_null (), std::invalid_argument,
3910  "The input graph is null, but the matrix does not own its graph.");
3911 
3912  crs_graph_type& theGraph = (graph == nullptr) ? *myGraph_ : *graph;
3913  const bool sortGraph = false; // we'll sort graph & matrix together below
3914 
3915  theGraph.reindexColumns (newColMap, newImport, sortGraph);
3916 
3917  if (sortEachRow && theGraph.isLocallyIndexed () && ! theGraph.isSorted ()) {
3918  const LocalOrdinal lclNumRows =
3919  static_cast<LocalOrdinal> (theGraph.getLocalNumRows ());
3920 
3921  for (LocalOrdinal row = 0; row < lclNumRows; ++row) {
3922 
3923  const RowInfo rowInfo = theGraph.getRowInfo (row);
3924  auto lclColInds = theGraph.getLocalIndsViewHostNonConst (rowInfo);
3925  auto vals = this->getValuesViewHostNonConst (rowInfo);
3926 
3927  sort2 (lclColInds.data (),
3928  lclColInds.data () + rowInfo.numEntries,
3929  vals.data ());
3930  }
3931  theGraph.indicesAreSorted_ = true;
3932  }
3933  }
3934 
3935  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3936  void
3938  replaceDomainMap (const Teuchos::RCP<const map_type>& newDomainMap)
3939  {
3940  const char tfecfFuncName[] = "replaceDomainMap: ";
3941  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3942  myGraph_.is_null (), std::runtime_error,
3943  "This method does not work if the matrix has a const graph. The whole "
3944  "idea of a const graph is that you are not allowed to change it, but this"
3945  " method necessarily must modify the graph, since the graph owns the "
3946  "matrix's domain Map and Import objects.");
3947  myGraph_->replaceDomainMap (newDomainMap);
3948  }
3949 
3950  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3951  void
3953  replaceDomainMapAndImporter (const Teuchos::RCP<const map_type>& newDomainMap,
3954  Teuchos::RCP<const import_type>& newImporter)
3955  {
3956  const char tfecfFuncName[] = "replaceDomainMapAndImporter: ";
3957  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3958  myGraph_.is_null (), std::runtime_error,
3959  "This method does not work if the matrix has a const graph. The whole "
3960  "idea of a const graph is that you are not allowed to change it, but this"
3961  " method necessarily must modify the graph, since the graph owns the "
3962  "matrix's domain Map and Import objects.");
3963  myGraph_->replaceDomainMapAndImporter (newDomainMap, newImporter);
3964  }
3965 
3966  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3967  void
3969  replaceRangeMap (const Teuchos::RCP<const map_type>& newRangeMap)
3970  {
3971  const char tfecfFuncName[] = "replaceRangeMap: ";
3972  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3973  myGraph_.is_null (), std::runtime_error,
3974  "This method does not work if the matrix has a const graph. The whole "
3975  "idea of a const graph is that you are not allowed to change it, but this"
3976  " method necessarily must modify the graph, since the graph owns the "
3977  "matrix's domain Map and Import objects.");
3978  myGraph_->replaceRangeMap (newRangeMap);
3979  }
3980 
3981  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3982  void
3984  replaceRangeMapAndExporter (const Teuchos::RCP<const map_type>& newRangeMap,
3985  Teuchos::RCP<const export_type>& newExporter)
3986  {
3987  const char tfecfFuncName[] = "replaceRangeMapAndExporter: ";
3988  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
3989  myGraph_.is_null (), std::runtime_error,
3990  "This method does not work if the matrix has a const graph. The whole "
3991  "idea of a const graph is that you are not allowed to change it, but this"
3992  " method necessarily must modify the graph, since the graph owns the "
3993  "matrix's domain Map and Import objects.");
3994  myGraph_->replaceRangeMapAndExporter (newRangeMap, newExporter);
3995  }
3996 
3997  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
3998  void
4000  insertNonownedGlobalValues (const GlobalOrdinal globalRow,
4001  const Teuchos::ArrayView<const GlobalOrdinal>& indices,
4002  const Teuchos::ArrayView<const Scalar>& values)
4003  {
4004  using Teuchos::Array;
4005  typedef GlobalOrdinal GO;
4006  typedef typename Array<GO>::size_type size_type;
4007 
4008  const size_type numToInsert = indices.size ();
4009  // Add the new data to the list of nonlocals.
4010  // This creates the arrays if they don't exist yet.
4011  std::pair<Array<GO>, Array<Scalar> >& curRow = nonlocals_[globalRow];
4012  Array<GO>& curRowInds = curRow.first;
4013  Array<Scalar>& curRowVals = curRow.second;
4014  const size_type newCapacity = curRowInds.size () + numToInsert;
4015  curRowInds.reserve (newCapacity);
4016  curRowVals.reserve (newCapacity);
4017  for (size_type k = 0; k < numToInsert; ++k) {
4018  curRowInds.push_back (indices[k]);
4019  curRowVals.push_back (values[k]);
4020  }
4021  }
4022 
4023  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4024  void
4027  {
4028  using Details::Behavior;
4030  using Teuchos::Comm;
4031  using Teuchos::outArg;
4032  using Teuchos::RCP;
4033  using Teuchos::rcp;
4034  using Teuchos::REDUCE_MAX;
4035  using Teuchos::REDUCE_MIN;
4036  using Teuchos::reduceAll;
4037  using std::endl;
4039  //typedef LocalOrdinal LO;
4040  typedef GlobalOrdinal GO;
4041  typedef typename Teuchos::Array<GO>::size_type size_type;
4042  const char tfecfFuncName[] = "globalAssemble: "; // for exception macro
4043  ProfilingRegion regionGlobalAssemble ("Tpetra::CrsMatrix::globalAssemble");
4044 
4045  const bool verbose = Behavior::verbose("CrsMatrix");
4046  std::unique_ptr<std::string> prefix;
4047  if (verbose) {
4048  prefix = this->createPrefix("CrsMatrix", "globalAssemble");
4049  std::ostringstream os;
4050  os << *prefix << "nonlocals_.size()=" << nonlocals_.size()
4051  << endl;
4052  std::cerr << os.str();
4053  }
4054  RCP<const Comm<int> > comm = getComm ();
4055 
4056  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4057  (! isFillActive (), std::runtime_error, "Fill must be active before "
4058  "you may call this method.");
4059 
4060  const size_t myNumNonlocalRows = nonlocals_.size ();
4061 
4062  // If no processes have nonlocal rows, then we don't have to do
4063  // anything. Checking this is probably cheaper than constructing
4064  // the Map of nonlocal rows (see below) and noticing that it has
4065  // zero global entries.
4066  {
4067  const int iHaveNonlocalRows = (myNumNonlocalRows == 0) ? 0 : 1;
4068  int someoneHasNonlocalRows = 0;
4069  reduceAll<int, int> (*comm, REDUCE_MAX, iHaveNonlocalRows,
4070  outArg (someoneHasNonlocalRows));
4071  if (someoneHasNonlocalRows == 0) {
4072  return; // no process has nonlocal rows, so nothing to do
4073  }
4074  }
4075 
4076  // 1. Create a list of the "nonlocal" rows on each process. this
4077  // requires iterating over nonlocals_, so while we do this,
4078  // deduplicate the entries and get a count for each nonlocal
4079  // row on this process.
4080  // 2. Construct a new row Map corresponding to those rows. This
4081  // Map is likely overlapping. We know that the Map is not
4082  // empty on all processes, because the above all-reduce and
4083  // return exclude that case.
4084 
4085  RCP<const map_type> nonlocalRowMap;
4086  Teuchos::Array<size_t> numEntPerNonlocalRow (myNumNonlocalRows);
4087  {
4088  Teuchos::Array<GO> myNonlocalGblRows (myNumNonlocalRows);
4089  size_type curPos = 0;
4090  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4091  ++mapIter, ++curPos) {
4092  myNonlocalGblRows[curPos] = mapIter->first;
4093  // Get the values and column indices by reference, since we
4094  // intend to change them in place (that's what "erase" does).
4095  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4096  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4097 
4098  // Sort both arrays jointly, using the column indices as keys,
4099  // then merge them jointly. "Merge" here adds values
4100  // corresponding to the same column indices. The first 2 args
4101  // of merge2 are output arguments that work just like the
4102  // return value of std::unique.
4103  sort2 (gblCols.begin (), gblCols.end (), vals.begin ());
4104  typename Teuchos::Array<GO>::iterator gblCols_newEnd;
4105  typename Teuchos::Array<Scalar>::iterator vals_newEnd;
4106  merge2 (gblCols_newEnd, vals_newEnd,
4107  gblCols.begin (), gblCols.end (),
4108  vals.begin (), vals.end ());
4109  gblCols.erase (gblCols_newEnd, gblCols.end ());
4110  vals.erase (vals_newEnd, vals.end ());
4111  numEntPerNonlocalRow[curPos] = gblCols.size ();
4112  }
4113 
4114  // Currently, Map requires that its indexBase be the global min
4115  // of all its global indices. Map won't compute this for us, so
4116  // we must do it. If our process has no nonlocal rows, set the
4117  // "min" to the max possible GO value. This ensures that if
4118  // some process has at least one nonlocal row, then it will pick
4119  // that up as the min. We know that at least one process has a
4120  // nonlocal row, since the all-reduce and return at the top of
4121  // this method excluded that case.
4122  GO myMinNonlocalGblRow = std::numeric_limits<GO>::max ();
4123  {
4124  auto iter = std::min_element (myNonlocalGblRows.begin (),
4125  myNonlocalGblRows.end ());
4126  if (iter != myNonlocalGblRows.end ()) {
4127  myMinNonlocalGblRow = *iter;
4128  }
4129  }
4130  GO gblMinNonlocalGblRow = 0;
4131  reduceAll<int, GO> (*comm, REDUCE_MIN, myMinNonlocalGblRow,
4132  outArg (gblMinNonlocalGblRow));
4133  const GO indexBase = gblMinNonlocalGblRow;
4134  const global_size_t INV = Teuchos::OrdinalTraits<global_size_t>::invalid ();
4135  nonlocalRowMap = rcp (new map_type (INV, myNonlocalGblRows (), indexBase, comm));
4136  }
4137 
4138  // 3. Use the values and column indices for each nonlocal row, as
4139  // stored in nonlocals_, to construct a CrsMatrix corresponding
4140  // to nonlocal rows. We have
4141  // exact counts of the number of entries in each nonlocal row.
4142 
4143  if (verbose) {
4144  std::ostringstream os;
4145  os << *prefix << "Create nonlocal matrix" << endl;
4146  std::cerr << os.str();
4147  }
4148  RCP<crs_matrix_type> nonlocalMatrix =
4149  rcp (new crs_matrix_type (nonlocalRowMap, numEntPerNonlocalRow ()));
4150  {
4151  size_type curPos = 0;
4152  for (auto mapIter = nonlocals_.begin (); mapIter != nonlocals_.end ();
4153  ++mapIter, ++curPos) {
4154  const GO gblRow = mapIter->first;
4155  // Get values & column indices by ref, just to avoid copy.
4156  Teuchos::Array<GO>& gblCols = (mapIter->second).first;
4157  Teuchos::Array<Scalar>& vals = (mapIter->second).second;
4158  //const LO numEnt = static_cast<LO> (numEntPerNonlocalRow[curPos]);
4159  nonlocalMatrix->insertGlobalValues (gblRow, gblCols (), vals ());
4160  }
4161  }
4162  // There's no need to fill-complete the nonlocals matrix.
4163  // We just use it as a temporary container for the Export.
4164 
4165  // 4. If the original row Map is one to one, then we can Export
4166  // directly from nonlocalMatrix into this. Otherwise, we have
4167  // to create a temporary matrix with a one-to-one row Map,
4168  // Export into that, then Import from the temporary matrix into
4169  // *this.
4170 
4171  auto origRowMap = this->getRowMap ();
4172  const bool origRowMapIsOneToOne = origRowMap->isOneToOne ();
4173 
4174  int isLocallyComplete = 1; // true by default
4175 
4176  if (origRowMapIsOneToOne) {
4177  if (verbose) {
4178  std::ostringstream os;
4179  os << *prefix << "Original row Map is 1-to-1" << endl;
4180  std::cerr << os.str();
4181  }
4182  export_type exportToOrig (nonlocalRowMap, origRowMap);
4183  if (! exportToOrig.isLocallyComplete ()) {
4184  isLocallyComplete = 0;
4185  }
4186  if (verbose) {
4187  std::ostringstream os;
4188  os << *prefix << "doExport from nonlocalMatrix" << endl;
4189  std::cerr << os.str();
4190  }
4191  this->doExport (*nonlocalMatrix, exportToOrig, Tpetra::ADD);
4192  // We're done at this point!
4193  }
4194  else {
4195  if (verbose) {
4196  std::ostringstream os;
4197  os << *prefix << "Original row Map is NOT 1-to-1" << endl;
4198  std::cerr << os.str();
4199  }
4200  // If you ask a Map whether it is one to one, it does some
4201  // communication and stashes intermediate results for later use
4202  // by createOneToOne. Thus, calling createOneToOne doesn't cost
4203  // much more then the original cost of calling isOneToOne.
4204  auto oneToOneRowMap = Tpetra::createOneToOne (origRowMap);
4205  export_type exportToOneToOne (nonlocalRowMap, oneToOneRowMap);
4206  if (! exportToOneToOne.isLocallyComplete ()) {
4207  isLocallyComplete = 0;
4208  }
4209 
4210  // Create a temporary matrix with the one-to-one row Map.
4211  //
4212  // TODO (mfh 09 Sep 2016, 12 Sep 2016) Estimate # entries in
4213  // each row, to avoid reallocation during the Export operation.
4214  if (verbose) {
4215  std::ostringstream os;
4216  os << *prefix << "Create & doExport into 1-to-1 matrix"
4217  << endl;
4218  std::cerr << os.str();
4219  }
4220  crs_matrix_type oneToOneMatrix (oneToOneRowMap, 0);
4221  // Export from matrix of nonlocals into the temp one-to-one matrix.
4222  oneToOneMatrix.doExport(*nonlocalMatrix, exportToOneToOne,
4223  Tpetra::ADD);
4224 
4225  // We don't need the matrix of nonlocals anymore, so get rid of
4226  // it, to keep the memory high-water mark down.
4227  if (verbose) {
4228  std::ostringstream os;
4229  os << *prefix << "Free nonlocalMatrix" << endl;
4230  std::cerr << os.str();
4231  }
4232  nonlocalMatrix = Teuchos::null;
4233 
4234  // Import from the one-to-one matrix to the original matrix.
4235  if (verbose) {
4236  std::ostringstream os;
4237  os << *prefix << "doImport from 1-to-1 matrix" << endl;
4238  std::cerr << os.str();
4239  }
4240  import_type importToOrig (oneToOneRowMap, origRowMap);
4241  this->doImport (oneToOneMatrix, importToOrig, Tpetra::ADD);
4242  }
4243 
4244  // It's safe now to clear out nonlocals_, since we've already
4245  // committed side effects to *this. The standard idiom for
4246  // clearing a Container like std::map, is to swap it with an empty
4247  // Container and let the swapped Container fall out of scope.
4248  if (verbose) {
4249  std::ostringstream os;
4250  os << *prefix << "Free nonlocals_ (std::map)" << endl;
4251  std::cerr << os.str();
4252  }
4253  decltype (nonlocals_) newNonlocals;
4254  std::swap (nonlocals_, newNonlocals);
4255 
4256  // FIXME (mfh 12 Sep 2016) I don't like this all-reduce, and I
4257  // don't like throwing an exception here. A local return value
4258  // would likely be more useful to users. However, if users find
4259  // themselves exercising nonlocal inserts often, then they are
4260  // probably novice users who need the help. See Gibhub Issues
4261  // #603 and #601 (esp. the latter) for discussion.
4262 
4263  int isGloballyComplete = 0; // output argument of reduceAll
4264  reduceAll<int, int> (*comm, REDUCE_MIN, isLocallyComplete,
4265  outArg (isGloballyComplete));
4266  TEUCHOS_TEST_FOR_EXCEPTION
4267  (isGloballyComplete != 1, std::runtime_error, "On at least one process, "
4268  "you called insertGlobalValues with a global row index which is not in "
4269  "the matrix's row Map on any process in its communicator.");
4270  }
4271 
4272  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4273  void
4275  resumeFill (const Teuchos::RCP<Teuchos::ParameterList>& params)
4276  {
4277  if (! isStaticGraph ()) { // Don't resume fill of a nonowned graph.
4278  myGraph_->resumeFill (params);
4279  }
4280 #if KOKKOSKERNELS_VERSION >= 40299
4281  // Delete the apply helper (if it exists)
4282  applyHelper.reset();
4283 #endif
4284  fillComplete_ = false;
4285  }
4286 
4287  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4288  bool
4291  return getCrsGraphRef ().haveGlobalConstants ();
4292  }
4293 
4294  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4295  void
4297  fillComplete (const Teuchos::RCP<Teuchos::ParameterList>& params)
4298  {
4299  const char tfecfFuncName[] = "fillComplete(params): ";
4300 
4301  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4302  (this->getCrsGraph ().is_null (), std::logic_error,
4303  "getCrsGraph() returns null. This should not happen at this point. "
4304  "Please report this bug to the Tpetra developers.");
4305 
4306  const crs_graph_type& graph = this->getCrsGraphRef ();
4307  if (this->isStaticGraph () && graph.isFillComplete ()) {
4308  // If this matrix's graph is fill complete and the user did not
4309  // supply a domain or range Map, use the graph's domain and
4310  // range Maps.
4311  this->fillComplete (graph.getDomainMap (), graph.getRangeMap (), params);
4312  }
4313  else { // assume that user's row Map is the domain and range Map
4314  Teuchos::RCP<const map_type> rangeMap = graph.getRowMap ();
4315  Teuchos::RCP<const map_type> domainMap = rangeMap;
4316  this->fillComplete (domainMap, rangeMap, params);
4317  }
4318  }
4319 
4320  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4321  void
4323  fillComplete (const Teuchos::RCP<const map_type>& domainMap,
4324  const Teuchos::RCP<const map_type>& rangeMap,
4325  const Teuchos::RCP<Teuchos::ParameterList>& params)
4326  {
4327  using Details::Behavior;
4329  using Teuchos::ArrayRCP;
4330  using Teuchos::RCP;
4331  using Teuchos::rcp;
4332  using std::endl;
4333  const char tfecfFuncName[] = "fillComplete: ";
4334  ProfilingRegion regionFillComplete
4335  ("Tpetra::CrsMatrix::fillComplete");
4336  const bool verbose = Behavior::verbose("CrsMatrix");
4337  std::unique_ptr<std::string> prefix;
4338  if (verbose) {
4339  prefix = this->createPrefix("CrsMatrix", "fillComplete(dom,ran,p)");
4340  std::ostringstream os;
4341  os << *prefix << endl;
4342  std::cerr << os.str ();
4343  }
4344  Details::ProfilingRegion region(
4345  "Tpetra::CrsMatrix::fillCompete",
4346  "fillCompete");
4347 
4348  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4349  (! this->isFillActive () || this->isFillComplete (), std::runtime_error,
4350  "Matrix fill state must be active (isFillActive() "
4351  "must be true) before you may call fillComplete().");
4352  const int numProcs = this->getComm ()->getSize ();
4353 
4354  //
4355  // Read parameters from the input ParameterList.
4356  //
4357  {
4358  Details::ProfilingRegion region_fc("Tpetra::CrsMatrix::fillCompete", "ParameterList");
4359 
4360  // If true, the caller promises that no process did nonlocal
4361  // changes since the last call to fillComplete.
4362  bool assertNoNonlocalInserts = false;
4363  // If true, makeColMap sorts remote GIDs (within each remote
4364  // process' group).
4365  bool sortGhosts = true;
4366 
4367  if (! params.is_null ()) {
4368  assertNoNonlocalInserts = params->get ("No Nonlocal Changes",
4369  assertNoNonlocalInserts);
4370  if (params->isParameter ("sort column map ghost gids")) {
4371  sortGhosts = params->get ("sort column map ghost gids", sortGhosts);
4372  }
4373  else if (params->isParameter ("Sort column Map ghost GIDs")) {
4374  sortGhosts = params->get ("Sort column Map ghost GIDs", sortGhosts);
4375  }
4376  }
4377  // We also don't need to do global assembly if there is only one
4378  // process in the communicator.
4379  const bool needGlobalAssemble = ! assertNoNonlocalInserts && numProcs > 1;
4380  // This parameter only matters if this matrix owns its graph.
4381  if (! this->myGraph_.is_null ()) {
4382  this->myGraph_->sortGhostsAssociatedWithEachProcessor_ = sortGhosts;
4383  }
4384 
4385  if (! this->getCrsGraphRef ().indicesAreAllocated ()) {
4386  if (this->hasColMap ()) { // use local indices
4387  allocateValues(LocalIndices, GraphNotYetAllocated, verbose);
4388  }
4389  else { // no column Map, so use global indices
4390  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
4391  }
4392  }
4393  // Global assemble, if we need to. This call only costs a single
4394  // all-reduce if we didn't need global assembly after all.
4395  if (needGlobalAssemble) {
4396  this->globalAssemble ();
4397  }
4398  else {
4399  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4400  (numProcs == 1 && nonlocals_.size() > 0,
4401  std::runtime_error, "Cannot have nonlocal entries on a serial run. "
4402  "An invalid entry (i.e., with row index not in the row Map) must have "
4403  "been submitted to the CrsMatrix.");
4404  }
4405  }
4406  if (this->isStaticGraph ()) {
4407  Details::ProfilingRegion region_isg("Tpetra::CrsMatrix::fillCompete", "isStaticGraph");
4408  // FIXME (mfh 14 Nov 2016) In order to fix #843, I enable the
4409  // checks below only in debug mode. It would be nicer to do a
4410  // local check, then propagate the error state in a deferred
4411  // way, whenever communication happens. That would reduce the
4412  // cost of checking, to the point where it may make sense to
4413  // enable it even in release mode.
4414 #ifdef HAVE_TPETRA_DEBUG
4415  // FIXME (mfh 18 Jun 2014) This check for correctness of the
4416  // input Maps incurs a penalty of two all-reduces for the
4417  // otherwise optimal const graph case.
4418  //
4419  // We could turn these (max) 2 all-reduces into (max) 1, by
4420  // fusing them. We could do this by adding a "locallySameAs"
4421  // method to Map, which would return one of four states:
4422  //
4423  // a. Certainly globally the same
4424  // b. Certainly globally not the same
4425  // c. Locally the same
4426  // d. Locally not the same
4427  //
4428  // The first two states don't require further communication.
4429  // The latter two states require an all-reduce to communicate
4430  // globally, but we only need one all-reduce, since we only need
4431  // to check whether at least one of the Maps is wrong.
4432  const bool domainMapsMatch =
4433  this->staticGraph_->getDomainMap ()->isSameAs (*domainMap);
4434  const bool rangeMapsMatch =
4435  this->staticGraph_->getRangeMap ()->isSameAs (*rangeMap);
4436 
4437  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4438  (! domainMapsMatch, std::runtime_error,
4439  "The CrsMatrix's domain Map does not match the graph's domain Map. "
4440  "The graph cannot be changed because it was given to the CrsMatrix "
4441  "constructor as const. You can fix this by passing in the graph's "
4442  "domain Map and range Map to the matrix's fillComplete call.");
4443 
4444  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4445  (! rangeMapsMatch, std::runtime_error,
4446  "The CrsMatrix's range Map does not match the graph's range Map. "
4447  "The graph cannot be changed because it was given to the CrsMatrix "
4448  "constructor as const. You can fix this by passing in the graph's "
4449  "domain Map and range Map to the matrix's fillComplete call.");
4450 #endif // HAVE_TPETRA_DEBUG
4451 
4452  // The matrix does _not_ own the graph, and the graph's
4453  // structure is already fixed, so just fill the local matrix.
4454  this->fillLocalMatrix (params);
4455  }
4456  else {
4457  Details::ProfilingRegion region_insg("Tpetra::CrsMatrix::fillCompete", "isNotStaticGraph");
4458  // Set the graph's domain and range Maps. This will clear the
4459  // Import if the domain Map has changed (is a different
4460  // pointer), and the Export if the range Map has changed (is a
4461  // different pointer).
4462  this->myGraph_->setDomainRangeMaps (domainMap, rangeMap);
4463 
4464  // Make the graph's column Map, if necessary.
4465  Teuchos::Array<int> remotePIDs (0);
4466  const bool mustBuildColMap = ! this->hasColMap ();
4467  if (mustBuildColMap) {
4468  this->myGraph_->makeColMap (remotePIDs);
4469  }
4470 
4471  // Make indices local, if necessary. The method won't do
4472  // anything if the graph is already locally indexed.
4473  const std::pair<size_t, std::string> makeIndicesLocalResult =
4474  this->myGraph_->makeIndicesLocal(verbose);
4475  // TODO (mfh 20 Jul 2017) Instead of throwing here, pass along
4476  // the error state to makeImportExport
4477  // which may do all-reduces and thus may
4478  // have the opportunity to communicate that error state.
4479  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4480  (makeIndicesLocalResult.first != 0, std::runtime_error,
4481  makeIndicesLocalResult.second);
4482 
4483  const bool sorted = this->myGraph_->isSorted ();
4484  const bool merged = this->myGraph_->isMerged ();
4485  this->sortAndMergeIndicesAndValues (sorted, merged);
4486 
4487  // Make Import and Export objects, if they haven't been made
4488  // already. If we made a column Map above, reuse information
4489  // from that process to avoid communiation in the Import setup.
4490  this->myGraph_->makeImportExport (remotePIDs, mustBuildColMap);
4491 
4492  // The matrix _does_ own the graph, so fill the local graph at
4493  // the same time as the local matrix.
4494  this->fillLocalGraphAndMatrix (params);
4495 
4496  const bool callGraphComputeGlobalConstants = params.get () == nullptr ||
4497  params->get ("compute global constants", true);
4498  if (callGraphComputeGlobalConstants) {
4499  this->myGraph_->computeGlobalConstants ();
4500  }
4501  else {
4502  this->myGraph_->computeLocalConstants ();
4503  }
4504  this->myGraph_->fillComplete_ = true;
4505  this->myGraph_->checkInternalState ();
4506  }
4507 
4508  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4509 
4510  this->fillComplete_ = true; // Now we're fill complete!
4511  {
4512  Details::ProfilingRegion region_cis(
4513  "Tpetra::CrsMatrix::fillCompete", "checkInternalState"
4514  );
4515  this->checkInternalState ();
4516  }
4517  } //fillComplete(domainMap, rangeMap, params)
4518 
4519  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4520  void
4522  expertStaticFillComplete (const Teuchos::RCP<const map_type> & domainMap,
4523  const Teuchos::RCP<const map_type> & rangeMap,
4524  const Teuchos::RCP<const import_type>& importer,
4525  const Teuchos::RCP<const export_type>& exporter,
4526  const Teuchos::RCP<Teuchos::ParameterList> &params)
4527  {
4528 #ifdef HAVE_TPETRA_MMM_TIMINGS
4529  std::string label;
4530  if(!params.is_null())
4531  label = params->get("Timer Label",label);
4532  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
4533  using Teuchos::TimeMonitor;
4534 
4535  Teuchos::TimeMonitor all(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-all")));
4536 #endif
4537 
4538  const char tfecfFuncName[] = "expertStaticFillComplete: ";
4539  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC( ! isFillActive() || isFillComplete(),
4540  std::runtime_error, "Matrix fill state must be active (isFillActive() "
4541  "must be true) before calling fillComplete().");
4542  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
4543  myGraph_.is_null (), std::logic_error, "myGraph_ is null. This is not allowed.");
4544 
4545  {
4546 #ifdef HAVE_TPETRA_MMM_TIMINGS
4547  Teuchos::TimeMonitor graph(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-Graph")));
4548 #endif
4549  // We will presume globalAssemble is not needed, so we do the ESFC on the graph
4550  myGraph_->expertStaticFillComplete (domainMap, rangeMap, importer, exporter,params);
4551  }
4552 
4553  {
4554 #ifdef HAVE_TPETRA_MMM_TIMINGS
4555  TimeMonitor fLGAM(*TimeMonitor::getNewTimer(prefix + std::string("eSFC-M-fLGAM")));
4556 #endif
4557  // Fill the local graph and matrix
4558  fillLocalGraphAndMatrix (params);
4559  }
4560  // FIXME (mfh 28 Aug 2014) "Preserve Local Graph" bool parameter no longer used.
4561 
4562  // Now we're fill complete!
4563  fillComplete_ = true;
4564 
4565  // Sanity checks at the end.
4566 #ifdef HAVE_TPETRA_DEBUG
4567  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(isFillActive(), std::logic_error,
4568  ": We're at the end of fillComplete(), but isFillActive() is true. "
4569  "Please report this bug to the Tpetra developers.");
4570  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(! isFillComplete(), std::logic_error,
4571  ": We're at the end of fillComplete(), but isFillActive() is true. "
4572  "Please report this bug to the Tpetra developers.");
4573 #endif // HAVE_TPETRA_DEBUG
4574  {
4575 #ifdef HAVE_TPETRA_MMM_TIMINGS
4576  Teuchos::TimeMonitor cIS(*TimeMonitor::getNewTimer(prefix + std::string("ESFC-M-cIS")));
4577 #endif
4578 
4579  checkInternalState();
4580  }
4581  }
4582 
4583  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4585  mergeRowIndicesAndValues (size_t rowLen, LocalOrdinal* cols, impl_scalar_type* vals)
4586  {
4587  impl_scalar_type* rowValueIter = vals;
4588  // beg,end define a half-exclusive interval over which to iterate.
4589  LocalOrdinal* beg = cols;
4590  LocalOrdinal* end = cols + rowLen;
4591  LocalOrdinal* newend = beg;
4592  if (beg != end) {
4593  LocalOrdinal* cur = beg + 1;
4594  impl_scalar_type* vcur = rowValueIter + 1;
4595  impl_scalar_type* vend = rowValueIter;
4596  cur = beg+1;
4597  while (cur != end) {
4598  if (*cur != *newend) {
4599  // new entry; save it
4600  ++newend;
4601  ++vend;
4602  (*newend) = (*cur);
4603  (*vend) = (*vcur);
4604  }
4605  else {
4606  // old entry; merge it
4607  //(*vend) = f (*vend, *vcur);
4608  (*vend) += *vcur;
4609  }
4610  ++cur;
4611  ++vcur;
4612  }
4613  ++newend; // one past the last entry, per typical [beg,end) semantics
4614  }
4615  return newend - beg;
4616  }
4617 
4618  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4619  void
4621  sortAndMergeIndicesAndValues (const bool sorted, const bool merged)
4622  {
4623  using ::Tpetra::Details::ProfilingRegion;
4624  typedef LocalOrdinal LO;
4625  typedef typename Kokkos::View<LO*, device_type>::HostMirror::execution_space
4626  host_execution_space;
4627  typedef Kokkos::RangePolicy<host_execution_space, LO> range_type;
4628  const char tfecfFuncName[] = "sortAndMergeIndicesAndValues: ";
4629  ProfilingRegion regionSAM ("Tpetra::CrsMatrix::sortAndMergeIndicesAndValues");
4630 
4631  if (! sorted || ! merged) {
4632  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4633  (this->isStaticGraph (), std::runtime_error, "Cannot sort or merge with "
4634  "\"static\" (const) graph, since the matrix does not own the graph.");
4635  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4636  (this->myGraph_.is_null (), std::logic_error, "myGraph_ is null, but "
4637  "this matrix claims ! isStaticGraph(). "
4638  "Please report this bug to the Tpetra developers.");
4639  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
4640  (this->isStorageOptimized (), std::logic_error, "It is invalid to call "
4641  "this method if the graph's storage has already been optimized. "
4642  "Please report this bug to the Tpetra developers.");
4643 
4644  crs_graph_type& graph = * (this->myGraph_);
4645  const LO lclNumRows = static_cast<LO> (this->getLocalNumRows ());
4646  size_t totalNumDups = 0;
4647  {
4648  //Accessing host unpacked (4-array CRS) local matrix.
4649  auto rowBegins_ = graph.getRowPtrsUnpackedHost();
4650  auto rowLengths_ = graph.k_numRowEntries_;
4651  auto vals_ = this->valuesUnpacked_wdv.getHostView(Access::ReadWrite);
4652  auto cols_ = graph.lclIndsUnpacked_wdv.getHostView(Access::ReadWrite);
4653  Kokkos::parallel_reduce ("sortAndMergeIndicesAndValues", range_type (0, lclNumRows),
4654  [=] (const LO lclRow, size_t& numDups) {
4655  size_t rowBegin = rowBegins_(lclRow);
4656  size_t rowLen = rowLengths_(lclRow);
4657  LO* cols = cols_.data() + rowBegin;
4658  impl_scalar_type* vals = vals_.data() + rowBegin;
4659  if (! sorted) {
4660  sort2 (cols, cols + rowLen, vals);
4661  }
4662  if (! merged) {
4663  size_t newRowLength = mergeRowIndicesAndValues (rowLen, cols, vals);
4664  rowLengths_(lclRow) = newRowLength;
4665  numDups += rowLen - newRowLength;
4666  }
4667  }, totalNumDups);
4668  }
4669  if (! sorted) {
4670  graph.indicesAreSorted_ = true; // we just sorted every row
4671  }
4672  if (! merged) {
4673  graph.noRedundancies_ = true; // we just merged every row
4674  }
4675  }
4676  }
4677 
4678  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4679  void
4683  Scalar alpha,
4684  Scalar beta) const
4685  {
4687  using Teuchos::RCP;
4688  using Teuchos::rcp;
4689  using Teuchos::rcp_const_cast;
4690  using Teuchos::rcpFromRef;
4691  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4692  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one ();
4693 
4694  // mfh 05 Jun 2014: Special case for alpha == 0. I added this to
4695  // fix an Ifpack2 test (RILUKSingleProcessUnitTests), which was
4696  // failing only for the Kokkos refactor version of Tpetra. It's a
4697  // good idea regardless to have the bypass.
4698  if (alpha == ZERO) {
4699  if (beta == ZERO) {
4700  Y_in.putScalar (ZERO);
4701  } else if (beta != ONE) {
4702  Y_in.scale (beta);
4703  }
4704  return;
4705  }
4706 
4707  // It's possible that X is a view of Y or vice versa. We don't
4708  // allow this (apply() requires that X and Y not alias one
4709  // another), but it's helpful to detect and work around this case.
4710  // We don't try to to detect the more subtle cases (e.g., one is a
4711  // subview of the other, but their initial pointers differ). We
4712  // only need to do this if this matrix's Import is trivial;
4713  // otherwise, we don't actually apply the operator from X into Y.
4714 
4715  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4716  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4717 
4718  // If beta == 0, then the output MV will be overwritten; none of
4719  // its entries should be read. (Sparse BLAS semantics say that we
4720  // must ignore any Inf or NaN entries in Y_in, if beta is zero.)
4721  // This matters if we need to do an Export operation; see below.
4722  const bool Y_is_overwritten = (beta == ZERO);
4723 
4724  // We treat the case of a replicated MV output specially.
4725  const bool Y_is_replicated =
4726  (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4727 
4728  // This is part of the special case for replicated MV output.
4729  // We'll let each process do its thing, but do an all-reduce at
4730  // the end to sum up the results. Setting beta=0 on all processes
4731  // but Proc 0 makes the math work out for the all-reduce. (This
4732  // assumes that the replicated data is correctly replicated, so
4733  // that the data are the same on all processes.)
4734  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4735  beta = ZERO;
4736  }
4737 
4738  // Temporary MV for Import operation. After the block of code
4739  // below, this will be an (Imported if necessary) column Map MV
4740  // ready to give to localApply(...).
4741  RCP<const MV> X_colMap;
4742  if (importer.is_null ()) {
4743  if (! X_in.isConstantStride ()) {
4744  // Not all sparse mat-vec kernels can handle an input MV with
4745  // nonconstant stride correctly, so we have to copy it in that
4746  // case into a constant stride MV. To make a constant stride
4747  // copy of X_in, we force creation of the column (== domain)
4748  // Map MV (if it hasn't already been created, else fetch the
4749  // cached copy). This avoids creating a new MV each time.
4750  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in, true);
4751  Tpetra::deep_copy (*X_colMapNonConst, X_in);
4752  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4753  }
4754  else {
4755  // The domain and column Maps are the same, so do the local
4756  // multiply using the domain Map input MV X_in.
4757  X_colMap = rcpFromRef (X_in);
4758  }
4759  }
4760  else { // need to Import source (multi)vector
4761  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply: Import");
4762 
4763  // We're doing an Import anyway, which will copy the relevant
4764  // elements of the domain Map MV X_in into a separate column Map
4765  // MV. Thus, we don't have to worry whether X_in is constant
4766  // stride.
4767  RCP<MV> X_colMapNonConst = getColumnMapMultiVector (X_in);
4768 
4769  // Import from the domain Map MV to the column Map MV.
4770  X_colMapNonConst->doImport (X_in, *importer, INSERT);
4771  X_colMap = rcp_const_cast<const MV> (X_colMapNonConst);
4772  }
4773 
4774  // Temporary MV for doExport (if needed), or for copying a
4775  // nonconstant stride output MV into a constant stride MV. This
4776  // is null if we don't need the temporary MV, that is, if the
4777  // Export is trivial (null).
4778  RCP<MV> Y_rowMap = getRowMapMultiVector (Y_in);
4779 
4780  // If we have a nontrivial Export object, we must perform an
4781  // Export. In that case, the local multiply result will go into
4782  // the row Map multivector. We don't have to make a
4783  // constant-stride version of Y_in in this case, because we had to
4784  // make a constant stride Y_rowMap MV and do an Export anyway.
4785  if (! exporter.is_null ()) {
4786  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, ZERO);
4787  {
4788  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply: Export");
4789 
4790  // If we're overwriting the output MV Y_in completely (beta ==
4791  // 0), then make sure that it is filled with zeros before we
4792  // do the Export. Otherwise, the ADD combine mode will use
4793  // data in Y_in, which is supposed to be zero.
4794  if (Y_is_overwritten) {
4795  Y_in.putScalar (ZERO);
4796  }
4797  else {
4798  // Scale output MV by beta, so that doExport sums in the
4799  // mat-vec contribution: Y_in = beta*Y_in + alpha*A*X_in.
4800  Y_in.scale (beta);
4801  }
4802  // Do the Export operation.
4803  Y_in.doExport (*Y_rowMap, *exporter, ADD_ASSIGN);
4804  }
4805  }
4806  else { // Don't do an Export: row Map and range Map are the same.
4807  //
4808  // If Y_in does not have constant stride, or if the column Map
4809  // MV aliases Y_in, then we can't let the kernel write directly
4810  // to Y_in. Instead, we have to use the cached row (== range)
4811  // Map MV as temporary storage.
4812  //
4813  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4814  // the user passed in the same MultiVector for both X and Y. It
4815  // won't detect whether one MultiVector views the other. We
4816  // should also check the MultiVectors' raw data pointers.
4817  if (! Y_in.isConstantStride () || X_colMap.getRawPtr () == &Y_in) {
4818  // Force creating the MV if it hasn't been created already.
4819  // This will reuse a previously created cached MV.
4820  Y_rowMap = getRowMapMultiVector (Y_in, true);
4821 
4822  // If beta == 0, we don't need to copy Y_in into Y_rowMap,
4823  // since we're overwriting it anyway.
4824  if (beta != ZERO) {
4825  Tpetra::deep_copy (*Y_rowMap, Y_in);
4826  }
4827  this->localApply (*X_colMap, *Y_rowMap, Teuchos::NO_TRANS, alpha, beta);
4828  Tpetra::deep_copy (Y_in, *Y_rowMap);
4829  }
4830  else {
4831  this->localApply (*X_colMap, Y_in, Teuchos::NO_TRANS, alpha, beta);
4832  }
4833  }
4834 
4835  // If the range Map is a locally replicated Map, sum up
4836  // contributions from each process. We set beta = 0 on all
4837  // processes but Proc 0 initially, so this will handle the scaling
4838  // factor beta correctly.
4839  if (Y_is_replicated) {
4840  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply: Reduce Y");
4841  Y_in.reduce ();
4842  }
4843  }
4844 
4845  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4846  void
4850  const Teuchos::ETransp mode,
4851  Scalar alpha,
4852  Scalar beta) const
4853  {
4855  using Teuchos::null;
4856  using Teuchos::RCP;
4857  using Teuchos::rcp;
4858  using Teuchos::rcp_const_cast;
4859  using Teuchos::rcpFromRef;
4860  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero ();
4861 
4862  // Take shortcuts for alpha == 0.
4863  if (alpha == ZERO) {
4864  // Follow the Sparse BLAS convention by ignoring both the matrix
4865  // and X_in, in this case.
4866  if (beta == ZERO) {
4867  // Follow the Sparse BLAS convention by overwriting any Inf or
4868  // NaN values in Y_in, in this case.
4869  Y_in.putScalar (ZERO);
4870  }
4871  else {
4872  Y_in.scale (beta);
4873  }
4874  return;
4875  }
4876  else if (beta == ZERO) {
4877  //Thyra was implicitly assuming that Y gets set to zero / or is overwritten
4878  //when bets==0. This was not the case with transpose in a multithreaded
4879  //environment where a multiplication with subsequent atomic_adds is used
4880  //since 0 is effectively not special cased. Doing the explicit set to zero here
4881  //This catches cases where Y is nan or inf.
4882  Y_in.putScalar (ZERO);
4883  }
4884 
4885  const size_t numVectors = X_in.getNumVectors ();
4886 
4887  // We don't allow X_in and Y_in to alias one another. It's hard
4888  // to check this, because advanced users could create views from
4889  // raw pointers. However, if X_in and Y_in reference the same
4890  // object, we will do the user a favor by copying X into new
4891  // storage (with a warning). We only need to do this if we have
4892  // trivial importers; otherwise, we don't actually apply the
4893  // operator from X into Y.
4894  RCP<const import_type> importer = this->getGraph ()->getImporter ();
4895  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
4896  // access X indirectly, in case we need to create temporary storage
4897  RCP<const MV> X;
4898 
4899  // some parameters for below
4900  const bool Y_is_replicated = (! Y_in.isDistributed () && this->getComm ()->getSize () != 1);
4901  const bool Y_is_overwritten = (beta == ZERO);
4902  if (Y_is_replicated && this->getComm ()->getRank () > 0) {
4903  beta = ZERO;
4904  }
4905 
4906  // The kernels do not allow input or output with nonconstant stride.
4907  if (! X_in.isConstantStride () && importer.is_null ()) {
4908  X = rcp (new MV (X_in, Teuchos::Copy)); // Constant-stride copy of X_in
4909  } else {
4910  X = rcpFromRef (X_in); // Reference to X_in
4911  }
4912 
4913  // Set up temporary multivectors for Import and/or Export.
4914  if (importer != Teuchos::null) {
4915  if (importMV_ != Teuchos::null && importMV_->getNumVectors() != numVectors) {
4916  importMV_ = null;
4917  }
4918  if (importMV_ == null) {
4919  importMV_ = rcp (new MV (this->getColMap (), numVectors));
4920  }
4921  }
4922  if (exporter != Teuchos::null) {
4923  if (exportMV_ != Teuchos::null && exportMV_->getNumVectors() != numVectors) {
4924  exportMV_ = null;
4925  }
4926  if (exportMV_ == null) {
4927  exportMV_ = rcp (new MV (this->getRowMap (), numVectors));
4928  }
4929  }
4930 
4931  // If we have a non-trivial exporter, we must import elements that
4932  // are permuted or are on other processors.
4933  if (! exporter.is_null ()) {
4934  ProfilingRegion regionImport ("Tpetra::CrsMatrix::apply (transpose): Import");
4935  exportMV_->doImport (X_in, *exporter, INSERT);
4936  X = exportMV_; // multiply out of exportMV_
4937  }
4938 
4939  // If we have a non-trivial importer, we must export elements that
4940  // are permuted or belong to other processors. We will compute
4941  // solution into the to-be-exported MV; get a view.
4942  if (importer != Teuchos::null) {
4943  ProfilingRegion regionExport ("Tpetra::CrsMatrix::apply (transpose): Export");
4944 
4945  // FIXME (mfh 18 Apr 2015) Temporary fix suggested by Clark
4946  // Dohrmann on Fri 17 Apr 2015. At some point, we need to go
4947  // back and figure out why this helps. importMV_ SHOULD be
4948  // completely overwritten in the localApply(...) call
4949  // below, because beta == ZERO there.
4950  importMV_->putScalar (ZERO);
4951  // Do the local computation.
4952  this->localApply (*X, *importMV_, mode, alpha, ZERO);
4953 
4954  if (Y_is_overwritten) {
4955  Y_in.putScalar (ZERO);
4956  } else {
4957  Y_in.scale (beta);
4958  }
4959  Y_in.doExport (*importMV_, *importer, ADD_ASSIGN);
4960  }
4961  // otherwise, multiply into Y
4962  else {
4963  // can't multiply in-situ; can't multiply into non-strided multivector
4964  //
4965  // FIXME (mfh 05 Jun 2014) This test for aliasing only tests if
4966  // the user passed in the same MultiVector for both X and Y. It
4967  // won't detect whether one MultiVector views the other. We
4968  // should also check the MultiVectors' raw data pointers.
4969  if (! Y_in.isConstantStride () || X.getRawPtr () == &Y_in) {
4970  // Make a deep copy of Y_in, into which to write the multiply result.
4971  MV Y (Y_in, Teuchos::Copy);
4972  this->localApply (*X, Y, mode, alpha, beta);
4973  Tpetra::deep_copy (Y_in, Y);
4974  } else {
4975  this->localApply (*X, Y_in, mode, alpha, beta);
4976  }
4977  }
4978 
4979  // If the range Map is a locally replicated map, sum the
4980  // contributions from each process. (That's why we set beta=0
4981  // above for all processes but Proc 0.)
4982  if (Y_is_replicated) {
4983  ProfilingRegion regionReduce ("Tpetra::CrsMatrix::apply (transpose): Reduce Y");
4984  Y_in.reduce ();
4985  }
4986  }
4987 
4988  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
4989  void
4993  const Teuchos::ETransp mode,
4994  const Scalar& alpha,
4995  const Scalar& beta) const
4996  {
4998  using Teuchos::NO_TRANS;
4999  ProfilingRegion regionLocalApply ("Tpetra::CrsMatrix::localApply");
5000 
5001  auto X_lcl = X.getLocalViewDevice(Access::ReadOnly);
5002  auto Y_lcl = Y.getLocalViewDevice(Access::ReadWrite);
5003 
5004  const bool debug = ::Tpetra::Details::Behavior::debug ();
5005  if (debug) {
5006  const char tfecfFuncName[] = "localApply: ";
5007  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5008  (X.getNumVectors () != Y.getNumVectors (), std::runtime_error,
5009  "X.getNumVectors() = " << X.getNumVectors () << " != "
5010  "Y.getNumVectors() = " << Y.getNumVectors () << ".");
5011  const bool transpose = (mode != Teuchos::NO_TRANS);
5012  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5013  (! transpose && X.getLocalLength () !=
5014  getColMap ()->getLocalNumElements (), std::runtime_error,
5015  "NO_TRANS case: X has the wrong number of local rows. "
5016  "X.getLocalLength() = " << X.getLocalLength () << " != "
5017  "getColMap()->getLocalNumElements() = " <<
5018  getColMap ()->getLocalNumElements () << ".");
5019  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5020  (! transpose && Y.getLocalLength () !=
5021  getRowMap ()->getLocalNumElements (), std::runtime_error,
5022  "NO_TRANS case: Y has the wrong number of local rows. "
5023  "Y.getLocalLength() = " << Y.getLocalLength () << " != "
5024  "getRowMap()->getLocalNumElements() = " <<
5025  getRowMap ()->getLocalNumElements () << ".");
5026  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5027  (transpose && X.getLocalLength () !=
5028  getRowMap ()->getLocalNumElements (), std::runtime_error,
5029  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5030  "rows. X.getLocalLength() = " << X.getLocalLength ()
5031  << " != getRowMap()->getLocalNumElements() = "
5032  << getRowMap ()->getLocalNumElements () << ".");
5033  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5034  (transpose && Y.getLocalLength () !=
5035  getColMap ()->getLocalNumElements (), std::runtime_error,
5036  "TRANS or CONJ_TRANS case: X has the wrong number of local "
5037  "rows. Y.getLocalLength() = " << Y.getLocalLength ()
5038  << " != getColMap()->getLocalNumElements() = "
5039  << getColMap ()->getLocalNumElements () << ".");
5040  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5041  (! isFillComplete (), std::runtime_error, "The matrix is not "
5042  "fill complete. You must call fillComplete() (possibly with "
5043  "domain and range Map arguments) without an intervening "
5044  "resumeFill() call before you may call this method.");
5045  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5046  (! X.isConstantStride () || ! Y.isConstantStride (),
5047  std::runtime_error, "X and Y must be constant stride.");
5048  // If the two pointers are null, then they don't alias one
5049  // another, even though they are equal.
5050  // Kokkos does not guarantee that zero row-extent vectors
5051  // point to different places, so we have to check that too.
5052  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5053  (X_lcl.data () == Y_lcl.data () && X_lcl.data () != nullptr
5054  && X_lcl.extent(0) != 0,
5055  std::runtime_error, "X and Y may not alias one another.");
5056  }
5057 
5058 #if KOKKOSKERNELS_VERSION >= 40299
5059  auto A_lcl = getLocalMatrixDevice();
5060 
5061  if(!applyHelper.get()) {
5062  // The apply helper does not exist, so create it.
5063  // Decide now whether to use the imbalanced row path, or the default.
5064  bool useMergePath = false;
5065 #ifdef KOKKOSKERNELS_ENABLE_TPL_CUSPARSE
5066  //TODO: when https://github.com/kokkos/kokkos-kernels/issues/2166 is fixed and,
5067  //we can use SPMV_MERGE_PATH for the native spmv as well.
5068  //Take out this ifdef to enable that.
5069  //
5070  //Until then, only use SPMV_MERGE_PATH when calling cuSPARSE.
5071  if constexpr(std::is_same_v<execution_space, Kokkos::Cuda>) {
5072  LocalOrdinal nrows = getLocalNumRows();
5073  LocalOrdinal maxRowImbalance = 0;
5074  if(nrows != 0)
5075  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5076 
5077  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5078  useMergePath = true;
5079  }
5080 #endif
5081  applyHelper = std::make_shared<ApplyHelper>(A_lcl.nnz(), A_lcl.graph.row_map,
5082  useMergePath ? KokkosSparse::SPMV_MERGE_PATH : KokkosSparse::SPMV_DEFAULT);
5083  }
5084 
5085  // Translate mode (Teuchos enum) to KokkosKernels (1-character string)
5086  const char* modeKK = nullptr;
5087  switch(mode)
5088  {
5089  case Teuchos::NO_TRANS:
5090  modeKK = KokkosSparse::NoTranspose; break;
5091  case Teuchos::TRANS:
5092  modeKK = KokkosSparse::Transpose; break;
5093  case Teuchos::CONJ_TRANS:
5094  modeKK = KokkosSparse::ConjugateTranspose; break;
5095  default:
5096  throw std::invalid_argument("Tpetra::CrsMatrix::localApply: invalid mode");
5097  }
5098 
5099  if(applyHelper->shouldUseIntRowptrs())
5100  {
5101  auto A_lcl_int_rowptrs = applyHelper->getIntRowptrMatrix(A_lcl);
5102  KokkosSparse::spmv(
5103  &applyHelper->handle_int, modeKK,
5104  impl_scalar_type(alpha), A_lcl_int_rowptrs, X_lcl, impl_scalar_type(beta), Y_lcl);
5105  }
5106  else
5107  {
5108  KokkosSparse::spmv(
5109  &applyHelper->handle, modeKK,
5110  impl_scalar_type(alpha), A_lcl, X_lcl, impl_scalar_type(beta), Y_lcl);
5111  }
5112 #else
5113  LocalOrdinal nrows = getLocalNumRows();
5114  LocalOrdinal maxRowImbalance = 0;
5115  if(nrows != 0)
5116  maxRowImbalance = getLocalMaxNumRowEntries() - (getLocalNumEntries() / nrows);
5117 
5118  auto matrix_lcl = getLocalMultiplyOperator();
5119  if(size_t(maxRowImbalance) >= Tpetra::Details::Behavior::rowImbalanceThreshold())
5120  matrix_lcl->applyImbalancedRows (X_lcl, Y_lcl, mode, alpha, beta);
5121  else
5122  matrix_lcl->apply (X_lcl, Y_lcl, mode, alpha, beta);
5123 #endif
5124  }
5125 
5126  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5127  void
5131  Teuchos::ETransp mode,
5132  Scalar alpha,
5133  Scalar beta) const
5134  {
5136  const char fnName[] = "Tpetra::CrsMatrix::apply";
5137 
5138  TEUCHOS_TEST_FOR_EXCEPTION
5139  (! isFillComplete (), std::runtime_error,
5140  fnName << ": Cannot call apply() until fillComplete() "
5141  "has been called.");
5142 
5143  if (mode == Teuchos::NO_TRANS) {
5144  ProfilingRegion regionNonTranspose (fnName);
5145  this->applyNonTranspose (X, Y, alpha, beta);
5146  }
5147  else {
5148  ProfilingRegion regionTranspose ("Tpetra::CrsMatrix::apply (transpose)");
5149  this->applyTranspose (X, Y, mode, alpha, beta);
5150  }
5151  }
5152 
5153 
5154  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5155  template<class T>
5156  Teuchos::RCP<CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> >
5158  convert () const
5159  {
5160  using Teuchos::RCP;
5161  typedef CrsMatrix<T, LocalOrdinal, GlobalOrdinal, Node> output_matrix_type;
5162  const char tfecfFuncName[] = "convert: ";
5163 
5164  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5165  (! this->isFillComplete (), std::runtime_error, "This matrix (the source "
5166  "of the conversion) is not fill complete. You must first call "
5167  "fillComplete() (possibly with the domain and range Map) without an "
5168  "intervening call to resumeFill(), before you may call this method.");
5169 
5170  RCP<output_matrix_type> newMatrix
5171  (new output_matrix_type (this->getCrsGraph ()));
5172  // Copy old values into new values. impl_scalar_type and T may
5173  // differ, so we can't use Kokkos::deep_copy.
5175  copyConvert (newMatrix->getLocalMatrixDevice ().values,
5176  this->getLocalMatrixDevice ().values);
5177  // Since newmat has a static (const) graph, the graph already has
5178  // a column Map, and Import and Export objects already exist (if
5179  // applicable). Thus, calling fillComplete is cheap.
5180  newMatrix->fillComplete (this->getDomainMap (), this->getRangeMap ());
5181 
5182  return newMatrix;
5183  }
5184 
5185 
5186  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5187  void
5190  {
5191  const bool debug = ::Tpetra::Details::Behavior::debug ("CrsGraph");
5192  if (debug) {
5193  const char tfecfFuncName[] = "checkInternalState: ";
5194  const char err[] = "Internal state is not consistent. "
5195  "Please report this bug to the Tpetra developers.";
5196 
5197  // This version of the graph (RCP<const crs_graph_type>) must
5198  // always be nonnull.
5199  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5200  (staticGraph_.is_null (), std::logic_error, err);
5201  // myGraph == null means that the matrix has a const ("static")
5202  // graph. Otherwise, the matrix has a dynamic graph (it owns its
5203  // graph).
5204  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5205  (! myGraph_.is_null () && myGraph_ != staticGraph_,
5206  std::logic_error, err);
5207  // if matrix is fill complete, then graph must be fill complete
5208  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5209  (isFillComplete () && ! staticGraph_->isFillComplete (),
5210  std::logic_error, err << " Specifically, the matrix is fill complete, "
5211  "but its graph is NOT fill complete.");
5212  // if values are allocated and they are non-zero in number, then
5213  // one of the allocations should be present
5214  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5215  (staticGraph_->indicesAreAllocated () &&
5216  staticGraph_->getLocalAllocationSize() > 0 &&
5217  staticGraph_->getLocalNumRows() > 0 &&
5218  valuesUnpacked_wdv.extent (0) == 0,
5219  std::logic_error, err);
5220  }
5221  }
5222 
5223  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5224  std::string
5227  {
5228  std::ostringstream os;
5229 
5230  os << "Tpetra::CrsMatrix (Kokkos refactor): {";
5231  if (this->getObjectLabel () != "") {
5232  os << "Label: \"" << this->getObjectLabel () << "\", ";
5233  }
5234  if (isFillComplete ()) {
5235  os << "isFillComplete: true"
5236  << ", global dimensions: [" << getGlobalNumRows () << ", "
5237  << getGlobalNumCols () << "]"
5238  << ", global number of entries: " << getGlobalNumEntries ()
5239  << "}";
5240  }
5241  else {
5242  os << "isFillComplete: false"
5243  << ", global dimensions: [" << getGlobalNumRows () << ", "
5244  << getGlobalNumCols () << "]}";
5245  }
5246  return os.str ();
5247  }
5248 
5249  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5250  void
5252  describe (Teuchos::FancyOStream &out,
5253  const Teuchos::EVerbosityLevel verbLevel) const
5254  {
5255  using std::endl;
5256  using std::setw;
5257  using Teuchos::ArrayView;
5258  using Teuchos::Comm;
5259  using Teuchos::RCP;
5260  using Teuchos::TypeNameTraits;
5261  using Teuchos::VERB_DEFAULT;
5262  using Teuchos::VERB_NONE;
5263  using Teuchos::VERB_LOW;
5264  using Teuchos::VERB_MEDIUM;
5265  using Teuchos::VERB_HIGH;
5266  using Teuchos::VERB_EXTREME;
5267 
5268  const Teuchos::EVerbosityLevel vl = (verbLevel == VERB_DEFAULT) ? VERB_LOW : verbLevel;
5269 
5270  if (vl == VERB_NONE) {
5271  return; // Don't print anything at all
5272  }
5273 
5274  // By convention, describe() always begins with a tab.
5275  Teuchos::OSTab tab0 (out);
5276 
5277  RCP<const Comm<int> > comm = this->getComm();
5278  const int myRank = comm->getRank();
5279  const int numProcs = comm->getSize();
5280  size_t width = 1;
5281  for (size_t dec=10; dec<getGlobalNumRows(); dec *= 10) {
5282  ++width;
5283  }
5284  width = std::max<size_t> (width, static_cast<size_t> (11)) + 2;
5285 
5286  // none: print nothing
5287  // low: print O(1) info from node 0
5288  // medium: print O(P) info, num entries per process
5289  // high: print O(N) info, num entries per row
5290  // extreme: print O(NNZ) info: print indices and values
5291  //
5292  // for medium and higher, print constituent objects at specified verbLevel
5293  if (myRank == 0) {
5294  out << "Tpetra::CrsMatrix (Kokkos refactor):" << endl;
5295  }
5296  Teuchos::OSTab tab1 (out);
5297 
5298  if (myRank == 0) {
5299  if (this->getObjectLabel () != "") {
5300  out << "Label: \"" << this->getObjectLabel () << "\", ";
5301  }
5302  {
5303  out << "Template parameters:" << endl;
5304  Teuchos::OSTab tab2 (out);
5305  out << "Scalar: " << TypeNameTraits<Scalar>::name () << endl
5306  << "LocalOrdinal: " << TypeNameTraits<LocalOrdinal>::name () << endl
5307  << "GlobalOrdinal: " << TypeNameTraits<GlobalOrdinal>::name () << endl
5308  << "Node: " << TypeNameTraits<Node>::name () << endl;
5309  }
5310  if (isFillComplete()) {
5311  out << "isFillComplete: true" << endl
5312  << "Global dimensions: [" << getGlobalNumRows () << ", "
5313  << getGlobalNumCols () << "]" << endl
5314  << "Global number of entries: " << getGlobalNumEntries () << endl
5315  << endl << "Global max number of entries in a row: "
5316  << getGlobalMaxNumRowEntries () << endl;
5317  }
5318  else {
5319  out << "isFillComplete: false" << endl
5320  << "Global dimensions: [" << getGlobalNumRows () << ", "
5321  << getGlobalNumCols () << "]" << endl;
5322  }
5323  }
5324 
5325  if (vl < VERB_MEDIUM) {
5326  return; // all done!
5327  }
5328 
5329  // Describe the row Map.
5330  if (myRank == 0) {
5331  out << endl << "Row Map:" << endl;
5332  }
5333  if (getRowMap ().is_null ()) {
5334  if (myRank == 0) {
5335  out << "null" << endl;
5336  }
5337  }
5338  else {
5339  if (myRank == 0) {
5340  out << endl;
5341  }
5342  getRowMap ()->describe (out, vl);
5343  }
5344 
5345  // Describe the column Map.
5346  if (myRank == 0) {
5347  out << "Column Map: ";
5348  }
5349  if (getColMap ().is_null ()) {
5350  if (myRank == 0) {
5351  out << "null" << endl;
5352  }
5353  } else if (getColMap () == getRowMap ()) {
5354  if (myRank == 0) {
5355  out << "same as row Map" << endl;
5356  }
5357  } else {
5358  if (myRank == 0) {
5359  out << endl;
5360  }
5361  getColMap ()->describe (out, vl);
5362  }
5363 
5364  // Describe the domain Map.
5365  if (myRank == 0) {
5366  out << "Domain Map: ";
5367  }
5368  if (getDomainMap ().is_null ()) {
5369  if (myRank == 0) {
5370  out << "null" << endl;
5371  }
5372  } else if (getDomainMap () == getRowMap ()) {
5373  if (myRank == 0) {
5374  out << "same as row Map" << endl;
5375  }
5376  } else if (getDomainMap () == getColMap ()) {
5377  if (myRank == 0) {
5378  out << "same as column Map" << endl;
5379  }
5380  } else {
5381  if (myRank == 0) {
5382  out << endl;
5383  }
5384  getDomainMap ()->describe (out, vl);
5385  }
5386 
5387  // Describe the range Map.
5388  if (myRank == 0) {
5389  out << "Range Map: ";
5390  }
5391  if (getRangeMap ().is_null ()) {
5392  if (myRank == 0) {
5393  out << "null" << endl;
5394  }
5395  } else if (getRangeMap () == getDomainMap ()) {
5396  if (myRank == 0) {
5397  out << "same as domain Map" << endl;
5398  }
5399  } else if (getRangeMap () == getRowMap ()) {
5400  if (myRank == 0) {
5401  out << "same as row Map" << endl;
5402  }
5403  } else {
5404  if (myRank == 0) {
5405  out << endl;
5406  }
5407  getRangeMap ()->describe (out, vl);
5408  }
5409 
5410  // O(P) data
5411  for (int curRank = 0; curRank < numProcs; ++curRank) {
5412  if (myRank == curRank) {
5413  out << "Process rank: " << curRank << endl;
5414  Teuchos::OSTab tab2 (out);
5415  if (! staticGraph_->indicesAreAllocated ()) {
5416  out << "Graph indices not allocated" << endl;
5417  }
5418  else {
5419  out << "Number of allocated entries: "
5420  << staticGraph_->getLocalAllocationSize () << endl;
5421  }
5422  out << "Number of entries: " << getLocalNumEntries () << endl
5423  << "Max number of entries per row: " << getLocalMaxNumRowEntries ()
5424  << endl;
5425  }
5426  // Give output time to complete by executing some barriers.
5427  comm->barrier ();
5428  comm->barrier ();
5429  comm->barrier ();
5430  }
5431 
5432  if (vl < VERB_HIGH) {
5433  return; // all done!
5434  }
5435 
5436  // O(N) and O(NNZ) data
5437  for (int curRank = 0; curRank < numProcs; ++curRank) {
5438  if (myRank == curRank) {
5439  out << std::setw(width) << "Proc Rank"
5440  << std::setw(width) << "Global Row"
5441  << std::setw(width) << "Num Entries";
5442  if (vl == VERB_EXTREME) {
5443  out << std::setw(width) << "(Index,Value)";
5444  }
5445  out << endl;
5446  for (size_t r = 0; r < getLocalNumRows (); ++r) {
5447  const size_t nE = getNumEntriesInLocalRow(r);
5448  GlobalOrdinal gid = getRowMap()->getGlobalElement(r);
5449  out << std::setw(width) << myRank
5450  << std::setw(width) << gid
5451  << std::setw(width) << nE;
5452  if (vl == VERB_EXTREME) {
5453  if (isGloballyIndexed()) {
5454  global_inds_host_view_type rowinds;
5455  values_host_view_type rowvals;
5456  getGlobalRowView (gid, rowinds, rowvals);
5457  for (size_t j = 0; j < nE; ++j) {
5458  out << " (" << rowinds[j]
5459  << ", " << rowvals[j]
5460  << ") ";
5461  }
5462  }
5463  else if (isLocallyIndexed()) {
5464  local_inds_host_view_type rowinds;
5465  values_host_view_type rowvals;
5466  getLocalRowView (r, rowinds, rowvals);
5467  for (size_t j=0; j < nE; ++j) {
5468  out << " (" << getColMap()->getGlobalElement(rowinds[j])
5469  << ", " << rowvals[j]
5470  << ") ";
5471  }
5472  } // globally or locally indexed
5473  } // vl == VERB_EXTREME
5474  out << endl;
5475  } // for each row r on this process
5476  } // if (myRank == curRank)
5477 
5478  // Give output time to complete
5479  comm->barrier ();
5480  comm->barrier ();
5481  comm->barrier ();
5482  } // for each process p
5483  }
5484 
5485  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5486  bool
5489  {
5490  // It's not clear what kind of compatibility checks on sizes can
5491  // be performed here. Epetra_CrsGraph doesn't check any sizes for
5492  // compatibility.
5493 
5494  // Currently, the source object must be a RowMatrix with the same
5495  // four template parameters as the target CrsMatrix. We might
5496  // relax this requirement later.
5497  const row_matrix_type* srcRowMat =
5498  dynamic_cast<const row_matrix_type*> (&source);
5499  return (srcRowMat != nullptr);
5500  }
5501 
5502  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5503  void
5506  const typename crs_graph_type::padding_type& padding,
5507  const bool verbose)
5508  {
5510  using Details::padCrsArrays;
5511  using std::endl;
5512  using LO = local_ordinal_type;
5513  using row_ptrs_type =
5514  typename local_graph_device_type::row_map_type::non_const_type;
5515  using range_policy =
5516  Kokkos::RangePolicy<execution_space, Kokkos::IndexType<LO>>;
5517  const char tfecfFuncName[] = "applyCrsPadding";
5518  const char suffix[] =
5519  ". Please report this bug to the Tpetra developers.";
5520  ProfilingRegion regionCAP("Tpetra::CrsMatrix::applyCrsPadding");
5521 
5522  std::unique_ptr<std::string> prefix;
5523  if (verbose) {
5524  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
5525  std::ostringstream os;
5526  os << *prefix << "padding: ";
5527  padding.print(os);
5528  os << endl;
5529  std::cerr << os.str();
5530  }
5531  const int myRank = ! verbose ? -1 : [&] () {
5532  auto map = this->getMap();
5533  if (map.is_null()) {
5534  return -1;
5535  }
5536  auto comm = map->getComm();
5537  if (comm.is_null()) {
5538  return -1;
5539  }
5540  return comm->getRank();
5541  } ();
5542 
5543  // NOTE (mfh 29 Jan 2020) This allocates the values array.
5544  if (! myGraph_->indicesAreAllocated()) {
5545  if (verbose) {
5546  std::ostringstream os;
5547  os << *prefix << "Call allocateIndices" << endl;
5548  std::cerr << os.str();
5549  }
5550  allocateValues(GlobalIndices, GraphNotYetAllocated, verbose);
5551  }
5552 
5553  // FIXME (mfh 10 Feb 2020) We shouldn't actually reallocate
5554  // row_ptrs_beg or allocate row_ptrs_end unless the allocation
5555  // size needs to increase. That should be the job of
5556  // padCrsArrays.
5557 
5558  // Making copies here because rowPtrsUnpacked_ has a const type. Otherwise, we
5559  // would use it directly.
5560 
5561  if (verbose) {
5562  std::ostringstream os;
5563  os << *prefix << "Allocate row_ptrs_beg: "
5564  << myGraph_->getRowPtrsUnpackedHost().extent(0) << endl;
5565  std::cerr << os.str();
5566  }
5567  using Kokkos::view_alloc;
5568  using Kokkos::WithoutInitializing;
5569  row_ptrs_type row_ptr_beg(view_alloc("row_ptr_beg", WithoutInitializing),
5570  myGraph_->rowPtrsUnpacked_dev_.extent(0));
5571  // DEEP_COPY REVIEW - DEVICE-TO-DEVICE
5572  Kokkos::deep_copy(execution_space(),row_ptr_beg, myGraph_->rowPtrsUnpacked_dev_);
5573 
5574  const size_t N = row_ptr_beg.extent(0) == 0 ? size_t(0) :
5575  size_t(row_ptr_beg.extent(0) - 1);
5576  if (verbose) {
5577  std::ostringstream os;
5578  os << *prefix << "Allocate row_ptrs_end: " << N << endl;
5579  std::cerr << os.str();
5580  }
5581  row_ptrs_type row_ptr_end(
5582  view_alloc("row_ptr_end", WithoutInitializing), N);
5583 
5584  row_ptrs_type num_row_entries_d;
5585 
5586  const bool refill_num_row_entries =
5587  myGraph_->k_numRowEntries_.extent(0) != 0;
5588 
5589  if (refill_num_row_entries) { // unpacked storage
5590  // We can't assume correct *this capture until C++17, and it's
5591  // likely more efficient just to capture what we need anyway.
5592  num_row_entries_d = create_mirror_view_and_copy(memory_space(),
5593  myGraph_->k_numRowEntries_);
5594  Kokkos::parallel_for
5595  ("Fill end row pointers", range_policy(0, N),
5596  KOKKOS_LAMBDA (const size_t i) {
5597  row_ptr_end(i) = row_ptr_beg(i) + num_row_entries_d(i);
5598  });
5599  }
5600  else {
5601  // FIXME (mfh 04 Feb 2020) Fix padCrsArrays so that if packed
5602  // storage, we don't need row_ptr_end to be separate allocation;
5603  // could just have it alias row_ptr_beg+1.
5604  Kokkos::parallel_for
5605  ("Fill end row pointers", range_policy(0, N),
5606  KOKKOS_LAMBDA (const size_t i) {
5607  row_ptr_end(i) = row_ptr_beg(i+1);
5608  });
5609  }
5610 
5611  if (myGraph_->isGloballyIndexed()) {
5612  padCrsArrays(row_ptr_beg, row_ptr_end,
5613  myGraph_->gblInds_wdv,
5614  valuesUnpacked_wdv, padding, myRank, verbose);
5615  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5616  const auto newColIndsLen = myGraph_->gblInds_wdv.extent(0);
5617  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5618  (newValuesLen != newColIndsLen, std::logic_error,
5619  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5620  << " != myGraph_->gblInds_wdv.extent(0)=" << newColIndsLen
5621  << suffix);
5622  }
5623  else {
5624  padCrsArrays(row_ptr_beg, row_ptr_end,
5625  myGraph_->lclIndsUnpacked_wdv,
5626  valuesUnpacked_wdv, padding, myRank, verbose);
5627  const auto newValuesLen = valuesUnpacked_wdv.extent(0);
5628  const auto newColIndsLen = myGraph_->lclIndsUnpacked_wdv.extent(0);
5629  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5630  (newValuesLen != newColIndsLen, std::logic_error,
5631  ": After padding, valuesUnpacked_wdv.extent(0)=" << newValuesLen
5632  << " != myGraph_->lclIndsUnpacked_wdv.extent(0)=" << newColIndsLen
5633  << suffix);
5634  }
5635 
5636  if (refill_num_row_entries) {
5637  Kokkos::parallel_for
5638  ("Fill num entries", range_policy(0, N),
5639  KOKKOS_LAMBDA (const size_t i) {
5640  num_row_entries_d(i) = row_ptr_end(i) - row_ptr_beg(i);
5641  });
5642  Kokkos::deep_copy(myGraph_->k_numRowEntries_, num_row_entries_d);
5643  }
5644 
5645  if (verbose) {
5646  std::ostringstream os;
5647  os << *prefix << "Assign myGraph_->rowPtrsUnpacked_; "
5648  << "old size: " << myGraph_->rowPtrsUnpacked_host_.extent(0)
5649  << ", new size: " << row_ptr_beg.extent(0) << endl;
5650  std::cerr << os.str();
5651  TEUCHOS_ASSERT( myGraph_->getRowPtrsUnpackedHost().extent(0) ==
5652  row_ptr_beg.extent(0) );
5653  }
5654  myGraph_->setRowPtrsUnpacked(row_ptr_beg);
5655  }
5656 
5657  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5658  void
5659  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5660  copyAndPermuteStaticGraph(
5661  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5662  const size_t numSameIDs,
5663  const LocalOrdinal permuteToLIDs[],
5664  const LocalOrdinal permuteFromLIDs[],
5665  const size_t numPermutes)
5666  {
5667  using Details::ProfilingRegion;
5668  using Teuchos::Array;
5669  using Teuchos::ArrayView;
5670  using std::endl;
5671  using LO = LocalOrdinal;
5672  using GO = GlobalOrdinal;
5673  const char tfecfFuncName[] = "copyAndPermuteStaticGraph";
5674  const char suffix[] =
5675  " Please report this bug to the Tpetra developers.";
5676  ProfilingRegion regionCAP
5677  ("Tpetra::CrsMatrix::copyAndPermuteStaticGraph");
5678 
5679  const bool debug = Details::Behavior::debug("CrsGraph");
5680  const bool verbose = Details::Behavior::verbose("CrsGraph");
5681  std::unique_ptr<std::string> prefix;
5682  if (verbose) {
5683  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5684  std::ostringstream os;
5685  os << *prefix << "Start" << endl;
5686  }
5687  const char* const prefix_raw =
5688  verbose ? prefix.get()->c_str() : nullptr;
5689 
5690  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5691  //
5692  // Copy the first numSame row from source to target (this matrix).
5693  // This involves copying rows corresponding to LIDs [0, numSame-1].
5694  //
5695  const map_type& srcRowMap = * (srcMat.getRowMap ());
5696  nonconst_global_inds_host_view_type rowInds;
5697  nonconst_values_host_view_type rowVals;
5698  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5699  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5700  // Global ID for the current row index in the source matrix.
5701  // The first numSameIDs GIDs in the two input lists are the
5702  // same, so sourceGID == targetGID in this case.
5703  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5704  const GO targetGID = sourceGID;
5705 
5706  ArrayView<const GO>rowIndsConstView;
5707  ArrayView<const Scalar> rowValsConstView;
5708 
5709  if (sourceIsLocallyIndexed) {
5710  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5711  if (rowLength > static_cast<size_t> (rowInds.size())) {
5712  Kokkos::resize(rowInds,rowLength);
5713  Kokkos::resize(rowVals,rowLength);
5714  }
5715  // Resizing invalidates an Array's views, so we must make new
5716  // ones, even if rowLength hasn't changed.
5717  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5718  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5719 
5720  // The source matrix is locally indexed, so we have to get a
5721  // copy. Really it's the GIDs that have to be copied (because
5722  // they have to be converted from LIDs).
5723  size_t checkRowLength = 0;
5724  srcMat.getGlobalRowCopy (sourceGID, rowIndsView,
5725  rowValsView, checkRowLength);
5726  if (debug) {
5727  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5728  (rowLength != checkRowLength, std::logic_error, "For "
5729  "global row index " << sourceGID << ", the source "
5730  "matrix's getNumEntriesInGlobalRow returns a row length "
5731  "of " << rowLength << ", but getGlobalRowCopy reports "
5732  "a row length of " << checkRowLength << "." << suffix);
5733  }
5734 
5735  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5736  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5737  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5738  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5739  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5740  rowIndsView.data(), rowIndsView.extent(0),
5741  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5742  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5743  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5744  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5745  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5746  // KDDKDD UVM TEMPORARY: KokkosView interface
5747  }
5748  else { // source matrix is globally indexed.
5749  global_inds_host_view_type rowIndsView;
5750  values_host_view_type rowValsView;
5751  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5752  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5753  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5754  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5755  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5756  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5757  rowIndsView.data(), rowIndsView.extent(0),
5758  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5759  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5760  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5761  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5762  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5763  // KDDKDD UVM TEMPORARY: KokkosView interface
5764 
5765  }
5766 
5767  // Applying a permutation to a matrix with a static graph
5768  // means REPLACE-ing entries.
5769  combineGlobalValues(targetGID, rowIndsConstView,
5770  rowValsConstView, REPLACE,
5771  prefix_raw, debug, verbose);
5772  }
5773 
5774  if (verbose) {
5775  std::ostringstream os;
5776  os << *prefix << "Do permutes" << endl;
5777  }
5778 
5779  const map_type& tgtRowMap = * (this->getRowMap ());
5780  for (size_t p = 0; p < numPermutes; ++p) {
5781  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5782  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5783 
5784  ArrayView<const GO> rowIndsConstView;
5785  ArrayView<const Scalar> rowValsConstView;
5786 
5787  if (sourceIsLocallyIndexed) {
5788  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5789  if (rowLength > static_cast<size_t> (rowInds.size ())) {
5790  Kokkos::resize(rowInds,rowLength);
5791  Kokkos::resize(rowVals,rowLength);
5792  }
5793  // Resizing invalidates an Array's views, so we must make new
5794  // ones, even if rowLength hasn't changed.
5795  nonconst_global_inds_host_view_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5796  nonconst_values_host_view_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5797 
5798  // The source matrix is locally indexed, so we have to get a
5799  // copy. Really it's the GIDs that have to be copied (because
5800  // they have to be converted from LIDs).
5801  size_t checkRowLength = 0;
5802  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
5803  rowValsView, checkRowLength);
5804  if (debug) {
5805  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5806  (rowLength != checkRowLength, std::logic_error, "For "
5807  "source matrix global row index " << sourceGID << ", "
5808  "getNumEntriesInGlobalRow returns a row length of " <<
5809  rowLength << ", but getGlobalRowCopy a row length of "
5810  << checkRowLength << "." << suffix);
5811  }
5812 
5813  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5814  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5815  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5816  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5817  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5818  rowIndsView.data(), rowIndsView.extent(0),
5819  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5820  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5821  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5822  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5823  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5824  // KDDKDD UVM TEMPORARY: KokkosView interface
5825  }
5826  else {
5827  global_inds_host_view_type rowIndsView;
5828  values_host_view_type rowValsView;
5829  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5830  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5831  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5832  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5833  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5834  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5835  rowIndsView.data(), rowIndsView.extent(0),
5836  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5837  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5838  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5839  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5840  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5841  // KDDKDD UVM TEMPORARY: KokkosView interface
5842  }
5843 
5844  combineGlobalValues(targetGID, rowIndsConstView,
5845  rowValsConstView, REPLACE,
5846  prefix_raw, debug, verbose);
5847  }
5848 
5849  if (verbose) {
5850  std::ostringstream os;
5851  os << *prefix << "Done" << endl;
5852  }
5853  }
5854 
5855  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
5856  void
5857  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
5858  copyAndPermuteNonStaticGraph(
5859  const RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>& srcMat,
5860  const size_t numSameIDs,
5861  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs_dv,
5862  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs_dv,
5863  const size_t numPermutes)
5864  {
5865  using Details::ProfilingRegion;
5866  using Teuchos::Array;
5867  using Teuchos::ArrayView;
5868  using std::endl;
5869  using LO = LocalOrdinal;
5870  using GO = GlobalOrdinal;
5871  const char tfecfFuncName[] = "copyAndPermuteNonStaticGraph";
5872  const char suffix[] =
5873  " Please report this bug to the Tpetra developers.";
5874  ProfilingRegion regionCAP
5875  ("Tpetra::CrsMatrix::copyAndPermuteNonStaticGraph");
5876 
5877  const bool debug = Details::Behavior::debug("CrsGraph");
5878  const bool verbose = Details::Behavior::verbose("CrsGraph");
5879  std::unique_ptr<std::string> prefix;
5880  if (verbose) {
5881  prefix = this->createPrefix("CrsGraph", tfecfFuncName);
5882  std::ostringstream os;
5883  os << *prefix << "Start" << endl;
5884  }
5885  const char* const prefix_raw =
5886  verbose ? prefix.get()->c_str() : nullptr;
5887 
5888  {
5889  using row_graph_type = RowGraph<LO, GO, Node>;
5890  const row_graph_type& srcGraph = *(srcMat.getGraph());
5891  auto padding =
5892  myGraph_->computeCrsPadding(srcGraph, numSameIDs,
5893  permuteToLIDs_dv, permuteFromLIDs_dv, verbose);
5894  applyCrsPadding(*padding, verbose);
5895  }
5896  const bool sourceIsLocallyIndexed = srcMat.isLocallyIndexed ();
5897  //
5898  // Copy the first numSame row from source to target (this matrix).
5899  // This involves copying rows corresponding to LIDs [0, numSame-1].
5900  //
5901  const map_type& srcRowMap = * (srcMat.getRowMap ());
5902  const LO numSameIDs_as_LID = static_cast<LO> (numSameIDs);
5903  using gids_type = nonconst_global_inds_host_view_type;
5904  using vals_type = nonconst_values_host_view_type;
5905  gids_type rowInds;
5906  vals_type rowVals;
5907  for (LO sourceLID = 0; sourceLID < numSameIDs_as_LID; ++sourceLID) {
5908  // Global ID for the current row index in the source matrix.
5909  // The first numSameIDs GIDs in the two input lists are the
5910  // same, so sourceGID == targetGID in this case.
5911  const GO sourceGID = srcRowMap.getGlobalElement (sourceLID);
5912  const GO targetGID = sourceGID;
5913 
5914  ArrayView<const GO> rowIndsConstView;
5915  ArrayView<const Scalar> rowValsConstView;
5916 
5917  if (sourceIsLocallyIndexed) {
5918 
5919  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5920  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5921  Kokkos::resize(rowInds,rowLength);
5922  Kokkos::resize(rowVals,rowLength);
5923  }
5924  // Resizing invalidates an Array's views, so we must make new
5925  // ones, even if rowLength hasn't changed.
5926  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5927  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5928 
5929  // The source matrix is locally indexed, so we have to get a
5930  // copy. Really it's the GIDs that have to be copied (because
5931  // they have to be converted from LIDs).
5932  size_t checkRowLength = 0;
5933  srcMat.getGlobalRowCopy (sourceGID, rowIndsView, rowValsView,
5934  checkRowLength);
5935  if (debug) {
5936  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
5937  (rowLength != checkRowLength, std::logic_error, ": For "
5938  "global row index " << sourceGID << ", the source "
5939  "matrix's getNumEntriesInGlobalRow returns a row length "
5940  "of " << rowLength << ", but getGlobalRowCopy reports "
5941  "a row length of " << checkRowLength << "." << suffix);
5942  }
5943  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
5944  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
5945  }
5946  else { // source matrix is globally indexed.
5947  global_inds_host_view_type rowIndsView;
5948  values_host_view_type rowValsView;
5949  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
5950 
5951  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
5952  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
5953  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
5954  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
5955  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
5956  rowIndsView.data(), rowIndsView.extent(0),
5957  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5958  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
5959  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
5960  Teuchos::RCP_DISABLE_NODE_LOOKUP);
5961  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
5962  // KDDKDD UVM TEMPORARY: KokkosView interface
5963  }
5964 
5965  // Combine the data into the target matrix.
5966  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
5967  rowValsConstView, prefix_raw, debug, verbose);
5968  }
5969 
5970  if (verbose) {
5971  std::ostringstream os;
5972  os << *prefix << "Do permutes" << endl;
5973  }
5974  const LO* const permuteFromLIDs = permuteFromLIDs_dv.view_host().data();
5975  const LO* const permuteToLIDs = permuteToLIDs_dv.view_host().data();
5976 
5977  const map_type& tgtRowMap = * (this->getRowMap ());
5978  for (size_t p = 0; p < numPermutes; ++p) {
5979  const GO sourceGID = srcRowMap.getGlobalElement (permuteFromLIDs[p]);
5980  const GO targetGID = tgtRowMap.getGlobalElement (permuteToLIDs[p]);
5981 
5982  ArrayView<const GO> rowIndsConstView;
5983  ArrayView<const Scalar> rowValsConstView;
5984 
5985  if (sourceIsLocallyIndexed) {
5986  const size_t rowLength = srcMat.getNumEntriesInGlobalRow (sourceGID);
5987  if (rowLength > static_cast<size_t> (rowInds.extent(0))) {
5988  Kokkos::resize(rowInds,rowLength);
5989  Kokkos::resize(rowVals,rowLength);
5990  }
5991  // Resizing invalidates an Array's views, so we must make new
5992  // ones, even if rowLength hasn't changed.
5993  gids_type rowIndsView = Kokkos::subview(rowInds,std::make_pair((size_t)0, rowLength));
5994  vals_type rowValsView = Kokkos::subview(rowVals,std::make_pair((size_t)0, rowLength));
5995 
5996  // The source matrix is locally indexed, so we have to get a
5997  // copy. Really it's the GIDs that have to be copied (because
5998  // they have to be converted from LIDs).
5999  size_t checkRowLength = 0;
6000  srcMat.getGlobalRowCopy(sourceGID, rowIndsView,
6001  rowValsView, checkRowLength);
6002  if (debug) {
6003  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6004  (rowLength != checkRowLength, std::logic_error, "For "
6005  "source matrix global row index " << sourceGID << ", "
6006  "getNumEntriesInGlobalRow returns a row length of " <<
6007  rowLength << ", but getGlobalRowCopy a row length of "
6008  << checkRowLength << "." << suffix);
6009  }
6010  rowIndsConstView = Teuchos::ArrayView<const GO>(rowIndsView.data(), rowLength);
6011  rowValsConstView = Teuchos::ArrayView<const Scalar>(reinterpret_cast<Scalar *>(rowValsView.data()), rowLength);
6012  }
6013  else {
6014  global_inds_host_view_type rowIndsView;
6015  values_host_view_type rowValsView;
6016  srcMat.getGlobalRowView(sourceGID, rowIndsView, rowValsView);
6017 
6018  // KDDKDD UVM TEMPORARY: refactor combineGlobalValues to take
6019  // KDDKDD UVM TEMPORARY: Kokkos::View instead of ArrayView
6020  // KDDKDD UVM TEMPORARY: For now, wrap the view in ArrayViews
6021  // KDDKDD UVM TEMPORARY: Should be safe because we hold the KokkosViews
6022  rowIndsConstView = Teuchos::ArrayView<const GO> ( // BAD BAD BAD
6023  rowIndsView.data(), rowIndsView.extent(0),
6024  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6025  rowValsConstView = Teuchos::ArrayView<const Scalar> ( // BAD BAD BAD
6026  reinterpret_cast<const Scalar*>(rowValsView.data()), rowValsView.extent(0),
6027  Teuchos::RCP_DISABLE_NODE_LOOKUP);
6028  // KDDKDD UVM TEMPORARY: Add replace, sum, transform methods with
6029  // KDDKDD UVM TEMPORARY: KokkosView interface
6030  }
6031 
6032  // Combine the data into the target matrix.
6033  insertGlobalValuesFilteredChecked(targetGID, rowIndsConstView,
6034  rowValsConstView, prefix_raw, debug, verbose);
6035  }
6036 
6037  if (verbose) {
6038  std::ostringstream os;
6039  os << *prefix << "Done" << endl;
6040  }
6041  }
6042 
6043  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6044  void
6047  const SrcDistObject& srcObj,
6048  const size_t numSameIDs,
6049  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteToLIDs,
6050  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& permuteFromLIDs,
6051  const CombineMode /*CM*/)
6052  {
6053  using Details::Behavior;
6056  using std::endl;
6057 
6058  // Method name string for TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC.
6059  const char tfecfFuncName[] = "copyAndPermute: ";
6060  ProfilingRegion regionCAP("Tpetra::CrsMatrix::copyAndPermute");
6061 
6062  const bool verbose = Behavior::verbose("CrsMatrix");
6063  std::unique_ptr<std::string> prefix;
6064  if (verbose) {
6065  prefix = this->createPrefix("CrsMatrix", "copyAndPermute");
6066  std::ostringstream os;
6067  os << *prefix << endl
6068  << *prefix << " numSameIDs: " << numSameIDs << endl
6069  << *prefix << " numPermute: " << permuteToLIDs.extent(0)
6070  << endl
6071  << *prefix << " "
6072  << dualViewStatusToString (permuteToLIDs, "permuteToLIDs")
6073  << endl
6074  << *prefix << " "
6075  << dualViewStatusToString (permuteFromLIDs, "permuteFromLIDs")
6076  << endl
6077  << *prefix << " "
6078  << "isStaticGraph: " << (isStaticGraph() ? "true" : "false")
6079  << endl;
6080  std::cerr << os.str ();
6081  }
6082 
6083  const auto numPermute = permuteToLIDs.extent (0);
6084  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6085  (numPermute != permuteFromLIDs.extent (0),
6086  std::invalid_argument, "permuteToLIDs.extent(0) = "
6087  << numPermute << "!= permuteFromLIDs.extent(0) = "
6088  << permuteFromLIDs.extent (0) << ".");
6089 
6090  // This dynamic cast should succeed, because we've already tested
6091  // it in checkSizes().
6093  const RMT& srcMat = dynamic_cast<const RMT&> (srcObj);
6094  if (isStaticGraph ()) {
6095  TEUCHOS_ASSERT( ! permuteToLIDs.need_sync_host () );
6096  auto permuteToLIDs_h = permuteToLIDs.view_host ();
6097  TEUCHOS_ASSERT( ! permuteFromLIDs.need_sync_host () );
6098  auto permuteFromLIDs_h = permuteFromLIDs.view_host ();
6099 
6100  copyAndPermuteStaticGraph(srcMat, numSameIDs,
6101  permuteToLIDs_h.data(),
6102  permuteFromLIDs_h.data(),
6103  numPermute);
6104  }
6105  else {
6106  copyAndPermuteNonStaticGraph(srcMat, numSameIDs, permuteToLIDs,
6107  permuteFromLIDs, numPermute);
6108  }
6109 
6110  if (verbose) {
6111  std::ostringstream os;
6112  os << *prefix << "Done" << endl;
6113  std::cerr << os.str();
6114  }
6115  }
6116 
6117  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6118  void
6121  (const SrcDistObject& source,
6122  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6123  Kokkos::DualView<char*, buffer_device_type>& exports,
6124  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6125  size_t& constantNumPackets)
6126  {
6127  using Details::Behavior;
6130  using Teuchos::outArg;
6131  using Teuchos::REDUCE_MAX;
6132  using Teuchos::reduceAll;
6133  using std::endl;
6134  typedef LocalOrdinal LO;
6135  typedef GlobalOrdinal GO;
6136  const char tfecfFuncName[] = "packAndPrepare: ";
6137  ProfilingRegion regionPAP ("Tpetra::CrsMatrix::packAndPrepare");
6138 
6139  const bool debug = Behavior::debug("CrsMatrix");
6140  const bool verbose = Behavior::verbose("CrsMatrix");
6141 
6142  // Processes on which the communicator is null should not participate.
6143  Teuchos::RCP<const Teuchos::Comm<int> > pComm = this->getComm ();
6144  if (pComm.is_null ()) {
6145  return;
6146  }
6147  const Teuchos::Comm<int>& comm = *pComm;
6148  const int myRank = comm.getSize ();
6149 
6150  std::unique_ptr<std::string> prefix;
6151  if (verbose) {
6152  prefix = this->createPrefix("CrsMatrix", "packAndPrepare");
6153  std::ostringstream os;
6154  os << *prefix << "Start" << endl
6155  << *prefix << " "
6156  << dualViewStatusToString (exportLIDs, "exportLIDs")
6157  << endl
6158  << *prefix << " "
6159  << dualViewStatusToString (exports, "exports")
6160  << endl
6161  << *prefix << " "
6162  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6163  << endl;
6164  std::cerr << os.str ();
6165  }
6166 
6167  // Attempt to cast the source object to CrsMatrix. If successful,
6168  // use the source object's packNew() method to pack its data for
6169  // communication. Otherwise, attempt to cast to RowMatrix; if
6170  // successful, use the source object's pack() method. Otherwise,
6171  // the source object doesn't have the right type.
6172  //
6173  // FIXME (mfh 30 Jun 2013, 11 Sep 2017) We don't even need the
6174  // RowMatrix to have the same Node type. Unfortunately, we don't
6175  // have a way to ask if the RowMatrix is "a RowMatrix with any
6176  // Node type," since RowMatrix doesn't have a base class. A
6177  // hypothetical RowMatrixBase<Scalar, LO, GO> class, which does
6178  // not currently exist, would satisfy this requirement.
6179  //
6180  // Why RowMatrixBase<Scalar, LO, GO>? The source object's Scalar
6181  // type doesn't technically need to match the target object's
6182  // Scalar type, so we could just have RowMatrixBase<LO, GO>. LO
6183  // and GO need not be the same, as long as there is no overflow of
6184  // the indices. However, checking for index overflow is global
6185  // and therefore undesirable.
6186 
6187  std::ostringstream msg; // for collecting error messages
6188  int lclBad = 0; // to be set below
6189 
6190  using crs_matrix_type = CrsMatrix<Scalar, LO, GO, Node>;
6191  const crs_matrix_type* srcCrsMat =
6192  dynamic_cast<const crs_matrix_type*> (&source);
6193  if (srcCrsMat != nullptr) {
6194  if (verbose) {
6195  std::ostringstream os;
6196  os << *prefix << "Source matrix same (CrsMatrix) type as target; "
6197  "calling packNew" << endl;
6198  std::cerr << os.str ();
6199  }
6200  try {
6201  srcCrsMat->packNew (exportLIDs, exports, numPacketsPerLID,
6202  constantNumPackets);
6203  }
6204  catch (std::exception& e) {
6205  lclBad = 1;
6206  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6207  }
6208  }
6209  else {
6210  using Kokkos::HostSpace;
6211  using Kokkos::subview;
6212  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6213  using range_type = Kokkos::pair<size_t, size_t>;
6214 
6215  if (verbose) {
6216  std::ostringstream os;
6217  os << *prefix << "Source matrix NOT same (CrsMatrix) type as target"
6218  << endl;
6219  std::cerr << os.str ();
6220  }
6221 
6222  const row_matrix_type* srcRowMat =
6223  dynamic_cast<const row_matrix_type*> (&source);
6224  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6225  (srcRowMat == nullptr, std::invalid_argument,
6226  "The source object of the Import or Export operation is neither a "
6227  "CrsMatrix (with the same template parameters as the target object), "
6228  "nor a RowMatrix (with the same first four template parameters as the "
6229  "target object).");
6230 
6231  // For the RowMatrix case, we need to convert from
6232  // Kokkos::DualView to Teuchos::Array*. This doesn't need to be
6233  // so terribly efficient, since packing a non-CrsMatrix
6234  // RowMatrix for Import/Export into a CrsMatrix is not a
6235  // critical case. Thus, we may allocate Teuchos::Array objects
6236  // here and copy to and from Kokkos::*View.
6237 
6238  // View exportLIDs's host data as a Teuchos::ArrayView.
6239  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6240  auto exportLIDs_h = exportLIDs.view_host ();
6241  Teuchos::ArrayView<const LO> exportLIDs_av (exportLIDs_h.data (),
6242  exportLIDs_h.size ());
6243 
6244  // pack() will allocate exports_a as needed. We'll copy back
6245  // into exports (after (re)allocating exports if needed) below.
6246  Teuchos::Array<char> exports_a;
6247 
6248  // View exportLIDs' host data as a Teuchos::ArrayView. We don't
6249  // need to sync, since we're doing write-only access, but we do
6250  // need to mark the DualView as modified on host.
6251 
6252  numPacketsPerLID.clear_sync_state (); // write-only access
6253  numPacketsPerLID.modify_host ();
6254  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6255  Teuchos::ArrayView<size_t> numPacketsPerLID_av (numPacketsPerLID_h.data (),
6256  numPacketsPerLID_h.size ());
6257 
6258  // Invoke RowMatrix's legacy pack() interface, using above
6259  // Teuchos::Array* objects.
6260  try {
6261  srcRowMat->pack (exportLIDs_av, exports_a, numPacketsPerLID_av,
6262  constantNumPackets);
6263  }
6264  catch (std::exception& e) {
6265  lclBad = 1;
6266  msg << "Proc " << myRank << ": " << e.what () << std::endl;
6267  }
6268 
6269  // Allocate 'exports', and copy exports_a back into it.
6270  const size_t newAllocSize = static_cast<size_t> (exports_a.size ());
6271  if (static_cast<size_t> (exports.extent (0)) < newAllocSize) {
6272  const std::string oldLabel = exports.d_view.label ();
6273  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6274  exports = exports_type (newLabel, newAllocSize);
6275  }
6276  // It's safe to assume that we're working on host anyway, so
6277  // just keep exports sync'd to host.
6278  // ignore current device contents
6279  exports.modify_host();
6280 
6281  auto exports_h = exports.view_host ();
6282  auto exports_h_sub = subview (exports_h, range_type (0, newAllocSize));
6283 
6284  // Kokkos::deep_copy needs a Kokkos::View input, so turn
6285  // exports_a into a nonowning Kokkos::View first before copying.
6286  typedef typename exports_type::t_host::execution_space HES;
6287  typedef Kokkos::Device<HES, HostSpace> host_device_type;
6288  Kokkos::View<const char*, host_device_type>
6289  exports_a_kv (exports_a.getRawPtr (), newAllocSize);
6290  // DEEP_COPY REVIEW - NOT TESTED
6291  Kokkos::deep_copy (exports_h_sub, exports_a_kv);
6292  }
6293 
6294  if (debug) {
6295  int gblBad = 0; // output argument; to be set below
6296  reduceAll<int, int> (comm, REDUCE_MAX, lclBad, outArg (gblBad));
6297  if (gblBad != 0) {
6298  Tpetra::Details::gathervPrint (std::cerr, msg.str (), comm);
6299  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6300  (true, std::logic_error, "packNew() or pack() threw an exception on "
6301  "one or more participating processes.");
6302  }
6303  }
6304  else {
6305  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6306  (lclBad != 0, std::logic_error, "packNew threw an exception on one "
6307  "or more participating processes. Here is this process' error "
6308  "message: " << msg.str ());
6309  }
6310 
6311  if (verbose) {
6312  std::ostringstream os;
6313  os << *prefix << "packAndPrepare: Done!" << endl
6314  << *prefix << " "
6315  << dualViewStatusToString (exportLIDs, "exportLIDs")
6316  << endl
6317  << *prefix << " "
6318  << dualViewStatusToString (exports, "exports")
6319  << endl
6320  << *prefix << " "
6321  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6322  << endl;
6323  std::cerr << os.str ();
6324  }
6325  }
6326 
6327  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6328  size_t
6329  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6330  packRow (char exports[],
6331  const size_t offset,
6332  const size_t numEnt,
6333  const GlobalOrdinal gidsIn[],
6334  const impl_scalar_type valsIn[],
6335  const size_t numBytesPerValue) const
6336  {
6337  using Kokkos::View;
6338  using Kokkos::subview;
6340  typedef LocalOrdinal LO;
6341  typedef GlobalOrdinal GO;
6342  typedef impl_scalar_type ST;
6343 
6344  if (numEnt == 0) {
6345  // Empty rows always take zero bytes, to ensure sparsity.
6346  return 0;
6347  }
6348 
6349  const GO gid = 0; // packValueCount wants this
6350  const LO numEntLO = static_cast<size_t> (numEnt);
6351 
6352  const size_t numEntBeg = offset;
6353  const size_t numEntLen = PackTraits<LO>::packValueCount (numEntLO);
6354  const size_t gidsBeg = numEntBeg + numEntLen;
6355  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6356  const size_t valsBeg = gidsBeg + gidsLen;
6357  const size_t valsLen = numEnt * numBytesPerValue;
6358 
6359  char* const numEntOut = exports + numEntBeg;
6360  char* const gidsOut = exports + gidsBeg;
6361  char* const valsOut = exports + valsBeg;
6362 
6363  size_t numBytesOut = 0;
6364  int errorCode = 0;
6365  numBytesOut += PackTraits<LO>::packValue (numEntOut, numEntLO);
6366 
6367  {
6368  Kokkos::pair<int, size_t> p;
6369  p = PackTraits<GO>::packArray (gidsOut, gidsIn, numEnt);
6370  errorCode += p.first;
6371  numBytesOut += p.second;
6372 
6373  p = PackTraits<ST>::packArray (valsOut, valsIn, numEnt);
6374  errorCode += p.first;
6375  numBytesOut += p.second;
6376  }
6377 
6378  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6379  TEUCHOS_TEST_FOR_EXCEPTION
6380  (numBytesOut != expectedNumBytes, std::logic_error, "packRow: "
6381  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6382  << expectedNumBytes << ".");
6383  TEUCHOS_TEST_FOR_EXCEPTION
6384  (errorCode != 0, std::runtime_error, "packRow: "
6385  "PackTraits::packArray returned a nonzero error code");
6386 
6387  return numBytesOut;
6388  }
6389 
6390  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6391  size_t
6392  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6393  unpackRow (GlobalOrdinal gidsOut[],
6394  impl_scalar_type valsOut[],
6395  const char imports[],
6396  const size_t offset,
6397  const size_t numBytes,
6398  const size_t numEnt,
6399  const size_t numBytesPerValue)
6400  {
6401  using Kokkos::View;
6402  using Kokkos::subview;
6404  typedef LocalOrdinal LO;
6405  typedef GlobalOrdinal GO;
6406  typedef impl_scalar_type ST;
6407 
6408  Details::ProfilingRegion region_upack_row(
6409  "Tpetra::CrsMatrix::unpackRow",
6410  "Import/Export"
6411  );
6412 
6413  if (numBytes == 0) {
6414  // Rows with zero bytes should always have zero entries.
6415  if (numEnt != 0) {
6416  const int myRank = this->getMap ()->getComm ()->getRank ();
6417  TEUCHOS_TEST_FOR_EXCEPTION
6418  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6419  "unpackRow: The number of bytes to unpack numBytes=0, but the "
6420  "number of entries to unpack (as reported by numPacketsPerLID) "
6421  "for this row numEnt=" << numEnt << " != 0.");
6422  }
6423  return 0;
6424  }
6425 
6426  if (numEnt == 0 && numBytes != 0) {
6427  const int myRank = this->getMap ()->getComm ()->getRank ();
6428  TEUCHOS_TEST_FOR_EXCEPTION
6429  (true, std::logic_error, "(Proc " << myRank << ") CrsMatrix::"
6430  "unpackRow: The number of entries to unpack (as reported by "
6431  "numPacketsPerLID) numEnt=0, but the number of bytes to unpack "
6432  "numBytes=" << numBytes << " != 0.");
6433  }
6434 
6435  const GO gid = 0; // packValueCount wants this
6436  const LO lid = 0; // packValueCount wants this
6437 
6438  const size_t numEntBeg = offset;
6439  const size_t numEntLen = PackTraits<LO>::packValueCount (lid);
6440  const size_t gidsBeg = numEntBeg + numEntLen;
6441  const size_t gidsLen = numEnt * PackTraits<GO>::packValueCount (gid);
6442  const size_t valsBeg = gidsBeg + gidsLen;
6443  const size_t valsLen = numEnt * numBytesPerValue;
6444 
6445  const char* const numEntIn = imports + numEntBeg;
6446  const char* const gidsIn = imports + gidsBeg;
6447  const char* const valsIn = imports + valsBeg;
6448 
6449  size_t numBytesOut = 0;
6450  int errorCode = 0;
6451  LO numEntOut;
6452  numBytesOut += PackTraits<LO>::unpackValue (numEntOut, numEntIn);
6453  if (static_cast<size_t> (numEntOut) != numEnt ||
6454  numEntOut == static_cast<LO> (0)) {
6455  const int myRank = this->getMap ()->getComm ()->getRank ();
6456  std::ostringstream os;
6457  os << "(Proc " << myRank << ") CrsMatrix::unpackRow: ";
6458  bool firstErrorCondition = false;
6459  if (static_cast<size_t> (numEntOut) != numEnt) {
6460  os << "Number of entries from numPacketsPerLID numEnt=" << numEnt
6461  << " does not equal number of entries unpacked from imports "
6462  "buffer numEntOut=" << numEntOut << ".";
6463  firstErrorCondition = true;
6464  }
6465  if (numEntOut == static_cast<LO> (0)) {
6466  if (firstErrorCondition) {
6467  os << " Also, ";
6468  }
6469  os << "Number of entries unpacked from imports buffer numEntOut=0, "
6470  "but number of bytes to unpack for this row numBytes=" << numBytes
6471  << " != 0. This should never happen, since packRow should only "
6472  "ever pack rows with a nonzero number of entries. In this case, "
6473  "the number of entries from numPacketsPerLID is numEnt=" << numEnt
6474  << ".";
6475  }
6476  TEUCHOS_TEST_FOR_EXCEPTION(true, std::logic_error, os.str ());
6477  }
6478 
6479  {
6480  Kokkos::pair<int, size_t> p;
6481  p = PackTraits<GO>::unpackArray (gidsOut, gidsIn, numEnt);
6482  errorCode += p.first;
6483  numBytesOut += p.second;
6484 
6485  p = PackTraits<ST>::unpackArray (valsOut, valsIn, numEnt);
6486  errorCode += p.first;
6487  numBytesOut += p.second;
6488  }
6489 
6490  TEUCHOS_TEST_FOR_EXCEPTION
6491  (numBytesOut != numBytes, std::logic_error, "unpackRow: numBytesOut = "
6492  << numBytesOut << " != numBytes = " << numBytes << ".");
6493 
6494  const size_t expectedNumBytes = numEntLen + gidsLen + valsLen;
6495  TEUCHOS_TEST_FOR_EXCEPTION
6496  (numBytesOut != expectedNumBytes, std::logic_error, "unpackRow: "
6497  "numBytesOut = " << numBytesOut << " != expectedNumBytes = "
6498  << expectedNumBytes << ".");
6499 
6500  TEUCHOS_TEST_FOR_EXCEPTION
6501  (errorCode != 0, std::runtime_error, "unpackRow: "
6502  "PackTraits::unpackArray returned a nonzero error code");
6503 
6504  return numBytesOut;
6505  }
6506 
6507  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6508  void
6509  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6510  allocatePackSpaceNew (Kokkos::DualView<char*, buffer_device_type>& exports,
6511  size_t& totalNumEntries,
6512  const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs) const
6513  {
6514  using Details::Behavior;
6516  using std::endl;
6517  typedef impl_scalar_type IST;
6518  typedef LocalOrdinal LO;
6519  typedef GlobalOrdinal GO;
6520  //const char tfecfFuncName[] = "allocatePackSpaceNew: ";
6521 
6522  // mfh 18 Oct 2017: Set TPETRA_VERBOSE to true for copious debug
6523  // output to std::cerr on every MPI process. This is unwise for
6524  // runs with large numbers of MPI processes.
6525  const bool verbose = Behavior::verbose("CrsMatrix");
6526  std::unique_ptr<std::string> prefix;
6527  if (verbose) {
6528  prefix = this->createPrefix("CrsMatrix", "allocatePackSpaceNew");
6529  std::ostringstream os;
6530  os << *prefix << "Before:"
6531  << endl
6532  << *prefix << " "
6533  << dualViewStatusToString (exports, "exports")
6534  << endl
6535  << *prefix << " "
6536  << dualViewStatusToString (exportLIDs, "exportLIDs")
6537  << endl;
6538  std::cerr << os.str ();
6539  }
6540 
6541  // The number of export LIDs must fit in LocalOrdinal, assuming
6542  // that the LIDs are distinct and valid on the calling process.
6543  const LO numExportLIDs = static_cast<LO> (exportLIDs.extent (0));
6544 
6545  TEUCHOS_ASSERT( ! exportLIDs.need_sync_host () );
6546  auto exportLIDs_h = exportLIDs.view_host ();
6547 
6548  // Count the total number of matrix entries to send.
6549  totalNumEntries = 0;
6550  for (LO i = 0; i < numExportLIDs; ++i) {
6551  const LO lclRow = exportLIDs_h[i];
6552  size_t curNumEntries = this->getNumEntriesInLocalRow (lclRow);
6553  // FIXME (mfh 25 Jan 2015) We should actually report invalid row
6554  // indices as an error. Just consider them nonowned for now.
6555  if (curNumEntries == Teuchos::OrdinalTraits<size_t>::invalid ()) {
6556  curNumEntries = 0;
6557  }
6558  totalNumEntries += curNumEntries;
6559  }
6560 
6561  // FIXME (mfh 24 Feb 2013, 24 Mar 2017) This code is only correct
6562  // if sizeof(IST) is a meaningful representation of the amount of
6563  // data in a Scalar instance. (LO and GO are always built-in
6564  // integer types.)
6565  //
6566  // Allocate the exports array. It does NOT need padding for
6567  // alignment, since we use memcpy to write to / read from send /
6568  // receive buffers.
6569  const size_t allocSize =
6570  static_cast<size_t> (numExportLIDs) * sizeof (LO) +
6571  totalNumEntries * (sizeof (IST) + sizeof (GO));
6572  if (static_cast<size_t> (exports.extent (0)) < allocSize) {
6573  using exports_type = Kokkos::DualView<char*, buffer_device_type>;
6574 
6575  const std::string oldLabel = exports.d_view.label ();
6576  const std::string newLabel = (oldLabel == "") ? "exports" : oldLabel;
6577  exports = exports_type (newLabel, allocSize);
6578  }
6579 
6580  if (verbose) {
6581  std::ostringstream os;
6582  os << *prefix << "After:"
6583  << endl
6584  << *prefix << " "
6585  << dualViewStatusToString (exports, "exports")
6586  << endl
6587  << *prefix << " "
6588  << dualViewStatusToString (exportLIDs, "exportLIDs")
6589  << endl;
6590  std::cerr << os.str ();
6591  }
6592  }
6593 
6594  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6595  void
6597  packNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6598  Kokkos::DualView<char*, buffer_device_type>& exports,
6599  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6600  size_t& constantNumPackets) const
6601  {
6602  // The call to packNew in packAndPrepare catches and handles any exceptions.
6603  Details::ProfilingRegion region_pack_new("Tpetra::CrsMatrix::packNew", "Import/Export");
6604  if (this->isStaticGraph ()) {
6606  packCrsMatrixNew (*this, exports, numPacketsPerLID, exportLIDs,
6607  constantNumPackets);
6608  }
6609  else {
6610  this->packNonStaticNew (exportLIDs, exports, numPacketsPerLID,
6611  constantNumPackets);
6612  }
6613  }
6614 
6615  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6616  void
6618  packNonStaticNew (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& exportLIDs,
6619  Kokkos::DualView<char*, buffer_device_type>& exports,
6620  const Kokkos::DualView<size_t*, buffer_device_type>& numPacketsPerLID,
6621  size_t& constantNumPackets) const
6622  {
6623  using Details::Behavior;
6625  using Details::PackTraits;
6627  using Kokkos::View;
6628  using std::endl;
6629  using LO = LocalOrdinal;
6630  using GO = GlobalOrdinal;
6631  using ST = impl_scalar_type;
6632  const char tfecfFuncName[] = "packNonStaticNew: ";
6633 
6634  const bool verbose = Behavior::verbose("CrsMatrix");
6635  std::unique_ptr<std::string> prefix;
6636  if (verbose) {
6637  prefix = this->createPrefix("CrsMatrix", "packNonStaticNew");
6638  std::ostringstream os;
6639  os << *prefix << "Start" << endl;
6640  std::cerr << os.str ();
6641  }
6642 
6643  const size_t numExportLIDs = static_cast<size_t> (exportLIDs.extent (0));
6644  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6645  (numExportLIDs != static_cast<size_t> (numPacketsPerLID.extent (0)),
6646  std::invalid_argument, "exportLIDs.size() = " << numExportLIDs
6647  << " != numPacketsPerLID.size() = " << numPacketsPerLID.extent (0)
6648  << ".");
6649 
6650  // Setting this to zero tells the caller to expect a possibly
6651  // different ("nonconstant") number of packets per local index
6652  // (i.e., a possibly different number of entries per row).
6653  constantNumPackets = 0;
6654 
6655  // The pack buffer 'exports' enters this method possibly
6656  // unallocated. Do the first two parts of "Count, allocate, fill,
6657  // compute."
6658  size_t totalNumEntries = 0;
6659  this->allocatePackSpaceNew (exports, totalNumEntries, exportLIDs);
6660  const size_t bufSize = static_cast<size_t> (exports.extent (0));
6661 
6662  // Write-only host access
6663  exports.clear_sync_state();
6664  exports.modify_host();
6665  auto exports_h = exports.view_host ();
6666  if (verbose) {
6667  std::ostringstream os;
6668  os << *prefix << "After marking exports as modified on host, "
6669  << dualViewStatusToString (exports, "exports") << endl;
6670  std::cerr << os.str ();
6671  }
6672 
6673  // Read-only host access
6674  auto exportLIDs_h = exportLIDs.view_host ();
6675 
6676  // Write-only host access
6677  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->clear_sync_state();
6678  const_cast<Kokkos::DualView<size_t*, buffer_device_type>*>(&numPacketsPerLID)->modify_host();
6679  auto numPacketsPerLID_h = numPacketsPerLID.view_host ();
6680 
6681  // Compute the number of "packets" (in this case, bytes) per
6682  // export LID (in this case, local index of the row to send), and
6683  // actually pack the data.
6684  auto maxRowNumEnt = this->getLocalMaxNumRowEntries();
6685 
6686 
6687  // Temporary buffer for global column indices.
6688  typename global_inds_host_view_type::non_const_type gidsIn_k;
6689  if (this->isLocallyIndexed()) { // Need storage for Global IDs
6690  gidsIn_k =
6691  typename global_inds_host_view_type::non_const_type("packGids",
6692  maxRowNumEnt);
6693  }
6694 
6695  size_t offset = 0; // current index into 'exports' array.
6696  for (size_t i = 0; i < numExportLIDs; ++i) {
6697  const LO lclRow = exportLIDs_h[i];
6698 
6699  size_t numBytes = 0;
6700  size_t numEnt = this->getNumEntriesInLocalRow (lclRow);
6701 
6702  // Only pack this row's data if it has a nonzero number of
6703  // entries. We can do this because receiving processes get the
6704  // number of packets, and will know that zero packets means zero
6705  // entries.
6706  if (numEnt == 0) {
6707  numPacketsPerLID_h[i] = 0;
6708  continue;
6709  }
6710 
6711  if (this->isLocallyIndexed ()) {
6712  typename global_inds_host_view_type::non_const_type gidsIn;
6713  values_host_view_type valsIn;
6714  // If the matrix is locally indexed on the calling process, we
6715  // have to use its column Map (which it _must_ have in this
6716  // case) to convert to global indices.
6717  local_inds_host_view_type lidsIn;
6718  this->getLocalRowView (lclRow, lidsIn, valsIn);
6719  const map_type& colMap = * (this->getColMap ());
6720  for (size_t k = 0; k < numEnt; ++k) {
6721  gidsIn_k[k] = colMap.getGlobalElement (lidsIn[k]);
6722  }
6723  gidsIn = Kokkos::subview(gidsIn_k, Kokkos::make_pair(GO(0),GO(numEnt)));
6724 
6725  const size_t numBytesPerValue =
6726  PackTraits<ST>::packValueCount (valsIn[0]);
6727  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6728  gidsIn.data (), valsIn.data (),
6729  numBytesPerValue);
6730  }
6731  else if (this->isGloballyIndexed ()) {
6732  global_inds_host_view_type gidsIn;
6733  values_host_view_type valsIn;
6734  // If the matrix is globally indexed on the calling process,
6735  // then we can use the column indices directly. However, we
6736  // have to get the global row index. The calling process must
6737  // have a row Map, since otherwise it shouldn't be participating
6738  // in packing operations.
6739  const map_type& rowMap = * (this->getRowMap ());
6740  const GO gblRow = rowMap.getGlobalElement (lclRow);
6741  this->getGlobalRowView (gblRow, gidsIn, valsIn);
6742 
6743  const size_t numBytesPerValue =
6744  PackTraits<ST>::packValueCount (valsIn[0]);
6745  numBytes = this->packRow (exports_h.data (), offset, numEnt,
6746  gidsIn.data (), valsIn.data (),
6747  numBytesPerValue);
6748  }
6749  // mfh 11 Sep 2017: Currently, if the matrix is neither globally
6750  // nor locally indexed, then it has no entries. Therefore,
6751  // there is nothing to pack. No worries!
6752 
6753  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6754  (offset > bufSize || offset + numBytes > bufSize, std::logic_error,
6755  "First invalid offset into 'exports' pack buffer at index i = " << i
6756  << ". exportLIDs_h[i]: " << exportLIDs_h[i] << ", bufSize: " <<
6757  bufSize << ", offset: " << offset << ", numBytes: " << numBytes <<
6758  ".");
6759  // numPacketsPerLID_h[i] is the number of "packets" in the
6760  // current local row i. Packet=char (really "byte") so use the
6761  // number of bytes of the packed data for that row.
6762  numPacketsPerLID_h[i] = numBytes;
6763  offset += numBytes;
6764  }
6765 
6766  if (verbose) {
6767  std::ostringstream os;
6768  os << *prefix << "Tpetra::CrsMatrix::packNonStaticNew: After:" << endl
6769  << *prefix << " "
6770  << dualViewStatusToString (exports, "exports")
6771  << endl
6772  << *prefix << " "
6773  << dualViewStatusToString (exportLIDs, "exportLIDs")
6774  << endl;
6775  std::cerr << os.str ();
6776  }
6777  }
6778 
6779  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6780  LocalOrdinal
6781  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6782  combineGlobalValuesRaw(const LocalOrdinal lclRow,
6783  const LocalOrdinal numEnt,
6784  const impl_scalar_type vals[],
6785  const GlobalOrdinal cols[],
6786  const Tpetra::CombineMode combMode,
6787  const char* const prefix,
6788  const bool debug,
6789  const bool verbose)
6790  {
6791  using GO = GlobalOrdinal;
6792 
6793  // mfh 23 Mar 2017: This branch is not thread safe in a debug
6794  // build, due to use of Teuchos::ArrayView; see #229.
6795  const GO gblRow = myGraph_->rowMap_->getGlobalElement(lclRow);
6796  Teuchos::ArrayView<const GO> cols_av
6797  (numEnt == 0 ? nullptr : cols, numEnt);
6798  Teuchos::ArrayView<const Scalar> vals_av
6799  (numEnt == 0 ? nullptr : reinterpret_cast<const Scalar*> (vals), numEnt);
6800 
6801  // FIXME (mfh 23 Mar 2017) This is a work-around for less common
6802  // combine modes. combineGlobalValues throws on error; it does
6803  // not return an error code. Thus, if it returns, it succeeded.
6804  combineGlobalValues(gblRow, cols_av, vals_av, combMode,
6805  prefix, debug, verbose);
6806  return numEnt;
6807  }
6808 
6809  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6810  void
6811  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
6812  combineGlobalValues(
6813  const GlobalOrdinal globalRowIndex,
6814  const Teuchos::ArrayView<const GlobalOrdinal>& columnIndices,
6815  const Teuchos::ArrayView<const Scalar>& values,
6816  const Tpetra::CombineMode combineMode,
6817  const char* const prefix,
6818  const bool debug,
6819  const bool verbose)
6820  {
6821  const char tfecfFuncName[] = "combineGlobalValues: ";
6822 
6823  if (isStaticGraph ()) {
6824  // INSERT doesn't make sense for a static graph, since you
6825  // aren't allowed to change the structure of the graph.
6826  // However, all the other combine modes work.
6827  if (combineMode == ADD) {
6828  sumIntoGlobalValues (globalRowIndex, columnIndices, values);
6829  }
6830  else if (combineMode == REPLACE) {
6831  replaceGlobalValues (globalRowIndex, columnIndices, values);
6832  }
6833  else if (combineMode == ABSMAX) {
6834  using ::Tpetra::Details::AbsMax;
6835  AbsMax<Scalar> f;
6836  this->template transformGlobalValues<AbsMax<Scalar> > (globalRowIndex,
6837  columnIndices,
6838  values, f);
6839  }
6840  else if (combineMode == INSERT) {
6841  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6842  (isStaticGraph() && combineMode == INSERT,
6843  std::invalid_argument, "INSERT combine mode is forbidden "
6844  "if the matrix has a static (const) graph (i.e., was "
6845  "constructed with the CrsMatrix constructor that takes a "
6846  "const CrsGraph pointer).");
6847  }
6848  else {
6849  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6850  (true, std::logic_error, "Invalid combine mode; should "
6851  "never get here! "
6852  "Please report this bug to the Tpetra developers.");
6853  }
6854  }
6855  else { // The matrix has a dynamic graph.
6856  if (combineMode == ADD || combineMode == INSERT) {
6857  // For a dynamic graph, all incoming column indices are
6858  // inserted into the target graph. Duplicate indices will
6859  // have their values summed. In this context, ADD and INSERT
6860  // are equivalent. We need to call insertGlobalValues()
6861  // anyway if the column indices don't yet exist in this row,
6862  // so we just call insertGlobalValues() for both cases.
6863  insertGlobalValuesFilteredChecked(globalRowIndex,
6864  columnIndices, values, prefix, debug, verbose);
6865  }
6866  // FIXME (mfh 14 Mar 2012):
6867  //
6868  // Implementing ABSMAX or REPLACE for a dynamic graph would
6869  // require modifying assembly to attach a possibly different
6870  // combine mode to each inserted (i, j, A_ij) entry. For
6871  // example, consider two different Export operations to the same
6872  // target CrsMatrix, the first with ABSMAX combine mode and the
6873  // second with REPLACE. This isn't a common use case, so we
6874  // won't mess with it for now.
6875  else if (combineMode == ABSMAX) {
6876  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6877  ! isStaticGraph () && combineMode == ABSMAX, std::logic_error,
6878  "ABSMAX combine mode when the matrix has a dynamic graph is not yet "
6879  "implemented.");
6880  }
6881  else if (combineMode == REPLACE) {
6882  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6883  ! isStaticGraph () && combineMode == REPLACE, std::logic_error,
6884  "REPLACE combine mode when the matrix has a dynamic graph is not yet "
6885  "implemented.");
6886  }
6887  else {
6888  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC(
6889  true, std::logic_error, "Should never get here! Please report this "
6890  "bug to the Tpetra developers.");
6891  }
6892  }
6893  }
6894 
6895  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
6896  void
6899  (const Kokkos::DualView<const local_ordinal_type*, buffer_device_type>& importLIDs,
6900  Kokkos::DualView<char*, buffer_device_type> imports,
6901  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
6902  const size_t constantNumPackets,
6903  const CombineMode combineMode)
6904  {
6905  using Details::Behavior;
6908  using std::endl;
6909  const char tfecfFuncName[] = "unpackAndCombine: ";
6910  ProfilingRegion regionUAC ("Tpetra::CrsMatrix::unpackAndCombine");
6911 
6912  const bool debug = Behavior::debug("CrsMatrix");
6913  const bool verbose = Behavior::verbose("CrsMatrix");
6914  constexpr int numValidModes = 5;
6915  const CombineMode validModes[numValidModes] =
6916  {ADD, REPLACE, ABSMAX, INSERT, ZERO};
6917  const char* validModeNames[numValidModes] =
6918  {"ADD", "REPLACE", "ABSMAX", "INSERT", "ZERO"};
6919 
6920  std::unique_ptr<std::string> prefix;
6921  if (verbose) {
6922  prefix = this->createPrefix("CrsMatrix", "unpackAndCombine");
6923  std::ostringstream os;
6924  os << *prefix << "Start:" << endl
6925  << *prefix << " "
6926  << dualViewStatusToString (importLIDs, "importLIDs")
6927  << endl
6928  << *prefix << " "
6929  << dualViewStatusToString (imports, "imports")
6930  << endl
6931  << *prefix << " "
6932  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
6933  << endl
6934  << *prefix << " constantNumPackets: " << constantNumPackets
6935  << endl
6936  << *prefix << " combineMode: " << combineModeToString (combineMode)
6937  << endl;
6938  std::cerr << os.str ();
6939  }
6940 
6941  if (debug) {
6942  if (std::find (validModes, validModes+numValidModes, combineMode) ==
6943  validModes+numValidModes) {
6944  std::ostringstream os;
6945  os << "Invalid combine mode. Valid modes are {";
6946  for (int k = 0; k < numValidModes; ++k) {
6947  os << validModeNames[k];
6948  if (k < numValidModes - 1) {
6949  os << ", ";
6950  }
6951  }
6952  os << "}.";
6953  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6954  (true, std::invalid_argument, os.str ());
6955  }
6956  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6957  (importLIDs.extent(0) != numPacketsPerLID.extent(0),
6958  std::invalid_argument, "importLIDs.extent(0)="
6959  << importLIDs.extent(0)
6960  << " != numPacketsPerLID.extent(0)="
6961  << numPacketsPerLID.extent(0) << ".");
6962  }
6963 
6964  if (combineMode == ZERO) {
6965  return; // nothing to do
6966  }
6967 
6968  if (debug) {
6969  using Teuchos::reduceAll;
6970  std::unique_ptr<std::ostringstream> msg (new std::ostringstream ());
6971  int lclBad = 0;
6972  try {
6973  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
6974  constantNumPackets, combineMode,
6975  verbose);
6976  } catch (std::exception& e) {
6977  lclBad = 1;
6978  *msg << e.what ();
6979  }
6980  int gblBad = 0;
6981  const Teuchos::Comm<int>& comm = * (this->getComm ());
6982  reduceAll<int, int> (comm, Teuchos::REDUCE_MAX,
6983  lclBad, Teuchos::outArg (gblBad));
6984  if (gblBad != 0) {
6985  // mfh 22 Oct 2017: 'prefix' might be null, since it is only
6986  // initialized in a debug build. Thus, we get the process
6987  // rank again here. This is an error message, so the small
6988  // run-time cost doesn't matter. See #1887.
6989  std::ostringstream os;
6990  os << "Proc " << comm.getRank () << ": " << msg->str () << endl;
6991  msg = std::unique_ptr<std::ostringstream> (new std::ostringstream ());
6992  ::Tpetra::Details::gathervPrint (*msg, os.str (), comm);
6993  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
6994  (true, std::logic_error, std::endl << "unpackAndCombineImpl "
6995  "threw an exception on one or more participating processes: "
6996  << endl << msg->str ());
6997  }
6998  }
6999  else {
7000  unpackAndCombineImpl(importLIDs, imports, numPacketsPerLID,
7001  constantNumPackets, combineMode,
7002  verbose);
7003  }
7004 
7005  if (verbose) {
7006  std::ostringstream os;
7007  os << *prefix << "Done!" << endl
7008  << *prefix << " "
7009  << dualViewStatusToString (importLIDs, "importLIDs")
7010  << endl
7011  << *prefix << " "
7012  << dualViewStatusToString (imports, "imports")
7013  << endl
7014  << *prefix << " "
7015  << dualViewStatusToString (numPacketsPerLID, "numPacketsPerLID")
7016  << endl;
7017  std::cerr << os.str ();
7018  }
7019  }
7020 
7021  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7022  void
7025  const Kokkos::DualView<const local_ordinal_type*,
7026  buffer_device_type>& importLIDs,
7027  Kokkos::DualView<char*, buffer_device_type> imports,
7028  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7029  const size_t constantNumPackets,
7030  const CombineMode combineMode,
7031  const bool verbose)
7032  {
7033  Details::ProfilingRegion region_unpack_and_combine_impl(
7034  "Tpetra::CrsMatrix::unpackAndCombineImpl",
7035  "Import/Export"
7036  );
7037  using std::endl;
7038  const char tfecfFuncName[] = "unpackAndCombineImpl";
7039  std::unique_ptr<std::string> prefix;
7040  if (verbose) {
7041  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7042  std::ostringstream os;
7043  os << *prefix << "isStaticGraph(): "
7044  << (isStaticGraph() ? "true" : "false")
7045  << ", importLIDs.extent(0): "
7046  << importLIDs.extent(0)
7047  << ", imports.extent(0): "
7048  << imports.extent(0)
7049  << ", numPacketsPerLID.extent(0): "
7050  << numPacketsPerLID.extent(0)
7051  << endl;
7052  std::cerr << os.str();
7053  }
7054 
7055  if (isStaticGraph ()) {
7056  using Details::unpackCrsMatrixAndCombineNew;
7057  unpackCrsMatrixAndCombineNew(*this, imports, numPacketsPerLID,
7058  importLIDs, constantNumPackets,
7059  combineMode);
7060  }
7061  else {
7062  {
7063  using padding_type = typename crs_graph_type::padding_type;
7064  std::unique_ptr<padding_type> padding;
7065  try {
7066  padding = myGraph_->computePaddingForCrsMatrixUnpack(
7067  importLIDs, imports, numPacketsPerLID, verbose);
7068  }
7069  catch (std::exception& e) {
7070  const auto rowMap = getRowMap();
7071  const auto comm = rowMap.is_null() ? Teuchos::null :
7072  rowMap->getComm();
7073  const int myRank = comm.is_null() ? -1 : comm->getRank();
7074  TEUCHOS_TEST_FOR_EXCEPTION
7075  (true, std::runtime_error, "Proc " << myRank << ": "
7076  "Tpetra::CrsGraph::computePaddingForCrsMatrixUnpack "
7077  "threw an exception: " << e.what());
7078  }
7079  if (verbose) {
7080  std::ostringstream os;
7081  os << *prefix << "Call applyCrsPadding" << endl;
7082  std::cerr << os.str();
7083  }
7084  applyCrsPadding(*padding, verbose);
7085  }
7086  if (verbose) {
7087  std::ostringstream os;
7088  os << *prefix << "Call unpackAndCombineImplNonStatic" << endl;
7089  std::cerr << os.str();
7090  }
7091  unpackAndCombineImplNonStatic(importLIDs, imports,
7092  numPacketsPerLID,
7093  constantNumPackets,
7094  combineMode);
7095  }
7096 
7097  if (verbose) {
7098  std::ostringstream os;
7099  os << *prefix << "Done" << endl;
7100  std::cerr << os.str();
7101  }
7102  }
7103 
7104  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7105  void
7106  CrsMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node>::
7107  unpackAndCombineImplNonStatic(
7108  const Kokkos::DualView<const local_ordinal_type*,
7109  buffer_device_type>& importLIDs,
7110  Kokkos::DualView<char*, buffer_device_type> imports,
7111  Kokkos::DualView<size_t*, buffer_device_type> numPacketsPerLID,
7112  const size_t constantNumPackets,
7113  const CombineMode combineMode)
7114  {
7115  using Kokkos::View;
7116  using Kokkos::subview;
7117  using Kokkos::MemoryUnmanaged;
7118  using Details::Behavior;
7121  using Details::PackTraits;
7122  using Details::ScalarViewTraits;
7123  using std::endl;
7124  using LO = LocalOrdinal;
7125  using GO = GlobalOrdinal;
7126  using ST = impl_scalar_type;
7127  using size_type = typename Teuchos::ArrayView<LO>::size_type;
7128  using HES =
7129  typename View<int*, device_type>::HostMirror::execution_space;
7130  using pair_type = std::pair<typename View<int*, HES>::size_type,
7131  typename View<int*, HES>::size_type>;
7132  using gids_out_type = View<GO*, HES, MemoryUnmanaged>;
7133  using vals_out_type = View<ST*, HES, MemoryUnmanaged>;
7134  const char tfecfFuncName[] = "unpackAndCombineImplNonStatic";
7135 
7136  const bool debug = Behavior::debug("CrsMatrix");
7137  const bool verbose = Behavior::verbose("CrsMatrix");
7138  std::unique_ptr<std::string> prefix;
7139  if (verbose) {
7140  prefix = this->createPrefix("CrsMatrix", tfecfFuncName);
7141  std::ostringstream os;
7142  os << *prefix << endl; // we've already printed DualViews' statuses
7143  std::cerr << os.str ();
7144  }
7145  const char* const prefix_raw =
7146  verbose ? prefix.get()->c_str() : nullptr;
7147 
7148  const size_type numImportLIDs = importLIDs.extent (0);
7149  if (combineMode == ZERO || numImportLIDs == 0) {
7150  return; // nothing to do; no need to combine entries
7151  }
7152 
7153  Details::ProfilingRegion region_unpack_and_combine_impl_non_static(
7154  "Tpetra::CrsMatrix::unpackAndCombineImplNonStatic",
7155  "Import/Export"
7156  );
7157 
7158  // We're unpacking on host. This is read-only host access.
7159  if (imports.need_sync_host()) {
7160  imports.sync_host ();
7161  }
7162  auto imports_h = imports.view_host();
7163 
7164  // Read-only host access.
7165  if (numPacketsPerLID.need_sync_host()) {
7166  numPacketsPerLID.sync_host ();
7167  }
7168  auto numPacketsPerLID_h = numPacketsPerLID.view_host();
7169 
7170  TEUCHOS_ASSERT( ! importLIDs.need_sync_host() );
7171  auto importLIDs_h = importLIDs.view_host();
7172 
7173  size_t numBytesPerValue;
7174  {
7175  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7176  // with run-time size? We already assume that all entries in both the
7177  // source and target matrices have the same size. If the calling process
7178  // owns at least one entry in either matrix, we can use that entry to set
7179  // the size. However, it is possible that the calling process owns no
7180  // entries. In that case, we're in trouble. One way to fix this would be
7181  // for each row's data to contain the run-time size. This is only
7182  // necessary if the size is not a compile-time constant.
7183  Scalar val;
7184  numBytesPerValue = PackTraits<ST>::packValueCount (val);
7185  }
7186 
7187  // Determine the maximum number of entries in any one row
7188  size_t offset = 0;
7189  size_t maxRowNumEnt = 0;
7190  for (size_type i = 0; i < numImportLIDs; ++i) {
7191  const size_t numBytes = numPacketsPerLID_h[i];
7192  if (numBytes == 0) {
7193  continue; // empty buffer for that row means that the row is empty
7194  }
7195  // We need to unpack a nonzero number of entries for this row.
7196  if (debug) {
7197  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7198  (offset + numBytes > size_t(imports_h.extent (0)),
7199  std::logic_error, ": At local row index importLIDs_h[i="
7200  << i << "]=" << importLIDs_h[i] << ", offset (=" << offset
7201  << ") + numBytes (=" << numBytes << ") > "
7202  "imports_h.extent(0)=" << imports_h.extent (0) << ".");
7203  }
7204  LO numEntLO = 0;
7205 
7206  if (debug) {
7207  const size_t theNumBytes =
7208  PackTraits<LO>::packValueCount (numEntLO);
7209  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7210  (theNumBytes > numBytes, std::logic_error, ": theNumBytes="
7211  << theNumBytes << " > numBytes = " << numBytes << ".");
7212  }
7213  const char* const inBuf = imports_h.data () + offset;
7214  const size_t actualNumBytes =
7215  PackTraits<LO>::unpackValue (numEntLO, inBuf);
7216 
7217  if (debug) {
7218  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7219  (actualNumBytes > numBytes, std::logic_error, ": At i=" << i
7220  << ", actualNumBytes=" << actualNumBytes
7221  << " > numBytes=" << numBytes << ".");
7222  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7223  (numEntLO == 0, std::logic_error, ": At local row index "
7224  "importLIDs_h[i=" << i << "]=" << importLIDs_h[i] << ", "
7225  "the number of entries read from the packed data is "
7226  "numEntLO=" << numEntLO << ", but numBytes=" << numBytes
7227  << " != 0.");
7228  }
7229 
7230  maxRowNumEnt = std::max(size_t(numEntLO), maxRowNumEnt);
7231  offset += numBytes;
7232  }
7233 
7234  // Temporary space to cache incoming global column indices and
7235  // values. Column indices come in as global indices, in case the
7236  // source object's column Map differs from the target object's
7237  // (this's) column Map.
7238  View<GO*, HES> gblColInds;
7239  View<LO*, HES> lclColInds;
7240  View<ST*, HES> vals;
7241  {
7242  GO gid = 0;
7243  LO lid = 0;
7244  // FIXME (mfh 17 Feb 2015, tjf 2 Aug 2017) What do I do about Scalar types
7245  // with run-time size? We already assume that all entries in both the
7246  // source and target matrices have the same size. If the calling process
7247  // owns at least one entry in either matrix, we can use that entry to set
7248  // the size. However, it is possible that the calling process owns no
7249  // entries. In that case, we're in trouble. One way to fix this would be
7250  // for each row's data to contain the run-time size. This is only
7251  // necessary if the size is not a compile-time constant.
7252  Scalar val;
7253  gblColInds = ScalarViewTraits<GO, HES>::allocateArray(
7254  gid, maxRowNumEnt, "gids");
7255  lclColInds = ScalarViewTraits<LO, HES>::allocateArray(
7256  lid, maxRowNumEnt, "lids");
7257  vals = ScalarViewTraits<ST, HES>::allocateArray(
7258  val, maxRowNumEnt, "vals");
7259  }
7260 
7261  offset = 0;
7262  for (size_type i = 0; i < numImportLIDs; ++i) {
7263  const size_t numBytes = numPacketsPerLID_h[i];
7264  if (numBytes == 0) {
7265  continue; // empty buffer for that row means that the row is empty
7266  }
7267  LO numEntLO = 0;
7268  const char* const inBuf = imports_h.data () + offset;
7269  (void) PackTraits<LO>::unpackValue (numEntLO, inBuf);
7270 
7271  const size_t numEnt = static_cast<size_t>(numEntLO);;
7272  const LO lclRow = importLIDs_h[i];
7273 
7274  gids_out_type gidsOut = subview (gblColInds, pair_type (0, numEnt));
7275  vals_out_type valsOut = subview (vals, pair_type (0, numEnt));
7276 
7277  const size_t numBytesOut =
7278  unpackRow (gidsOut.data (), valsOut.data (), imports_h.data (),
7279  offset, numBytes, numEnt, numBytesPerValue);
7280  TEUCHOS_TEST_FOR_EXCEPTION_CLASS_FUNC
7281  (numBytes != numBytesOut, std::logic_error, ": At i=" << i
7282  << ", numBytes=" << numBytes << " != numBytesOut="
7283  << numBytesOut << ".");
7284 
7285  const ST* const valsRaw = const_cast<const ST*> (valsOut.data ());
7286  const GO* const gidsRaw = const_cast<const GO*> (gidsOut.data ());
7287  combineGlobalValuesRaw(lclRow, numEnt, valsRaw, gidsRaw,
7288  combineMode, prefix_raw, debug, verbose);
7289  // Don't update offset until current LID has succeeded.
7290  offset += numBytes;
7291  } // for each import LID i
7292 
7293  if (verbose) {
7294  std::ostringstream os;
7295  os << *prefix << "Done" << endl;
7296  std::cerr << os.str();
7297  }
7298  }
7299 
7300  template<class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7301  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7303  getColumnMapMultiVector (const MV& X_domainMap,
7304  const bool force) const
7305  {
7306  using Teuchos::null;
7307  using Teuchos::RCP;
7308  using Teuchos::rcp;
7309 
7310  TEUCHOS_TEST_FOR_EXCEPTION(
7311  ! this->hasColMap (), std::runtime_error, "Tpetra::CrsMatrix::getColumn"
7312  "MapMultiVector: You may only call this method if the matrix has a "
7313  "column Map. If the matrix does not yet have a column Map, you should "
7314  "first call fillComplete (with domain and range Map if necessary).");
7315 
7316  // If the graph is not fill complete, then the Import object (if
7317  // one should exist) hasn't been constructed yet.
7318  TEUCHOS_TEST_FOR_EXCEPTION(
7319  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7320  "CrsMatrix::getColumnMapMultiVector: You may only call this method if "
7321  "this matrix's graph is fill complete.");
7322 
7323  const size_t numVecs = X_domainMap.getNumVectors ();
7324  RCP<const import_type> importer = this->getGraph ()->getImporter ();
7325  RCP<const map_type> colMap = this->getColMap ();
7326 
7327  RCP<MV> X_colMap; // null by default
7328 
7329  // If the Import object is trivial (null), then we don't need a
7330  // separate column Map multivector. Just return null in that
7331  // case. The caller is responsible for knowing not to use the
7332  // returned null pointer.
7333  //
7334  // If the Import is nontrivial, then we do need a separate
7335  // column Map multivector for the Import operation. Check in
7336  // that case if we have to (re)create the column Map
7337  // multivector.
7338  if (! importer.is_null () || force) {
7339  if (importMV_.is_null () || importMV_->getNumVectors () != numVecs) {
7340  X_colMap = rcp (new MV (colMap, numVecs));
7341 
7342  // Cache the newly created multivector for later reuse.
7343  importMV_ = X_colMap;
7344  }
7345  else { // Yay, we can reuse the cached multivector!
7346  X_colMap = importMV_;
7347  // mfh 09 Jan 2013: We don't have to fill with zeros first,
7348  // because the Import uses INSERT combine mode, which overwrites
7349  // existing entries.
7350  //
7351  //X_colMap->putScalar (ZERO);
7352  }
7353  }
7354  return X_colMap;
7355  }
7356 
7357  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7358  Teuchos::RCP<MultiVector<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7361  const bool force) const
7362  {
7363  using Teuchos::null;
7364  using Teuchos::RCP;
7365  using Teuchos::rcp;
7366 
7367  // If the graph is not fill complete, then the Export object (if
7368  // one should exist) hasn't been constructed yet.
7369  TEUCHOS_TEST_FOR_EXCEPTION(
7370  ! this->getGraph ()->isFillComplete (), std::runtime_error, "Tpetra::"
7371  "CrsMatrix::getRowMapMultiVector: You may only call this method if this "
7372  "matrix's graph is fill complete.");
7373 
7374  const size_t numVecs = Y_rangeMap.getNumVectors ();
7375  RCP<const export_type> exporter = this->getGraph ()->getExporter ();
7376  // Every version of the constructor takes either a row Map, or a
7377  // graph (all of whose constructors take a row Map). Thus, the
7378  // matrix always has a row Map.
7379  RCP<const map_type> rowMap = this->getRowMap ();
7380 
7381  RCP<MV> Y_rowMap; // null by default
7382 
7383  // If the Export object is trivial (null), then we don't need a
7384  // separate row Map multivector. Just return null in that case.
7385  // The caller is responsible for knowing not to use the returned
7386  // null pointer.
7387  //
7388  // If the Export is nontrivial, then we do need a separate row
7389  // Map multivector for the Export operation. Check in that case
7390  // if we have to (re)create the row Map multivector.
7391  if (! exporter.is_null () || force) {
7392  if (exportMV_.is_null () || exportMV_->getNumVectors () != numVecs) {
7393  Y_rowMap = rcp (new MV (rowMap, numVecs));
7394  exportMV_ = Y_rowMap; // Cache the newly created MV for later reuse.
7395  }
7396  else { // Yay, we can reuse the cached multivector!
7397  Y_rowMap = exportMV_;
7398  }
7399  }
7400  return Y_rowMap;
7401  }
7402 
7403  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7404  void
7406  removeEmptyProcessesInPlace (const Teuchos::RCP<const map_type>& newMap)
7407  {
7408  TEUCHOS_TEST_FOR_EXCEPTION(
7409  myGraph_.is_null (), std::logic_error, "Tpetra::CrsMatrix::"
7410  "removeEmptyProcessesInPlace: This method does not work when the matrix "
7411  "was created with a constant graph (that is, when it was created using "
7412  "the version of its constructor that takes an RCP<const CrsGraph>). "
7413  "This is because the matrix is not allowed to modify the graph in that "
7414  "case, but removing empty processes requires modifying the graph.");
7415  myGraph_->removeEmptyProcessesInPlace (newMap);
7416  // Even though CrsMatrix's row Map (as returned by getRowMap())
7417  // comes from its CrsGraph, CrsMatrix still implements DistObject,
7418  // so we also have to change the DistObject's Map.
7419  this->map_ = this->getRowMap ();
7420  // In the nonconst graph case, staticGraph_ is just a const
7421  // pointer to myGraph_. This assignment is probably redundant,
7422  // but it doesn't hurt.
7423  staticGraph_ = Teuchos::rcp_const_cast<const Graph> (myGraph_);
7424  }
7425 
7426  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7427  Teuchos::RCP<RowMatrix<Scalar, LocalOrdinal, GlobalOrdinal, Node> >
7429  add (const Scalar& alpha,
7431  const Scalar& beta,
7432  const Teuchos::RCP<const map_type>& domainMap,
7433  const Teuchos::RCP<const map_type>& rangeMap,
7434  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7435  {
7436  using Teuchos::Array;
7437  using Teuchos::ArrayView;
7438  using Teuchos::ParameterList;
7439  using Teuchos::RCP;
7440  using Teuchos::rcp;
7441  using Teuchos::rcp_implicit_cast;
7442  using Teuchos::sublist;
7443  using std::endl;
7444  using LO = local_ordinal_type;
7445  using GO = global_ordinal_type;
7446  using crs_matrix_type =
7448  const char errPfx[] = "Tpetra::CrsMatrix::add: ";
7449 
7450  const bool debug = Details::Behavior::debug("CrsMatrix");
7451  const bool verbose = Details::Behavior::verbose("CrsMatrix");
7452  std::unique_ptr<std::string> prefix;
7453  if (verbose) {
7454  prefix = this->createPrefix("CrsMatrix", "add");
7455  std::ostringstream os;
7456  os << *prefix << "Start" << endl;
7457  std::cerr << os.str ();
7458  }
7459 
7460  const crs_matrix_type& B = *this; // a convenient abbreviation
7461  const Scalar ZERO = Teuchos::ScalarTraits<Scalar>::zero();
7462  const Scalar ONE = Teuchos::ScalarTraits<Scalar>::one();
7463 
7464  // If the user didn't supply a domain or range Map, then try to
7465  // get one from B first (if it has them), then from A (if it has
7466  // them). If we don't have any domain or range Maps, scold the
7467  // user.
7468  RCP<const map_type> A_domainMap = A.getDomainMap ();
7469  RCP<const map_type> A_rangeMap = A.getRangeMap ();
7470  RCP<const map_type> B_domainMap = B.getDomainMap ();
7471  RCP<const map_type> B_rangeMap = B.getRangeMap ();
7472 
7473  RCP<const map_type> theDomainMap = domainMap;
7474  RCP<const map_type> theRangeMap = rangeMap;
7475 
7476  if (domainMap.is_null ()) {
7477  if (B_domainMap.is_null ()) {
7478  TEUCHOS_TEST_FOR_EXCEPTION(
7479  A_domainMap.is_null (), std::invalid_argument,
7480  "Tpetra::CrsMatrix::add: If neither A nor B have a domain Map, "
7481  "then you must supply a nonnull domain Map to this method.");
7482  theDomainMap = A_domainMap;
7483  } else {
7484  theDomainMap = B_domainMap;
7485  }
7486  }
7487  if (rangeMap.is_null ()) {
7488  if (B_rangeMap.is_null ()) {
7489  TEUCHOS_TEST_FOR_EXCEPTION(
7490  A_rangeMap.is_null (), std::invalid_argument,
7491  "Tpetra::CrsMatrix::add: If neither A nor B have a range Map, "
7492  "then you must supply a nonnull range Map to this method.");
7493  theRangeMap = A_rangeMap;
7494  } else {
7495  theRangeMap = B_rangeMap;
7496  }
7497  }
7498 
7499  if (debug) {
7500  // In debug mode, check that A and B have matching domain and
7501  // range Maps, if they have domain and range Maps at all. (If
7502  // they aren't fill complete, then they may not yet have them.)
7503  if (! A_domainMap.is_null() && ! A_rangeMap.is_null()) {
7504  if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7505  TEUCHOS_TEST_FOR_EXCEPTION
7506  (! B_domainMap->isSameAs(*A_domainMap),
7507  std::invalid_argument,
7508  errPfx << "The input RowMatrix A must have a domain Map "
7509  "which is the same as (isSameAs) this RowMatrix's "
7510  "domain Map.");
7511  TEUCHOS_TEST_FOR_EXCEPTION
7512  (! B_rangeMap->isSameAs(*A_rangeMap), std::invalid_argument,
7513  errPfx << "The input RowMatrix A must have a range Map "
7514  "which is the same as (isSameAs) this RowMatrix's range "
7515  "Map.");
7516  TEUCHOS_TEST_FOR_EXCEPTION
7517  (! domainMap.is_null() &&
7518  ! domainMap->isSameAs(*B_domainMap),
7519  std::invalid_argument,
7520  errPfx << "The input domain Map must be the same as "
7521  "(isSameAs) this RowMatrix's domain Map.");
7522  TEUCHOS_TEST_FOR_EXCEPTION
7523  (! rangeMap.is_null() &&
7524  ! rangeMap->isSameAs(*B_rangeMap),
7525  std::invalid_argument,
7526  errPfx << "The input range Map must be the same as "
7527  "(isSameAs) this RowMatrix's range Map.");
7528  }
7529  }
7530  else if (! B_domainMap.is_null() && ! B_rangeMap.is_null()) {
7531  TEUCHOS_TEST_FOR_EXCEPTION
7532  (! domainMap.is_null() &&
7533  ! domainMap->isSameAs(*B_domainMap),
7534  std::invalid_argument,
7535  errPfx << "The input domain Map must be the same as "
7536  "(isSameAs) this RowMatrix's domain Map.");
7537  TEUCHOS_TEST_FOR_EXCEPTION
7538  (! rangeMap.is_null() && ! rangeMap->isSameAs(*B_rangeMap),
7539  std::invalid_argument,
7540  errPfx << "The input range Map must be the same as "
7541  "(isSameAs) this RowMatrix's range Map.");
7542  }
7543  else {
7544  TEUCHOS_TEST_FOR_EXCEPTION
7545  (domainMap.is_null() || rangeMap.is_null(),
7546  std::invalid_argument, errPfx << "If neither A nor B "
7547  "have a domain and range Map, then you must supply a "
7548  "nonnull domain and range Map to this method.");
7549  }
7550  }
7551 
7552  // What parameters do we pass to C's constructor? Do we call
7553  // fillComplete on C after filling it? And if so, what parameters
7554  // do we pass to C's fillComplete call?
7555  bool callFillComplete = true;
7556  RCP<ParameterList> constructorSublist;
7557  RCP<ParameterList> fillCompleteSublist;
7558  if (! params.is_null()) {
7559  callFillComplete =
7560  params->get("Call fillComplete", callFillComplete);
7561  constructorSublist = sublist(params, "Constructor parameters");
7562  fillCompleteSublist = sublist(params, "fillComplete parameters");
7563  }
7564 
7565  RCP<const map_type> A_rowMap = A.getRowMap ();
7566  RCP<const map_type> B_rowMap = B.getRowMap ();
7567  RCP<const map_type> C_rowMap = B_rowMap; // see discussion in documentation
7568  RCP<crs_matrix_type> C; // The result matrix.
7569 
7570  // If A and B's row Maps are the same, we can compute an upper
7571  // bound on the number of entries in each row of C, before
7572  // actually computing the sum. A reasonable upper bound is the
7573  // sum of the two entry counts in each row.
7574  if (A_rowMap->isSameAs (*B_rowMap)) {
7575  const LO localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7576  Array<size_t> C_maxNumEntriesPerRow (localNumRows, 0);
7577 
7578  // Get the number of entries in each row of A.
7579  if (alpha != ZERO) {
7580  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7581  const size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7582  C_maxNumEntriesPerRow[localRow] += A_numEntries;
7583  }
7584  }
7585  // Get the number of entries in each row of B.
7586  if (beta != ZERO) {
7587  for (LO localRow = 0; localRow < localNumRows; ++localRow) {
7588  const size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7589  C_maxNumEntriesPerRow[localRow] += B_numEntries;
7590  }
7591  }
7592  // Construct the result matrix C.
7593  if (constructorSublist.is_null ()) {
7594  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow ()));
7595  } else {
7596  C = rcp (new crs_matrix_type (C_rowMap, C_maxNumEntriesPerRow (),
7597  constructorSublist));
7598  }
7599  // Since A and B have the same row Maps, we could add them
7600  // together all at once and merge values before we call
7601  // insertGlobalValues. However, we don't really need to, since
7602  // we've already allocated enough space in each row of C for C
7603  // to do the merge itself.
7604  }
7605  else { // the row Maps of A and B are not the same
7606  // Construct the result matrix C.
7607  // true: !A_rowMap->isSameAs (*B_rowMap)
7608  TEUCHOS_TEST_FOR_EXCEPTION
7609  (true, std::invalid_argument, errPfx << "The row maps must "
7610  "be the same for statically allocated matrices, to ensure "
7611  "that there is sufficient space to do the addition.");
7612  }
7613 
7614  TEUCHOS_TEST_FOR_EXCEPTION
7615  (C.is_null (), std::logic_error,
7616  errPfx << "C should not be null at this point. "
7617  "Please report this bug to the Tpetra developers.");
7618 
7619  if (verbose) {
7620  std::ostringstream os;
7621  os << *prefix << "Compute C = alpha*A + beta*B" << endl;
7622  std::cerr << os.str ();
7623  }
7624  using gids_type = nonconst_global_inds_host_view_type;
7625  using vals_type = nonconst_values_host_view_type;
7626  gids_type ind;
7627  vals_type val;
7628 
7629  if (alpha != ZERO) {
7630  const LO A_localNumRows = static_cast<LO> (A_rowMap->getLocalNumElements ());
7631  for (LO localRow = 0; localRow < A_localNumRows; ++localRow) {
7632  size_t A_numEntries = A.getNumEntriesInLocalRow (localRow);
7633  const GO globalRow = A_rowMap->getGlobalElement (localRow);
7634  if (A_numEntries > static_cast<size_t> (ind.size ())) {
7635  Kokkos::resize(ind,A_numEntries);
7636  Kokkos::resize(val,A_numEntries);
7637  }
7638  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, A_numEntries));
7639  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, A_numEntries));
7640  A.getGlobalRowCopy (globalRow, indView, valView, A_numEntries);
7641 
7642  if (alpha != ONE) {
7643  for (size_t k = 0; k < A_numEntries; ++k) {
7644  valView[k] *= alpha;
7645  }
7646  }
7647  C->insertGlobalValues (globalRow, A_numEntries,
7648  reinterpret_cast<Scalar *>(valView.data()),
7649  indView.data());
7650  }
7651  }
7652 
7653  if (beta != ZERO) {
7654  const LO B_localNumRows = static_cast<LO> (B_rowMap->getLocalNumElements ());
7655  for (LO localRow = 0; localRow < B_localNumRows; ++localRow) {
7656  size_t B_numEntries = B.getNumEntriesInLocalRow (localRow);
7657  const GO globalRow = B_rowMap->getGlobalElement (localRow);
7658  if (B_numEntries > static_cast<size_t> (ind.size ())) {
7659  Kokkos::resize(ind,B_numEntries);
7660  Kokkos::resize(val,B_numEntries);
7661  }
7662  gids_type indView = Kokkos::subview(ind,std::make_pair((size_t)0, B_numEntries));
7663  vals_type valView = Kokkos::subview(val,std::make_pair((size_t)0, B_numEntries));
7664  B.getGlobalRowCopy (globalRow, indView, valView, B_numEntries);
7665 
7666  if (beta != ONE) {
7667  for (size_t k = 0; k < B_numEntries; ++k) {
7668  valView[k] *= beta;
7669  }
7670  }
7671  C->insertGlobalValues (globalRow, B_numEntries,
7672  reinterpret_cast<Scalar *>(valView.data()),
7673  indView.data());
7674  }
7675  }
7676 
7677  if (callFillComplete) {
7678  if (verbose) {
7679  std::ostringstream os;
7680  os << *prefix << "Call fillComplete on C" << endl;
7681  std::cerr << os.str ();
7682  }
7683  if (fillCompleteSublist.is_null ()) {
7684  C->fillComplete (theDomainMap, theRangeMap);
7685  } else {
7686  C->fillComplete (theDomainMap, theRangeMap, fillCompleteSublist);
7687  }
7688  }
7689  else if (verbose) {
7690  std::ostringstream os;
7691  os << *prefix << "Do NOT call fillComplete on C" << endl;
7692  std::cerr << os.str ();
7693  }
7694 
7695  if (verbose) {
7696  std::ostringstream os;
7697  os << *prefix << "Done" << endl;
7698  std::cerr << os.str ();
7699  }
7700  return rcp_implicit_cast<row_matrix_type> (C);
7701  }
7702 
7703 
7704 
7705  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
7706  void
7709  const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node>& rowTransfer,
7710  const Teuchos::RCP<const ::Tpetra::Details::Transfer<LocalOrdinal, GlobalOrdinal, Node> > & domainTransfer,
7711  const Teuchos::RCP<const map_type>& domainMap,
7712  const Teuchos::RCP<const map_type>& rangeMap,
7713  const Teuchos::RCP<Teuchos::ParameterList>& params) const
7714  {
7715  using Details::Behavior;
7720  using Teuchos::ArrayRCP;
7721  using Teuchos::ArrayView;
7722  using Teuchos::Comm;
7723  using Teuchos::ParameterList;
7724  using Teuchos::RCP;
7725  using std::endl;
7726  typedef LocalOrdinal LO;
7727  typedef GlobalOrdinal GO;
7728  typedef node_type NT;
7729  typedef CrsMatrix<Scalar, LO, GO, NT> this_CRS_type;
7730  typedef Vector<int, LO, GO, NT> IntVectorType;
7731  using Teuchos::as;
7732 
7733  const bool debug = Behavior::debug("CrsMatrix");
7734  const bool verbose = Behavior::verbose("CrsMatrix");
7735  int MyPID = getComm ()->getRank ();
7736 
7737  std::unique_ptr<std::string> verbosePrefix;
7738  if (verbose) {
7739  verbosePrefix =
7740  this->createPrefix("CrsMatrix", "transferAndFillComplete");
7741  std::ostringstream os;
7742  os << "Start" << endl;
7743  std::cerr << os.str();
7744  }
7745 
7746  //
7747  // Get the caller's parameters
7748  //
7749  bool isMM = false; // optimize for matrix-matrix ops.
7750  bool reverseMode = false; // Are we in reverse mode?
7751  bool restrictComm = false; // Do we need to restrict the communicator?
7752 
7753  int mm_optimization_core_count =
7754  Behavior::TAFC_OptimizationCoreCount();
7755  RCP<ParameterList> matrixparams; // parameters for the destination matrix
7756  bool overrideAllreduce = false;
7757  bool useKokkosPath = false;
7758  if (! params.is_null ()) {
7759  matrixparams = sublist (params, "CrsMatrix");
7760  reverseMode = params->get ("Reverse Mode", reverseMode);
7761  useKokkosPath = params->get ("TAFC: use kokkos path", useKokkosPath);
7762  restrictComm = params->get ("Restrict Communicator", restrictComm);
7763  auto & slist = params->sublist("matrixmatrix: kernel params",false);
7764  isMM = slist.get("isMatrixMatrix_TransferAndFillComplete",false);
7765  mm_optimization_core_count = slist.get("MM_TAFC_OptimizationCoreCount",mm_optimization_core_count);
7766 
7767  overrideAllreduce = slist.get("MM_TAFC_OverrideAllreduceCheck",false);
7768  if(getComm()->getSize() < mm_optimization_core_count && isMM) isMM = false;
7769  if(reverseMode) isMM = false;
7770  }
7771 
7772  // Only used in the sparse matrix-matrix multiply (isMM) case.
7773  std::shared_ptr< ::Tpetra::Details::CommRequest> iallreduceRequest;
7774  int mismatch = 0;
7775  int reduced_mismatch = 0;
7776  if (isMM && !overrideAllreduce) {
7777 
7778  // Test for pathological matrix transfer
7779  const bool source_vals = ! getGraph ()->getImporter ().is_null();
7780  const bool target_vals = ! (rowTransfer.getExportLIDs ().size() == 0 ||
7781  rowTransfer.getRemoteLIDs ().size() == 0);
7782  mismatch = (source_vals != target_vals) ? 1 : 0;
7783  iallreduceRequest =
7784  ::Tpetra::Details::iallreduce (mismatch, reduced_mismatch,
7785  Teuchos::REDUCE_MAX, * (getComm ()));
7786  }
7787 
7788 #ifdef HAVE_TPETRA_MMM_TIMINGS
7789  using Teuchos::TimeMonitor;
7790  std::string label;
7791  if(!params.is_null())
7792  label = params->get("Timer Label",label);
7793  std::string prefix = std::string("Tpetra ")+ label + std::string(": ");
7794  std::string tlstr;
7795  {
7796  std::ostringstream os;
7797  if(isMM) os<<":MMOpt";
7798  else os<<":MMLegacy";
7799  tlstr = os.str();
7800  }
7801 
7802  Teuchos::TimeMonitor MMall(*TimeMonitor::getNewTimer(prefix + std::string("TAFC All") +tlstr ));
7803 #endif
7804 
7805  // Make sure that the input argument rowTransfer is either an
7806  // Import or an Export. Import and Export are the only two
7807  // subclasses of Transfer that we defined, but users might
7808  // (unwisely, for now at least) decide to implement their own
7809  // subclasses. Exclude this possibility.
7810  const import_type* xferAsImport = dynamic_cast<const import_type*> (&rowTransfer);
7811  const export_type* xferAsExport = dynamic_cast<const export_type*> (&rowTransfer);
7812  TEUCHOS_TEST_FOR_EXCEPTION(
7813  xferAsImport == nullptr && xferAsExport == nullptr, std::invalid_argument,
7814  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' input "
7815  "argument must be either an Import or an Export, and its template "
7816  "parameters must match the corresponding template parameters of the "
7817  "CrsMatrix.");
7818 
7819  // Make sure that the input argument domainTransfer is either an
7820  // Import or an Export. Import and Export are the only two
7821  // subclasses of Transfer that we defined, but users might
7822  // (unwisely, for now at least) decide to implement their own
7823  // subclasses. Exclude this possibility.
7824  Teuchos::RCP<const import_type> xferDomainAsImport = Teuchos::rcp_dynamic_cast<const import_type> (domainTransfer);
7825  Teuchos::RCP<const export_type> xferDomainAsExport = Teuchos::rcp_dynamic_cast<const export_type> (domainTransfer);
7826 
7827  if(! domainTransfer.is_null()) {
7828  TEUCHOS_TEST_FOR_EXCEPTION(
7829  (xferDomainAsImport.is_null() && xferDomainAsExport.is_null()), std::invalid_argument,
7830  "Tpetra::CrsMatrix::transferAndFillComplete: The 'domainTransfer' input "
7831  "argument must be either an Import or an Export, and its template "
7832  "parameters must match the corresponding template parameters of the "
7833  "CrsMatrix.");
7834 
7835  TEUCHOS_TEST_FOR_EXCEPTION(
7836  ( xferAsImport != nullptr || ! xferDomainAsImport.is_null() ) &&
7837  (( xferAsImport != nullptr && xferDomainAsImport.is_null() ) ||
7838  ( xferAsImport == nullptr && ! xferDomainAsImport.is_null() )), std::invalid_argument,
7839  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7840  "arguments must be of the same type (either Import or Export).");
7841 
7842  TEUCHOS_TEST_FOR_EXCEPTION(
7843  ( xferAsExport != nullptr || ! xferDomainAsExport.is_null() ) &&
7844  (( xferAsExport != nullptr && xferDomainAsExport.is_null() ) ||
7845  ( xferAsExport == nullptr && ! xferDomainAsExport.is_null() )), std::invalid_argument,
7846  "Tpetra::CrsMatrix::transferAndFillComplete: The 'rowTransfer' and 'domainTransfer' input "
7847  "arguments must be of the same type (either Import or Export).");
7848  } // domainTransfer != null
7849 
7850 
7851  // FIXME (mfh 15 May 2014) Wouldn't communication still be needed,
7852  // if the source Map is not distributed but the target Map is?
7853  const bool communication_needed = rowTransfer.getSourceMap ()->isDistributed ();
7854 
7855  // Get the new domain and range Maps. We need some of them for
7856  // error checking, now that we have the reverseMode parameter.
7857  RCP<const map_type> MyRowMap = reverseMode ?
7858  rowTransfer.getSourceMap () : rowTransfer.getTargetMap ();
7859  RCP<const map_type> MyColMap; // create this below
7860  RCP<const map_type> MyDomainMap = ! domainMap.is_null () ?
7861  domainMap : getDomainMap ();
7862  RCP<const map_type> MyRangeMap = ! rangeMap.is_null () ?
7863  rangeMap : getRangeMap ();
7864  RCP<const map_type> BaseRowMap = MyRowMap;
7865  RCP<const map_type> BaseDomainMap = MyDomainMap;
7866 
7867  // If the user gave us a nonnull destMat, then check whether it's
7868  // "pristine." That means that it has no entries.
7869  //
7870  // FIXME (mfh 15 May 2014) If this is not true on all processes,
7871  // then this exception test may hang. It would be better to
7872  // forward an error flag to the next communication phase.
7873  if (! destMat.is_null ()) {
7874  // FIXME (mfh 15 May 2014): The Epetra idiom for checking
7875  // whether a graph or matrix has no entries on the calling
7876  // process, is that it is neither locally nor globally indexed.
7877  // This may change eventually with the Kokkos refactor version
7878  // of Tpetra, so it would be better just to check the quantity
7879  // of interest directly. Note that with the Kokkos refactor
7880  // version of Tpetra, asking for the total number of entries in
7881  // a graph or matrix that is not fill complete might require
7882  // computation (kernel launch), since it is not thread scalable
7883  // to update a count every time an entry is inserted.
7884  const bool NewFlag = ! destMat->getGraph ()->isLocallyIndexed () &&
7885  ! destMat->getGraph ()->isGloballyIndexed ();
7886  TEUCHOS_TEST_FOR_EXCEPTION(
7887  ! NewFlag, std::invalid_argument, "Tpetra::CrsMatrix::"
7888  "transferAndFillComplete: The input argument 'destMat' is only allowed "
7889  "to be nonnull, if its graph is empty (neither locally nor globally "
7890  "indexed).");
7891  // FIXME (mfh 15 May 2014) At some point, we want to change
7892  // graphs and matrices so that their DistObject Map
7893  // (this->getMap()) may differ from their row Map. This will
7894  // make redistribution for 2-D distributions more efficient. I
7895  // hesitate to change this check, because I'm not sure how much
7896  // the code here depends on getMap() and getRowMap() being the
7897  // same.
7898  TEUCHOS_TEST_FOR_EXCEPTION(
7899  ! destMat->getRowMap ()->isSameAs (*MyRowMap), std::invalid_argument,
7900  "Tpetra::CrsMatrix::transferAndFillComplete: The (row) Map of the "
7901  "input argument 'destMat' is not the same as the (row) Map specified "
7902  "by the input argument 'rowTransfer'.");
7903  TEUCHOS_TEST_FOR_EXCEPTION(
7904  ! destMat->checkSizes (*this), std::invalid_argument,
7905  "Tpetra::CrsMatrix::transferAndFillComplete: You provided a nonnull "
7906  "destination matrix, but checkSizes() indicates that it is not a legal "
7907  "legal target for redistribution from the source matrix (*this). This "
7908  "may mean that they do not have the same dimensions.");
7909  }
7910 
7911  // If forward mode (the default), then *this's (row) Map must be
7912  // the same as the source Map of the Transfer. If reverse mode,
7913  // then *this's (row) Map must be the same as the target Map of
7914  // the Transfer.
7915  //
7916  // FIXME (mfh 15 May 2014) At some point, we want to change graphs
7917  // and matrices so that their DistObject Map (this->getMap()) may
7918  // differ from their row Map. This will make redistribution for
7919  // 2-D distributions more efficient. I hesitate to change this
7920  // check, because I'm not sure how much the code here depends on
7921  // getMap() and getRowMap() being the same.
7922  TEUCHOS_TEST_FOR_EXCEPTION(
7923  ! (reverseMode || getRowMap ()->isSameAs (*rowTransfer.getSourceMap ())),
7924  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7925  "rowTransfer->getSourceMap() must match this->getRowMap() in forward mode.");
7926  TEUCHOS_TEST_FOR_EXCEPTION(
7927  ! (! reverseMode || getRowMap ()->isSameAs (*rowTransfer.getTargetMap ())),
7928  std::invalid_argument, "Tpetra::CrsMatrix::transferAndFillComplete: "
7929  "rowTransfer->getTargetMap() must match this->getRowMap() in reverse mode.");
7930 
7931  // checks for domainTransfer
7932  TEUCHOS_TEST_FOR_EXCEPTION(
7933  ! xferDomainAsImport.is_null() && ! xferDomainAsImport->getTargetMap()->isSameAs(*domainMap),
7934  std::invalid_argument,
7935  "Tpetra::CrsMatrix::transferAndFillComplete: The target map of the 'domainTransfer' input "
7936  "argument must be the same as the rebalanced domain map 'domainMap'");
7937 
7938  TEUCHOS_TEST_FOR_EXCEPTION(
7939  ! xferDomainAsExport.is_null() && ! xferDomainAsExport->getSourceMap()->isSameAs(*domainMap),
7940  std::invalid_argument,
7941  "Tpetra::CrsMatrix::transferAndFillComplete: The source map of the 'domainTransfer' input "
7942  "argument must be the same as the rebalanced domain map 'domainMap'");
7943 
7944  // The basic algorithm here is:
7945  //
7946  // 1. Call the moral equivalent of "Distor.do" to handle the import.
7947  // 2. Copy all the Imported and Copy/Permuted data into the raw
7948  // CrsMatrix / CrsGraphData pointers, still using GIDs.
7949  // 3. Call an optimized version of MakeColMap that avoids the
7950  // Directory lookups (since the importer knows who owns all the
7951  // GIDs) AND reindexes to LIDs.
7952  // 4. Call expertStaticFillComplete()
7953 
7954  // Get information from the Importer
7955  const size_t NumSameIDs = rowTransfer.getNumSameIDs();
7956  ArrayView<const LO> ExportLIDs = reverseMode ?
7957  rowTransfer.getRemoteLIDs () : rowTransfer.getExportLIDs ();
7958  auto RemoteLIDs = reverseMode ?
7959  rowTransfer.getExportLIDs_dv() : rowTransfer.getRemoteLIDs_dv();
7960  auto PermuteToLIDs = reverseMode ?
7961  rowTransfer.getPermuteFromLIDs_dv() : rowTransfer.getPermuteToLIDs_dv();
7962  auto PermuteFromLIDs = reverseMode ?
7963  rowTransfer.getPermuteToLIDs_dv() : rowTransfer.getPermuteFromLIDs_dv();
7964  Distributor& Distor = rowTransfer.getDistributor ();
7965 
7966  // Owning PIDs
7967  Teuchos::Array<int> SourcePids;
7968 
7969  // Temp variables for sub-communicators
7970  RCP<const map_type> ReducedRowMap, ReducedColMap,
7971  ReducedDomainMap, ReducedRangeMap;
7972  RCP<const Comm<int> > ReducedComm;
7973 
7974  // If the user gave us a null destMat, then construct the new
7975  // destination matrix. We will replace its column Map later.
7976  if (destMat.is_null ()) {
7977  destMat = rcp (new this_CRS_type (MyRowMap, 0, matrixparams));
7978  }
7979 
7980  /***************************************************/
7981  /***** 1) First communicator restriction phase ****/
7982  /***************************************************/
7983  if (restrictComm) {
7984 #ifdef HAVE_TPETRA_MMM_TIMINGS
7985  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrictComm")));
7986 #endif
7987  ReducedRowMap = MyRowMap->removeEmptyProcesses ();
7988  ReducedComm = ReducedRowMap.is_null () ?
7989  Teuchos::null :
7990  ReducedRowMap->getComm ();
7991  destMat->removeEmptyProcessesInPlace (ReducedRowMap);
7992 
7993  ReducedDomainMap = MyRowMap.getRawPtr () == MyDomainMap.getRawPtr () ?
7994  ReducedRowMap :
7995  MyDomainMap->replaceCommWithSubset (ReducedComm);
7996  ReducedRangeMap = MyRowMap.getRawPtr () == MyRangeMap.getRawPtr () ?
7997  ReducedRowMap :
7998  MyRangeMap->replaceCommWithSubset (ReducedComm);
7999 
8000  // Reset the "my" maps
8001  MyRowMap = ReducedRowMap;
8002  MyDomainMap = ReducedDomainMap;
8003  MyRangeMap = ReducedRangeMap;
8004 
8005  // Update my PID, if we've restricted the communicator
8006  if (! ReducedComm.is_null ()) {
8007  MyPID = ReducedComm->getRank ();
8008  }
8009  else {
8010  MyPID = -2; // For debugging
8011  }
8012  }
8013  else {
8014  ReducedComm = MyRowMap->getComm ();
8015  }
8016 
8017 
8018 
8019  /***************************************************/
8020  /***** 2) From Tpetra::DistObject::doTransfer() ****/
8021  /***************************************************/
8022  // Get the owning PIDs
8023  RCP<const import_type> MyImporter = getGraph ()->getImporter ();
8024 
8025  // check whether domain maps of source matrix and base domain map is the same
8026  bool bSameDomainMap = BaseDomainMap->isSameAs (*getDomainMap ());
8027 
8028  if (! restrictComm && ! MyImporter.is_null () && bSameDomainMap ) {
8029 #ifdef HAVE_TPETRA_MMM_TIMINGS
8030  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs same map")));
8031 #endif
8032  // Same domain map as source matrix
8033  //
8034  // NOTE: This won't work for restrictComm (because the Import
8035  // doesn't know the restricted PIDs), though writing an
8036  // optimized version for that case would be easy (Import an
8037  // IntVector of the new PIDs). Might want to add this later.
8038  Import_Util::getPids (*MyImporter, SourcePids, false);
8039  }
8040  else if (restrictComm && ! MyImporter.is_null () && bSameDomainMap) {
8041  // Same domain map as source matrix (restricted communicator)
8042  // We need one import from the domain to the column map
8043 #ifdef HAVE_TPETRA_MMM_TIMINGS
8044  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs restricted comm")));
8045 #endif
8046  IntVectorType SourceDomain_pids(getDomainMap (),true);
8047  IntVectorType SourceCol_pids(getColMap());
8048  // SourceDomain_pids contains the restricted pids
8049  SourceDomain_pids.putScalar(MyPID);
8050 
8051  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8052  SourcePids.resize (getColMap ()->getLocalNumElements ());
8053  SourceCol_pids.get1dCopy (SourcePids ());
8054  }
8055  else if (MyImporter.is_null ()) {
8056  // Matrix has no off-process entries
8057 #ifdef HAVE_TPETRA_MMM_TIMINGS
8058  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs all local entries")));
8059 #endif
8060  SourcePids.resize (getColMap ()->getLocalNumElements ());
8061  SourcePids.assign (getColMap ()->getLocalNumElements (), MyPID);
8062  }
8063  else if ( ! MyImporter.is_null () &&
8064  ! domainTransfer.is_null () ) {
8065  // general implementation for rectangular matrices with
8066  // domain map different than SourceMatrix domain map.
8067  // User has to provide a DomainTransfer object. We need
8068  // to communications (import/export)
8069 #ifdef HAVE_TPETRA_MMM_TIMINGS
8070  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs rectangular case")));
8071 #endif
8072 
8073  // TargetDomain_pids lives on the rebalanced new domain map
8074  IntVectorType TargetDomain_pids (domainMap);
8075  TargetDomain_pids.putScalar (MyPID);
8076 
8077  // SourceDomain_pids lives on the non-rebalanced old domain map
8078  IntVectorType SourceDomain_pids (getDomainMap ());
8079 
8080  // SourceCol_pids lives on the non-rebalanced old column map
8081  IntVectorType SourceCol_pids (getColMap ());
8082 
8083  if (! reverseMode && ! xferDomainAsImport.is_null() ) {
8084  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8085  }
8086  else if (reverseMode && ! xferDomainAsExport.is_null() ) {
8087  SourceDomain_pids.doExport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8088  }
8089  else if (! reverseMode && ! xferDomainAsExport.is_null() ) {
8090  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsExport, INSERT);
8091  }
8092  else if (reverseMode && ! xferDomainAsImport.is_null() ) {
8093  SourceDomain_pids.doImport (TargetDomain_pids, *xferDomainAsImport, INSERT);
8094  }
8095  else {
8096  TEUCHOS_TEST_FOR_EXCEPTION(
8097  true, std::logic_error, "Tpetra::CrsMatrix::"
8098  "transferAndFillComplete: Should never get here! "
8099  "Please report this bug to a Tpetra developer.");
8100  }
8101  SourceCol_pids.doImport (SourceDomain_pids, *MyImporter, INSERT);
8102  SourcePids.resize (getColMap ()->getLocalNumElements ());
8103  SourceCol_pids.get1dCopy (SourcePids ());
8104  }
8105  else if ( ! MyImporter.is_null () &&
8106  BaseDomainMap->isSameAs (*BaseRowMap) &&
8107  getDomainMap ()->isSameAs (*getRowMap ())) {
8108  // We can use the rowTransfer + SourceMatrix's Import to find out who owns what.
8109 #ifdef HAVE_TPETRA_MMM_TIMINGS
8110  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs query import")));
8111 #endif
8112 
8113  IntVectorType TargetRow_pids (domainMap);
8114  IntVectorType SourceRow_pids (getRowMap ());
8115  IntVectorType SourceCol_pids (getColMap ());
8116 
8117  TargetRow_pids.putScalar (MyPID);
8118  if (! reverseMode && xferAsImport != nullptr) {
8119  SourceRow_pids.doExport (TargetRow_pids, *xferAsImport, INSERT);
8120  }
8121  else if (reverseMode && xferAsExport != nullptr) {
8122  SourceRow_pids.doExport (TargetRow_pids, *xferAsExport, INSERT);
8123  }
8124  else if (! reverseMode && xferAsExport != nullptr) {
8125  SourceRow_pids.doImport (TargetRow_pids, *xferAsExport, INSERT);
8126  }
8127  else if (reverseMode && xferAsImport != nullptr) {
8128  SourceRow_pids.doImport (TargetRow_pids, *xferAsImport, INSERT);
8129  }
8130  else {
8131  TEUCHOS_TEST_FOR_EXCEPTION(
8132  true, std::logic_error, "Tpetra::CrsMatrix::"
8133  "transferAndFillComplete: Should never get here! "
8134  "Please report this bug to a Tpetra developer.");
8135  }
8136 
8137  SourceCol_pids.doImport (SourceRow_pids, *MyImporter, INSERT);
8138  SourcePids.resize (getColMap ()->getLocalNumElements ());
8139  SourceCol_pids.get1dCopy (SourcePids ());
8140  }
8141  else {
8142  TEUCHOS_TEST_FOR_EXCEPTION(
8143  true, std::invalid_argument, "Tpetra::CrsMatrix::"
8144  "transferAndFillComplete: This method only allows either domainMap == "
8145  "getDomainMap (), or (domainMap == rowTransfer.getTargetMap () and "
8146  "getDomainMap () == getRowMap ()).");
8147  }
8148 
8149  // Tpetra-specific stuff
8150  size_t constantNumPackets = destMat->constantNumberOfPackets ();
8151  {
8152 #ifdef HAVE_TPETRA_MMM_TIMINGS
8153  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC reallocate buffers")));
8154 #endif
8155  if (constantNumPackets == 0) {
8156  destMat->reallocArraysForNumPacketsPerLid (ExportLIDs.size (),
8157  RemoteLIDs.view_host().size ());
8158  }
8159  else {
8160  // There are a constant number of packets per element. We
8161  // already know (from the number of "remote" (incoming)
8162  // elements) how many incoming elements we expect, so we can
8163  // resize the buffer accordingly.
8164  const size_t rbufLen = RemoteLIDs.view_host().size() * constantNumPackets;
8165  destMat->reallocImportsIfNeeded (rbufLen, false, nullptr);
8166  }
8167  }
8168 
8169  // Pack & Prepare w/ owning PIDs
8170  {
8171 #ifdef HAVE_TPETRA_MMM_TIMINGS
8172  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC pack and prepare")));
8173 #endif
8174  if (debug) {
8175  using Teuchos::outArg;
8176  using Teuchos::REDUCE_MAX;
8177  using Teuchos::reduceAll;
8178  using std::cerr;
8179  using std::endl;
8180  RCP<const Teuchos::Comm<int> > comm = this->getComm ();
8181  const int myRank = comm->getRank ();
8182 
8183  std::ostringstream errStrm;
8184  int lclErr = 0;
8185  int gblErr = 0;
8186 
8187  Teuchos::ArrayView<size_t> numExportPacketsPerLID;
8188  try {
8189  // packAndPrepare* methods modify numExportPacketsPerLID_.
8190  destMat->numExportPacketsPerLID_.modify_host ();
8191  numExportPacketsPerLID =
8192  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8193  }
8194  catch (std::exception& e) {
8195  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw: "
8196  << e.what () << std::endl;
8197  lclErr = 1;
8198  }
8199  catch (...) {
8200  errStrm << "Proc " << myRank << ": getArrayViewFromDualView threw "
8201  "an exception not a subclass of std::exception" << std::endl;
8202  lclErr = 1;
8203  }
8204 
8205  if (! comm.is_null ()) {
8206  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8207  }
8208  if (gblErr != 0) {
8209  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8210  TEUCHOS_TEST_FOR_EXCEPTION(
8211  true, std::runtime_error, "getArrayViewFromDualView threw an "
8212  "exception on at least one process.");
8213  }
8214 
8215  if (verbose) {
8216  std::ostringstream os;
8217  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8218  << std::endl;
8219  std::cerr << os.str ();
8220  }
8221  try {
8223  destMat->exports_,
8224  numExportPacketsPerLID,
8225  ExportLIDs,
8226  SourcePids,
8227  constantNumPackets);
8228  }
8229  catch (std::exception& e) {
8230  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw: "
8231  << e.what () << std::endl;
8232  lclErr = 1;
8233  }
8234  catch (...) {
8235  errStrm << "Proc " << myRank << ": packCrsMatrixWithOwningPIDs threw "
8236  "an exception not a subclass of std::exception" << std::endl;
8237  lclErr = 1;
8238  }
8239 
8240  if (verbose) {
8241  std::ostringstream os;
8242  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8243  << std::endl;
8244  std::cerr << os.str ();
8245  }
8246 
8247  if (! comm.is_null ()) {
8248  reduceAll<int, int> (*comm, REDUCE_MAX, lclErr, outArg (gblErr));
8249  }
8250  if (gblErr != 0) {
8251  ::Tpetra::Details::gathervPrint (cerr, errStrm.str (), *comm);
8252  TEUCHOS_TEST_FOR_EXCEPTION(
8253  true, std::runtime_error, "packCrsMatrixWithOwningPIDs threw an "
8254  "exception on at least one process.");
8255  }
8256  }
8257  else {
8258  // packAndPrepare* methods modify numExportPacketsPerLID_.
8259  destMat->numExportPacketsPerLID_.modify_host ();
8260  Teuchos::ArrayView<size_t> numExportPacketsPerLID =
8261  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8262  if (verbose) {
8263  std::ostringstream os;
8264  os << *verbosePrefix << "Calling packCrsMatrixWithOwningPIDs"
8265  << std::endl;
8266  std::cerr << os.str ();
8267  }
8269  destMat->exports_,
8270  numExportPacketsPerLID,
8271  ExportLIDs,
8272  SourcePids,
8273  constantNumPackets);
8274  if (verbose) {
8275  std::ostringstream os;
8276  os << *verbosePrefix << "Done with packCrsMatrixWithOwningPIDs"
8277  << std::endl;
8278  std::cerr << os.str ();
8279  }
8280  }
8281  }
8282 
8283  // Do the exchange of remote data.
8284  {
8285 #ifdef HAVE_TPETRA_MMM_TIMINGS
8286  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC getOwningPIDs exchange remote data")));
8287 #endif
8288  if (! communication_needed) {
8289  if (verbose) {
8290  std::ostringstream os;
8291  os << *verbosePrefix << "Communication not needed" << std::endl;
8292  std::cerr << os.str ();
8293  }
8294  }
8295  else {
8296  if (reverseMode) {
8297  if (constantNumPackets == 0) { // variable number of packets per LID
8298  if (verbose) {
8299  std::ostringstream os;
8300  os << *verbosePrefix << "Reverse mode, variable # packets / LID"
8301  << std::endl;
8302  std::cerr << os.str ();
8303  }
8304  // Make sure that host has the latest version, since we're
8305  // using the version on host. If host has the latest
8306  // version, syncing to host does nothing.
8307  destMat->numExportPacketsPerLID_.sync_host ();
8308  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8309  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8310  destMat->numImportPacketsPerLID_.sync_host ();
8311  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8312  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8313 
8314  if (verbose) {
8315  std::ostringstream os;
8316  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8317  << std::endl;
8318  std::cerr << os.str ();
8319  }
8320  Distor.doReversePostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8321  destMat->numImportPacketsPerLID_.view_host());
8322  if (verbose) {
8323  std::ostringstream os;
8324  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8325  << std::endl;
8326  std::cerr << os.str ();
8327  }
8328 
8329  size_t totalImportPackets = 0;
8330  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8331  totalImportPackets += numImportPacketsPerLID[i];
8332  }
8333 
8334  // Reallocation MUST go before setting the modified flag,
8335  // because it may clear out the flags.
8336  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8337  verbosePrefix.get ());
8338  destMat->imports_.modify_host ();
8339  auto hostImports = destMat->imports_.view_host();
8340  // This is a legacy host pack/unpack path, so use the host
8341  // version of exports_.
8342  destMat->exports_.sync_host ();
8343  auto hostExports = destMat->exports_.view_host();
8344  if (verbose) {
8345  std::ostringstream os;
8346  os << *verbosePrefix << "Calling 4-arg doReversePostsAndWaits"
8347  << std::endl;
8348  std::cerr << os.str ();
8349  }
8350  Distor.doReversePostsAndWaits (hostExports,
8351  numExportPacketsPerLID,
8352  hostImports,
8353  numImportPacketsPerLID);
8354  if (verbose) {
8355  std::ostringstream os;
8356  os << *verbosePrefix << "Finished 4-arg doReversePostsAndWaits"
8357  << std::endl;
8358  std::cerr << os.str ();
8359  }
8360  }
8361  else { // constant number of packets per LID
8362  if (verbose) {
8363  std::ostringstream os;
8364  os << *verbosePrefix << "Reverse mode, constant # packets / LID"
8365  << std::endl;
8366  std::cerr << os.str ();
8367  }
8368  destMat->imports_.modify_host ();
8369  auto hostImports = destMat->imports_.view_host();
8370  // This is a legacy host pack/unpack path, so use the host
8371  // version of exports_.
8372  destMat->exports_.sync_host ();
8373  auto hostExports = destMat->exports_.view_host();
8374  if (verbose) {
8375  std::ostringstream os;
8376  os << *verbosePrefix << "Calling 3-arg doReversePostsAndWaits"
8377  << std::endl;
8378  std::cerr << os.str ();
8379  }
8380  Distor.doReversePostsAndWaits (hostExports,
8381  constantNumPackets,
8382  hostImports);
8383  if (verbose) {
8384  std::ostringstream os;
8385  os << *verbosePrefix << "Finished 3-arg doReversePostsAndWaits"
8386  << std::endl;
8387  std::cerr << os.str ();
8388  }
8389  }
8390  }
8391  else { // forward mode (the default)
8392  if (constantNumPackets == 0) { // variable number of packets per LID
8393  if (verbose) {
8394  std::ostringstream os;
8395  os << *verbosePrefix << "Forward mode, variable # packets / LID"
8396  << std::endl;
8397  std::cerr << os.str ();
8398  }
8399  // Make sure that host has the latest version, since we're
8400  // using the version on host. If host has the latest
8401  // version, syncing to host does nothing.
8402  destMat->numExportPacketsPerLID_.sync_host ();
8403  Teuchos::ArrayView<const size_t> numExportPacketsPerLID =
8404  getArrayViewFromDualView (destMat->numExportPacketsPerLID_);
8405  destMat->numImportPacketsPerLID_.sync_host ();
8406  Teuchos::ArrayView<size_t> numImportPacketsPerLID =
8407  getArrayViewFromDualView (destMat->numImportPacketsPerLID_);
8408  if (verbose) {
8409  std::ostringstream os;
8410  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8411  << std::endl;
8412  std::cerr << os.str ();
8413  }
8414  Distor.doPostsAndWaits(destMat->numExportPacketsPerLID_.view_host(), 1,
8415  destMat->numImportPacketsPerLID_.view_host());
8416  if (verbose) {
8417  std::ostringstream os;
8418  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8419  << std::endl;
8420  std::cerr << os.str ();
8421  }
8422 
8423  size_t totalImportPackets = 0;
8424  for (Array_size_type i = 0; i < numImportPacketsPerLID.size (); ++i) {
8425  totalImportPackets += numImportPacketsPerLID[i];
8426  }
8427 
8428  // Reallocation MUST go before setting the modified flag,
8429  // because it may clear out the flags.
8430  destMat->reallocImportsIfNeeded (totalImportPackets, verbose,
8431  verbosePrefix.get ());
8432  destMat->imports_.modify_host ();
8433  auto hostImports = destMat->imports_.view_host();
8434  // This is a legacy host pack/unpack path, so use the host
8435  // version of exports_.
8436  destMat->exports_.sync_host ();
8437  auto hostExports = destMat->exports_.view_host();
8438  if (verbose) {
8439  std::ostringstream os;
8440  os << *verbosePrefix << "Calling 4-arg doPostsAndWaits"
8441  << std::endl;
8442  std::cerr << os.str ();
8443  }
8444  Distor.doPostsAndWaits (hostExports,
8445  numExportPacketsPerLID,
8446  hostImports,
8447  numImportPacketsPerLID);
8448  if (verbose) {
8449  std::ostringstream os;
8450  os << *verbosePrefix << "Finished 4-arg doPostsAndWaits"
8451  << std::endl;
8452  std::cerr << os.str ();
8453  }
8454  }
8455  else { // constant number of packets per LID
8456  if (verbose) {
8457  std::ostringstream os;
8458  os << *verbosePrefix << "Forward mode, constant # packets / LID"
8459  << std::endl;
8460  std::cerr << os.str ();
8461  }
8462  destMat->imports_.modify_host ();
8463  auto hostImports = destMat->imports_.view_host();
8464  // This is a legacy host pack/unpack path, so use the host
8465  // version of exports_.
8466  destMat->exports_.sync_host ();
8467  auto hostExports = destMat->exports_.view_host();
8468  if (verbose) {
8469  std::ostringstream os;
8470  os << *verbosePrefix << "Calling 3-arg doPostsAndWaits"
8471  << std::endl;
8472  std::cerr << os.str ();
8473  }
8474  Distor.doPostsAndWaits (hostExports,
8475  constantNumPackets,
8476  hostImports);
8477  if (verbose) {
8478  std::ostringstream os;
8479  os << *verbosePrefix << "Finished 3-arg doPostsAndWaits"
8480  << std::endl;
8481  std::cerr << os.str ();
8482  }
8483  }
8484  }
8485  }
8486  }
8487 
8488  /*********************************************************************/
8489  /**** 3) Copy all of the Same/Permute/Remote data into CSR_arrays ****/
8490  /*********************************************************************/
8491 
8492  bool runOnHost = std::is_same_v<typename device_type::memory_space, Kokkos::HostSpace> && !useKokkosPath;
8493 
8494  Teuchos::Array<int> RemotePids;
8495  if (runOnHost) {
8496  Teuchos::Array<int> TargetPids;
8497  // Backwards compatibility measure. We'll use this again below.
8498 
8499  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8500  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8501  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8502  destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8503 
8504 # ifdef HAVE_TPETRA_MMM_TIMINGS
8505  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8506 # endif
8507  ArrayRCP<size_t> CSR_rowptr;
8508  ArrayRCP<GO> CSR_colind_GID;
8509  ArrayRCP<LO> CSR_colind_LID;
8510  ArrayRCP<Scalar> CSR_vals;
8511 
8512  destMat->imports_.sync_device ();
8513  destMat->numImportPacketsPerLID_.sync_device ();
8514 
8515  size_t N = BaseRowMap->getLocalNumElements ();
8516 
8517  auto RemoteLIDs_d = RemoteLIDs.view_device();
8518  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8519  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8520 
8522  *this,
8523  RemoteLIDs_d,
8524  destMat->imports_.view_device(), //hostImports
8525  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8526  NumSameIDs,
8527  PermuteToLIDs_d,
8528  PermuteFromLIDs_d,
8529  N,
8530  MyPID,
8531  CSR_rowptr,
8532  CSR_colind_GID,
8533  CSR_vals,
8534  SourcePids(),
8535  TargetPids);
8536 
8537  // If LO and GO are the same, we can reuse memory when
8538  // converting the column indices from global to local indices.
8539  if (typeid (LO) == typeid (GO)) {
8540  CSR_colind_LID = Teuchos::arcp_reinterpret_cast<LO> (CSR_colind_GID);
8541  }
8542  else {
8543  CSR_colind_LID.resize (CSR_colind_GID.size());
8544  }
8545  CSR_colind_LID.resize (CSR_colind_GID.size());
8546 
8547  // On return from unpackAndCombineIntoCrsArrays TargetPids[i] == -1 for locally
8548  // owned entries. Convert them to the actual PID.
8549  // JHU FIXME This can be done within unpackAndCombineIntoCrsArrays with a parallel_for.
8550  for(size_t i=0; i<static_cast<size_t>(TargetPids.size()); i++)
8551  {
8552  if(TargetPids[i] == -1) TargetPids[i] = MyPID;
8553  }
8554 #ifdef HAVE_TPETRA_MMM_TIMINGS
8555  tmCopySPRdata = Teuchos::null;
8556 #endif
8557  /**************************************************************/
8558  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8559  /**************************************************************/
8560  // Call an optimized version of makeColMap that avoids the
8561  // Directory lookups (since the Import object knows who owns all
8562  // the GIDs).
8563  if (verbose) {
8564  std::ostringstream os;
8565  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8566  << std::endl;
8567  std::cerr << os.str ();
8568  }
8569  {
8570 #ifdef HAVE_TPETRA_MMM_TIMINGS
8571  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8572 #endif
8573  Import_Util::lowCommunicationMakeColMapAndReindexSerial(CSR_rowptr (),
8574  CSR_colind_LID (),
8575  CSR_colind_GID (),
8576  BaseDomainMap,
8577  TargetPids,
8578  RemotePids,
8579  MyColMap);
8580  }
8581 
8582  if (verbose) {
8583  std::ostringstream os;
8584  os << *verbosePrefix << "restrictComm="
8585  << (restrictComm ? "true" : "false") << std::endl;
8586  std::cerr << os.str ();
8587  }
8588 
8589  /*******************************************************/
8590  /**** 4) Second communicator restriction phase ****/
8591  /*******************************************************/
8592  {
8593 #ifdef HAVE_TPETRA_MMM_TIMINGS
8594  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8595 #endif
8596  if (restrictComm) {
8597  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8598  ReducedRowMap :
8599  MyColMap->replaceCommWithSubset (ReducedComm);
8600  MyColMap = ReducedColMap; // Reset the "my" maps
8601  }
8602 
8603  // Replace the col map
8604  if (verbose) {
8605  std::ostringstream os;
8606  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8607  std::cerr << os.str ();
8608  }
8609  destMat->replaceColMap (MyColMap);
8610 
8611  // Short circuit if the processor is no longer in the communicator
8612  //
8613  // NOTE: Epetra replaces modifies all "removed" processes so they
8614  // have a dummy (serial) Map that doesn't touch the original
8615  // communicator. Duplicating that here might be a good idea.
8616  if (ReducedComm.is_null ()) {
8617  if (verbose) {
8618  std::ostringstream os;
8619  os << *verbosePrefix << "I am no longer in the communicator; "
8620  "returning" << std::endl;
8621  std::cerr << os.str ();
8622  }
8623  return;
8624  }
8625  }
8626 
8627  /***************************************************/
8628  /**** 5) Sort ****/
8629  /***************************************************/
8630  if ((! reverseMode && xferAsImport != nullptr) ||
8631  (reverseMode && xferAsExport != nullptr)) {
8632  if (verbose) {
8633  std::ostringstream os;
8634  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8635  std::cerr << os.str ();
8636  }
8637 #ifdef HAVE_TPETRA_MMM_TIMINGS
8638  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8639 #endif
8640  Import_Util::sortCrsEntries (CSR_rowptr(),
8641  CSR_colind_LID(),
8642  CSR_vals());
8643  }
8644  else if ((! reverseMode && xferAsExport != nullptr) ||
8645  (reverseMode && xferAsImport != nullptr)) {
8646  if (verbose) {
8647  std::ostringstream os;
8648  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8649  << endl;
8650  std::cerr << os.str();
8651  }
8652 #ifdef HAVE_TPETRA_MMM_TIMINGS
8653  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8654 #endif
8655  Import_Util::sortAndMergeCrsEntries (CSR_rowptr(),
8656  CSR_colind_LID(),
8657  CSR_vals());
8658  if (CSR_rowptr[N] != static_cast<size_t>(CSR_vals.size())) {
8659  CSR_colind_LID.resize (CSR_rowptr[N]);
8660  CSR_vals.resize (CSR_rowptr[N]);
8661  }
8662  }
8663  else {
8664  TEUCHOS_TEST_FOR_EXCEPTION(
8665  true, std::logic_error, "Tpetra::CrsMatrix::"
8666  "transferAndFillComplete: Should never get here! "
8667  "Please report this bug to a Tpetra developer.");
8668  }
8669  /***************************************************/
8670  /**** 6) Reset the colmap and the arrays ****/
8671  /***************************************************/
8672 
8673  if (verbose) {
8674  std::ostringstream os;
8675  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8676  std::cerr << os.str ();
8677  }
8678 
8679  // Call constructor for the new matrix (restricted as needed)
8680  //
8681  // NOTE (mfh 15 May 2014) This should work fine for the Kokkos
8682  // refactor version of CrsMatrix, though it reserves the right to
8683  // make a deep copy of the arrays.
8684  {
8685 #ifdef HAVE_TPETRA_MMM_TIMINGS
8686  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8687 #endif
8688  destMat->setAllValues (CSR_rowptr, CSR_colind_LID, CSR_vals);
8689  }
8690 
8691  } else {
8692  // run on device
8693 
8694 
8695  // Backwards compatibility measure. We'll use this again below.
8696 
8697  // TODO JHU Need to track down why numImportPacketsPerLID_ has not been corrently marked as modified on host (which it has been)
8698  // TODO JHU somewhere above, e.g., call to Distor.doPostsAndWaits().
8699  // TODO JHU This only becomes apparent as we begin to convert TAFC to run on device.
8700  destMat->numImportPacketsPerLID_.modify_host(); //FIXME
8701 
8702 # ifdef HAVE_TPETRA_MMM_TIMINGS
8703  RCP<TimeMonitor> tmCopySPRdata = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC unpack-count-resize + copy same-perm-remote data"))));
8704 # endif
8705  ArrayRCP<size_t> CSR_rowptr;
8706  ArrayRCP<GO> CSR_colind_GID;
8707  ArrayRCP<LO> CSR_colind_LID;
8708  ArrayRCP<Scalar> CSR_vals;
8709 
8710  destMat->imports_.sync_device ();
8711  destMat->numImportPacketsPerLID_.sync_device ();
8712 
8713  size_t N = BaseRowMap->getLocalNumElements ();
8714 
8715  auto RemoteLIDs_d = RemoteLIDs.view_device();
8716  auto PermuteToLIDs_d = PermuteToLIDs.view_device();
8717  auto PermuteFromLIDs_d = PermuteFromLIDs.view_device();
8718 
8719  Kokkos::View<size_t*,device_type> CSR_rowptr_d;
8720  Kokkos::View<GO*,device_type> CSR_colind_GID_d;
8721  Kokkos::View<LO*,device_type> CSR_colind_LID_d;
8722  Kokkos::View<impl_scalar_type*,device_type> CSR_vals_d;
8723  Kokkos::View<int*,device_type> TargetPids_d;
8724 
8726  *this,
8727  RemoteLIDs_d,
8728  destMat->imports_.view_device(), //hostImports
8729  destMat->numImportPacketsPerLID_.view_device(), //numImportPacketsPerLID
8730  NumSameIDs,
8731  PermuteToLIDs_d,
8732  PermuteFromLIDs_d,
8733  N,
8734  MyPID,
8735  CSR_rowptr_d,
8736  CSR_colind_GID_d,
8737  CSR_vals_d,
8738  SourcePids(),
8739  TargetPids_d);
8740 
8741  Kokkos::resize (CSR_colind_LID_d, CSR_colind_GID_d.size());
8742 
8743 #ifdef HAVE_TPETRA_MMM_TIMINGS
8744  tmCopySPRdata = Teuchos::null;
8745 #endif
8746  /**************************************************************/
8747  /**** 4) Call Optimized MakeColMap w/ no Directory Lookups ****/
8748  /**************************************************************/
8749  // Call an optimized version of makeColMap that avoids the
8750  // Directory lookups (since the Import object knows who owns all
8751  // the GIDs).
8752  if (verbose) {
8753  std::ostringstream os;
8754  os << *verbosePrefix << "Calling lowCommunicationMakeColMapAndReindex"
8755  << std::endl;
8756  std::cerr << os.str ();
8757  }
8758  {
8759 #ifdef HAVE_TPETRA_MMM_TIMINGS
8760  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC makeColMap")));
8761 #endif
8762  Import_Util::lowCommunicationMakeColMapAndReindex(CSR_rowptr_d,
8763  CSR_colind_LID_d,
8764  CSR_colind_GID_d,
8765  BaseDomainMap,
8766  TargetPids_d,
8767  RemotePids,
8768  MyColMap);
8769  }
8770 
8771  if (verbose) {
8772  std::ostringstream os;
8773  os << *verbosePrefix << "restrictComm="
8774  << (restrictComm ? "true" : "false") << std::endl;
8775  std::cerr << os.str ();
8776  }
8777 
8778  /*******************************************************/
8779  /**** 4) Second communicator restriction phase ****/
8780  /*******************************************************/
8781  {
8782 #ifdef HAVE_TPETRA_MMM_TIMINGS
8783  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC restrict colmap")));
8784 #endif
8785  if (restrictComm) {
8786  ReducedColMap = (MyRowMap.getRawPtr () == MyColMap.getRawPtr ()) ?
8787  ReducedRowMap :
8788  MyColMap->replaceCommWithSubset (ReducedComm);
8789  MyColMap = ReducedColMap; // Reset the "my" maps
8790  }
8791 
8792  // Replace the col map
8793  if (verbose) {
8794  std::ostringstream os;
8795  os << *verbosePrefix << "Calling replaceColMap" << std::endl;
8796  std::cerr << os.str ();
8797  }
8798  destMat->replaceColMap (MyColMap);
8799 
8800  // Short circuit if the processor is no longer in the communicator
8801  //
8802  // NOTE: Epetra replaces modifies all "removed" processes so they
8803  // have a dummy (serial) Map that doesn't touch the original
8804  // communicator. Duplicating that here might be a good idea.
8805  if (ReducedComm.is_null ()) {
8806  if (verbose) {
8807  std::ostringstream os;
8808  os << *verbosePrefix << "I am no longer in the communicator; "
8809  "returning" << std::endl;
8810  std::cerr << os.str ();
8811  }
8812  return;
8813  }
8814  }
8815 
8816  /***************************************************/
8817  /**** 5) Sort ****/
8818  /***************************************************/
8819 
8820  if ((! reverseMode && xferAsImport != nullptr) ||
8821  (reverseMode && xferAsExport != nullptr)) {
8822  if (verbose) {
8823  std::ostringstream os;
8824  os << *verbosePrefix << "Calling sortCrsEntries" << endl;
8825  std::cerr << os.str ();
8826  }
8827 #ifdef HAVE_TPETRA_MMM_TIMINGS
8828  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortCrsEntries")));
8829 #endif
8830  Import_Util::sortCrsEntries (CSR_rowptr_d,
8831  CSR_colind_LID_d,
8832  CSR_vals_d);
8833  }
8834  else if ((! reverseMode && xferAsExport != nullptr) ||
8835  (reverseMode && xferAsImport != nullptr)) {
8836  if (verbose) {
8837  std::ostringstream os;
8838  os << *verbosePrefix << "Calling sortAndMergeCrsEntries"
8839  << endl;
8840  std::cerr << os.str();
8841  }
8842 #ifdef HAVE_TPETRA_MMM_TIMINGS
8843  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC sortAndMergeCrsEntries")));
8844 #endif
8845  Import_Util::sortAndMergeCrsEntries (CSR_rowptr_d,
8846  CSR_colind_LID_d,
8847  CSR_vals_d);
8848  }
8849  else {
8850  TEUCHOS_TEST_FOR_EXCEPTION(
8851  true, std::logic_error, "Tpetra::CrsMatrix::"
8852  "transferAndFillComplete: Should never get here! "
8853  "Please report this bug to a Tpetra developer.");
8854  }
8855 
8856  /***************************************************/
8857  /**** 6) Reset the colmap and the arrays ****/
8858  /***************************************************/
8859 
8860  if (verbose) {
8861  std::ostringstream os;
8862  os << *verbosePrefix << "Calling destMat->setAllValues" << endl;
8863  std::cerr << os.str ();
8864  }
8865 
8866  {
8867 #ifdef HAVE_TPETRA_MMM_TIMINGS
8868  Teuchos::TimeMonitor MMrc(*TimeMonitor::getNewTimer(prefix + std::string("TAFC setAllValues")));
8869 #endif
8870  destMat->setAllValues (CSR_rowptr_d, CSR_colind_LID_d, CSR_vals_d);
8871  }
8872 
8873  } //if (runOnHost) .. else ..
8874 
8875  /***************************************************/
8876  /**** 7) Build Importer & Call ESFC ****/
8877  /***************************************************/
8878 #ifdef HAVE_TPETRA_MMM_TIMINGS
8879  RCP<TimeMonitor> tmIESFC = rcp(new TimeMonitor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC build importer and esfc"))));
8880 #endif
8881  // Pre-build the importer using the existing PIDs
8882  Teuchos::ParameterList esfc_params;
8883 
8884  RCP<import_type> MyImport;
8885 
8886  // Fulfull the non-blocking allreduce on reduced_mismatch.
8887  if (iallreduceRequest.get () != nullptr) {
8888  if (verbose) {
8889  std::ostringstream os;
8890  os << *verbosePrefix << "Calling iallreduceRequest->wait()"
8891  << endl;
8892  std::cerr << os.str ();
8893  }
8894  iallreduceRequest->wait ();
8895  if (reduced_mismatch != 0) {
8896  isMM = false;
8897  }
8898  }
8899 
8900  if( isMM ) {
8901 #ifdef HAVE_TPETRA_MMM_TIMINGS
8902  Teuchos::TimeMonitor MMisMM (*TimeMonitor::getNewTimer(prefix + std::string("isMM Block")));
8903 #endif
8904  // Combine all type1/2/3 lists, [filter them], then call the expert import constructor.
8905 
8906  if (verbose) {
8907  std::ostringstream os;
8908  os << *verbosePrefix << "Getting CRS pointers" << endl;
8909  std::cerr << os.str ();
8910  }
8911 
8912  Teuchos::ArrayRCP<LocalOrdinal> type3LIDs;
8913  Teuchos::ArrayRCP<int> type3PIDs;
8914  auto rowptr = getCrsGraph()->getLocalRowPtrsHost();
8915  auto colind = getCrsGraph()->getLocalIndicesHost();
8916 
8917  if (verbose) {
8918  std::ostringstream os;
8919  os << *verbosePrefix << "Calling reverseNeighborDiscovery" << std::endl;
8920  std::cerr << os.str ();
8921  }
8922 
8923  {
8924 #ifdef HAVE_TPETRA_MMM_TIMINGS
8925  TimeMonitor tm_rnd (*TimeMonitor::getNewTimer(prefix + std::string("isMMrevNeighDis")));
8926 #endif
8927  Import_Util::reverseNeighborDiscovery(*this,
8928  rowptr,
8929  colind,
8930  rowTransfer,
8931  MyImporter,
8932  MyDomainMap,
8933  type3PIDs,
8934  type3LIDs,
8935  ReducedComm);
8936  }
8937 
8938  if (verbose) {
8939  std::ostringstream os;
8940  os << *verbosePrefix << "Done with reverseNeighborDiscovery" << std::endl;
8941  std::cerr << os.str ();
8942  }
8943 
8944  Teuchos::ArrayView<const int> EPID1 = MyImporter.is_null() ? Teuchos::ArrayView<const int>() : MyImporter->getExportPIDs();
8945  Teuchos::ArrayView<const LO> ELID1 = MyImporter.is_null() ? Teuchos::ArrayView<const LO>() : MyImporter->getExportLIDs();
8946 
8947  Teuchos::ArrayView<const int> TEPID2 = rowTransfer.getExportPIDs(); // row matrix
8948  Teuchos::ArrayView<const LO> TELID2 = rowTransfer.getExportLIDs();
8949 
8950  const int numCols = getGraph()->getColMap()->getLocalNumElements(); // may be dup
8951  // from EpetraExt_MMHelpers.cpp: build_type2_exports
8952  std::vector<bool> IsOwned(numCols,true);
8953  std::vector<int> SentTo(numCols,-1);
8954  if (! MyImporter.is_null ()) {
8955  for (auto && rlid : MyImporter->getRemoteLIDs()) { // the remoteLIDs must be from sourcematrix
8956  IsOwned[rlid]=false;
8957  }
8958  }
8959 
8960  std::vector<std::pair<int,GO> > usrtg;
8961  usrtg.reserve(TEPID2.size());
8962 
8963  {
8964  const auto& colMap = * (this->getColMap ()); // *this is sourcematrix
8965  for (Array_size_type i = 0; i < TEPID2.size (); ++i) {
8966  const LO row = TELID2[i];
8967  const int pid = TEPID2[i];
8968  for (auto j = rowptr[row]; j < rowptr[row+1]; ++j) {
8969  const int col = colind[j];
8970  if (IsOwned[col] && SentTo[col] != pid) {
8971  SentTo[col] = pid;
8972  GO gid = colMap.getGlobalElement (col);
8973  usrtg.push_back (std::pair<int,GO> (pid, gid));
8974  }
8975  }
8976  }
8977  }
8978 
8979 // This sort can _not_ be omitted.[
8980  std::sort(usrtg.begin(),usrtg.end()); // default comparator does the right thing, now sorted in gid order
8981  auto eopg = std ::unique(usrtg.begin(),usrtg.end());
8982  // 25 Jul 2018: Could just ignore the entries at and after eopg.
8983  usrtg.erase(eopg,usrtg.end());
8984 
8985  const Array_size_type type2_us_size = usrtg.size();
8986  Teuchos::ArrayRCP<int> EPID2=Teuchos::arcp(new int[type2_us_size],0,type2_us_size,true);
8987  Teuchos::ArrayRCP< LO> ELID2=Teuchos::arcp(new LO[type2_us_size],0,type2_us_size,true);
8988 
8989  int pos=0;
8990  for(auto && p : usrtg) {
8991  EPID2[pos]= p.first;
8992  ELID2[pos]= this->getDomainMap()->getLocalElement(p.second);
8993  pos++;
8994  }
8995 
8996  Teuchos::ArrayView<int> EPID3 = type3PIDs();
8997  Teuchos::ArrayView< LO> ELID3 = type3LIDs();
8998  GO InfGID = std::numeric_limits<GO>::max();
8999  int InfPID = INT_MAX;
9000 #ifdef TPETRA_MIN3
9001 # undef TPETRA_MIN3
9002 #endif // TPETRA_MIN3
9003 #define TPETRA_MIN3(x,y,z) ((x)<(y)?(std::min(x,z)):(std::min(y,z)))
9004  int i1=0, i2=0, i3=0;
9005  int Len1 = EPID1.size();
9006  int Len2 = EPID2.size();
9007  int Len3 = EPID3.size();
9008 
9009  int MyLen=Len1+Len2+Len3;
9010  Teuchos::ArrayRCP<LO> userExportLIDs = Teuchos::arcp(new LO[MyLen],0,MyLen,true);
9011  Teuchos::ArrayRCP<int> userExportPIDs = Teuchos::arcp(new int[MyLen],0,MyLen,true);
9012  int iloc = 0; // will be the size of the userExportLID/PIDs
9013 
9014  while(i1 < Len1 || i2 < Len2 || i3 < Len3){
9015  int PID1 = (i1<Len1)?(EPID1[i1]):InfPID;
9016  int PID2 = (i2<Len2)?(EPID2[i2]):InfPID;
9017  int PID3 = (i3<Len3)?(EPID3[i3]):InfPID;
9018 
9019  GO GID1 = (i1<Len1)?getDomainMap()->getGlobalElement(ELID1[i1]):InfGID;
9020  GO GID2 = (i2<Len2)?getDomainMap()->getGlobalElement(ELID2[i2]):InfGID;
9021  GO GID3 = (i3<Len3)?getDomainMap()->getGlobalElement(ELID3[i3]):InfGID;
9022 
9023  int MIN_PID = TPETRA_MIN3(PID1,PID2,PID3);
9024  GO MIN_GID = TPETRA_MIN3( ((PID1==MIN_PID)?GID1:InfGID), ((PID2==MIN_PID)?GID2:InfGID), ((PID3==MIN_PID)?GID3:InfGID));
9025 #ifdef TPETRA_MIN3
9026 # undef TPETRA_MIN3
9027 #endif // TPETRA_MIN3
9028  bool added_entry=false;
9029 
9030  if(PID1 == MIN_PID && GID1 == MIN_GID){
9031  userExportLIDs[iloc]=ELID1[i1];
9032  userExportPIDs[iloc]=EPID1[i1];
9033  i1++;
9034  added_entry=true;
9035  iloc++;
9036  }
9037  if(PID2 == MIN_PID && GID2 == MIN_GID){
9038  if(!added_entry) {
9039  userExportLIDs[iloc]=ELID2[i2];
9040  userExportPIDs[iloc]=EPID2[i2];
9041  added_entry=true;
9042  iloc++;
9043  }
9044  i2++;
9045  }
9046  if(PID3 == MIN_PID && GID3 == MIN_GID){
9047  if(!added_entry) {
9048  userExportLIDs[iloc]=ELID3[i3];
9049  userExportPIDs[iloc]=EPID3[i3];
9050  iloc++;
9051  }
9052  i3++;
9053  }
9054  }
9055 
9056  if (verbose) {
9057  std::ostringstream os;
9058  os << *verbosePrefix << "Create Import" << std::endl;
9059  std::cerr << os.str ();
9060  }
9061 
9062 #ifdef HAVE_TPETRA_MMM_TIMINGS
9063  auto ismmIctor(*TimeMonitor::getNewTimer(prefix + std::string("isMMIportCtor")));
9064 #endif
9065  Teuchos::RCP<Teuchos::ParameterList> plist = rcp(new Teuchos::ParameterList());
9066  // 25 Jul 2018: Test for equality with the non-isMM path's Import object.
9067  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9068  MyImport = rcp ( new import_type (MyDomainMap,
9069  MyColMap,
9070  RemotePids,
9071  userExportLIDs.view(0,iloc).getConst(),
9072  userExportPIDs.view(0,iloc).getConst(),
9073  plist)
9074  );
9075 
9076  if (verbose) {
9077  std::ostringstream os;
9078  os << *verbosePrefix << "Call expertStaticFillComplete" << std::endl;
9079  std::cerr << os.str ();
9080  }
9081 
9082  {
9083 #ifdef HAVE_TPETRA_MMM_TIMINGS
9084  TimeMonitor esfc (*TimeMonitor::getNewTimer(prefix + std::string("isMM::destMat->eSFC")));
9085  esfc_params.set("Timer Label",label+std::string("isMM eSFC"));
9086 #endif
9087  if(!params.is_null())
9088  esfc_params.set("compute global constants",params->get("compute global constants",true));
9089  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap, MyImport,Teuchos::null,rcp(new Teuchos::ParameterList(esfc_params)));
9090 
9091  }
9092 
9093  } // if(isMM)
9094  else {
9095 #ifdef HAVE_TPETRA_MMM_TIMINGS
9096  TimeMonitor MMnotMMblock (*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMblock")));
9097 #endif
9098  if (verbose) {
9099  std::ostringstream os;
9100  os << *verbosePrefix << "Create Import" << std::endl;
9101  std::cerr << os.str ();
9102  }
9103 
9104 #ifdef HAVE_TPETRA_MMM_TIMINGS
9105  TimeMonitor notMMIcTor(*TimeMonitor::getNewTimer(prefix + std::string("TAFC notMMCreateImporter")));
9106 #endif
9107  Teuchos::RCP<Teuchos::ParameterList> mypars = rcp(new Teuchos::ParameterList);
9108  mypars->set("Timer Label","notMMFrom_tAFC");
9109  if ((MyDomainMap != MyColMap) && (!MyDomainMap->isSameAs(*MyColMap)))
9110  MyImport = rcp (new import_type (MyDomainMap, MyColMap, RemotePids, mypars));
9111 
9112  if (verbose) {
9113  std::ostringstream os;
9114  os << *verbosePrefix << "Call expertStaticFillComplete" << endl;
9115  std::cerr << os.str ();
9116  }
9117 
9118 #ifdef HAVE_TPETRA_MMM_TIMINGS
9119  TimeMonitor esfcnotmm(*TimeMonitor::getNewTimer(prefix + std::string("notMMdestMat->expertStaticFillComplete")));
9120  esfc_params.set("Timer Label",prefix+std::string("notMM eSFC"));
9121 #else
9122  esfc_params.set("Timer Label",std::string("notMM eSFC"));
9123 #endif
9124 
9125  if (!params.is_null ()) {
9126  esfc_params.set ("compute global constants",
9127  params->get ("compute global constants", true));
9128  }
9129  destMat->expertStaticFillComplete (MyDomainMap, MyRangeMap,
9130  MyImport, Teuchos::null,
9131  rcp (new Teuchos::ParameterList (esfc_params)));
9132  }
9133 
9134 #ifdef HAVE_TPETRA_MMM_TIMINGS
9135  tmIESFC = Teuchos::null;
9136 #endif
9137 
9138  if (verbose) {
9139  std::ostringstream os;
9140  os << *verbosePrefix << "Done" << endl;
9141  std::cerr << os.str ();
9142  }
9143  } //transferAndFillComplete
9144 
9145 
9146  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9147  void
9150  const import_type& importer,
9151  const Teuchos::RCP<const map_type>& domainMap,
9152  const Teuchos::RCP<const map_type>& rangeMap,
9153  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9154  {
9155  transferAndFillComplete (destMatrix, importer, Teuchos::null, domainMap, rangeMap, params);
9156  }
9157 
9158  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9159  void
9162  const import_type& rowImporter,
9163  const import_type& domainImporter,
9164  const Teuchos::RCP<const map_type>& domainMap,
9165  const Teuchos::RCP<const map_type>& rangeMap,
9166  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9167  {
9168  transferAndFillComplete (destMatrix, rowImporter, Teuchos::rcpFromRef(domainImporter), domainMap, rangeMap, params);
9169  }
9170 
9171  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9172  void
9175  const export_type& exporter,
9176  const Teuchos::RCP<const map_type>& domainMap,
9177  const Teuchos::RCP<const map_type>& rangeMap,
9178  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9179  {
9180  transferAndFillComplete (destMatrix, exporter, Teuchos::null, domainMap, rangeMap, params);
9181  }
9182 
9183  template <class Scalar, class LocalOrdinal, class GlobalOrdinal, class Node>
9184  void
9187  const export_type& rowExporter,
9188  const export_type& domainExporter,
9189  const Teuchos::RCP<const map_type>& domainMap,
9190  const Teuchos::RCP<const map_type>& rangeMap,
9191  const Teuchos::RCP<Teuchos::ParameterList>& params) const
9192  {
9193  transferAndFillComplete (destMatrix, rowExporter, Teuchos::rcpFromRef(domainExporter), domainMap, rangeMap, params);
9194  }
9195 
9196 } // namespace Tpetra
9197 
9198 //
9199 // Explicit instantiation macro
9200 //
9201 // Must be expanded from within the Tpetra namespace!
9202 //
9203 
9204 #define TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR,LO,GO,NODE) \
9205  \
9206  template class CrsMatrix< SCALAR , LO , GO , NODE >;
9207 
9208 #define TPETRA_CRSMATRIX_CONVERT_INSTANT(SO,SI,LO,GO,NODE) \
9209  \
9210  template Teuchos::RCP< CrsMatrix< SO , LO , GO , NODE > > \
9211  CrsMatrix< SI , LO , GO , NODE >::convert< SO > () const;
9212 
9213 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9214  template<> \
9215  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9216  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9217  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9218  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9219  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& importer, \
9220  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9221  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9222  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9223  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9224  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9225  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9226  const Teuchos::RCP<Teuchos::ParameterList>& params);
9227 
9228 #define TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9229  template<> \
9230  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9231  importAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9232  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9233  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9234  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowImporter, \
9235  const Import<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9236  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9237  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainImporter, \
9238  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9239  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9240  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9241  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9242  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9243  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9244  const Teuchos::RCP<Teuchos::ParameterList>& params);
9245 
9246 
9247 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9248  template<> \
9249  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9250  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9251  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9252  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9253  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& exporter, \
9254  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9255  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9256  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9257  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9258  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9259  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9260  const Teuchos::RCP<Teuchos::ParameterList>& params);
9261 
9262 #define TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9263  template<> \
9264  Teuchos::RCP<CrsMatrix<SCALAR, LO, GO, NODE> > \
9265  exportAndFillCompleteCrsMatrix (const Teuchos::RCP<const CrsMatrix<SCALAR, LO, GO, NODE> >& sourceMatrix, \
9266  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9267  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9268  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& rowExporter, \
9269  const Export<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9270  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9271  CrsMatrix<SCALAR, LO, GO, NODE>::node_type>& domainExporter, \
9272  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9273  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9274  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& domainMap, \
9275  const Teuchos::RCP<const Map<CrsMatrix<SCALAR, LO, GO, NODE>::local_ordinal_type, \
9276  CrsMatrix<SCALAR, LO, GO, NODE>::global_ordinal_type, \
9277  CrsMatrix<SCALAR, LO, GO, NODE>::node_type> >& rangeMap, \
9278  const Teuchos::RCP<Teuchos::ParameterList>& params);
9279 
9280 
9281 #define TPETRA_CRSMATRIX_INSTANT(SCALAR, LO, GO ,NODE) \
9282  TPETRA_CRSMATRIX_MATRIX_INSTANT(SCALAR, LO, GO, NODE) \
9283  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9284  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT(SCALAR, LO, GO, NODE) \
9285  TPETRA_CRSMATRIX_IMPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE) \
9286  TPETRA_CRSMATRIX_EXPORT_AND_FILL_COMPLETE_INSTANT_TWO(SCALAR, LO, GO, NODE)
9287 
9288 #endif // TPETRA_CRSMATRIX_DEF_HPP
Communication plan for data redistribution from a uniquely-owned to a (possibly) multiply-owned distr...
Teuchos::RCP< const map_type > getRowMap() const override
Returns the Map that describes the row distribution in this graph.
bool hasColMap() const override
Whether the matrix has a well-defined column Map.
Declaration and generic definition of traits class that tells Tpetra::CrsMatrix how to pack and unpac...
bool indicesAreSorted_
Whether the graph&#39;s indices are sorted in each row, on this process.
global_size_t getGlobalNumCols() const override
The number of global columns in the matrix.
Impl::CreateMirrorViewFromUnmanagedHostArray< ValueType, OutputDeviceType >::output_view_type create_mirror_view_from_raw_host_array(const OutputDeviceType &, ValueType *inPtr, const size_t inSize, const bool copy=true, const char label[]="")
Variant of Kokkos::create_mirror_view that takes a raw host 1-d array as input.
Functor for the the ABSMAX CombineMode of Import and Export operations.
void checkInternalState() const
Check that this object&#39;s state is sane; throw if it&#39;s not.
Sparse matrix that presents a row-oriented interface that lets users read or modify entries...
void copyOffsets(const OutputViewType &dst, const InputViewType &src)
Copy row offsets (in a sparse graph or matrix) from src to dst. The offsets may have different types...
CrsGraph< LocalOrdinal, GlobalOrdinal, Node > crs_graph_type
The CrsGraph specialization suitable for this CrsMatrix specialization.
void replaceColMap(const Teuchos::RCP< const map_type > &newColMap)
Replace the matrix&#39;s column Map with the given Map.
virtual LocalOrdinal replaceGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceGlobalValues.
virtual bool supportsRowViews() const override
Return true if getLocalRowView() and getGlobalRowView() are valid for this object.
void replaceDomainMapAndImporter(const Teuchos::RCP< const map_type > &newDomainMap, Teuchos::RCP< const import_type > &newImporter)
Replace the current domain Map and Import with the given objects.
local_inds_dualv_type::t_host::const_type getLocalIndsViewHost(const RowInfo &rowinfo) const
Get a const, locally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myRo...
void merge2(IT1 &indResultOut, IT2 &valResultOut, IT1 indBeg, IT1 indEnd, IT2 valBeg, IT2)
Merge values in place, additively, with the same index.
static size_t mergeRowIndicesAndValues(size_t rowLen, local_ordinal_type *cols, impl_scalar_type *vals)
Merge duplicate row indices in the given row, along with their corresponding values.
void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
Teuchos::RCP< const map_type > getRangeMap() const override
The range Map of this matrix.
global_size_t getGlobalNumRows() const override
Number of global elements in the row map of this matrix.
size_t insertGlobalIndicesImpl(const local_ordinal_type lclRow, const global_ordinal_type inputGblColInds[], const size_t numInputInds)
Insert global indices, using an input local row index.
std::map< GlobalOrdinal, std::pair< Teuchos::Array< GlobalOrdinal >, Teuchos::Array< Scalar > > > nonlocals_
Nonlocal data added using insertGlobalValues().
static KOKKOS_INLINE_FUNCTION size_t unpackValue(LO &outVal, const char inBuf[])
Unpack the given value from the given output buffer.
void sortAndMergeIndicesAndValues(const bool sorted, const bool merged)
Sort and merge duplicate local column indices in all rows on the calling process, along with their co...
typename device_type::execution_space execution_space
The Kokkos execution space.
void packNew(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &exportLIDs, Kokkos::DualView< char *, buffer_device_type > &exports, const Kokkos::DualView< size_t *, buffer_device_type > &numPacketsPerLID, size_t &constantNumPackets) const
Pack this object&#39;s data for an Import or Export.
virtual void insertGlobalValuesImpl(crs_graph_type &graph, RowInfo &rowInfo, const GlobalOrdinal gblColInds[], const impl_scalar_type vals[], const size_t numInputEnt)
Common implementation detail of insertGlobalValues and insertGlobalValuesFiltered.
Declaration of Tpetra::Details::Profiling, a scope guard for Kokkos Profiling.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Get the number of entries in the given row (local index).
typename Kokkos::ArithTraits< impl_scalar_type >::mag_type mag_type
Type of a norm result.
void putScalar(const Scalar &value)
Set all values in the multivector with the given value.
size_t getNumVectors() const
Number of columns in the multivector.
void getGlobalRowView(GlobalOrdinal GlobalRow, global_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant, nonpersisting view of a row of this matrix, using global row and column indices...
size_t getLocalLength() const
Local number of rows on the calling process.
Declaration of a function that prints strings from each process.
void setAllToScalar(const Scalar &alpha)
Set all matrix entries equal to alpha.
bool isConstantStride() const
Whether this multivector has constant stride between columns.
Traits class for packing / unpacking data of type T.
void replaceRangeMapAndExporter(const Teuchos::RCP< const map_type > &newRangeMap, Teuchos::RCP< const export_type > &newExporter)
Replace the current Range Map and Export with the given objects.
virtual size_t getNumEntriesInLocalRow(LocalOrdinal localRow) const =0
The current number of entries on the calling process in the specified local row.
void scale(const Scalar &alpha)
Scale the matrix&#39;s values: this := alpha*this.
Teuchos::RCP< const map_type > getDomainMap() const override
Returns the Map associated with the domain of this graph.
size_t getLocalNumCols() const override
The number of columns connected to the locally owned rows of this matrix.
Declare and define Tpetra::Details::copyOffsets, an implementation detail of Tpetra (in particular...
void fillLocalGraphAndMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local graph and matrix.
void resumeFill(const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Resume operations that may change the values or structure of the matrix.
void leftScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the left with the given Vector.
bool noRedundancies_
Whether the graph&#39;s indices are non-redundant (merged) in each row, on this process.
GlobalOrdinal global_ordinal_type
The type of each global index in the matrix.
void padCrsArrays(const RowPtr &rowPtrBeg, const RowPtr &rowPtrEnd, Indices &indices_wdv, const Padding &padding, const int my_rank, const bool verbose)
Determine if the row pointers and indices arrays need to be resized to accommodate new entries...
bool isDistributed() const
Whether this is a globally distributed object.
void reindexColumns(crs_graph_type *const graph, const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
Teuchos::RCP< const crs_graph_type > getCrsGraph() const
This matrix&#39;s graph, as a CrsGraph.
global_inds_dualv_type::t_host::const_type getGlobalIndsViewHost(const RowInfo &rowinfo) const
Get a const, globally indexed view of the locally owned row myRow, such that rowinfo = getRowInfo(myR...
static bool debug()
Whether Tpetra is in debug mode.
size_t getGlobalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, over all processes in the matrix&#39;s communicator...
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRangeMap() const =0
The Map associated with the range of this operator, which must be compatible with Y...
void applyNonTranspose(const MV &X_in, MV &Y_in, Scalar alpha, Scalar beta) const
Special case of apply() for mode == Teuchos::NO_TRANS.
Teuchos::RCP< CrsMatrix< T, LocalOrdinal, GlobalOrdinal, Node > > convert() const
Return another CrsMatrix with the same entries, but converted to a different Scalar type T...
Scalar scalar_type
The type of each entry in the matrix.
Allocation information for a locally owned row in a CrsGraph or CrsMatrix.
void swap(CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &matrix)
Swaps the data from *this with the data and maps from crsMatrix.
void fillLocalMatrix(const Teuchos::RCP< Teuchos::ParameterList > &params)
Fill data into the local matrix.
local_inds_wdv_type lclIndsUnpacked_wdv
Local ordinals of column indices for all rows Valid when isLocallyIndexed is true If OptimizedStorage...
void verbosePrintArray(std::ostream &out, const ArrayType &x, const char name[], const size_t maxNumToPrint)
Print min(x.size(), maxNumToPrint) entries of x.
bool isGloballyIndexed() const override
Whether the graph&#39;s column indices are stored as global indices.
void leftScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Left-scale a KokkosSparse::CrsMatrix.
Teuchos_Ordinal Array_size_type
Size type for Teuchos Array objects.
void globalAssemble()
Communicate nonlocal contributions to other processes.
void getLocalDiagCopy(Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &diag) const override
Get a constant, nonpersisting view of a row of this matrix, using local row and column indices...
size_t findGlobalIndices(const RowInfo &rowInfo, const Teuchos::ArrayView< const global_ordinal_type > &indices, std::function< void(const size_t, const size_t, const size_t)> fun) const
Finds indices in the given row.
KokkosSparse::CrsMatrix< impl_scalar_type, local_ordinal_type, device_type, void, typename local_graph_device_type::size_type > local_matrix_device_type
The specialization of Kokkos::CrsMatrix that represents the part of the sparse matrix on each MPI pro...
virtual LocalOrdinal sumIntoLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoLocalValues.
void sort(View &view, const size_t &size)
Convenience wrapper for std::sort for host-accessible views.
void gathervPrint(std::ostream &out, const std::string &s, const Teuchos::Comm< int > &comm)
On Process 0 in the given communicator, print strings from each process in that communicator, in rank order.
bool isFillActive() const
Whether the matrix is not fill complete.
Teuchos::RCP< MV > importMV_
Column Map MultiVector used in apply().
Declare and define Tpetra::Details::copyConvert, an implementation detail of Tpetra (in particular...
bool isStaticGraph() const
Indicates that the graph is static, so that new entries cannot be added to this matrix.
size_t global_size_t
Global size_t object.
bool hasTransposeApply() const override
Whether apply() allows applying the transpose or conjugate transpose.
void reindexColumns(const Teuchos::RCP< const map_type > &newColMap, const Teuchos::RCP< const import_type > &newImport=Teuchos::null, const bool sortIndicesInEachRow=true)
Reindex the column indices in place, and replace the column Map. Optionally, replace the Import objec...
void deep_copy(MultiVector< DS, DL, DG, DN > &dst, const MultiVector< SS, SL, SG, SN > &src)
Copy the contents of the MultiVector src into dst.
dual_view_type::t_host::const_type getLocalViewHost(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on host. This requires that ther...
size_t getLocalMaxNumRowEntries() const override
Maximum number of entries in any row of the matrix, on this process.
void exportAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const export_type &exporter, const Teuchos::RCP< const map_type > &domainMap=Teuchos::null, const Teuchos::RCP< const map_type > &rangeMap=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Export from this to the given destination matrix, and make the result fill complete.
static KOKKOS_INLINE_FUNCTION size_t packValue(char outBuf[], const LO &inVal)
Pack the given value of type value_type into the given output buffer of bytes (char).
Insert new values that don&#39;t currently exist.
values_dualv_type::t_dev::const_type getValuesViewDevice(const RowInfo &rowinfo) const
Get a const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow)...
bool isFillComplete() const override
Whether the matrix is fill complete.
bool isSorted() const
Whether graph indices in all rows are known to be sorted.
Teuchos::RCP< const Teuchos::Comm< int > > getComm() const override
The communicator over which the matrix is distributed.
Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > createOneToOne(const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &M)
Nonmember constructor for a contiguous Map with user-defined weights and a user-specified, possibly nondefault Kokkos Node type.
void importAndFillComplete(Teuchos::RCP< CrsMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > &destMatrix, const import_type &importer, const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null) const
Import from this to the given destination matrix, and make the result fill complete.
void scale(const Scalar &alpha)
Scale in place: this = alpha*this.
virtual void getGlobalRowCopy(GlobalOrdinal GlobalRow, nonconst_global_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const =0
Get a copy of the given global row&#39;s entries.
Teuchos::RCP< const map_type > rowMap_
The Map describing the distribution of rows of the graph.
TPETRA_DETAILS_ALWAYS_INLINE local_matrix_device_type getLocalMatrixDevice() const
The local sparse matrix.
num_row_entries_type k_numRowEntries_
The number of local entries in each locally owned row.
global_ordinal_type getGlobalElement(local_ordinal_type localIndex) const
The global index corresponding to the given local index.
bool isNodeLocalElement(local_ordinal_type localIndex) const
Whether the given local index is valid for this Map on the calling process.
Functions for manipulating CRS arrays.
Kokkos::View< size_t *, Kokkos::LayoutLeft, device_type >::HostMirror num_row_entries_type
Row offsets for &quot;1-D&quot; storage.
GlobalOrdinal getIndexBase() const override
The index base for global indices for this matrix.
Declare and define the functions Tpetra::Details::computeOffsetsFromCounts and Tpetra::computeOffsets...
Communication plan for data redistribution from a (possibly) multiply-owned to a uniquely-owned distr...
void unpackAndCombineIntoCrsArrays(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, const size_t constantNumPackets, const CombineMode combineMode, const size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs, size_t TargetNumRows, size_t TargetNumNonzeros, const int MyTargetPID, const Teuchos::ArrayView< size_t > &CRS_rowptr, const Teuchos::ArrayView< GO > &CRS_colind, const Teuchos::ArrayView< const int > &SourcePids, Teuchos::Array< int > &TargetPids)
unpackAndCombineIntoCrsArrays
void sort2(const IT1 &first1, const IT1 &last1, const IT2 &first2, const bool stableSort=false)
Sort the first array, and apply the resulting permutation to the second array.
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getDomainMap() const =0
The Map associated with the domain of this operator, which must be compatible with X...
Teuchos::RCP< MV > getColumnMapMultiVector(const MV &X_domainMap, const bool force=false) const
Create a (or fetch a cached) column Map MultiVector.
#define TPETRA_ABUSE_WARNING(throw_exception_test, Exception, msg)
Handle an abuse warning, according to HAVE_TPETRA_THROW_ABUSE_WARNINGS and HAVE_TPETRA_PRINT_ABUSE_WA...
bool isNodeGlobalElement(global_ordinal_type globalIndex) const
Whether the given global index is owned by this Map on the calling process.
void packCrsMatrixNew(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports, const Kokkos::DualView< size_t *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &numPacketsPerLID, const Kokkos::DualView< const LO *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exportLIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication, for &quot;new&quot; DistObject inter...
void describe(Teuchos::FancyOStream &out, const Teuchos::EVerbosityLevel verbLevel=Teuchos::Describable::verbLevel_default) const override
Print this object with the given verbosity level to the given output stream.
Teuchos::RCP< const RowGraph< LocalOrdinal, GlobalOrdinal, Node > > getGraph() const override
This matrix&#39;s graph, as a RowGraph.
static bool verbose()
Whether Tpetra is in verbose mode.
CombineMode
Rule for combining data in an Import or Export.
Sum new values.
void unpackAndCombine(const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &importLIDs, Kokkos::DualView< char *, buffer_device_type > imports, Kokkos::DualView< size_t *, buffer_device_type > numPacketsPerLID, const size_t constantNumPackets, const CombineMode CM) override
Unpack the imported column indices and values, and combine into matrix.
bool isFillComplete() const override
Whether fillComplete() has been called and the graph is in compute mode.
bool haveGlobalConstants() const
Returns true if globalConstants have been computed; false otherwise.
bool isGloballyIndexed() const override
Whether the matrix is globally indexed on the calling process.
RowInfo getRowInfoFromGlobalRowIndex(const global_ordinal_type gblRow) const
Get information about the locally owned row with global index gblRow.
LocalOrdinal sumIntoGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using global indices.
Utility functions for packing and unpacking sparse matrix entries.
void copyConvert(const OutputViewType &dst, const InputViewType &src)
Copy values from the 1-D Kokkos::View src, to the 1-D Kokkos::View dst, of the same length...
virtual Teuchos::RCP< RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > > add(const Scalar &alpha, const RowMatrix< Scalar, LocalOrdinal, GlobalOrdinal, Node > &A, const Scalar &beta, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &domainMap, const Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params) const override
Implementation of RowMatrix::add: return alpha*A + beta*this.
bool fillComplete_
Whether the matrix is fill complete.
local_ordinal_type replaceGlobalValues(const global_ordinal_type globalRow, const Kokkos::View< const global_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using global indices.
Replace old value with maximum of magnitudes of old and new values.
virtual LocalOrdinal sumIntoGlobalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const GlobalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts, const bool atomic=useAtomicUpdatesByDefault)
Implementation detail of sumIntoGlobalValues.
Abstract base class for objects that can be the source of an Import or Export operation.
local_ordinal_type sumIntoLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals, const bool atomic=useAtomicUpdatesByDefault)
Sum into one or more sparse matrix entries, using local row and column indices.
typename Node::device_type device_type
The Kokkos device type.
size_t getNumEntriesInLocalRow(local_ordinal_type localRow) const override
Number of entries in the sparse matrix in the given local row, on the calling (MPI) process...
static LocalMapType::local_ordinal_type getDiagCopyWithoutOffsets(const DiagType &D, const LocalMapType &rowMap, const LocalMapType &colMap, const CrsMatrixType &A)
Given a locally indexed, local sparse matrix, and corresponding local row and column Maps...
Teuchos::RCP< MV > getRowMapMultiVector(const MV &Y_rangeMap, const bool force=false) const
Create a (or fetch a cached) row Map MultiVector.
std::string description() const override
A one-line description of this object.
void getLocalDiagOffsets(Teuchos::ArrayRCP< size_t > &offsets) const
Get offsets of the diagonal entries in the matrix.
void apply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, Teuchos::ETransp mode=Teuchos::NO_TRANS, Scalar alpha=Teuchos::ScalarTraits< Scalar >::one(), Scalar beta=Teuchos::ScalarTraits< Scalar >::zero()) const override
Compute a sparse matrix-MultiVector multiply.
size_t getLocalNumEntries() const override
The local number of entries in this matrix.
LO getLocalDiagCopyWithoutOffsetsNotFillComplete(::Tpetra::Vector< SC, LO, GO, NT > &diag, const ::Tpetra::RowMatrix< SC, LO, GO, NT > &A, const bool debug=false)
Given a locally indexed, global sparse matrix, extract the matrix&#39;s diagonal entries into a Tpetra::V...
Replace existing values with new values.
void fillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Tell the matrix that you are done changing its structure or values, and that you are ready to do comp...
Teuchos::RCP< const map_type > getRangeMap() const override
Returns the Map associated with the domain of this graph.
dual_view_type::t_dev::const_type getLocalViewDevice(Access::ReadOnlyStruct) const
Return a read-only, up-to-date view of this MultiVector&#39;s local data on device. This requires that th...
Replace old values with zero.
const row_ptrs_host_view_type & getRowPtrsUnpackedHost() const
Get the unpacked row pointers on host. Lazily make a copy from device.
std::string combineModeToString(const CombineMode combineMode)
Human-readable string representation of the given CombineMode.
void insertGlobalValues(const GlobalOrdinal globalRow, const Teuchos::ArrayView< const GlobalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals)
Insert one or more entries into the matrix, using global column indices.
static size_t rowImbalanceThreshold()
Threshold for deciding if a local matrix is &quot;imbalanced&quot; in the number of entries per row...
bool isLocallyComplete() const
Is this Export or Import locally complete?
virtual LocalOrdinal replaceLocalValuesImpl(impl_scalar_type rowVals[], const crs_graph_type &graph, const RowInfo &rowInfo, const LocalOrdinal inds[], const impl_scalar_type newVals[], const LocalOrdinal numElts)
Implementation detail of replaceLocalValues.
Declaration and definition of Tpetra::Details::leftScaleLocalCrsMatrix.
RowInfo getRowInfo(const local_ordinal_type myRow) const
Get information about the locally owned row with local index myRow.
values_dualv_type::t_dev getValuesViewDeviceNonConst(const RowInfo &rowinfo)
Get a non-const Device view of the locally owned values row myRow, such that rowinfo = getRowInfo(myR...
std::string dualViewStatusToString(const DualViewType &dv, const char name[])
Return the status of the given Kokkos::DualView, as a human-readable string.
virtual void removeEmptyProcessesInPlace(const Teuchos::RCP< const map_type > &newMap) override
Remove processes owning zero rows from the Maps and their communicator.
virtual bool checkSizes(const SrcDistObject &source) override
Compare the source and target (this) objects for compatibility.
local_map_type getLocalMap() const
Get the LocalMap for Kokkos-Kernels.
A distributed graph accessed by rows (adjacency lists) and stored sparsely.
typename row_matrix_type::impl_scalar_type impl_scalar_type
The type used internally in place of Scalar.
size_t unpackAndCombineWithOwningPIDsCount(const CrsGraph< LO, GO, NT > &sourceGraph, const Teuchos::ArrayView< const LO > &importLIDs, const Teuchos::ArrayView< const typename CrsGraph< LO, GO, NT >::packet_type > &imports, const Teuchos::ArrayView< const size_t > &numPacketsPerLID, size_t constantNumPackets, CombineMode combineMode, size_t numSameIDs, const Teuchos::ArrayView< const LO > &permuteToLIDs, const Teuchos::ArrayView< const LO > &permuteFromLIDs)
Special version of Tpetra::Details::unpackCrsGraphAndCombine that also unpacks owning process ranks...
A parallel distribution of indices over processes.
void getLocalRowCopy(LocalOrdinal LocalRow, nonconst_local_inds_host_view_type &Indices, nonconst_values_host_view_type &Values, size_t &NumEntries) const override
Fill given arrays with a deep copy of the locally owned entries of the matrix in a given row...
void doExport(const SrcDistObject &source, const Export< LocalOrdinal, GlobalOrdinal, Node > &exporter, const CombineMode CM, const bool restrictedMode=false)
Export data into this object using an Export object (&quot;forward mode&quot;).
Teuchos::RCP< const map_type > getDomainMap() const override
The domain Map of this matrix.
Teuchos::ArrayView< typename DualViewType::t_dev::value_type > getArrayViewFromDualView(const DualViewType &x)
Get a Teuchos::ArrayView which views the host Kokkos::View of the input 1-D Kokkos::DualView.
static KOKKOS_INLINE_FUNCTION size_t packValueCount(const LO &)
Number of bytes required to pack or unpack the given value of type value_type.
void insertLocalValues(const LocalOrdinal localRow, const Teuchos::ArrayView< const LocalOrdinal > &cols, const Teuchos::ArrayView< const Scalar > &vals, const CombineMode CM=ADD)
Insert one or more entries into the matrix, using local column indices.
Internal functions and macros designed for use with Tpetra::Import and Tpetra::Export objects...
Details::EStorageStatus storageStatus_
Status of the matrix&#39;s storage, when not in a fill-complete state.
A read-only, row-oriented interface to a sparse matrix.
void rightScale(const Vector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &x) override
Scale the matrix on the right with the given Vector.
void replaceDomainMap(const Teuchos::RCP< const map_type > &newDomainMap)
Replace the current domain Map with the given objects.
Scalar operator()(const Scalar &x, const Scalar &y)
Return the maximum of the magnitudes (absolute values) of x and y.
local_ordinal_type getLocalElement(global_ordinal_type globalIndex) const
The local index corresponding to the given global index.
void getLocalRowView(LocalOrdinal LocalRow, local_inds_host_view_type &indices, values_host_view_type &values) const override
Get a constant view of a row of this matrix, using local row and column indices.
values_dualv_type::t_host::const_type getValuesViewHost(const RowInfo &rowinfo) const
Get a const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow).
bool isLocallyIndexed() const override
Whether the graph&#39;s column indices are stored as local indices.
A distributed dense vector.
Declaration of Tpetra::Details::iallreduce.
void reduce()
Sum values of a locally replicated multivector across all processes.
Declaration and definition of Tpetra::Details::castAwayConstDualView, an implementation detail of Tpe...
std::shared_ptr< CommRequest > iallreduce(const InputViewType &sendbuf, const OutputViewType &recvbuf, const ::Teuchos::EReductionType op, const ::Teuchos::Comm< int > &comm)
Nonblocking all-reduce, for either rank-1 or rank-0 Kokkos::View objects.
void applyTranspose(const MV &X_in, MV &Y_in, const Teuchos::ETransp mode, Scalar alpha, Scalar beta) const
Special case of apply() for mode != Teuchos::NO_TRANS.
OffsetsViewType::non_const_value_type computeOffsetsFromCounts(const ExecutionSpace &execSpace, const OffsetsViewType &ptr, const CountsViewType &counts)
Compute offsets from counts.
void allocateValues(ELocalGlobal lg, GraphAllocationStatus gas, const bool verbose)
Allocate values (and optionally indices) using the Node.
Kokkos::DualView< ValueType *, DeviceType > castAwayConstDualView(const Kokkos::DualView< const ValueType *, DeviceType > &input_dv)
Cast away const-ness of a 1-D Kokkos::DualView.
void expertStaticFillComplete(const Teuchos::RCP< const map_type > &domainMap, const Teuchos::RCP< const map_type > &rangeMap, const Teuchos::RCP< const import_type > &importer=Teuchos::null, const Teuchos::RCP< const export_type > &exporter=Teuchos::null, const Teuchos::RCP< Teuchos::ParameterList > &params=Teuchos::null)
Perform a fillComplete on a matrix that already has data.
virtual Teuchos::RCP< const map_type > getMap() const
The Map describing the parallel distribution of this object.
size_t getNumEntriesInGlobalRow(GlobalOrdinal globalRow) const override
Number of entries in the sparse matrix in the given global row, on the calling (MPI) process...
static size_t verbosePrintCountThreshold()
Number of entries below which arrays, lists, etc. will be printed in debug mode.
local_matrix_device_type::values_type::const_type getLocalValuesDevice(Access::ReadOnlyStruct s) const
Get the Kokkos local values on device, read only.
void setAllValues(const typename local_graph_device_type::row_map_type &ptr, const typename local_graph_device_type::entries_type::non_const_type &ind, const typename local_matrix_device_type::values_type &val)
Set the local matrix using three (compressed sparse row) arrays.
bool isLocallyIndexed() const override
Whether the matrix is locally indexed on the calling process.
Teuchos::RCP< const map_type > getColMap() const override
The Map that describes the column distribution in this matrix.
local_ordinal_type replaceLocalValues(const local_ordinal_type localRow, const Kokkos::View< const local_ordinal_type *, Kokkos::AnonymousSpace > &inputInds, const Kokkos::View< const impl_scalar_type *, Kokkos::AnonymousSpace > &inputVals)
Replace one or more entries&#39; values, using local row and column indices.
Declaration and definition of Tpetra::Details::rightScaleLocalCrsMatrix.
Declaration and definition of Tpetra::Details::getEntryOnHost.
void packCrsMatrixWithOwningPIDs(const CrsMatrix< ST, LO, GO, NT > &sourceMatrix, Kokkos::DualView< char *, typename DistObject< char, LO, GO, NT >::buffer_device_type > &exports_dv, const Teuchos::ArrayView< size_t > &numPacketsPerLID, const Teuchos::ArrayView< const LO > &exportLIDs, const Teuchos::ArrayView< const int > &sourcePIDs, size_t &constantNumPackets)
Pack specified entries of the given local sparse matrix for communication.
void replaceRangeMap(const Teuchos::RCP< const map_type > &newRangeMap)
Replace the current range Map with the given objects.
std::shared_ptr< local_multiply_op_type > getLocalMultiplyOperator() const
The local sparse matrix operator (a wrapper of getLocalMatrixDevice() that supports local matrix-vect...
Teuchos::RCP< const map_type > colMap_
The Map describing the distribution of columns of the graph.
LocalOrdinal local_ordinal_type
The type of each local index in the matrix.
mag_type getFrobeniusNorm() const override
Compute and return the Frobenius norm of the matrix.
std::unique_ptr< std::string > createPrefix(const int myRank, const char prefix[])
Create string prefix for each line of verbose output.
Definition: Tpetra_Util.cpp:71
virtual Teuchos::RCP< const Map< LocalOrdinal, GlobalOrdinal, Node > > getRowMap() const =0
The Map that describes the distribution of rows over processes.
Accumulate new values into existing values (may not be supported in all classes)
void localApply(const MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &X, MultiVector< Scalar, LocalOrdinal, GlobalOrdinal, Node > &Y, const Teuchos::ETransp mode=Teuchos::NO_TRANS, const Scalar &alpha=Teuchos::ScalarTraits< Scalar >::one(), const Scalar &beta=Teuchos::ScalarTraits< Scalar >::zero()) const
Compute the local part of a sparse matrix-(Multi)Vector multiply.
void rightScaleLocalCrsMatrix(const LocalSparseMatrixType &A_lcl, const ScalingFactorsViewType &scalingFactors, const bool assumeSymmetric, const bool divide=true)
Right-scale a KokkosSparse::CrsMatrix.
bool isStorageOptimized() const
Returns true if storage has been optimized.
Description of Tpetra&#39;s behavior.
virtual void copyAndPermute(const SrcDistObject &source, const size_t numSameIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteToLIDs, const Kokkos::DualView< const local_ordinal_type *, buffer_device_type > &permuteFromLIDs, const CombineMode CM) override
values_dualv_type::t_host getValuesViewHostNonConst(const RowInfo &rowinfo)
Get a non-const Host view of the locally owned values row myRow, such that rowinfo = getRowInfo(myRow...
Functions that wrap Kokkos::create_mirror_view, in order to avoid deep copies when not necessary...
Declaration of Tpetra::Details::Behavior, a class that describes Tpetra&#39;s behavior.
size_t getLocalNumRows() const override
The number of matrix rows owned by the calling process.
Teuchos::RCP< MV > exportMV_
Row Map MultiVector used in apply().
Teuchos::RCP< const map_type > getRowMap() const override
The Map that describes the row distribution in this matrix.
global_size_t getGlobalNumEntries() const override
The global number of entries in this matrix.